├── .gitignore ├── .idea ├── encodings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── wl-graph-kernels.iml ├── LICENSE ├── README.md ├── data ├── .gitkeep ├── Lexicon_NamedRockUnit.nt └── download_datasets.sh ├── example_graphs ├── 07-Graph.dot ├── 07-Graph.pdf ├── 07-almost_relabeled.dot ├── 07-almost_relabeled.pdf ├── 07-relabeled.dot ├── 07-relabeled.pdf ├── 07-relabeled_vertical.dot ├── 07-relabeled_vertical.pdf ├── 07-subGraph_A1_B1.dot ├── 07-subGraph_A1_B1.pdf ├── 07-subGraph_A1_B1_vertical.dot ├── 07-subGraph_A1_B1_vertical.pdf └── \ ├── notebooks ├── affiliation_scores.ipynb ├── affiliation_timing.ipynb ├── lithogenesis_scores.ipynb ├── lithogenesis_timing.ipynb └── no_labels_scores.ipynb ├── presentation ├── img │ ├── 07-Graph.pdf │ ├── 07-almost_relabeled.pdf │ ├── 07-relabeled.pdf │ ├── 07-relabeled_vertical.pdf │ ├── 07-subGraph_A1_B1.pdf │ ├── 07-subGraph_A1_B1_vertical.pdf │ ├── wl_iteration_total.png │ └── wl_iteration_upper.png ├── presentation.nav ├── presentation.pdf └── presentation.tex ├── report ├── RefereeReport.pdf ├── RefereeReport.tex └── img │ ├── affiliation_timing.png │ └── lithogenesis_timing.png ├── requirements.txt ├── results ├── affiliation_timing.png ├── csv_to_latex.py ├── lithogenesis_timing.png ├── wl_affiliation_results.csv ├── wl_affiliation_results_with_normalization.csv ├── wl_affiliation_results_with_normalization.tex ├── wl_lithogenesis_results_with_normalization.csv ├── wl_lithogenesis_results_with_normalization.tex ├── wl_no_labels.csv ├── wl_no_labels.tex ├── wlrdf_affiliation_results.csv ├── wlrdf_affiliation_results_with_normalization.csv ├── wlrdf_affiliation_results_with_normalization.tex ├── wlrdf_lithogenesis_results.csv ├── wlrdf_lithogenesis_results_with_normalization.csv ├── wlrdf_lithogenesis_results_with_normalization.tex ├── wlrdf_no_labels.csv └── wlrdf_no_labels.tex ├── setup.py ├── tests ├── __init__.py ├── resources │ ├── __init__.py │ └── example.ttl └── wlkernel_test.py └── wlkernel ├── .idea ├── encodings.xml ├── misc.xml ├── modules.xml ├── vcs.xml ├── wlkernel.iml └── workspace.xml ├── __init__.py └── _wlkernel.py /.gitignore: -------------------------------------------------------------------------------- 1 | */aifbfixed_complete.n3 2 | */Lexicon_NamedRockUnit.nt 3 | ./Report/*.aux 4 | ./Report/*.out 5 | ./Report/*.synctex.gz 6 | ./Report/*.err 7 | ./Report/*.log 8 | # Created by https://www.gitignore.io/api/python,pycharm,jupyternotebook,jupyternotebooks 9 | # Edit at https://www.gitignore.io/?templates=python,pycharm,jupyternotebook,jupyternotebooks 10 | 11 | ### JupyterNotebook ### 12 | .ipynb_checkpoints 13 | */.ipynb_checkpoints/* 14 | 15 | # Remove previous ipynb_checkpoints 16 | # git rm -r .ipynb_checkpoints/ 17 | # 18 | 19 | ### JupyterNotebooks ### 20 | # gitignore template for Jupyter Notebooks 21 | # website: http://jupyter.org/ 22 | 23 | 24 | # Remove previous ipynb_checkpoints 25 | # git rm -r .ipynb_checkpoints/ 26 | 27 | ### PyCharm ### 28 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 29 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 30 | 31 | # User-specific stuff 32 | .idea/**/workspace.xml 33 | .idea/**/tasks.xml 34 | .idea/**/usage.statistics.xml 35 | .idea/**/dictionaries 36 | .idea/**/shelf 37 | 38 | # Generated files 39 | .idea/**/contentModel.xml 40 | 41 | # Sensitive or high-churn files 42 | .idea/**/dataSources/ 43 | .idea/**/dataSources.ids 44 | .idea/**/dataSources.local.xml 45 | .idea/**/sqlDataSources.xml 46 | .idea/**/dynamic.xml 47 | .idea/**/uiDesigner.xml 48 | .idea/**/dbnavigator.xml 49 | 50 | # Gradle 51 | .idea/**/gradle.xml 52 | .idea/**/libraries 53 | 54 | # Gradle and Maven with auto-import 55 | # When using Gradle or Maven with auto-import, you should exclude module files, 56 | # since they will be recreated, and may cause churn. Uncomment if using 57 | # auto-import. 58 | # .idea/modules.xml 59 | # .idea/*.iml 60 | # .idea/modules 61 | 62 | # CMake 63 | cmake-build-*/ 64 | 65 | # Mongo Explorer plugin 66 | .idea/**/mongoSettings.xml 67 | 68 | # File-based project format 69 | *.iws 70 | 71 | # IntelliJ 72 | out/ 73 | 74 | # mpeltonen/sbt-idea plugin 75 | .idea_modules/ 76 | 77 | # JIRA plugin 78 | atlassian-ide-plugin.xml 79 | 80 | # Cursive Clojure plugin 81 | .idea/replstate.xml 82 | 83 | # Crashlytics plugin (for Android Studio and IntelliJ) 84 | com_crashlytics_export_strings.xml 85 | crashlytics.properties 86 | crashlytics-build.properties 87 | fabric.properties 88 | 89 | # Editor-based Rest Client 90 | .idea/httpRequests 91 | 92 | # Android studio 3.1+ serialized cache file 93 | .idea/caches/build_file_checksums.ser 94 | 95 | # JetBrains templates 96 | **___jb_tmp___ 97 | 98 | ### PyCharm Patch ### 99 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 100 | 101 | # *.iml 102 | # modules.xml 103 | # .idea/misc.xml 104 | # *.ipr 105 | 106 | # Sonarlint plugin 107 | .idea/sonarlint 108 | 109 | ### Python ### 110 | # Byte-compiled / optimized / DLL files 111 | __pycache__/ 112 | *.py[cod] 113 | *$py.class 114 | 115 | # C extensions 116 | *.so 117 | 118 | # Distribution / packaging 119 | .Python 120 | build/ 121 | develop-eggs/ 122 | dist/ 123 | downloads/ 124 | eggs/ 125 | .eggs/ 126 | lib/ 127 | lib64/ 128 | parts/ 129 | sdist/ 130 | var/ 131 | wheels/ 132 | pip-wheel-metadata/ 133 | share/python-wheels/ 134 | *.egg-info/ 135 | .installed.cfg 136 | *.egg 137 | MANIFEST 138 | 139 | # PyInstaller 140 | # Usually these files are written by a python script from a template 141 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 142 | *.manifest 143 | *.spec 144 | 145 | # Installer logs 146 | pip-log.txt 147 | pip-delete-this-directory.txt 148 | 149 | # Unit test / coverage reports 150 | htmlcov/ 151 | .tox/ 152 | .nox/ 153 | .coverage 154 | .coverage.* 155 | .cache 156 | nosetests.xml 157 | coverage.xml 158 | *.cover 159 | .hypothesis/ 160 | .pytest_cache/ 161 | 162 | # Translations 163 | *.mo 164 | *.pot 165 | 166 | # Django stuff: 167 | *.log 168 | local_settings.py 169 | db.sqlite3 170 | 171 | # Flask stuff: 172 | instance/ 173 | .webassets-cache 174 | 175 | # Scrapy stuff: 176 | .scrapy 177 | 178 | # Sphinx documentation 179 | docs/_build/ 180 | 181 | # PyBuilder 182 | target/ 183 | 184 | # Jupyter Notebook 185 | 186 | # IPython 187 | profile_default/ 188 | ipython_config.py 189 | 190 | # pyenv 191 | .python-version 192 | 193 | # pipenv 194 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 195 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 196 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 197 | # install all needed dependencies. 198 | #Pipfile.lock 199 | 200 | # celery beat schedule file 201 | celerybeat-schedule 202 | 203 | # SageMath parsed files 204 | *.sage.py 205 | 206 | # Environments 207 | .env 208 | .venv 209 | env/ 210 | venv/ 211 | ENV/ 212 | env.bak/ 213 | venv.bak/ 214 | 215 | # Spyder project settings 216 | .spyderproject 217 | .spyproject 218 | 219 | # Rope project settings 220 | .ropeproject 221 | 222 | # mkdocs documentation 223 | /site 224 | 225 | # mypy 226 | .mypy_cache/ 227 | .dmypy.json 228 | dmypy.json 229 | 230 | # Pyre type checker 231 | .pyre/ 232 | 233 | # End of https://www.gitignore.io/api/python,pycharm,jupyternotebook,jupyternotebooks 234 | 235 | *.aux 236 | *.log 237 | *.out 238 | *synctex.gz 239 | *.toc 240 | *.vrb 241 | *.snm 242 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/wl-graph-kernels.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 lorenzo palloni 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weisfeiler-Lehman Graph Kernels 2 | 3 | ## Installation 4 | 5 | Python >= 3.6 is supported. 6 | 7 | $ git clone https://github.com/deeplego/wl-graph-kernels.git 8 | $ cd wl-graph-kernels 9 | $ pip install -r requirements.txt 10 | $ pip install . 11 | 12 | ## Usage 13 | 14 | To download the datasets of the experiments: 15 | 16 | $ cd data 17 | $ ./download_datasets.sh 18 | 19 | The experiments are replicated in the jupyter notebooks in the `notebooks` 20 | directory. 21 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/data/.gitkeep -------------------------------------------------------------------------------- /data/download_datasets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $(basename $(pwd)) != "data" ]] 4 | then 5 | echo "This script must be run from ./data folder." 6 | exit 1 7 | fi 8 | 9 | if [[ ! -e "./aifbfixed_complete.n3" ]] 10 | then 11 | echo ">>> Downloading aifbfixed_complete.n3" 12 | wget -q https://ndownloader.figshare.com/files/1118822 13 | mv 1118822 aifbfixed_complete.n3 14 | fi 15 | 16 | if [[ ! -e "./Lexicon_NamedRockUnit.nt" ]] 17 | then 18 | echo ">>> Downloading Lexicon_NamedRockUnit.nt" 19 | wget -q http://data.bgs.ac.uk/downloads/Lexicon_NamedRockUnit.nt 20 | fi 21 | 22 | exit 0 23 | -------------------------------------------------------------------------------- /example_graphs/07-Graph.dot: -------------------------------------------------------------------------------- 1 | digraph G { 2 | rankdir = LR; 3 | color = "blue2"; 4 | node [color = "blue2", fontcolor = "blue2", style = "bold"]; 5 | edge [fontsize = 12, style = "bold"]; 6 | 7 | A [label = "class A", style = "filled", color = "lightgrey"] 8 | B [label = "class B", style = "filled", color = "lightgrey"] 9 | 10 | A1 -> A [label = "P1"] 11 | A2 -> A [label = "P1"] 12 | B2 -> B [label = "P1"] 13 | B1 -> B [label = "P1"] 14 | 15 | A1 -> C [label = "P2"] 16 | A1 -> D [label = "P3"] 17 | A2 -> D [label = "P2"] 18 | A2 -> E [label = "P3"] 19 | B2 -> E [label = "P3"] 20 | B2 -> F [label = "P2"] 21 | B1 -> F [label = "P3"] 22 | B1 -> G [label = "P2"] 23 | 24 | C -> H [label = "P4"] 25 | D -> H [label = "P4"] 26 | F -> I [label = "P5"] 27 | G -> I [label = "P5"] 28 | 29 | H -> A2 [label = "P6"] 30 | I -> B2 [label = "P6"] 31 | 32 | {rank = min; A; B;} 33 | {rank = same; A1; A2; B2; B1;} 34 | {rank = same; C; D; E; F; G;} 35 | {rank = max; H; I;} 36 | } 37 | -------------------------------------------------------------------------------- /example_graphs/07-Graph.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/example_graphs/07-Graph.pdf -------------------------------------------------------------------------------- /example_graphs/07-almost_relabeled.dot: -------------------------------------------------------------------------------- 1 | digraph G { 2 | 3 | newrank = true; 4 | rankdir = LR; 5 | node[style="bold"] 6 | edge[style="bold"] 7 | 8 | color = "aquamarine4"; 9 | node [color = "aquamarine4", fontcolor = "aquamarine4"]; 10 | root [label="ϵ (A1)"]; 11 | right_root [label="ϵ (B1)"]; 12 | depth_4_node[label = "d = 4", shape = plaintext, fontcolor = "aquamarine4"]; 13 | {rank = same; depth_4_node; root; right_root}; 14 | 15 | color = "blue4"; 16 | node [color = "blue4", fontcolor = "blue4"]; 17 | edge [color = "blue4", fontsize = 10, fontcolor = "blue4"]; 18 | C [label = "C,P2"]; 19 | D [label = "D,P3"]; 20 | root -> C [label = "P2,ϵ"]; 21 | root -> D [label = "P3,ϵ"]; 22 | right_F [label = "F,P3"]; 23 | right_G [label = "G,P2"]; 24 | right_root -> right_F [label = "P3,ϵ"]; 25 | right_root -> right_G [label = "P2,ϵ"]; 26 | depth_3_node[label = "d = 3", shape = plaintext, fontcolor = "blue4"]; 27 | {rank = same; depth_3_node; C; D; right_F; right_G}; 28 | 29 | color = "cyan4"; 30 | node [color = "cyan4", fontcolor = "cyan4"]; 31 | edge [color = "cyan4", fontsize = 10, fontcolor = "cyan4"]; 32 | H [label = "H,P4P4"]; 33 | C -> H [label = "P4,C"]; 34 | D -> H [label = "P4,D"]; 35 | right_I [label = "I,P5P5"]; 36 | right_F -> right_I [label = "P5,F"]; 37 | right_G -> right_I [label = "P5,G"]; 38 | depth_2_node[label = "d = 2", shape = plaintext, fontcolor = "cyan4"]; 39 | {rank = same; depth_2_node; H; right_I}; 40 | 41 | 42 | color = "darkorchid4"; 43 | node [color = "darkorchid4", fontcolor = "darkorchid4"]; 44 | edge [color = "darkorchid4", fontsize = 10, fontcolor = "darkorchid4"]; 45 | A2 [label = "A2,P6"] 46 | H -> A2 [label = "P6,H"]; 47 | right_B2 [label = "B2,P6"]; 48 | right_I -> right_B2 [label = "P6,I"]; 49 | depth_1_node[label = "d = 1", shape = plaintext, fontcolor = "darkorchid4"]; 50 | {rank = same; depth_1_node; right_B2; A2}; 51 | 52 | color = "green4"; 53 | node [color = "green4", fontcolor = "green4"]; 54 | edge [color = "green4", fontsize = 10, fontcolor = "green4"]; 55 | D_0 [label = "D,P2", style = "dotted"]; 56 | E_0 [label = "E,P3"]; 57 | F_0 [label = "F,P2", style = "dotted"]; 58 | A2 -> D_0 [label = "P2,A2"]; 59 | A2 -> E_0 [label = "P3,A2"]; 60 | right_B2 -> E_0 [label = "P3,B2"]; 61 | right_B2 -> F_0 [label = "P2,B2"]; 62 | 63 | depth_0_node[label = "d = 0", shape = plaintext, fontcolor = "green4"]; 64 | {rank = same; depth_0_node; D_0; E_0; F_0}; 65 | } 66 | -------------------------------------------------------------------------------- /example_graphs/07-almost_relabeled.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/example_graphs/07-almost_relabeled.pdf -------------------------------------------------------------------------------- /example_graphs/07-relabeled.dot: -------------------------------------------------------------------------------- 1 | digraph G { 2 | 3 | newrank = true; 4 | rankdir = LR; 5 | node [style = "bold"] 6 | edge [style = "bold"] 7 | 8 | color = "aquamarin4"; 9 | node [color = "aquamarine4", fontcolor = "aquamarine4"]; 10 | root [label = "ϵ (A1)", fontcolor = "aquamarine4"]; 11 | right_root [label = "ϵ (B1)", fontcolor = "aquamarine4"]; 12 | depth_4_node[label = "d = 4", shape = plaintext, fontcolor = "aquamarine4"]; 13 | {rank = same; depth_4_node; root; right_root}; 14 | 15 | color = "blue4"; 16 | node [color = "blue4", fontcolor = "blue4"]; 17 | edge [color = "blue4", fontsize = 10, fontcolor = "blue4"]; 18 | C [label = "3"]; 19 | D [label = "4"]; 20 | root -> C [label = "1", fontcolor = "red2"]; 21 | root -> D [label = "2", fontcolor = "red2"]; 22 | right_F [label = "5"]; 23 | right_G [label = "6"]; 24 | right_root -> right_F [label = "2", fontcolor = "red2"]; 25 | right_root -> right_G [label = "1", fontcolor = "red2"]; 26 | depth_3_node[label = "d = 3", shape = plaintext, fontcolor = "blue4"]; 27 | {rank = same; depth_3_node; C; D; right_F; right_G}; 28 | 29 | color = "cyan4"; 30 | node [color = "cyan4", fontcolor = "cyan4"]; 31 | edge [color = "cyan4", fontsize = 10, fontcolor = "cyan4"]; 32 | H [label = "11"]; 33 | C -> H [label = "7"]; 34 | D -> H [label = "8"]; 35 | right_I [label = "12"]; 36 | right_F -> right_I [label = "9"]; 37 | right_G -> right_I [label = "10"]; 38 | depth_2_node[label = "d = 2", shape = plaintext, fontcolor = "cyan4"]; 39 | {rank = same; depth_2_node; H; right_I}; 40 | 41 | 42 | color = "darkorchid4"; 43 | node [color = "darkorchid4", fontcolor = "darkorchid4"]; 44 | edge [color = "darkorchid4", fontsize = 10, fontcolor = "darkorchid4"]; 45 | A2 [label = "15"] 46 | H -> A2 [label = "13"]; 47 | right_B2 [label = "16"]; 48 | right_I -> right_B2 [label = "14"]; 49 | depth_1_node[label = "d = 1", shape = plaintext, fontcolor = "darkorchid4"]; 50 | {rank = same; depth_1_node; right_B2; A2}; 51 | 52 | color = "green4"; 53 | node [color = "green4", fontcolor = "green4"]; 54 | edge [color = "green4", fontsize = 10, fontcolor = "green4"]; 55 | D_0 [label = "21", style = "dotted"]; 56 | E_0 [label = "22", fontcolor = "red2"]; 57 | F_0 [label = "23", style = "dotted"]; 58 | A2 -> D_0 [label = "17"]; 59 | A2 -> E_0 [label = "18"]; 60 | right_B2 -> E_0 [label = "19"]; 61 | right_B2 -> F_0 [label = "20"]; 62 | 63 | depth_0_node[label = "d = 0", shape = plaintext, fontcolor = "green4"]; 64 | {rank = same; depth_0_node; D_0; E_0; F_0}; 65 | } 66 | -------------------------------------------------------------------------------- /example_graphs/07-relabeled.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/example_graphs/07-relabeled.pdf -------------------------------------------------------------------------------- /example_graphs/07-relabeled_vertical.dot: -------------------------------------------------------------------------------- 1 | digraph G { 2 | 3 | newrank = true; 4 | node [style = "bold"] 5 | edge [style = "bold"] 6 | 7 | color = "aquamarin4"; 8 | node [color = "aquamarine4", fontcolor = "aquamarine4"]; 9 | root [label = "ϵ", fontcolor = "aquamarine4"]; 10 | right_root [label = "ϵ", fontcolor = "aquamarine4"]; 11 | depth_4_node[label = "d = 4", shape = plaintext, fontcolor = "aquamarine4"]; 12 | {rank = same; depth_4_node; root; right_root}; 13 | 14 | color = "blue4"; 15 | node [color = "blue4", fontcolor = "blue4"]; 16 | edge [color = "blue4", fontsize = 10, fontcolor = "blue4"]; 17 | C [label = "3"]; 18 | D [label = "4"]; 19 | root -> C [label = "1", fontcolor = "red2"]; 20 | root -> D [label = "2", fontcolor = "red2"]; 21 | right_F [label = "5"]; 22 | right_G [label = "6"]; 23 | right_root -> right_F [label = "2", fontcolor = "red2"]; 24 | right_root -> right_G [label = "1", fontcolor = "red2"]; 25 | depth_3_node[label = "d = 3", shape = plaintext, fontcolor = "blue4"]; 26 | {rank = same; depth_3_node; C; D; right_F; right_G}; 27 | 28 | color = "cyan4"; 29 | node [color = "cyan4", fontcolor = "cyan4"]; 30 | edge [color = "cyan4", fontsize = 10, fontcolor = "cyan4"]; 31 | H [label = "11"]; 32 | C -> H [label = "7"]; 33 | D -> H [label = "8"]; 34 | right_I [label = "12"]; 35 | right_F -> right_I [label = "9"]; 36 | right_G -> right_I [label = "10"]; 37 | depth_2_node[label = "d = 2", shape = plaintext, fontcolor = "cyan4"]; 38 | {rank = same; depth_2_node; H; right_I}; 39 | 40 | 41 | color = "darkorchid4"; 42 | node [color = "darkorchid4", fontcolor = "darkorchid4"]; 43 | edge [color = "darkorchid4", fontsize = 10, fontcolor = "darkorchid4"]; 44 | A2 [label = "15"] 45 | H -> A2 [label = "13"]; 46 | right_B2 [label = "16"]; 47 | right_I -> right_B2 [label = "14"]; 48 | depth_1_node[label = "d = 1", shape = plaintext, fontcolor = "darkorchid4"]; 49 | {rank = same; depth_1_node; right_B2; A2}; 50 | 51 | color = "green4"; 52 | node [color = "green4", fontcolor = "green4"]; 53 | edge [color = "green4", fontsize = 10, fontcolor = "green4"]; 54 | D_0 [label = "21", style = "dotted"]; 55 | E_0 [label = "22", fontcolor = "red2"]; 56 | F_0 [label = "23", style = "dotted"]; 57 | A2 -> D_0 [label = "17"]; 58 | A2 -> E_0 [label = "18"]; 59 | right_B2 -> E_0 [label = "19"]; 60 | right_B2 -> F_0 [label = "20"]; 61 | 62 | depth_0_node[label = "d = 0", shape = plaintext, fontcolor = "green4"]; 63 | {rank = same; depth_0_node; D_0; E_0; F_0}; 64 | } 65 | -------------------------------------------------------------------------------- /example_graphs/07-relabeled_vertical.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/example_graphs/07-relabeled_vertical.pdf -------------------------------------------------------------------------------- /example_graphs/07-subGraph_A1_B1.dot: -------------------------------------------------------------------------------- 1 | digraph G { 2 | 3 | newrank = true; 4 | rankdir = LR; 5 | node[style = "bold"] 6 | edge[style = "bold"] 7 | color = "aquamarine4"; 8 | node [color = "aquamarine4", fontcolor = "aquamarine4"]; 9 | root [label="ϵ (A1)"]; 10 | o_root [label="ϵ (B1)"]; 11 | depth_4_node[label = "d = 4", shape = plaintext, fontcolor = "aquamarine4"]; 12 | {rank = same; depth_4_node; root; o_root}; 13 | 14 | color = "blue4"; 15 | node [color = "blue4", fontcolor = "blue4"]; 16 | edge [color = "blue4", fontsize = 10, fontcolor = "blue4"]; 17 | root -> C [label = "P2", fontcolor = "red2"]; 18 | root -> D [label = "P3", fontcolor = "red2"]; 19 | o_F [label = "F"]; 20 | o_G [label = "G"]; 21 | o_root -> o_F [label = "P3", fontcolor = "red2"]; 22 | o_root -> o_G [label = "P2", fontcolor = "red2"]; 23 | depth_3_node[label = "d = 3", shape = plaintext, fontcolor = "blue4"]; 24 | {rank = same; depth_3_node; C; D; o_F; o_G}; 25 | 26 | color = "cyan4"; 27 | node [color = "cyan4", fontcolor = "cyan4"]; 28 | edge [color = "cyan4", fontsize = 10, fontcolor = "cyan4"]; 29 | C -> H [label = "P4"]; 30 | D -> H [label = "P4"]; 31 | o_I [label = "I"]; 32 | o_F -> o_I [label = "P5"]; 33 | o_G -> o_I [label = "P5"]; 34 | depth_2_node[label = "d = 2", shape = plaintext, fontcolor = "cyan4"]; 35 | {rank = same; depth_2_node; H; o_I}; 36 | 37 | 38 | color = "darkorchid4"; 39 | node [color = "darkorchid4", fontcolor = "darkorchid4"]; 40 | edge [color = "darkorchid4", fontsize = 10, fontcolor = "darkorchid4"]; 41 | H -> A2 [label = "P6", fontcolor = "red2"]; 42 | o_B2 [label = "B2"]; 43 | o_I -> o_B2 [label = "P6", fontcolor = "red2"]; 44 | depth_1_node[label = "d = 1", shape = plaintext, fontcolor = "darkorchid4"]; 45 | {rank = same; depth_1_node; o_B2; A2}; 46 | 47 | color = "green4"; 48 | node [color = "green4", fontcolor = "green4"]; 49 | edge [color = "green4", fontsize = 10, fontcolor = "green4"]; 50 | D_0 [label = "D", style = "dotted"]; 51 | E_0 [label = "E", fontcolor = "red2"]; 52 | F_0 [label = "F", style = "dotted"]; 53 | A2 -> D_0 [label = "P2", fontcolor = "red2"]; 54 | A2 -> E_0 [label = "P3", fontcolor = "red2"]; 55 | o_B2 -> E_0 [label = "P3", fontcolor = "red2"]; 56 | o_B2 -> F_0 [label = "P2", fontcolor = "red2"]; 57 | 58 | depth_0_node[label = "d = 0", shape = plaintext, fontcolor = "green4"]; 59 | {rank = same; depth_0_node; D_0; E_0; F_0}; 60 | } 61 | -------------------------------------------------------------------------------- /example_graphs/07-subGraph_A1_B1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/example_graphs/07-subGraph_A1_B1.pdf -------------------------------------------------------------------------------- /example_graphs/07-subGraph_A1_B1_vertical.dot: -------------------------------------------------------------------------------- 1 | digraph G { 2 | 3 | newrank = true; 4 | node[style = "bold"] 5 | edge[style = "bold"] 6 | color = "aquamarine4"; 7 | node [color = "aquamarine4", fontcolor = "aquamarine4"]; 8 | root [label="ϵ"]; 9 | o_root [label="ϵ"]; 10 | depth_4_node[label = "d = 4", shape = plaintext, fontcolor = "aquamarine4"]; 11 | {rank = same; depth_4_node; root; o_root}; 12 | 13 | color = "blue4"; 14 | node [color = "blue4", fontcolor = "blue4"]; 15 | edge [color = "blue4", fontsize = 10, fontcolor = "blue4"]; 16 | root -> C [label = "P2", fontcolor = "red2"]; 17 | root -> D [label = "P3", fontcolor = "red2"]; 18 | o_F [label = "F"]; 19 | o_G [label = "G"]; 20 | o_root -> o_F [label = "P3", fontcolor = "red2"]; 21 | o_root -> o_G [label = "P2", fontcolor = "red2"]; 22 | depth_3_node[label = "d = 3", shape = plaintext, fontcolor = "blue4"]; 23 | {rank = same; depth_3_node; C; D; o_F; o_G}; 24 | 25 | color = "cyan4"; 26 | node [color = "cyan4", fontcolor = "cyan4"]; 27 | edge [color = "cyan4", fontsize = 10, fontcolor = "cyan4"]; 28 | C -> H [label = "P4"]; 29 | D -> H [label = "P4"]; 30 | o_I [label = "I"]; 31 | o_F -> o_I [label = "P5"]; 32 | o_G -> o_I [label = "P5"]; 33 | depth_2_node[label = "d = 2", shape = plaintext, fontcolor = "cyan4"]; 34 | {rank = same; depth_2_node; H; o_I}; 35 | 36 | 37 | color = "darkorchid4"; 38 | node [color = "darkorchid4", fontcolor = "darkorchid4"]; 39 | edge [color = "darkorchid4", fontsize = 10, fontcolor = "darkorchid4"]; 40 | H -> A2 [label = "P6", fontcolor = "red2"]; 41 | o_B2 [label = "B2"]; 42 | o_I -> o_B2 [label = "P6", fontcolor = "red2"]; 43 | depth_1_node[label = "d = 1", shape = plaintext, fontcolor = "darkorchid4"]; 44 | {rank = same; depth_1_node; o_B2; A2}; 45 | 46 | color = "green4"; 47 | node [color = "green4", fontcolor = "green4"]; 48 | edge [color = "green4", fontsize = 10, fontcolor = "green4"]; 49 | D_0 [label = "D", style = "dotted"]; 50 | E_0 [label = "E", fontcolor = "red2"]; 51 | F_0 [label = "F", style = "dotted"]; 52 | A2 -> D_0 [label = "P2", fontcolor = "red2"]; 53 | A2 -> E_0 [label = "P3", fontcolor = "red2"]; 54 | o_B2 -> E_0 [label = "P3", fontcolor = "red2"]; 55 | o_B2 -> F_0 [label = "P2", fontcolor = "red2"]; 56 | 57 | depth_0_node[label = "d = 0", shape = plaintext, fontcolor = "green4"]; 58 | {rank = same; depth_0_node; D_0; E_0; F_0}; 59 | } 60 | -------------------------------------------------------------------------------- /example_graphs/07-subGraph_A1_B1_vertical.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/example_graphs/07-subGraph_A1_B1_vertical.pdf -------------------------------------------------------------------------------- /example_graphs/\: -------------------------------------------------------------------------------- 1 | digraph G { 2 | 3 | newrank = true; 4 | rankdir = LR; 5 | node[style="bold"] 6 | edge[style="bold"] 7 | 8 | color = "aquamarine4"; 9 | node [color = "aquamarine4", fontcolor = "aquamarine4"]; 10 | root [label="ϵ (A1)"]; 11 | right_root [label="ϵ (B1)"]; 12 | depth_4_node[label = "d = 4", shape = plaintext, fontcolor = "aquamarine4"]; 13 | {rank = same; depth_4_node; root; right_root}; 14 | 15 | color = "blue4"; 16 | node [color = "blue4", fontcolor = "blue4"]; 17 | edge [color = "blue4", fontsize = 10, fontcolor = "blue4"]; 18 | C [label = "C,P2"]; 19 | D [label = "D,P3"]; 20 | root -> C [label = "P2,ϵ"]; 21 | root -> D [label = "P3,ϵ"]; 22 | right_F [label = "F,P3"]; 23 | right_G [label = "G,P2"]; 24 | right_root -> right_F [label = "P3,ϵ"]; 25 | right_root -> right_G [label = "P2,ϵ"]; 26 | depth_3_node[label = "d = 3", shape = plaintext, fontcolor = "blue4"]; 27 | {rank = same; depth_3_node; C; D; right_F; right_G}; 28 | 29 | color = "cyan4"; 30 | node [color = "cyan4", fontcolor = "cyan4"]; 31 | edge [color = "cyan4", fontsize = 10, fontcolor = "cyan4"]; 32 | H [label = "H,P4P4"]; 33 | C -> H [label = "P4,C"]; 34 | D -> H [label = "P4,D"]; 35 | right_I [label = "I,P5P5"]; 36 | right_F -> right_I [label = "P5,F"]; 37 | right_G -> right_I [label = "P5,G"]; 38 | depth_2_node[label = "d = 2", shape = plaintext, fontcolor = "cyan4"]; 39 | {rank = same; depth_2_node; H; right_I}; 40 | 41 | 42 | color = "darkorchid4"; 43 | node [color = "darkorchid4", fontcolor = "darkorchid4"]; 44 | edge [color = "darkorchid4", fontsize = 10, fontcolor = "darkorchid4"]; 45 | A2 [label = "A2,P6"] 46 | H -> A2 [label = "P6,H"]; 47 | right_B2 [label = "B2,P6"]; 48 | right_I -> right_B2 [label = "P6,I"]; 49 | depth_1_node[label = "d = 1", shape = plaintext, fontcolor = "darkorchid4"]; 50 | {rank = same; depth_1_node; right_B2; A2}; 51 | 52 | color = "green4"; 53 | node [color = "green4", fontcolor = "green4"]; 54 | edge [color = "green4", fontsize = 10, fontcolor = "green4"]; 55 | D_0 [label = "D,P2", style = "dotted"]; 56 | E_0 [label = "E,P3"]; 57 | F_0 [label = "F,P2", style = "dotted"]; 58 | A2 -> D_0 [label = "P2,A2"]; 59 | A2 -> E_0 [label = "P3,A2"]; 60 | right_B2 -> E_0 [label = "P3,B2"]; 61 | right_B2 -> F_0 [label = "P2,B2"]; 62 | 63 | depth_0_node[label = "d = 0", shape = plaintext, fontcolor = "green4"]; 64 | {rank = same; depth_0_node; D_0; E_0; F_0}; 65 | } 66 | -------------------------------------------------------------------------------- /notebooks/affiliation_scores.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.insert(0, '../')" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from collections import Counter, OrderedDict\n", 20 | "import warnings\n", 21 | "\n", 22 | "import rdflib\n", 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "from pprint import pprint\n", 26 | "from sklearn import svm\n", 27 | "from sklearn.model_selection import cross_validate\n", 28 | "\n", 29 | "import wlkernel" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "warnings.simplefilter('ignore')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "rdf_graph = rdflib.Graph().parse('../data/aifbfixed_complete.n3', format='n3')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "Most common classes with predicate equal to 'affiliation':\n", 60 | "[('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance',\n", 61 | " 73),\n", 62 | " ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance',\n", 63 | " 60),\n", 64 | " ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance',\n", 65 | " 28),\n", 66 | " ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance',\n", 67 | " 16),\n", 68 | " ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id5instance',\n", 69 | " 1)]\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "affiliation_most_common = Counter(\n", 75 | " str(o) \n", 76 | " for s, p, o in rdf_graph\n", 77 | " if 'affiliation' in str(p)\n", 78 | ").most_common()\n", 79 | "print(\"Most common classes with predicate equal to 'affiliation':\")\n", 80 | "pprint(affiliation_most_common)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "instances_class_map = {\n", 90 | " str(s): str(o) for s, p, o in rdf_graph \n", 91 | " if 'affiliation' in str(p)\n", 92 | " and 'id5instance' not in str(o)\n", 93 | "}\n", 94 | "instances = list(instances_class_map.keys())\n", 95 | "y = list(instances_class_map.values())" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 7, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "number of triples: 28699\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "triples = list(\n", 113 | " (str(s), str(p), str(o)) for s, p, o in rdf_graph\n", 114 | " if 'affiliation' not in str(p)\n", 115 | " and 'employs' not in str(p)\n", 116 | " and 'member' not in str(p)\n", 117 | " and 'head' not in str(p)\n", 118 | ")\n", 119 | "print('number of triples:', len(triples))" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 44, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=1)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 45, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "for i in range(len(wlrdf_graph.labels)):\n", 138 | " for k in wlrdf_graph.labels[i].keys():\n", 139 | " wlrdf_graph.labels[i][k] = 'banana'" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 48, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=0)\n", 149 | "kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### Weisfeiler-Lehman RDF" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 11, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "RANDOM_STATE = 42\n", 166 | "\n", 167 | "depth_values = [1, 2, 3]\n", 168 | "iteration_values = [0, 2, 4, 6]\n", 169 | "C_values = [0.001, 0.01, 0.1, 1., 10., 100.]\n", 170 | "\n", 171 | "results = OrderedDict()\n", 172 | "\n", 173 | "for d in depth_values:\n", 174 | " for it in iteration_values:\n", 175 | " wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=d)\n", 176 | " kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=it)\n", 177 | " kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)\n", 178 | " \n", 179 | " results[(d, it)] = [0, 0, 0]\n", 180 | " for c in C_values:\n", 181 | " classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)\n", 182 | " scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))\n", 183 | " \n", 184 | " acc_mean = scores['test_accuracy'].mean()\n", 185 | " f1_mean = scores['test_f1_macro'].mean()\n", 186 | " \n", 187 | " if acc_mean > results[(d, it)][0]:\n", 188 | " results[(d, it)] = [acc_mean, f1_mean, c]" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 12, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/html": [ 199 | "
\n", 200 | "\n", 213 | "\n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | "
accuracyf1C
depthiterations
100.8819550.795756100.0
20.8819550.795756100.0
40.8819550.795756100.0
60.8819550.795756100.0
200.8921140.826007100.0
20.8800570.812488100.0
40.8745010.803701100.0
60.8745010.800821100.0
300.8795790.812187100.0
20.9137510.867388100.0
40.9081960.863829100.0
60.9081960.863829100.0
\n", 308 | "
" 309 | ], 310 | "text/plain": [ 311 | " accuracy f1 C\n", 312 | "depth iterations \n", 313 | "1 0 0.881955 0.795756 100.0\n", 314 | " 2 0.881955 0.795756 100.0\n", 315 | " 4 0.881955 0.795756 100.0\n", 316 | " 6 0.881955 0.795756 100.0\n", 317 | "2 0 0.892114 0.826007 100.0\n", 318 | " 2 0.880057 0.812488 100.0\n", 319 | " 4 0.874501 0.803701 100.0\n", 320 | " 6 0.874501 0.800821 100.0\n", 321 | "3 0 0.879579 0.812187 100.0\n", 322 | " 2 0.913751 0.867388 100.0\n", 323 | " 4 0.908196 0.863829 100.0\n", 324 | " 6 0.908196 0.863829 100.0" 325 | ] 326 | }, 327 | "execution_count": 12, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "fn = 'wlrdf_affiliation_results_with_normalization'\n", 334 | "\n", 335 | "df_res = pd.DataFrame(index=list(results.keys()))\n", 336 | "df_res['accuracy'] = [t[0] for t in results.values()]\n", 337 | "df_res['f1'] = [t[1] for t in results.values()]\n", 338 | "df_res['C'] = [t[2] for t in results.values()]\n", 339 | "df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))\n", 340 | "df_res.to_csv(f'../results/{fn}.csv')\n", 341 | "df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])\n", 342 | "df_res_test.to_html(f'../results/{fn}.html')\n", 343 | "df_res_test" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "### Weisfeiler-Lehman" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 13, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "RANDOM_STATE = 42\n", 360 | "\n", 361 | "depth_values = [1, 2, 3]\n", 362 | "iteration_values = [0, 2, 4, 6]\n", 363 | "C_values = [0.001, 0.01, 0.1, 1., 10., 100.]\n", 364 | "\n", 365 | "results = OrderedDict()\n", 366 | "\n", 367 | "for d in depth_values:\n", 368 | " for it in iteration_values:\n", 369 | " wl_graphs = [wlkernel.WLGraph(triples, instance, max_depth=d) for instance in instances]\n", 370 | " kernel_matrix = wlkernel.wl_kernel_matrix(wl_graphs, iterations=it)\n", 371 | " kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)\n", 372 | " \n", 373 | " results[(d, it)] = [0, 0, 0]\n", 374 | " for c in C_values:\n", 375 | " classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)\n", 376 | " scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))\n", 377 | " \n", 378 | " acc_mean = scores['test_accuracy'].mean()\n", 379 | " f1_mean = scores['test_f1_macro'].mean()\n", 380 | " \n", 381 | " if acc_mean > results[(d, it)][0]:\n", 382 | " results[(d, it)] = [acc_mean, f1_mean, c]" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 14, 388 | "metadata": {}, 389 | "outputs": [ 390 | { 391 | "data": { 392 | "text/html": [ 393 | "
\n", 394 | "\n", 407 | "\n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | "
accuracyf1C
depthiterations
100.8819550.795756100.0
20.8687610.788673100.0
40.8687610.788673100.0
60.8687610.788673100.0
200.8868510.819787100.0
20.8581270.781563100.0
40.7704460.604246100.0
60.7527580.579145100.0
300.8848430.818408100.0
20.8908000.824622100.0
40.8973430.840694100.0
60.8963560.821343100.0
\n", 502 | "
" 503 | ], 504 | "text/plain": [ 505 | " accuracy f1 C\n", 506 | "depth iterations \n", 507 | "1 0 0.881955 0.795756 100.0\n", 508 | " 2 0.868761 0.788673 100.0\n", 509 | " 4 0.868761 0.788673 100.0\n", 510 | " 6 0.868761 0.788673 100.0\n", 511 | "2 0 0.886851 0.819787 100.0\n", 512 | " 2 0.858127 0.781563 100.0\n", 513 | " 4 0.770446 0.604246 100.0\n", 514 | " 6 0.752758 0.579145 100.0\n", 515 | "3 0 0.884843 0.818408 100.0\n", 516 | " 2 0.890800 0.824622 100.0\n", 517 | " 4 0.897343 0.840694 100.0\n", 518 | " 6 0.896356 0.821343 100.0" 519 | ] 520 | }, 521 | "execution_count": 14, 522 | "metadata": {}, 523 | "output_type": "execute_result" 524 | } 525 | ], 526 | "source": [ 527 | "fn = 'wl_affiliation_results_with_normalization'\n", 528 | "\n", 529 | "df_res = pd.DataFrame(index=list(results.keys()))\n", 530 | "df_res['accuracy'] = [t[0] for t in results.values()]\n", 531 | "df_res['f1'] = [t[1] for t in results.values()]\n", 532 | "df_res['C'] = [t[2] for t in results.values()]\n", 533 | "df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))\n", 534 | "df_res.to_csv(f'../results/{fn}.csv')\n", 535 | "df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])\n", 536 | "df_res_test.to_html(f'../results/{fn}.html')\n", 537 | "df_res_test" 538 | ] 539 | } 540 | ], 541 | "metadata": { 542 | "kernelspec": { 543 | "display_name": "Python 3", 544 | "language": "python", 545 | "name": "python3" 546 | }, 547 | "language_info": { 548 | "codemirror_mode": { 549 | "name": "ipython", 550 | "version": 3 551 | }, 552 | "file_extension": ".py", 553 | "mimetype": "text/x-python", 554 | "name": "python", 555 | "nbconvert_exporter": "python", 556 | "pygments_lexer": "ipython3", 557 | "version": "3.7.3" 558 | } 559 | }, 560 | "nbformat": 4, 561 | "nbformat_minor": 2 562 | } 563 | -------------------------------------------------------------------------------- /notebooks/affiliation_timing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.insert(0, '../')" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from collections import Counter\n", 20 | "import time\n", 21 | "import random\n", 22 | "\n", 23 | "import rdflib\n", 24 | "import numpy as np\n", 25 | "from pprint import pprint\n", 26 | "from sklearn import svm\n", 27 | "\n", 28 | "import wlkernel" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "rdf_graph = rdflib.Graph().parse('../data/aifbfixed_complete.n3', format='n3')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "all_triples = [\n", 47 | " (str(subj), str(pred), str(obj))\n", 48 | " for subj, pred, obj in rdf_graph\n", 49 | "]" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 5, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "quantiles = np.linspace(0.1, 1, 10) # [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]\n", 59 | "results_wlrdf = []\n", 60 | "results_wl = []\n", 61 | "n = len(all_triples)\n", 62 | "RANDOM_STATE = 42" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 6, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "for q in quantiles:\n", 72 | " n_sub = int(n * q)\n", 73 | " random.seed(RANDOM_STATE)\n", 74 | " triples = random.sample(all_triples, n_sub)\n", 75 | " \n", 76 | " instances_class_map = {\n", 77 | " subj: obj\n", 78 | " for subj, pred, obj in triples\n", 79 | " if 'affiliation' in pred\n", 80 | " and 'id5instance' not in obj\n", 81 | " }\n", 82 | " instances = list(instances_class_map.keys())\n", 83 | " y = list(instances_class_map.values())\n", 84 | " \n", 85 | " triples = [\n", 86 | " (subj, pred, obj)\n", 87 | " for subj, pred, obj in triples\n", 88 | " if 'affiliation' not in pred\n", 89 | " and 'employs' not in pred\n", 90 | " and 'member' not in pred\n", 91 | " and 'head' not in pred\n", 92 | " ]\n", 93 | " t0 = time.time()\n", 94 | " wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=3)\n", 95 | " kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=0)\n", 96 | " t1 = time.time()\n", 97 | "\n", 98 | " results_wlrdf.append(t1 - t0)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 7, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "rdf_graph = rdflib.Graph().parse('../data/aifbfixed_complete.n3', format='n3')" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 8, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "all_triples = [\n", 117 | " (str(subj), str(pred), str(obj))\n", 118 | " for subj, pred, obj in rdf_graph\n", 119 | "]" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 9, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "for q in quantiles:\n", 129 | " n_sub = int(n * q)\n", 130 | " random.seed(RANDOM_STATE)\n", 131 | " triples = random.sample(all_triples, n_sub)\n", 132 | " \n", 133 | " instances_class_map = {\n", 134 | " subj: obj\n", 135 | " for subj, pred, obj in triples\n", 136 | " if 'affiliation' in pred\n", 137 | " and 'id5instance' not in obj\n", 138 | " }\n", 139 | " instances = list(instances_class_map.keys())\n", 140 | " y = list(instances_class_map.values())\n", 141 | " \n", 142 | " triples = [\n", 143 | " (subj, pred, obj)\n", 144 | " for subj, pred, obj in triples\n", 145 | " if 'affiliation' not in pred\n", 146 | " and 'employs' not in pred\n", 147 | " and 'member' not in pred\n", 148 | " and 'head' not in pred\n", 149 | " ]\n", 150 | " t0 = time.time()\n", 151 | " wl_graphs = [wlkernel.WLGraph(triples, instance, max_depth=3) for instance in instances]\n", 152 | " kernel_matrix = wlkernel.wl_kernel_matrix(wl_graphs, iterations=0)\n", 153 | " t1 = time.time()\n", 154 | "\n", 155 | " results_wl.append(t1 - t0)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 10, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "import matplotlib.pyplot as plt\n", 165 | "\n", 166 | "class Result:\n", 167 | " def __init__(self, values = None, color = 'red', name = ''):\n", 168 | " if values is not None:\n", 169 | " self.values = values\n", 170 | " self.color = color\n", 171 | " self.name = name\n", 172 | "\n", 173 | "x = quantiles\n", 174 | "y = Result(results_wlrdf, 'orange', 'WL RDF')\n", 175 | "y1 = Result(results_wl, 'purple', 'WL')\n", 176 | "n = len(x)\n", 177 | "\n", 178 | "fig, ax = plt.subplots(figsize=(15, 8))\n", 179 | "for i in range(n - 1):\n", 180 | " plt.plot(x[i: i+2], y.values[i: i+2],\n", 181 | " 'o-', color=y.color, markersize=8)\n", 182 | " plt.plot(x[i: i+2], y1.values[i: i+2],\n", 183 | " 'o-', color= y1.color, markersize=8)\n", 184 | "\n", 185 | "ax.xaxis.label.set_text('fraction of the dataset')\n", 186 | "ax.yaxis.label.set_text('runnning time (s)')\n", 187 | "\n", 188 | "custom_lines = [plt.Line2D([0], [0], color=y.color, lw=4),\n", 189 | " plt.Line2D([0], [0], color=y1.color, lw=4)]\n", 190 | "ax.legend(custom_lines, [y.name, y1.name])\n", 191 | "plt.savefig('../results/affiliation_timing.png', format='png')" 192 | ] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python 3", 198 | "language": "python", 199 | "name": "python3" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.7.3" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 2 216 | } 217 | -------------------------------------------------------------------------------- /notebooks/lithogenesis_scores.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.insert(0, '../')" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from collections import Counter, OrderedDict\n", 20 | "import warnings\n", 21 | "\n", 22 | "import rdflib\n", 23 | "import numpy as np\n", 24 | "from pprint import pprint\n", 25 | "from sklearn import svm\n", 26 | "from sklearn.model_selection import cross_validate\n", 27 | "\n", 28 | "import wlkernel" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "warnings.simplefilter('ignore')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "rdf_graph = rdflib.Graph().parse('../data/Lexicon_NamedRockUnit.nt', format='nt')" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "Most common classes with predicate equal to 'hasLithogenesis':\n", 59 | "[('http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/FLUV', 93),\n", 60 | " ('http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/GLACI', 53)]\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "hasLithogenesis_most_common = Counter(\n", 66 | " str(o)\n", 67 | " for s, p, o in rdf_graph\n", 68 | " if 'hasLithogenesis' in str(p)\n", 69 | ").most_common(2)\n", 70 | "print(\"Most common classes with predicate equal to 'hasLithogenesis':\")\n", 71 | "pprint(hasLithogenesis_most_common)\n", 72 | "classes = { c for c, _ in hasLithogenesis_most_common }" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 6, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "instances_class_map = {\n", 82 | " str(s): str(o)\n", 83 | " for s, p, o in rdf_graph\n", 84 | " if str(o) in classes\n", 85 | "}\n", 86 | "assert len(instances_class_map) == 146\n", 87 | "instances = list(instances_class_map.keys())\n", 88 | "assert len(instances) == len(set(instances))\n", 89 | "y = np.array(list(instances_class_map.values()))" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 7, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "number of tripes: 313901\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "triples = list(\n", 107 | " (str(s), str(p), str(o))\n", 108 | " for s, p, o in rdf_graph\n", 109 | " if 'hasLithogenesis' not in str(p)\n", 110 | ")\n", 111 | "print('number of tripes: ', len(triples))" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 9, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "RANDOM_STATE = 42\n", 121 | "\n", 122 | "depth_values = [1, 2, 3]\n", 123 | "iteration_values = [0, 2, 4, 6]\n", 124 | "C_values = [0.001, 0.01, 0.1, 1., 10., 100.]\n", 125 | "\n", 126 | "results = OrderedDict()\n", 127 | "\n", 128 | "for d in depth_values:\n", 129 | " for it in iteration_values:\n", 130 | " wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=d)\n", 131 | " kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=it)\n", 132 | " kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)\n", 133 | " \n", 134 | " results[(d, it)] = [0, 0, 0]\n", 135 | " for c in C_values:\n", 136 | " classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)\n", 137 | " scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))\n", 138 | " \n", 139 | " acc_mean = scores['test_accuracy'].mean()\n", 140 | " f1_mean = scores['test_f1_macro'].mean()\n", 141 | " \n", 142 | " if acc_mean > results[(d, it)][0]:\n", 143 | " results[(d, it)] = [acc_mean, f1_mean, c]" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 11, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/html": [ 154 | "
\n", 155 | "\n", 168 | "\n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | "
accuracyf1C
depthiterations
100.7955360.76373910.0
20.7955360.76373910.0
40.7955360.76373910.0
60.7955360.76373910.0
200.9062500.891229100.0
20.8928570.8740921.0
40.8928570.8740921.0
60.8857140.8666061.0
300.8910710.875862100.0
20.8919640.8734221.0
40.9062500.8901041.0
60.9071430.8888291.0
\n", 263 | "
" 264 | ], 265 | "text/plain": [ 266 | " accuracy f1 C\n", 267 | "depth iterations \n", 268 | "1 0 0.795536 0.763739 10.0\n", 269 | " 2 0.795536 0.763739 10.0\n", 270 | " 4 0.795536 0.763739 10.0\n", 271 | " 6 0.795536 0.763739 10.0\n", 272 | "2 0 0.906250 0.891229 100.0\n", 273 | " 2 0.892857 0.874092 1.0\n", 274 | " 4 0.892857 0.874092 1.0\n", 275 | " 6 0.885714 0.866606 1.0\n", 276 | "3 0 0.891071 0.875862 100.0\n", 277 | " 2 0.891964 0.873422 1.0\n", 278 | " 4 0.906250 0.890104 1.0\n", 279 | " 6 0.907143 0.888829 1.0" 280 | ] 281 | }, 282 | "execution_count": 11, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "import pandas as pd\n", 289 | "\n", 290 | "fn = 'wlrdf_lithogenesis_results_with_normalization'\n", 291 | "\n", 292 | "df_res = pd.DataFrame(index=list(results.keys()))\n", 293 | "df_res['accuracy'] = [t[0] for t in results.values()]\n", 294 | "df_res['f1'] = [t[1] for t in results.values()]\n", 295 | "df_res['C'] = [t[2] for t in results.values()]\n", 296 | "df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))\n", 297 | "df_res.to_csv(f'../results/{fn}.csv')\n", 298 | "df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])\n", 299 | "df_res_test.to_html(f'../results/{fn}.html')\n", 300 | "df_res_test" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 13, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "RANDOM_STATE = 42\n", 310 | "\n", 311 | "depth_values = [1, 2, 3]\n", 312 | "iteration_values = [0, 2, 4, 6]\n", 313 | "C_values = [0.001, 0.01, 0.1, 1., 10., 100.]\n", 314 | "\n", 315 | "results = OrderedDict()\n", 316 | "\n", 317 | "for d in depth_values:\n", 318 | " for it in iteration_values:\n", 319 | " wl_graphs = [wlkernel.WLGraph(triples, instance, max_depth=d) for instance in instances]\n", 320 | " kernel_matrix = wlkernel.wl_kernel_matrix(wl_graphs, iterations=it)\n", 321 | " kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)\n", 322 | " \n", 323 | " results[(d, it)] = [0, 0, 0]\n", 324 | " for c in C_values:\n", 325 | " classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)\n", 326 | " scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))\n", 327 | " \n", 328 | " acc_mean = scores['test_accuracy'].mean()\n", 329 | " f1_mean = scores['test_f1_macro'].mean()\n", 330 | " \n", 331 | " if acc_mean > results[(d, it)][0]:\n", 332 | " results[(d, it)] = [acc_mean, f1_mean, c]" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 14, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "data": { 342 | "text/html": [ 343 | "
\n", 344 | "\n", 357 | "\n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | "
accuracyf1C
depthiterations
100.8026790.77438310.0
20.7964290.76884210.0
40.7964290.76884210.0
60.7964290.76884210.0
200.8919640.877311100.0
20.8928570.8740921.0
40.8732140.8544851.0
60.8651790.8413531.0
300.8839290.871406100.0
20.9133930.8982911.0
40.9062500.8909221.0
60.9062500.8909221.0
\n", 452 | "
" 453 | ], 454 | "text/plain": [ 455 | " accuracy f1 C\n", 456 | "depth iterations \n", 457 | "1 0 0.802679 0.774383 10.0\n", 458 | " 2 0.796429 0.768842 10.0\n", 459 | " 4 0.796429 0.768842 10.0\n", 460 | " 6 0.796429 0.768842 10.0\n", 461 | "2 0 0.891964 0.877311 100.0\n", 462 | " 2 0.892857 0.874092 1.0\n", 463 | " 4 0.873214 0.854485 1.0\n", 464 | " 6 0.865179 0.841353 1.0\n", 465 | "3 0 0.883929 0.871406 100.0\n", 466 | " 2 0.913393 0.898291 1.0\n", 467 | " 4 0.906250 0.890922 1.0\n", 468 | " 6 0.906250 0.890922 1.0" 469 | ] 470 | }, 471 | "execution_count": 14, 472 | "metadata": {}, 473 | "output_type": "execute_result" 474 | } 475 | ], 476 | "source": [ 477 | "fn = 'wl_lithogenesis_results_with_normalization'\n", 478 | "\n", 479 | "df_res = pd.DataFrame(index=list(results.keys()))\n", 480 | "df_res['accuracy'] = [t[0] for t in results.values()]\n", 481 | "df_res['f1'] = [t[1] for t in results.values()]\n", 482 | "df_res['C'] = [t[2] for t in results.values()]\n", 483 | "df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))\n", 484 | "df_res.to_csv(f'../results/{fn}.csv')\n", 485 | "df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])\n", 486 | "df_res_test.to_html(f'../results/{fn}.html')\n", 487 | "df_res_test" 488 | ] 489 | } 490 | ], 491 | "metadata": { 492 | "kernelspec": { 493 | "display_name": "Python 3", 494 | "language": "python", 495 | "name": "python3" 496 | }, 497 | "language_info": { 498 | "codemirror_mode": { 499 | "name": "ipython", 500 | "version": 3 501 | }, 502 | "file_extension": ".py", 503 | "mimetype": "text/x-python", 504 | "name": "python", 505 | "nbconvert_exporter": "python", 506 | "pygments_lexer": "ipython3", 507 | "version": "3.7.3" 508 | } 509 | }, 510 | "nbformat": 4, 511 | "nbformat_minor": 2 512 | } 513 | -------------------------------------------------------------------------------- /notebooks/lithogenesis_timing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.insert(0, '../')" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from collections import Counter\n", 20 | "import time\n", 21 | "import random\n", 22 | "\n", 23 | "import rdflib\n", 24 | "import numpy as np\n", 25 | "from pprint import pprint\n", 26 | "from sklearn import svm\n", 27 | "\n", 28 | "import wlkernel" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "rdf_graph = rdflib.Graph().parse('../data/Lexicon_NamedRockUnit.nt', format='nt')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "all_triples = [\n", 47 | " (str(subj), str(pred), str(obj))\n", 48 | " for subj, pred, obj in rdf_graph\n", 49 | "]" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 5, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "quantiles = np.linspace(0.1, 1, 10) # [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]\n", 59 | "results_wlrdf = []\n", 60 | "results_wl = []\n", 61 | "n = len(all_triples)\n", 62 | "RANDOM_STATE = 42" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 6, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "for q in quantiles:\n", 72 | " n_sub = int(n * q)\n", 73 | " random.seed(RANDOM_STATE)\n", 74 | " triples = random.sample(all_triples, n_sub)\n", 75 | " \n", 76 | " instances_class_map = {\n", 77 | " subj: obj\n", 78 | " for subj, pred, obj in triples\n", 79 | " if 'hasLithogenesis' in pred\n", 80 | " }\n", 81 | " instances = list(instances_class_map.keys())\n", 82 | " y = list(instances_class_map.values())\n", 83 | " \n", 84 | " triples = [\n", 85 | " (subj, pred, obj)\n", 86 | " for subj, pred, obj in triples\n", 87 | " if 'hasLithogenesis' not in pred\n", 88 | " ]\n", 89 | " t0 = time.time()\n", 90 | " wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=3)\n", 91 | " kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=0)\n", 92 | " t1 = time.time()\n", 93 | "\n", 94 | " results_wlrdf.append(t1 - t0)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "rdf_graph = rdflib.Graph().parse('../data/Lexicon_NamedRockUnit.nt', format='nt')\n", 104 | "all_triples = [\n", 105 | " (str(subj), str(pred), str(obj))\n", 106 | " for subj, pred, obj in rdf_graph\n", 107 | "]" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 8, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "for q in quantiles:\n", 117 | " n_sub = int(n * q)\n", 118 | " random.seed(RANDOM_STATE)\n", 119 | " triples = random.sample(all_triples, n_sub)\n", 120 | " \n", 121 | " instances_class_map = {\n", 122 | " subj: obj\n", 123 | " for subj, pred, obj in triples\n", 124 | " if 'hasLithogenesis' in pred\n", 125 | " }\n", 126 | " instances = list(instances_class_map.keys())\n", 127 | " y = list(instances_class_map.values())\n", 128 | " \n", 129 | " triples = [\n", 130 | " (subj, pred, obj)\n", 131 | " for subj, pred, obj in triples\n", 132 | " if 'hasLithogenesis' not in pred\n", 133 | " ]\n", 134 | " t0 = time.time()\n", 135 | " wl_graphs = [wlkernel.WLGraph(triples, instance, max_depth=3) for instance in instances]\n", 136 | " kernel_matrix = wlkernel.wl_kernel_matrix(wl_graphs, iterations=0)\n", 137 | " t1 = time.time()\n", 138 | "\n", 139 | " results_wl.append(t1 - t0)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 10, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "image/png": "\n", 150 | "text/plain": [ 151 | "
" 152 | ] 153 | }, 154 | "metadata": { 155 | "needs_background": "light" 156 | }, 157 | "output_type": "display_data" 158 | } 159 | ], 160 | "source": [ 161 | "import matplotlib.pyplot as plt\n", 162 | "\n", 163 | "class Result:\n", 164 | " def __init__(self, values = None, color = 'red', name = ''):\n", 165 | " if values is not None:\n", 166 | " self.values = values\n", 167 | " self.color = color\n", 168 | " self.name = name\n", 169 | "\n", 170 | "x = quantiles\n", 171 | "y = Result(results_wlrdf, 'orange', 'WL RDF')\n", 172 | "y1 = Result(results_wl, 'purple', 'WL')\n", 173 | "n = len(x)\n", 174 | "\n", 175 | "fig, ax = plt.subplots(figsize=(15, 8))\n", 176 | "for i in range(n - 1):\n", 177 | " plt.plot(x[i: i+2], y.values[i: i+2],\n", 178 | " 'o-', color=y.color, markersize=8)\n", 179 | " plt.plot(x[i: i+2], y1.values[i: i+2],\n", 180 | " 'o-', color= y1.color, markersize=8)\n", 181 | "\n", 182 | "ax.xaxis.label.set_text('fraction of the dataset')\n", 183 | "ax.yaxis.label.set_text('runnning time (s)')\n", 184 | "\n", 185 | "custom_lines = [plt.Line2D([0], [0], color=y.color, lw=4),\n", 186 | " plt.Line2D([0], [0], color=y1.color, lw=4)]\n", 187 | "ax.legend(custom_lines, [y.name, y1.name])\n", 188 | "plt.savefig('../results/lithogenesis_timing.png', format='png')" 189 | ] 190 | } 191 | ], 192 | "metadata": { 193 | "kernelspec": { 194 | "display_name": "Python 3", 195 | "language": "python", 196 | "name": "python3" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 3 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython3", 208 | "version": "3.7.3" 209 | } 210 | }, 211 | "nbformat": 4, 212 | "nbformat_minor": 2 213 | } 214 | -------------------------------------------------------------------------------- /notebooks/no_labels_scores.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.insert(0, '../')" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from typing import Union\n", 20 | "from collections import Counter, OrderedDict\n", 21 | "import warnings\n", 22 | "\n", 23 | "import rdflib\n", 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "from pprint import pprint\n", 27 | "from sklearn import svm\n", 28 | "from sklearn.model_selection import cross_validate\n", 29 | "\n", 30 | "import wlkernel" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "warnings.simplefilter('ignore')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "rdf_graph = rdflib.Graph().parse('../data/aifbfixed_complete.n3', format='n3')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "triples = [\n", 58 | " (str(subj), str(pred), str(obj))\n", 59 | " for subj, pred, obj in rdf_graph\n", 60 | "]" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 6, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "instances_class_map = {\n", 70 | " subj: obj\n", 71 | " for subj, pred, obj in triples\n", 72 | " if 'affiliation' in pred\n", 73 | " and 'id5instance' not in obj\n", 74 | "}\n", 75 | "instances = list(instances_class_map.keys())\n", 76 | "y = list(instances_class_map.values())" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 7, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "triples = [\n", 86 | " (subj, pred, obj)\n", 87 | " for subj, pred, obj in triples\n", 88 | " if 'affiliation' not in pred\n", 89 | " and 'employs' not in pred\n", 90 | " and 'member' not in pred\n", 91 | " and 'head' not in pred\n", 92 | "]" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Weisfeiler-Lehman RDF" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 1, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "ename": "NameError", 109 | "evalue": "name 'Union' is not defined", 110 | "output_type": "error", 111 | "traceback": [ 112 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 113 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 114 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0mbananize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mg\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mwlkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWLRDFGraph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwlkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWLGraph\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mwlkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWLRDFGraph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwlkernel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mWLGraph\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m'All the label in the WLRDFGraph are replaced with the same label'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'banana'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 115 | "\u001b[0;31mNameError\u001b[0m: name 'Union' is not defined" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "def bananize(g: Union[wlkernel.WLRDFGraph, wlkernel.WLGraph]) -> Union[wlkernel.WLRDFGraph, wlkernel.WLGraph]:\n", 121 | " 'All the label in the WLRDFGraph are replaced with the same label'\n", 122 | " for i in range(len(g.labels)):\n", 123 | " for k in g.labels[i].keys():\n", 124 | " g.labels[i][k] = 'banana'\n", 125 | " return g" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 9, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "RANDOM_STATE = 42\n", 135 | "\n", 136 | "depth_values = [1, 2, 3]\n", 137 | "iteration_values = [0, 2, 4, 6]\n", 138 | "C_values = [0.001, 0.01, 0.1, 1., 10., 100.]\n", 139 | "\n", 140 | "results = OrderedDict()\n", 141 | "\n", 142 | "for d in depth_values:\n", 143 | " for it in iteration_values:\n", 144 | " wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=d)\n", 145 | " bananize(wlrdf_graph)\n", 146 | " kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=it)\n", 147 | " kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)\n", 148 | " \n", 149 | " results[(d, it)] = [0, 0, 0]\n", 150 | " for c in C_values:\n", 151 | " classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)\n", 152 | " scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))\n", 153 | " \n", 154 | " acc_mean = scores['test_accuracy'].mean()\n", 155 | " f1_mean = scores['test_f1_macro'].mean()\n", 156 | " \n", 157 | " if acc_mean > results[(d, it)][0]:\n", 158 | " results[(d, it)] = [acc_mean, f1_mean, c]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/html": [ 169 | "
\n", 170 | "\n", 183 | "\n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | "
accuracyf1C
depthiterations
100.5248470.305547100.0
20.6475360.566394100.0
40.6707800.591060100.0
60.6770300.594329100.0
200.5659360.34073210.0
20.6814220.622212100.0
40.7400480.663960100.0
60.7625970.688069100.0
300.4073940.2933201.0
20.8989140.861681100.0
40.8920790.854304100.0
60.8930660.85135810.0
\n", 278 | "
" 279 | ], 280 | "text/plain": [ 281 | " accuracy f1 C\n", 282 | "depth iterations \n", 283 | "1 0 0.524847 0.305547 100.0\n", 284 | " 2 0.647536 0.566394 100.0\n", 285 | " 4 0.670780 0.591060 100.0\n", 286 | " 6 0.677030 0.594329 100.0\n", 287 | "2 0 0.565936 0.340732 10.0\n", 288 | " 2 0.681422 0.622212 100.0\n", 289 | " 4 0.740048 0.663960 100.0\n", 290 | " 6 0.762597 0.688069 100.0\n", 291 | "3 0 0.407394 0.293320 1.0\n", 292 | " 2 0.898914 0.861681 100.0\n", 293 | " 4 0.892079 0.854304 100.0\n", 294 | " 6 0.893066 0.851358 10.0" 295 | ] 296 | }, 297 | "execution_count": 10, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "fn = 'wlrdf_no_labels'\n", 304 | "\n", 305 | "df_res = pd.DataFrame(index=list(results.keys()))\n", 306 | "df_res['accuracy'] = [t[0] for t in results.values()]\n", 307 | "df_res['f1'] = [t[1] for t in results.values()]\n", 308 | "df_res['C'] = [t[2] for t in results.values()]\n", 309 | "df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))\n", 310 | "df_res.to_csv(f'../results/{fn}.csv')\n", 311 | "df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])\n", 312 | "df_res_test.to_html(f'../results/{fn}.html')\n", 313 | "df_res_test" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "### Weisfeiler-Lehman" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 11, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "RANDOM_STATE = 42\n", 330 | "\n", 331 | "depth_values = [1, 2, 3]\n", 332 | "iteration_values = [0, 2, 4, 6]\n", 333 | "C_values = [0.001, 0.01, 0.1, 1., 10., 100.]\n", 334 | "\n", 335 | "results = OrderedDict()\n", 336 | "\n", 337 | "for d in depth_values:\n", 338 | " for it in iteration_values:\n", 339 | " wl_graphs = [bananize(\n", 340 | " wlkernel.WLGraph(triples, instance, max_depth=d)\n", 341 | " ) for instance in instances]\n", 342 | " kernel_matrix = wlkernel.wl_kernel_matrix(wl_graphs, iterations=it)\n", 343 | " kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)\n", 344 | " \n", 345 | " results[(d, it)] = [0, 0, 0]\n", 346 | " for c in C_values:\n", 347 | " classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)\n", 348 | " scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))\n", 349 | " \n", 350 | " acc_mean = scores['test_accuracy'].mean()\n", 351 | " f1_mean = scores['test_f1_macro'].mean()\n", 352 | " \n", 353 | " if acc_mean > results[(d, it)][0]:\n", 354 | " results[(d, it)] = [acc_mean, f1_mean, c]" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 12, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/html": [ 365 | "
\n", 366 | "\n", 379 | "\n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | "
accuracyf1C
depthiterations
100.3221530.194477100.0
20.5301110.34867210.0
40.5301110.34704910.0
60.5301110.34704910.0
200.5645470.35525310.0
20.5037240.3431481.0
40.4814370.392366100.0
60.5029990.3834611.0
300.4916970.343404100.0
20.6413330.527556100.0
40.7245510.60267710.0
60.7134740.557335100.0
\n", 474 | "
" 475 | ], 476 | "text/plain": [ 477 | " accuracy f1 C\n", 478 | "depth iterations \n", 479 | "1 0 0.322153 0.194477 100.0\n", 480 | " 2 0.530111 0.348672 10.0\n", 481 | " 4 0.530111 0.347049 10.0\n", 482 | " 6 0.530111 0.347049 10.0\n", 483 | "2 0 0.564547 0.355253 10.0\n", 484 | " 2 0.503724 0.343148 1.0\n", 485 | " 4 0.481437 0.392366 100.0\n", 486 | " 6 0.502999 0.383461 1.0\n", 487 | "3 0 0.491697 0.343404 100.0\n", 488 | " 2 0.641333 0.527556 100.0\n", 489 | " 4 0.724551 0.602677 10.0\n", 490 | " 6 0.713474 0.557335 100.0" 491 | ] 492 | }, 493 | "execution_count": 12, 494 | "metadata": {}, 495 | "output_type": "execute_result" 496 | } 497 | ], 498 | "source": [ 499 | "fn = 'wl_no_labels'\n", 500 | "\n", 501 | "df_res = pd.DataFrame(index=list(results.keys()))\n", 502 | "df_res['accuracy'] = [t[0] for t in results.values()]\n", 503 | "df_res['f1'] = [t[1] for t in results.values()]\n", 504 | "df_res['C'] = [t[2] for t in results.values()]\n", 505 | "df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))\n", 506 | "df_res.to_csv(f'../results/{fn}.csv')\n", 507 | "df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])\n", 508 | "df_res_test.to_html(f'../results/{fn}.html')\n", 509 | "df_res_test" 510 | ] 511 | } 512 | ], 513 | "metadata": { 514 | "kernelspec": { 515 | "display_name": "Python 3", 516 | "language": "python", 517 | "name": "python3" 518 | }, 519 | "language_info": { 520 | "codemirror_mode": { 521 | "name": "ipython", 522 | "version": 3 523 | }, 524 | "file_extension": ".py", 525 | "mimetype": "text/x-python", 526 | "name": "python", 527 | "nbconvert_exporter": "python", 528 | "pygments_lexer": "ipython3", 529 | "version": "3.7.3" 530 | } 531 | }, 532 | "nbformat": 4, 533 | "nbformat_minor": 2 534 | } 535 | -------------------------------------------------------------------------------- /presentation/img/07-Graph.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/presentation/img/07-Graph.pdf -------------------------------------------------------------------------------- /presentation/img/07-almost_relabeled.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/presentation/img/07-almost_relabeled.pdf -------------------------------------------------------------------------------- /presentation/img/07-relabeled.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/presentation/img/07-relabeled.pdf -------------------------------------------------------------------------------- /presentation/img/07-relabeled_vertical.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/presentation/img/07-relabeled_vertical.pdf -------------------------------------------------------------------------------- /presentation/img/07-subGraph_A1_B1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/presentation/img/07-subGraph_A1_B1.pdf -------------------------------------------------------------------------------- /presentation/img/07-subGraph_A1_B1_vertical.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/presentation/img/07-subGraph_A1_B1_vertical.pdf -------------------------------------------------------------------------------- /presentation/img/wl_iteration_total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/presentation/img/wl_iteration_total.png -------------------------------------------------------------------------------- /presentation/img/wl_iteration_upper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/presentation/img/wl_iteration_upper.png -------------------------------------------------------------------------------- /presentation/presentation.nav: -------------------------------------------------------------------------------- 1 | \headcommand {\slideentry {0}{0}{1}{1/1}{}{0}} 2 | \headcommand {\beamer@framepages {1}{1}} 3 | \headcommand {\slideentry {0}{0}{2}{2/2}{}{0}} 4 | \headcommand {\beamer@framepages {2}{2}} 5 | \headcommand {\slideentry {0}{0}{3}{3/3}{}{0}} 6 | \headcommand {\beamer@framepages {3}{3}} 7 | \headcommand {\slideentry {0}{0}{4}{4/4}{}{0}} 8 | \headcommand {\beamer@framepages {4}{4}} 9 | \headcommand {\slideentry {0}{0}{5}{5/5}{}{0}} 10 | \headcommand {\beamer@framepages {5}{5}} 11 | \headcommand {\slideentry {0}{0}{6}{6/6}{}{0}} 12 | \headcommand {\beamer@framepages {6}{6}} 13 | \headcommand {\slideentry {0}{0}{7}{7/7}{}{0}} 14 | \headcommand {\beamer@framepages {7}{7}} 15 | \headcommand {\slideentry {0}{0}{8}{8/8}{}{0}} 16 | \headcommand {\beamer@framepages {8}{8}} 17 | \headcommand {\slideentry {0}{0}{9}{9/9}{}{0}} 18 | \headcommand {\beamer@framepages {9}{9}} 19 | \headcommand {\slideentry {0}{0}{10}{10/10}{}{0}} 20 | \headcommand {\beamer@framepages {10}{10}} 21 | \headcommand {\slideentry {0}{0}{11}{11/11}{}{0}} 22 | \headcommand {\beamer@framepages {11}{11}} 23 | \headcommand {\slideentry {0}{0}{12}{12/12}{}{0}} 24 | \headcommand {\beamer@framepages {12}{12}} 25 | \headcommand {\slideentry {0}{0}{13}{13/13}{}{0}} 26 | \headcommand {\beamer@framepages {13}{13}} 27 | \headcommand {\slideentry {0}{0}{14}{14/14}{}{0}} 28 | \headcommand {\beamer@framepages {14}{14}} 29 | \headcommand {\slideentry {0}{0}{15}{15/15}{}{0}} 30 | \headcommand {\beamer@framepages {15}{15}} 31 | \headcommand {\slideentry {0}{0}{16}{16/16}{}{0}} 32 | \headcommand {\beamer@framepages {16}{16}} 33 | \headcommand {\slideentry {0}{0}{17}{17/17}{}{0}} 34 | \headcommand {\beamer@framepages {17}{17}} 35 | \headcommand {\slideentry {0}{0}{18}{18/18}{}{0}} 36 | \headcommand {\beamer@framepages {18}{18}} 37 | \headcommand {\slideentry {0}{0}{19}{19/19}{}{0}} 38 | \headcommand {\beamer@framepages {19}{19}} 39 | \headcommand {\slideentry {0}{0}{20}{20/20}{}{0}} 40 | \headcommand {\beamer@framepages {20}{20}} 41 | \headcommand {\slideentry {0}{0}{21}{21/21}{}{0}} 42 | \headcommand {\beamer@framepages {21}{21}} 43 | \headcommand {\beamer@partpages {1}{21}} 44 | \headcommand {\beamer@subsectionpages {1}{21}} 45 | \headcommand {\beamer@sectionpages {1}{21}} 46 | \headcommand {\beamer@documentpages {21}} 47 | \headcommand {\gdef \inserttotalframenumber {21}} 48 | -------------------------------------------------------------------------------- /presentation/presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/presentation/presentation.pdf -------------------------------------------------------------------------------- /presentation/presentation.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | 3 | \mode { 4 | 5 | % The Beamer class comes with a number of default slide themes 6 | % which change the colors and layouts of slides. Below this is a list 7 | % of all the themes, uncomment each in turn to see what they look like. 8 | 9 | %\usetheme{default} 10 | %\usetheme{AnnArbor} 11 | %\usetheme{Antibes} 12 | %\usetheme{Bergen} 13 | %\usetheme{Berkeley} 14 | %\usetheme{Berlin} 15 | %\usetheme{Boadilla} 16 | %\usetheme{CambridgeUS} 17 | %\usetheme{Copenhagen} 18 | %\usetheme{Darmstadt} 19 | %\usetheme{Dresden} 20 | %\usetheme{Frankfurt} 21 | %\usetheme{Goettingen} 22 | %\usetheme{Hannover} 23 | %\usetheme{Ilmenau} 24 | %\usetheme{JuanLesPins} 25 | %\usetheme{Luebeck} 26 | \usetheme{Madrid} 27 | %\usetheme{Malmoe} 28 | %\usetheme{Marburg} 29 | %\usetheme{Montpellier} 30 | %\usetheme{PaloAlto} 31 | %\usetheme{Pittsburgh} 32 | %\usetheme{Rochester} 33 | %\usetheme{Singapore} 34 | %\usetheme{Szeged} 35 | %\usetheme{Warsaw} 36 | 37 | % As well as themes, the Beamer class has a number of color themes 38 | % for any slide theme. Uncomment each of these in turn to see how it 39 | % changes the colors of your current slide theme. 40 | 41 | %\usecolortheme{albatross} 42 | %\usecolortheme{beaver} 43 | %\usecolortheme{beetle} 44 | %\usecolortheme{crane} 45 | %\usecolortheme{dolphin} 46 | %\usecolortheme{dove} 47 | %\usecolortheme{fly} 48 | %\usecolortheme{lily} 49 | %\usecolortheme{orchid} 50 | %\usecolortheme{rose} 51 | %\usecolortheme{seagull} 52 | %\usecolortheme{seahorse} 53 | %\usecolortheme{whale} 54 | %\usecolortheme{wolverine} 55 | 56 | %\setbeamertemplate{footline} % To remove the footer line in all slides uncomment this line 57 | %\setbeamertemplate{footline}[page number] % To replace the footer line in all slides with a simple slide count uncomment this line 58 | 59 | %\setbeamertemplate{navigation symbols}{} % To remove the navigation symbols from the bottom of all slides uncomment this line 60 | } 61 | 62 | \usepackage{graphicx} % Allows including images 63 | \usepackage{booktabs} % Allows the use of \toprule, \midrule and \bottomrule in tables 64 | \usepackage[utf8]{inputenc} 65 | \usepackage{float} 66 | \usepackage{subcaption} 67 | 68 | %---------------------------------------------------------------------------------------- 69 | % TITLE PAGE 70 | %---------------------------------------------------------------------------------------- 71 | 72 | \title[A Fast Approximation of WL RDF kernel]{A Fast Approximation of the Weisfeiler-Lehman Graph Kernel for RDF Data} 73 | \subtitle{Advanced Algorithms and Graph Minings} 74 | \author{Lorenzo Palloni \and Emilio Cecchini} 75 | \institute[]{ 76 | Università Degli Studi di Firenze \\ 77 | \medskip 78 | \textit{lorenzo.palloni@stud.unifi.it \and emilio.cecchini@stud.unifi.it} 79 | } 80 | \date{\today} 81 | 82 | \begin{document} 83 | 84 | \begin{frame} 85 | \titlepage % Print the title page as the first slide 86 | \end{frame} 87 | 88 | %---------------------------------------------------------------------------------------- 89 | % PRESENTATION SLIDES 90 | %---------------------------------------------------------------------------------------- 91 | 92 | \begin{frame} 93 | \frametitle{Introduction} 94 | 95 | \begin{itemize} 96 | \item 97 | In order to apply machine learning algorithms on graphs it is necessary to develop algorithms to compute how similar two graphs are. 98 | 99 | \item 100 | Starting from the well-known Weisfeiler-Lehman isomorphism test, kernel methods to measure the similarity between graphs have been developed. 101 | 102 | \item 103 | This paper proposes a fast approximation of a Weisfeiler-Lehman kernel applied to RDF data. 104 | 105 | 106 | \end{itemize} 107 | 108 | \end{frame} 109 | 110 | \begin{frame} 111 | \frametitle{Graph Kernels} 112 | 113 | Kernel-based machine learning algorithms abandon the explicit vector representations of data items by means of the \textit{kernel function}. 114 | 115 | \begin{definition}[Graph Kernel] 116 | Let $\mathbb{G}$ be a non-empty set of graphs. Any function $k: \mathbb{G} \times \mathbb{G} \rightarrow \mathbb{R}$ that takes as input two graphs $G$ and $G^\prime$ and returns a real number that is equal to the scalar product between $G$ and $G^\prime$ in a (even unknown) feature space is a valid kernel function. 117 | \end{definition} 118 | 119 | \end{frame} 120 | 121 | %------------------------------------------------ 122 | 123 | \begin{frame} 124 | \frametitle{Graphs isomorphism} 125 | 126 | \begin{itemize} 127 | \item 128 | Two graphs $G$ and $G^\prime$ are isomorphic if exists a bijective mapping between the labels of $G$ to the label of $G^\prime$ 129 | 130 | \item 131 | The graph isomorphism problem is NP. 132 | 133 | \item 134 | The graph kernel introduced in this paper uses concepts from the \textit{Weisfeiler-Lehman test} of isomorphism. 135 | \end{itemize} 136 | 137 | \end{frame} 138 | 139 | %------------------------------------------------ 140 | 141 | \begin{frame} 142 | \frametitle{Weisfeiler-Lehman test} 143 | 144 | \begin{itemize} 145 | \item 146 | Assume we are given two graphs $G$ and $G^\prime$ and we would like to test whether they are isomorphic. 147 | \item 148 | The Weisfeiler-Lehman test performs $h$ iterations. 149 | \item 150 | The key idea of the algorithm is to augment the node labels by the sorted set of node labels of neighbouring nodes, and compress these augmented labels into new, short labels. 151 | \item 152 | These steps are then repeated until the node label sets of $G$ and $G^\prime$ differ, or the number of iterations reaches $h$. 153 | \item 154 | The runtime complexity of the Weisfeiler-Lehman algorithm with h iterations is $O(hk)$, where $k$ is the number of labels in $G$ and $G^\prime$. 155 | \end{itemize} 156 | 157 | \end{frame} 158 | 159 | %------------------------------------------------ 160 | 161 | \begin{frame} 162 | \frametitle{Weisfeiler-Lehman test} 163 | 164 | \begin{center} 165 | \begin{figure} 166 | \end{figure} 167 | \includegraphics[width=\textwidth]{img/wl_iteration_upper.png} 168 | \end{center} 169 | 170 | \end{frame} 171 | 172 | %------------------------------------------------ 173 | 174 | \begingroup 175 | \small 176 | \begin{frame} 177 | \frametitle{Weisfeiler-Lehman kernel} 178 | 179 | \begin{definition}[Weisfeiler-Lehman kernel] 180 | Let $G_i = (V, E, \ell_i)$ and $G_i^\prime = (V^\prime , E^\prime , \ell_i)$ be the i-th iteration rewriting of the graphs $G$ and $G^\prime$ with the Weisfeiler-Lehman algorithm and h the number of iterations. Then the Weisfeiler-Lehman kernel is defined as: 181 | 182 | \begin{align} 183 | k_{\mathrm{WL}}^{h}\left(G, G^\prime\right)=\sum_{i=0}^h k_\delta\left(G_i, G_i^\prime\right) 184 | \end{align} 185 | 186 | where 187 | 188 | \begin{align} 189 | k_\delta\left((V, E, \ell),\left(V^\prime, E^\prime, \ell^\prime\right)\right)=\sum_{v \in V} \sum_{v^{\prime} \in V^{\prime}} \delta\left(\ell(v), \ell^{\prime}\left(v^{\prime}\right)\right) 190 | \end{align} 191 | 192 | Here $\delta$ is the Dirac kernel, which tests for equality, it is 1 if its arguments are equal, and 0 otherwise. 193 | \end{definition} 194 | 195 | \end{frame} 196 | \endgroup 197 | 198 | %------------------------------------------------ 199 | 200 | \begin{frame} 201 | \frametitle{Weisfeiler-Lehman Subtree Kernel} 202 | 203 | \begin{center} 204 | \begin{figure} 205 | \end{figure} 206 | \includegraphics[width=\textwidth,height=0.8\textheight,keepaspectratio]{img/wl_iteration_total.png} 207 | \end{center} 208 | 209 | \end{frame} 210 | 211 | %------------------------------------------------ 212 | 213 | \begin{frame} 214 | \frametitle{The Resource Description Framework} 215 | 216 | \begin{itemize} 217 | \item 218 | The Resource Description Framework (RDF) is the foundation for knowledge representation on the semantic web. 219 | 220 | \item 221 | It is based on the idea of making statements about resources in a \textit{subject-predicate-object} form, called \textit{triples} 222 | 223 | \item 224 | A set of triples represents a graph, that has subjects and objects as nodes and predicates as edges (note that is a \textit{directed multigraph with labeled edges}). 225 | \end{itemize} 226 | 227 | \end{frame} 228 | 229 | %------------------------------------------------ 230 | 231 | \begin{frame} 232 | \frametitle{The Resource Description Framework} 233 | 234 | \begin{center} 235 | \begin{figure} 236 | \includegraphics[scale=0.55,keepaspectratio]{img/07-Graph} 237 | \end{figure} 238 | \end{center} 239 | 240 | \end{frame} 241 | 242 | %------------------------------------------------ 243 | 244 | \begin{frame} 245 | \frametitle{Fast Weisfeiler-Lehman for RDF} 246 | 247 | \begin{itemize} 248 | \item 249 | The most immediate approach to apply graph kernels to RDF is to extract subgraphs for the instances that we are interested in and to compute the kernel on these subgraphs. 250 | 251 | \item 252 | Potentially it can be more efficient to do the kernel computation directly on the larger underlying RDF graph, instead of extracting many subgraphs. 253 | 254 | \item 255 | This paper proposes an approximation of the Weisfeiler-Lehman kernel designed for RDF data. 256 | \end{itemize} 257 | 258 | \end{frame} 259 | 260 | %------------------------------------------------ 261 | 262 | \begin{frame} 263 | \frametitle{Weisfeiler-Lehman RDF graph} 264 | 265 | \begin{definition}[Weisfeiler-Lehman RDF graph] 266 | A Weisfeiler-Lehman RDF graph is a graph $G = (V, E, \ell)$, where $V$ is a set of vertices, $E$ a set of directed edges, and $\ell:(V \cup E) \times N \rightarrow \Sigma$ a labeling function from vertices $V$ or edges $E$ and a depth index $j \in \mathbb{N}$ to a set of labels $\Sigma$. 267 | \end{definition} 268 | 269 | \begin{definition}[Neighborhood] 270 | The neighborhood $N(v) = \{(v^\prime, v) \in E\}$ of a vertex is the set of edges going to the vertex $v$ and the neighborhood $N((v, v^\prime)) = {v}$ of an edge is the vertex that the edge comes from. 271 | \end{definition} 272 | 273 | \end{frame} 274 | 275 | 276 | 277 | \begin{frame} 278 | \frametitle{Graph extraction from RDF} 279 | 280 | \begin{itemize} 281 | \item 282 | Given a set of RDF triples and a set of instances I, there is an algorithm to build a Weisfeiler-Lehman RDF graph. 283 | 284 | \item 285 | For each instance $i$ a subgraph up to depth $d$ is extracted from the RDF dataset and this subgraph is added to the total graph G that the algorithm is building. Thus, vertices and edges are only added if they have not been added to the graph already. 286 | 287 | \item 288 | Next to the graph G we also construct mappings $\mathcal{V}_i$ and $\mathcal{E}_i$ for each instance $i$, which records which vertices and edges belong to the subgraph of instance $i$ at which depth. 289 | \end{itemize} 290 | 291 | \end{frame} 292 | 293 | 294 | \begingroup 295 | \small 296 | \begin{frame} 297 | \frametitle{Graph extraction from RDF} 298 | 299 | Extraction of the instances A1 and B1. 300 | 301 | \begin{center} 302 | \begin{figure} 303 | \includegraphics[width=\textwidth,keepaspectratio]{img/07-subGraph_A1_B1} 304 | \end{figure} 305 | \end{center} 306 | 307 | \end{frame} 308 | \endgroup 309 | 310 | 311 | \begin{frame} 312 | \frametitle{Relabeling of the Weisfeiler-Lehman RDF graph} 313 | 314 | \begin{itemize} 315 | \item 316 | The relabeling process is quite similar to the standard one. 317 | 318 | \item 319 | It is extended to directed and labeled edges. 320 | 321 | \item 322 | The augmented labels are constructed taking into account the new definition of neighborhood and the depths. 323 | \end{itemize} 324 | 325 | \end{frame} 326 | 327 | %---------------------------------------------------------------------------- 328 | 329 | \begingroup 330 | \small 331 | \begin{frame} 332 | \frametitle{Relabeling of the Weisfeiler-Lehman RDF graph} 333 | 334 | Label propagation. 335 | 336 | \begin{center} 337 | \begin{figure} 338 | \includegraphics[width=\textwidth,keepaspectratio]{img/07-almost_relabeled} 339 | \end{figure} 340 | \end{center} 341 | 342 | \end{frame} 343 | \endgroup 344 | 345 | %---------------------------------------------------------------------------- 346 | 347 | \begingroup 348 | \small 349 | \begin{frame} 350 | \frametitle{Relabeling of the Weisfeiler-Lehman RDF graph} 351 | 352 | Relabeling. 353 | 354 | \begin{center} 355 | \begin{figure} 356 | \includegraphics[width=\textwidth,keepaspectratio]{img/07-relabeled} 357 | \end{figure} 358 | \end{center} 359 | 360 | \end{frame} 361 | \endgroup 362 | 363 | %---------------------------------------------------------------------------- 364 | 365 | \begingroup 366 | \small 367 | \begin{frame} 368 | \frametitle{Weisfeiler-Lehman kernel for RDF} 369 | 370 | \begin{definition}[Weisfeiler-Lehman kernel for RDF] 371 | Let $G$ be a Weisfeiler-Lehman RDF graph and rewritten for h iterations, and $\ell_0$ to $\ell_h$ the resulting label functions. Then we compute a kernel between two instances $i$, $i^\prime \in I$, as: 372 | 373 | \begin{align} 374 | k_{\mathrm{WLRDF}}^{h}\left(i, i^{\prime}\right)=\sum_{n=0}^{h} \frac{n+1}{h+1} k_{\delta, \mathrm{RDF}}^{n}\left(\left(\mathcal{V}_{i}, \mathcal{E}_{i}\right),\left(\mathcal{V}_{i^{\prime}}, \mathcal{E}_{i^{\prime}}\right)\right) 375 | \end{align} 376 | 377 | where 378 | 379 | \begin{align} 380 | k_{\delta, \mathrm{RDF}}^{n}\left(\left(\mathcal{V}_{i}, \mathcal{E}_{i}\right),\left(\mathcal{V}_{i^{\prime}}, \mathcal{E}_{i^{\prime}}\right)\right) &=\sum_{(v, d) \in \mathcal{V}_{i}\left(v^{\prime}, d^{\prime}\right) \in \mathcal{V}_{i^{\prime}}} \delta\left(\ell_{n}(v, d), \ell_{n}\left(v^{\prime}, d^{\prime}\right)\right) \\ 381 | &+\sum_{(e, d) \in \mathcal{E}_{i}\left(c^{\prime}, d^{\prime}\right) \in \mathcal{V}_{i^{\prime}}} \delta\left(\ell_{n}(e, d), \ell_{n}\left(e^{\prime}, d^{\prime}\right)\right) 382 | \end{align} 383 | 384 | \end{definition} 385 | 386 | \end{frame} 387 | \endgroup 388 | 389 | %------------------------------------------------ 390 | 391 | \begingroup 392 | \footnotesize 393 | \begin{frame} 394 | \frametitle{Weisfeiler-Lehman kernel for RDF} 395 | 396 | \begin{exampleblock}{Example} 397 | \begin{align*} 398 | k_{\mathrm{WLRDF}}^h\left(A1, B1\right) &= \sum_{n=0}^h \frac{n+1}{h+1} k_{\delta, \mathrm{RDF}}^{n}\left(\left(\mathcal{V}_{A1}, \mathcal{E}_{A1}\right),\left(\mathcal{V}_{B1}, \mathcal{E}_{B1}\right)\right) = \dfrac{1}{2} \cdot 10 + \dfrac{2}{2} \cdot 3 = 8 399 | \end{align*} 400 | \end{exampleblock} 401 | 402 | \begin{figure} 403 | \centering 404 | \begin{subfigure}{.5\textwidth} 405 | \centering 406 | \includegraphics[width=0.9\linewidth]{img/07-subGraph_A1_B1_vertical} 407 | \end{subfigure}% 408 | \begin{subfigure}{.5\textwidth} 409 | \centering 410 | \includegraphics[width=0.9\linewidth]{img/07-relabeled_vertical} 411 | \end{subfigure} 412 | \end{figure} 413 | 414 | \end{frame} 415 | \endgroup 416 | %------------------------------------------------ 417 | 418 | \begin{frame} 419 | \frametitle{Complexity} 420 | 421 | \begin{itemize} 422 | \item 423 | The complexity of the standard relabeling algorithm on a set of graphs is O$(Nh(n + m))$, where $N$ is the number of graphs, $h$ is the number of iterations and $n$ and $m$ are the number of vertices and edges per graph. 424 | \item 425 | This new relabeling method does not have $N$ graphs, but it introduces $d$ labels per vertex/edge, where $d$ is the extraction depth. 426 | \item 427 | If the WL RDF graph has $k$ nodes and edges the complexity of this new algorithm is $O(dhk)$ 428 | \item 429 | The new proposed method is faster than the regular one if $hk < N(n+m)$ 430 | \end{itemize} 431 | 432 | \end{frame} 433 | 434 | %------------------------------------------------ 435 | 436 | \begingroup 437 | \footnotesize 438 | \begin{frame} 439 | \frametitle{References} 440 | 441 | \begin{thebibliography}{99} % Beamer does not support BibTeX so references must be inserted manually as below 442 | 443 | \bibitem{lamport94} 444 | Vries Gerben Klaas Dirk, 445 | A Fast Approximation of the Weisfeiler-Lehman Graph Kernel for RDF Data, 446 | 2013 447 | 448 | \bibitem{wl-kernels} 449 | Shervashidze, N., Schweitzer, P., van Leeuwen, E.J., Mehlhorn, K., Borgwardt, K.M. 450 | Weisfeiler-lehman graph kernels, 451 | 2011 452 | \end{thebibliography} 453 | 454 | \end{frame} 455 | \endgroup 456 | 457 | %------------------------------------------------ 458 | 459 | \end{document} 460 | -------------------------------------------------------------------------------- /report/RefereeReport.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/report/RefereeReport.pdf -------------------------------------------------------------------------------- /report/RefereeReport.tex: -------------------------------------------------------------------------------- 1 | \documentclass[12pt]{scrartcl} 2 | \usepackage[utf8]{inputenc} 3 | \usepackage{hyperref} 4 | \usepackage{booktabs} 5 | \usepackage{caption} 6 | \usepackage{graphicx} 7 | 8 | \begin{document} 9 | 10 | 11 | \title{A Fast Approximation of the Weisfeiler-Lehman Graph Kernel for RDF Data} 12 | \subtitle{Referee report} 13 | \author{ 14 | Emilio Cecchini \\ \href{mailto:emilio.cecchini@stud.unifi.it}{emilio.cecchini@stud.unifi.it} 15 | \and 16 | Lorenzo Palloni \\ \href{mailto:lorenzo.palloni@stud.unifi.it}{lorenzo.palloni@stud.unifi.it} 17 | } 18 | 19 | \maketitle 20 | 21 | \section{Summary} 22 | 23 | The goal of this paper is to introduce a faster version of the Weisfeiler-Lehman graph kernel algorithm when applied to Resource Description Framework (RDF) data. 24 | 25 | The \textit{Resource Description Framework} (RDF) is the foundation for knowledge representation on the semantic web. A resource is described by a set of \textit{triples} which are of the form \textit{subject-predicate-object}. The entire collection of triples form a graph where the subjects and the objects are the nodes and the predicates are the edges. 26 | 27 | The \textit{Weisfeiler-Lehman test} is an algorithm that is used to compute graph isomorphism. The test proceeds in iterations where the key idea is to augment the node labels by the sorted set of node labels of neighbouring nodes, and compress these augmented labels into new, short labels. These steps are then repeated until the node label sets of the two graphs differ, or the number of iterations reaches the prefixed maximum. 28 | 29 | The \textit{Weisfeiler-Lehman kernel} is the state-of-the-art for graph kernels. It computes the number of subtrees shared between two graphs by using the Weisfeiler-Lehman test of graph isomorphism. 30 | 31 | This paper introduces an approximation of the Weisfeiler-Lehman kernel, which first extracts a set of subgraphs from the entire RDF graph and then the kernels are computed. For each instance a subgraph up to a certain depth is extracted from the RDF dataset and this subgraph is added to a total graph that the extraction algorithm is building. Thus, vertices and edges are only added if they have not been added to the graph already. For each node and edge, together with their labels, their extraction depth is stored. The relabeling process is the same of the standard Weisfeiler-Lehman test with the extension of the labels on the edges. Finally the kernel is computed by counting the number of common labels at each depth. 32 | 33 | \section{Evaluation} 34 | 35 | In this paper there is no formal theorem or proof. The author states that this kernel yields an approximation of the standard Weisfeiler-Lehman graph kernel, but he never actually gives any formal proof of the accuracy of that approximation. The comparison of the results with the standard Weisfeiler-Lehman graph kernel can be found only in the experiments section. However there is a good explanation on how the complexity of the algorithm is improved with this approximation. 36 | 37 | In the first experiment, where a classification on the SWRC ontology \cite{swrc} is performed, the author specify that the \textit{affiliation} relation and its inverse (the \textit{employs} relation) were removed from the dataset for training purposes. We instead discovered that there are two other relationships that must be removed because they link the instances to their corresponding class, these relationships are \textit{member} and \textit{head}. The fact that these two predicates were not removed from the training dataset led to a higher accuracy than the real one. 38 | 39 | The plots of the runtime experiments are inverted: the lithogenesis dataset is about ten time larger than the affiliation dataset, but the reported runtimes of the lithogenesis classification are ten smaller than the runtimes of the affiliation prediction. 40 | 41 | This paper proposes a new method on computing graph kernels, but it is limited only to RDF data. This method exploits the fact that usually, in the RDF graphs, the extracted sugraphs share many nodes and edges. This fact limits the number of scenarios in which the method is applicable with good results. 42 | 43 | The algorithm described in this paper is an approximation of the Weisfeiler-Lehman graph kernel proposed in \cite{wl-kernels}. The approximation algorithm is very similar to the standard one described in \cite{wl-kernels}, the only difference is that the label expansion process is also extended to the edges and the concept of \textit{depth} is introduced in order to have bigger graphs without storing duplicated nodes or edges. This two simple modifications seem to lead to a faster version, but there is not much innovation in this new proposed method. 44 | 45 | The proposed kernel method is a tool to perform machine learning algorithms on RDF data. There is a small section in the paper where the author introduces the \textit{Resource Description Framework}, but it is never clearly explained what it means to perform a classification on that kind of data. 46 | 47 | During the extraction process of the subgraphs of the instances, the algorithm keeps track of the extraction depth to which each node and edge were extracted. In the paper there is confusion about the order of the index of the depth. In the pseudocode of the algorithm the depth is counted backward, that is the root has index equal to the maximum extraction depth while the leaves of the tree has depth equal to zero. While in the explanation of the algorithm the author describes the process with the indexes inverted. 48 | 49 | The datasets used in the experiments are still available online. There is a GitHub repository that contains the source code of the experiments but it is quite old and we were not able to compile and to run it. 50 | 51 | 52 | \section{Replication of the experiments} 53 | 54 | Since we were not able to compile and to run the experiments done by the author, we have implemented a small part of the experiments in order to assess the validity of the results in the paper. We have implemented the standard Weisfeiler-Lehman graph kernel and its approximation proposed in the paper. The source code for the kernels and the experiments are available online \footnote{https://github.com/deeplego/wl-graph-kernels}. We have perfomed a classification on the AIFB dataset \cite{swrc} and the 'Named Rock Units' dataset of the British Geological Survey. We have used the C-Support Vector Machine algorithm found in the scikit-learn Python package. We have tried to compute the accuracy of the classification with the same method described in the paper, that is a 10-fold cross-validation, however we slightly semplified the process of computing the accuracy of the model in relation to the C parameter of the SVM. We executed a 10-fold cross-validation for each value of C in $\{10^{-3}, 10^{-2}, 10^{-1}, 1, 10^1, 10^2, 10^3\}$ and then we took the best accuracy value. The results of the classifications are reported in the tables below. 55 | 56 | \newpage 57 | 58 | \begin{center} 59 | \captionof{table}{Affiliation prediction with the standard Weisfeiler-Lehman kernel} 60 | \begin{tabular}{ccccc} 61 | \toprule 62 | depth & iterations & accuracy & f1 & C \\ 63 | \midrule 64 | 1 & 0 & 0.842337 & 0.772552 & 100.0 \\ 65 | & 2 & 0.836782 & 0.755789 & 100.0 \\ 66 | & 4 & 0.836782 & 0.755789 & 100.0 \\ 67 | & 6 & 0.836782 & 0.755789 & 100.0 \\ 68 | \hline 69 | 2 & 0 & 0.892516 & 0.836455 & 100.0 \\ 70 | & 2 & 0.826180 & 0.742251 & 100.0 \\ 71 | & 4 & 0.774069 & 0.618519 & 100.0 \\ 72 | & 6 & 0.740048 & 0.568392 & 100.0 \\ 73 | \hline 74 | 3 & 0 & 0.892591 & 0.850147 & 100.0 \\ 75 | & 2 & 0.897779 & 0.848919 & 100.0 \\ 76 | & 4 & 0.909258 & 0.860964 & 100.0 \\ 77 | & 6 & 0.881044 & 0.796105 & 100.0 \\ 78 | \bottomrule 79 | \end{tabular} 80 | \end{center} 81 | 82 | \begin{center} 83 | \captionof{table}{Affiliation prediction with the Weisfeiler-Lehman for RDF} 84 | \begin{tabular}{ccccc} 85 | \toprule 86 | depth & iterations & accuracy & f1 & C \\ 87 | \midrule 88 | 1 & 0 & 0.881955 & 0.795756 & 100.0 \\ 89 | & 2 & 0.881955 & 0.795756 & 100.0 \\ 90 | & 4 & 0.881955 & 0.795756 & 100.0 \\ 91 | & 6 & 0.881955 & 0.795756 & 100.0 \\ 92 | \hline 93 | 2 & 0 & 0.892114 & 0.826007 & 100.0 \\ 94 | & 2 & 0.880057 & 0.812488 & 100.0 \\ 95 | & 4 & 0.874501 & 0.803701 & 100.0 \\ 96 | & 6 & 0.874501 & 0.800821 & 100.0 \\ 97 | \hline 98 | 3 & 0 & 0.879579 & 0.812187 & 100.0 \\ 99 | & 2 & 0.913751 & 0.867388 & 100.0 \\ 100 | & 4 & 0.908196 & 0.863829 & 100.0 \\ 101 | & 6 & 0.908196 & 0.863829 & 100.0 \\ 102 | \bottomrule 103 | \end{tabular} 104 | \end{center} 105 | 106 | \newpage 107 | 108 | \begin{center} 109 | \captionof{table}{Lithogenesis prediction with the standard Weisfeiler-Lehman kernel} 110 | \begin{tabular}{ccccc} 111 | \toprule 112 | depth & iterations & accuracy & f1 & C \\ 113 | \midrule 114 | 1 & 0 & 0.802679 & 0.774383 & 10.0 \\ 115 | & 2 & 0.796429 & 0.768842 & 10.0 \\ 116 | & 4 & 0.796429 & 0.768842 & 10.0 \\ 117 | & 6 & 0.796429 & 0.768842 & 10.0 \\ 118 | \hline 119 | 2 & 0 & 0.891964 & 0.877311 & 100.0 \\ 120 | & 2 & 0.892857 & 0.874092 & 1.0 \\ 121 | & 4 & 0.873214 & 0.854485 & 1.0 \\ 122 | & 6 & 0.865179 & 0.841353 & 1.0 \\ 123 | \hline 124 | 3 & 0 & 0.883929 & 0.871406 & 100.0 \\ 125 | & 2 & 0.913393 & 0.898291 & 1.0 \\ 126 | & 4 & 0.906250 & 0.890922 & 1.0 \\ 127 | & 6 & 0.906250 & 0.890922 & 1.0 \\ 128 | \bottomrule 129 | \end{tabular} 130 | \end{center} 131 | 132 | \begin{center} 133 | \captionof{table}{Lithogenesis prediction with the Weisfeiler-Lehman kernel for RDF} 134 | \begin{tabular}{ccccc} 135 | \toprule 136 | depth & iterations & accuracy & f1 & C \\ 137 | \midrule 138 | 1 & 0 & 0.795536 & 0.763739 & 10.0 \\ 139 | & 2 & 0.795536 & 0.763739 & 10.0 \\ 140 | & 4 & 0.795536 & 0.763739 & 10.0 \\ 141 | & 6 & 0.795536 & 0.763739 & 10.0 \\ 142 | \hline 143 | 2 & 0 & 0.906250 & 0.891229 & 100.0 \\ 144 | & 2 & 0.892857 & 0.874092 & 1.0 \\ 145 | & 4 & 0.892857 & 0.874092 & 1.0 \\ 146 | & 6 & 0.885714 & 0.866606 & 1.0 \\ 147 | \hline 148 | 3 & 0 & 0.891071 & 0.875862 & 100.0 \\ 149 | & 2 & 0.891964 & 0.873422 & 1.0 \\ 150 | & 4 & 0.906250 & 0.890104 & 1.0 \\ 151 | & 6 & 0.907143 & 0.888829 & 1.0 \\ 152 | \bottomrule 153 | \end{tabular} 154 | \end{center} 155 | 156 | The accuracy values are almost the same as those reported in the paper. There is not much difference between the standard Weisfeiler-Lehman algorithm and its approximation in terms of accuracy. 157 | 158 | We have also replicated the experiment of the affiliation prediction where all the labels were removed from the graph. The results are given in the two table belows. This is the best scenario for the Weisfeiler-Lehman kernel for RDF data. As reported in the paper, these results are very similar to the performance on labeled graphs. 159 | 160 | \newpage 161 | 162 | \begin{center} 163 | \captionof{table}{Affiliation prediction with the standard Weisfeiler-Lehman kernel with all labels removed} 164 | \begin{tabular}{ccccc} 165 | \toprule 166 | depth & iterations & accuracy & f1 & C \\ 167 | \midrule 168 | 1 & 0 & 0.322153 & 0.194477 & 100.0 \\ 169 | & 2 & 0.530111 & 0.348672 & 10.0 \\ 170 | & 4 & 0.530111 & 0.347049 & 10.0 \\ 171 | & 6 & 0.530111 & 0.347049 & 10.0 \\ 172 | \hline 173 | 2 & 0 & 0.564547 & 0.355253 & 10.0 \\ 174 | & 2 & 0.503724 & 0.343148 & 1.0 \\ 175 | & 4 & 0.481437 & 0.392366 & 100.0 \\ 176 | & 6 & 0.502999 & 0.383461 & 1.0 \\ 177 | \hline 178 | 3 & 0 & 0.491697 & 0.343404 & 100.0 \\ 179 | & 2 & 0.641333 & 0.527556 & 100.0 \\ 180 | & 4 & 0.724551 & 0.602677 & 10.0 \\ 181 | & 6 & 0.713474 & 0.557335 & 100.0 \\ 182 | \bottomrule 183 | \end{tabular} 184 | \end{center} 185 | 186 | \begin{center} 187 | \captionof{table}{Affiliation prediction with the Weisfeiler-Lehman kernel for RDF with all labels removed} 188 | \begin{tabular}{ccccc} 189 | \toprule 190 | depth & iterations & accuracy & f1 & C \\ 191 | \midrule 192 | 1 & 0 & 0.524847 & 0.305547 & 100.0 \\ 193 | & 2 & 0.647536 & 0.566394 & 100.0 \\ 194 | & 4 & 0.670780 & 0.591060 & 100.0 \\ 195 | & 6 & 0.677030 & 0.594329 & 100.0 \\ 196 | \hline 197 | 2 & 0 & 0.565936 & 0.340732 & 10.0 \\ 198 | & 2 & 0.681422 & 0.622212 & 100.0 \\ 199 | & 4 & 0.740048 & 0.663960 & 100.0 \\ 200 | & 6 & 0.762597 & 0.688069 & 100.0 \\ 201 | \hline 202 | 3 & 0 & 0.407394 & 0.293320 & 1.0 \\ 203 | & 2 & 0.898914 & 0.861681 & 100.0 \\ 204 | & 4 & 0.892079 & 0.854304 & 100.0 \\ 205 | & 6 & 0.893066 & 0.851358 & 10.0 \\ 206 | \bottomrule 207 | \end{tabular} 208 | \end{center} 209 | 210 | \newpage 211 | 212 | Since this new method is supposed to be faster, we also replicated the experiments on the runtimes. The Weisfeiler-Lehman for RDF method is slightly faster then the regular one, but we were not able to see such a good improvements in the runtime as reported in the paper. 213 | 214 | \begin{center} 215 | \begin{figure}[h] 216 | \caption{Runtimes of the two kernels on the affiliation dataset} 217 | \includegraphics[width=\textwidth]{img/affiliation_timing.png} 218 | \end{figure} 219 | \end{center} 220 | 221 | \begin{center} 222 | \begin{figure}[h] 223 | \caption{Runtimes of the two kernels on the lithogenesis dataset} 224 | \includegraphics[width=\textwidth]{img/lithogenesis_timing.png} 225 | \end{figure} 226 | \end{center} 227 | 228 | \newpage 229 | 230 | \begin{thebibliography}{9} 231 | 232 | \bibitem{lamport94} 233 | Vries Gerben Klaas Dirk, 234 | A Fast Approximation of the Weisfeiler-Lehman Graph Kernel for RDF Data, 235 | 2013 236 | 237 | \bibitem{swrc} 238 | Sure, Y., Bloehdorn, S., Haase, P., Hartmann, J., Oberle, D., 239 | The swrc ontology - semantic web for research communities. 240 | Volume 3803 of LNCS., Covilha, 241 | Portugal, Springer (Dezember 2005) 218 – 231 242 | 243 | \bibitem{wl-kernels} 244 | Shervashidze, N., Schweitzer, P., van Leeuwen, E.J., Mehlhorn, K., Borgwardt, K.M. 245 | Weisfeiler-lehman graph kernels, 246 | 2011 247 | 248 | \end{thebibliography} 249 | 250 | \end{document} 251 | -------------------------------------------------------------------------------- /report/img/affiliation_timing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/report/img/affiliation_timing.png -------------------------------------------------------------------------------- /report/img/lithogenesis_timing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/report/img/lithogenesis_timing.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sklearn 2 | numpy 3 | nptyping 4 | rdflib 5 | path.py 6 | pytest 7 | pytest-cov 8 | -------------------------------------------------------------------------------- /results/affiliation_timing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/results/affiliation_timing.png -------------------------------------------------------------------------------- /results/csv_to_latex.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from path import Path 4 | import pandas as pd 5 | 6 | 7 | def convert(fn): 8 | df = pd.read_csv(fn, index_col=['depth', 'iterations']) 9 | df.to_latex(f'{fn.stripext()}.tex') 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = ArgumentParser() 14 | parser.add_argument('--file', '-f', type=str) 15 | flags = parser.parse_args() 16 | convert(Path(flags.file)) 17 | -------------------------------------------------------------------------------- /results/lithogenesis_timing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/results/lithogenesis_timing.png -------------------------------------------------------------------------------- /results/wl_affiliation_results.csv: -------------------------------------------------------------------------------- 1 | depth,iterations,accuracy,f1,C 2 | 1,0,0.7564907980736154,0.6433234000261864,1.0 3 | 1,2,0.7735251117991055,0.6534813180885163,1.0 4 | 1,4,0.7735251117991055,0.6534813180885163,1.0 5 | 1,6,0.7735251117991055,0.6534813180885163,1.0 6 | 2,0,0.8082408840729274,0.7098104342338087,1.0 7 | 2,2,0.7577700378396972,0.6579990189549013,1.0 8 | 2,4,0.7183350533195736,0.5632661696748384,1.0 9 | 2,6,0.6957451840385277,0.5284068915727786,0.1 10 | 3,0,0.8545042139662883,0.7886428236795884,10.0 11 | 3,2,0.8531217750257998,0.8065749405822935,0.001 12 | 3,4,0.832546439628483,0.7681329409754333,1.0 13 | 3,6,0.79578173374613,0.7226295853269538,0.001 14 | -------------------------------------------------------------------------------- /results/wl_affiliation_results_with_normalization.csv: -------------------------------------------------------------------------------- 1 | depth,iterations,accuracy,f1,C 2 | 1,0,0.881955194358445,0.795756261282577,100.0 3 | 1,2,0.8687607499140008,0.7886729279492436,100.0 4 | 1,4,0.8687607499140008,0.7886729279492436,100.0 5 | 1,6,0.8687607499140008,0.7886729279492436,100.0 6 | 2,0,0.8868507051943585,0.8197871572871573,100.0 7 | 2,2,0.8581269349845201,0.7815634647000745,100.0 8 | 2,4,0.7704463364293086,0.6042463799100796,100.0 9 | 2,6,0.7527584279325765,0.5791451324733059,100.0 10 | 3,0,0.8848426212590299,0.8184077034077035,100.0 11 | 3,2,0.8908002235982112,0.8246224200635967,100.0 12 | 3,4,0.8973426212590299,0.840693848635025,100.0 13 | 3,6,0.8963557791537667,0.8213432686594452,100.0 14 | -------------------------------------------------------------------------------- /results/wl_affiliation_results_with_normalization.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llrrr} 2 | \toprule 3 | & & accuracy & f1 & C \\ 4 | depth & iterations & & & \\ 5 | \midrule 6 | 1 & 0 & 0.842337 & 0.772552 & 100.0 \\ 7 | & 2 & 0.836782 & 0.755789 & 100.0 \\ 8 | & 4 & 0.836782 & 0.755789 & 100.0 \\ 9 | & 6 & 0.836782 & 0.755789 & 100.0 \\ 10 | 2 & 0 & 0.892516 & 0.836455 & 100.0 \\ 11 | & 2 & 0.826180 & 0.742251 & 100.0 \\ 12 | & 4 & 0.774069 & 0.618519 & 100.0 \\ 13 | & 6 & 0.740048 & 0.568392 & 100.0 \\ 14 | 3 & 0 & 0.892591 & 0.850147 & 100.0 \\ 15 | & 2 & 0.897779 & 0.848919 & 100.0 \\ 16 | & 4 & 0.909258 & 0.860964 & 100.0 \\ 17 | & 6 & 0.881044 & 0.796105 & 100.0 \\ 18 | \bottomrule 19 | \end{tabular} 20 | -------------------------------------------------------------------------------- /results/wl_lithogenesis_results_with_normalization.csv: -------------------------------------------------------------------------------- 1 | depth,iterations,accuracy,f1,C 2 | 1,0,0.8026785714285714,0.7743826999976886,10.0 3 | 1,2,0.7964285714285714,0.7688415744565631,10.0 4 | 1,4,0.7964285714285714,0.7688415744565631,10.0 5 | 1,6,0.7964285714285714,0.7688415744565631,10.0 6 | 2,0,0.8919642857142858,0.8773113283868433,100.0 7 | 2,2,0.8928571428571429,0.8740923537433837,1.0 8 | 2,4,0.8732142857142857,0.8544845106061286,1.0 9 | 2,6,0.8651785714285714,0.841352739556401,1.0 10 | 3,0,0.8839285714285715,0.8714062451136755,100.0 11 | 3,2,0.9133928571428571,0.8982908605505859,1.0 12 | 3,4,0.90625,0.8909224394979542,1.0 13 | 3,6,0.90625,0.8909224394979542,1.0 14 | -------------------------------------------------------------------------------- /results/wl_lithogenesis_results_with_normalization.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llrrr} 2 | \toprule 3 | & & accuracy & f1 & C \\ 4 | depth & iterations & & & \\ 5 | \midrule 6 | 1 & 0 & 0.802679 & 0.774383 & 10.0 \\ 7 | & 2 & 0.796429 & 0.768842 & 10.0 \\ 8 | & 4 & 0.796429 & 0.768842 & 10.0 \\ 9 | & 6 & 0.796429 & 0.768842 & 10.0 \\ 10 | 2 & 0 & 0.891964 & 0.877311 & 100.0 \\ 11 | & 2 & 0.892857 & 0.874092 & 1.0 \\ 12 | & 4 & 0.873214 & 0.854485 & 1.0 \\ 13 | & 6 & 0.865179 & 0.841353 & 1.0 \\ 14 | 3 & 0 & 0.883929 & 0.871406 & 100.0 \\ 15 | & 2 & 0.913393 & 0.898291 & 1.0 \\ 16 | & 4 & 0.906250 & 0.890922 & 1.0 \\ 17 | & 6 & 0.906250 & 0.890922 & 1.0 \\ 18 | \bottomrule 19 | \end{tabular} 20 | -------------------------------------------------------------------------------- /results/wl_no_labels.csv: -------------------------------------------------------------------------------- 1 | depth,iterations,accuracy,f1,C 2 | 1,0,0.3221534227726178,0.19447672210830105,100.0 3 | 1,2,0.530110509115927,0.34867194939563356,10.0 4 | 1,4,0.530110509115927,0.3470491423780897,10.0 5 | 1,6,0.530110509115927,0.3470491423780897,10.0 6 | 2,0,0.564546783625731,0.35525302548328863,10.0 7 | 2,2,0.5037237702098383,0.3431478203169379,1.0 8 | 2,4,0.481437048503612,0.3923659673659673,100.0 9 | 2,6,0.502999226006192,0.38346129360835246,1.0 10 | 3,0,0.49169676642586857,0.34340422713681223,100.0 11 | 3,2,0.6413334193326453,0.527556055056055,100.0 12 | 3,4,0.7245506535947712,0.6026768084856319,10.0 13 | 3,6,0.7134739422084623,0.5573347090645852,100.0 14 | -------------------------------------------------------------------------------- /results/wl_no_labels.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llrrr} 2 | \toprule 3 | & & accuracy & f1 & C \\ 4 | depth & iterations & & & \\ 5 | \midrule 6 | 1 & 0 & 0.322153 & 0.194477 & 100.0 \\ 7 | & 2 & 0.530111 & 0.348672 & 10.0 \\ 8 | & 4 & 0.530111 & 0.347049 & 10.0 \\ 9 | & 6 & 0.530111 & 0.347049 & 10.0 \\ 10 | 2 & 0 & 0.564547 & 0.355253 & 10.0 \\ 11 | & 2 & 0.503724 & 0.343148 & 1.0 \\ 12 | & 4 & 0.481437 & 0.392366 & 100.0 \\ 13 | & 6 & 0.502999 & 0.383461 & 1.0 \\ 14 | 3 & 0 & 0.491697 & 0.343404 & 100.0 \\ 15 | & 2 & 0.641333 & 0.527556 & 100.0 \\ 16 | & 4 & 0.724551 & 0.602677 & 10.0 \\ 17 | & 6 & 0.713474 & 0.557335 & 100.0 \\ 18 | \bottomrule 19 | \end{tabular} 20 | -------------------------------------------------------------------------------- /results/wlrdf_affiliation_results.csv: -------------------------------------------------------------------------------- 1 | depth,iterations,accuracy,f1,C 2 | 1,0,0.7848877708978328,0.657970231522863,10.0 3 | 1,2,0.7848877708978328,0.657970231522863,10.0 4 | 1,4,0.7848877708978328,0.657970231522863,1.0 5 | 1,6,0.7848877708978328,0.657970231522863,1.0 6 | 2,0,0.8262555899552803,0.7310782851049878,1.0 7 | 2,2,0.7898929308565531,0.6821006728050072,1.0 8 | 2,4,0.7960676814585483,0.684156578500619,1.0 9 | 2,6,0.8023176814585483,0.6868838512278915,0.1 10 | 3,0,0.8416430168558652,0.7626665813546618,0.01 11 | 3,2,0.897703818369453,0.8631294273322137,0.001 12 | 3,4,0.8924406604747162,0.8590949650624573,0.001 13 | 3,6,0.8806759545923633,0.839124102591595,0.001 14 | -------------------------------------------------------------------------------- /results/wlrdf_affiliation_results_with_normalization.csv: -------------------------------------------------------------------------------- 1 | depth,iterations,accuracy,f1,C 2 | 1,0,0.881955194358445,0.795756261282577,100.0 3 | 1,2,0.881955194358445,0.795756261282577,100.0 4 | 1,4,0.881955194358445,0.795756261282577,100.0 5 | 1,6,0.881955194358445,0.795756261282577,100.0 6 | 2,0,0.8921138630890952,0.8260073953823953,100.0 7 | 2,2,0.8800567595459237,0.8124879573041339,100.0 8 | 2,4,0.8745012039903681,0.8037011925982516,100.0 9 | 2,6,0.8745012039903681,0.8008212906374672,100.0 10 | 3,0,0.879579463364293,0.8121874653124653,100.0 11 | 3,2,0.9137512899896801,0.8673881673881674,100.0 12 | 3,4,0.9081957344341245,0.8638286754095578,100.0 13 | 3,6,0.9081957344341245,0.8638286754095578,100.0 14 | -------------------------------------------------------------------------------- /results/wlrdf_affiliation_results_with_normalization.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llrrr} 2 | \toprule 3 | & & accuracy & f1 & C \\ 4 | depth & iterations & & & \\ 5 | \midrule 6 | 1 & 0 & 0.881955 & 0.795756 & 100.0 \\ 7 | & 2 & 0.881955 & 0.795756 & 100.0 \\ 8 | & 4 & 0.881955 & 0.795756 & 100.0 \\ 9 | & 6 & 0.881955 & 0.795756 & 100.0 \\ 10 | 2 & 0 & 0.892114 & 0.826007 & 100.0 \\ 11 | & 2 & 0.880057 & 0.812488 & 100.0 \\ 12 | & 4 & 0.874501 & 0.803701 & 100.0 \\ 13 | & 6 & 0.874501 & 0.800821 & 100.0 \\ 14 | 3 & 0 & 0.879579 & 0.812187 & 100.0 \\ 15 | & 2 & 0.913751 & 0.867388 & 100.0 \\ 16 | & 4 & 0.908196 & 0.863829 & 100.0 \\ 17 | & 6 & 0.908196 & 0.863829 & 100.0 \\ 18 | \bottomrule 19 | \end{tabular} 20 | -------------------------------------------------------------------------------- /results/wlrdf_lithogenesis_results.csv: -------------------------------------------------------------------------------- 1 | depth,iterations,accuracy,f1,C 2 | 1,0,0.7946428571428571,0.7668365041391357,0.001 3 | 1,2,0.7946428571428571,0.7668365041391357,0.001 4 | 1,4,0.7946428571428571,0.7668365041391357,0.001 5 | 1,6,0.7946428571428571,0.7668365041391357,0.001 6 | 2,0,0.8991071428571429,0.8823099993065668,0.001 7 | 2,2,0.8571428571428571,0.8347124068405533,0.001 8 | 2,4,0.8571428571428571,0.8347124068405533,0.001 9 | 2,6,0.8571428571428571,0.8347124068405533,0.001 10 | 3,0,0.8866071428571429,0.8704565801079465,0.001 11 | 3,2,0.8928571428571427,0.8762965244773024,0.001 12 | 3,4,0.8857142857142856,0.8681093899743786,0.001 13 | 3,6,0.8857142857142856,0.8681093899743786,0.001 14 | -------------------------------------------------------------------------------- /results/wlrdf_lithogenesis_results_with_normalization.csv: -------------------------------------------------------------------------------- 1 | depth,iterations,accuracy,f1,C 2 | 1,0,0.7955357142857142,0.7637394251438875,10.0 3 | 1,2,0.7955357142857142,0.7637394251438875,10.0 4 | 1,4,0.7955357142857142,0.7637394251438875,10.0 5 | 1,6,0.7955357142857142,0.7637394251438875,10.0 6 | 2,0,0.90625,0.891229457041814,100.0 7 | 2,2,0.8928571428571429,0.8740923537433837,1.0 8 | 2,4,0.8928571428571429,0.8740923537433837,1.0 9 | 2,6,0.8857142857142858,0.8666057227273407,1.0 10 | 3,0,0.8910714285714286,0.8758615567439098,100.0 11 | 3,2,0.8919642857142858,0.8734224394979544,1.0 12 | 3,4,0.90625,0.8901037260476619,1.0 13 | 3,6,0.9071428571428571,0.8888291958486466,1.0 14 | -------------------------------------------------------------------------------- /results/wlrdf_lithogenesis_results_with_normalization.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llrrr} 2 | \toprule 3 | & & accuracy & f1 & C \\ 4 | depth & iterations & & & \\ 5 | \midrule 6 | 1 & 0 & 0.795536 & 0.763739 & 10.0 \\ 7 | & 2 & 0.795536 & 0.763739 & 10.0 \\ 8 | & 4 & 0.795536 & 0.763739 & 10.0 \\ 9 | & 6 & 0.795536 & 0.763739 & 10.0 \\ 10 | 2 & 0 & 0.906250 & 0.891229 & 100.0 \\ 11 | & 2 & 0.892857 & 0.874092 & 1.0 \\ 12 | & 4 & 0.892857 & 0.874092 & 1.0 \\ 13 | & 6 & 0.885714 & 0.866606 & 1.0 \\ 14 | 3 & 0 & 0.891071 & 0.875862 & 100.0 \\ 15 | & 2 & 0.891964 & 0.873422 & 1.0 \\ 16 | & 4 & 0.906250 & 0.890104 & 1.0 \\ 17 | & 6 & 0.907143 & 0.888829 & 1.0 \\ 18 | \bottomrule 19 | \end{tabular} 20 | -------------------------------------------------------------------------------- /results/wlrdf_no_labels.csv: -------------------------------------------------------------------------------- 1 | depth,iterations,accuracy,f1,C 2 | 1,0,0.5248473512211902,0.30554684499217744,100.0 3 | 1,2,0.6475361197110423,0.5663935370185369,100.0 4 | 1,4,0.6707795837633299,0.5910602591852591,100.0 5 | 1,6,0.6770295837633299,0.59432948995449,100.0 6 | 2,0,0.5659356725146198,0.34073240549440237,10.0 7 | 2,2,0.6814219986240111,0.622212370962371,100.0 8 | 2,4,0.7400477296181631,0.6639600024158847,100.0 9 | 2,6,0.7625967492260062,0.6880690877749702,100.0 10 | 3,0,0.4073937908496732,0.2933196589272441,1.0 11 | 3,2,0.8989142586859306,0.8616813859944665,100.0 12 | 3,4,0.8920794633642931,0.8543043879924686,100.0 13 | 3,6,0.8930663054695562,0.85135795942104,10.0 14 | -------------------------------------------------------------------------------- /results/wlrdf_no_labels.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llrrr} 2 | \toprule 3 | & & accuracy & f1 & C \\ 4 | depth & iterations & & & \\ 5 | \midrule 6 | 1 & 0 & 0.524847 & 0.305547 & 100.0 \\ 7 | & 2 & 0.647536 & 0.566394 & 100.0 \\ 8 | & 4 & 0.670780 & 0.591060 & 100.0 \\ 9 | & 6 & 0.677030 & 0.594329 & 100.0 \\ 10 | 2 & 0 & 0.565936 & 0.340732 & 10.0 \\ 11 | & 2 & 0.681422 & 0.622212 & 100.0 \\ 12 | & 4 & 0.740048 & 0.663960 & 100.0 \\ 13 | & 6 & 0.762597 & 0.688069 & 100.0 \\ 14 | 3 & 0 & 0.407394 & 0.293320 & 1.0 \\ 15 | & 2 & 0.898914 & 0.861681 & 100.0 \\ 16 | & 4 & 0.892079 & 0.854304 & 100.0 \\ 17 | & 6 & 0.893066 & 0.851358 & 10.0 \\ 18 | \bottomrule 19 | \end{tabular} 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | from wlkernel import __version__ 4 | 5 | 6 | setup( 7 | name='wlkernel', 8 | version=__version__, 9 | description='Weisfeiler-Lehman kernel for RDF graphs', 10 | packages=find_packages(exclude=['tests']), 11 | ) 12 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/tests/__init__.py -------------------------------------------------------------------------------- /tests/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzopalloni/wl-graph-kernels/ea046737f91017380090cb8d061efc1a44fefb5e/tests/resources/__init__.py -------------------------------------------------------------------------------- /tests/resources/example.ttl: -------------------------------------------------------------------------------- 1 | 'A1' 'P2' 'C' . 2 | 'A1' 'P3' 'D' . 3 | 'A2' 'P2' 'D' . 4 | 'A2' 'P3' 'E' . 5 | 'B2' 'P3' 'E' . 6 | 'B2' 'P2' 'F' . 7 | 'B1' 'P3' 'F' . 8 | 'B1' 'P2' 'G' . 9 | 'C' 'P4' 'H' . 10 | 'D' 'P4' 'H' . 11 | 'F' 'P5' 'I' . 12 | 'G' 'P5' 'I' . 13 | 'H' 'P6' 'A2' . 14 | 'I' 'P6' 'B2' . 15 | -------------------------------------------------------------------------------- /tests/wlkernel_test.py: -------------------------------------------------------------------------------- 1 | from os.path import abspath 2 | from pkg_resources import resource_filename 3 | 4 | import pytest 5 | import rdflib 6 | 7 | import wlkernel 8 | 9 | 10 | example_data = abspath(resource_filename('tests.resources', 'example.ttl')) 11 | 12 | 13 | def test_node_hash(): 14 | n1 = wlkernel.Node() 15 | n1_bis = n1 16 | n2 = wlkernel.Node() 17 | n2_bis = n2 18 | assert hash(n1) != hash(n2) 19 | assert hash(n1) == hash(n1_bis) 20 | assert hash(n2) == hash(n2_bis) 21 | 22 | 23 | def test_edge_hash(): 24 | e1 = wlkernel.Edge() 25 | e1_bis = e1 26 | e2 = wlkernel.Edge() 27 | e2_bis = e2 28 | assert hash(e1) != hash(e2) 29 | assert hash(e1) == hash(e1_bis) 30 | assert hash(e2) == hash(e2_bis) 31 | 32 | 33 | def test_wlgraph_depth_0(): 34 | ''' 35 | ###### 36 | # A1 # 37 | ###### 38 | ''' 39 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 40 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 41 | wl_graph = wlkernel.WLGraph(triples, 'A1', 0) 42 | assert len(wl_graph.nodes) == 1 43 | assert len(wl_graph.edges) == 0 44 | assert len(wl_graph.labels) == 1 45 | assert len(wl_graph.labels[0]) == len(wl_graph.nodes) + len(wl_graph.edges) 46 | 47 | 48 | def test_wlgraph_depth_1(): 49 | r''' 50 | ###### 51 | # A1 # 52 | ###### 53 | / \ 54 | P2 / \ P3 55 | / \ 56 | ##### ##### 57 | # C # # D # 58 | ##### ##### 59 | ''' 60 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 61 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 62 | wl_graph = wlkernel.WLGraph(triples, 'A1', 1) 63 | assert len(wl_graph.nodes) == 3 64 | assert len(wl_graph.edges) == 2 65 | assert len(wl_graph.labels) == 1 66 | assert len(wl_graph.labels[0]) == len(wl_graph.nodes) + len(wl_graph.edges) 67 | 68 | 69 | def test_wlgraph_depth_2(): 70 | r''' 71 | ###### 72 | # A1 # 73 | ###### 74 | / \ 75 | P2 / \ P3 76 | / \ 77 | ##### ##### 78 | # C # # D # 79 | ##### ##### 80 | \ / 81 | P4 \ / P4 82 | \ / 83 | ##### 84 | # H # 85 | ##### 86 | ''' 87 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 88 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 89 | wl_graph = wlkernel.WLGraph(triples, 'A1', 2) 90 | assert len(wl_graph.nodes) == 4 91 | assert len(wl_graph.edges) == 4 92 | assert len(wl_graph.labels) == 1 93 | assert len(wl_graph.labels[0]) == len(wl_graph.nodes) + len(wl_graph.edges) 94 | 95 | 96 | def test_wlgraph_depth_4(): 97 | r''' 98 | ###### 99 | # A1 # 100 | ###### 101 | / \ 102 | P2 / \ P3 103 | / \ 104 | ##### ##### 105 | # C # # D #<---- 106 | ##### ##### | 107 | \ / | 108 | P4 \ / P4 | 109 | \ / | 110 | ##### | 111 | # H # | P2 112 | ##### | 113 | | | 114 | | P6 | 115 | | | 116 | ###### | 117 | # A2 #----------- 118 | ###### 119 | | 120 | | P3 121 | | 122 | ##### 123 | # E # 124 | ##### 125 | ''' 126 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 127 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 128 | wl_graph = wlkernel.WLGraph(triples, 'A1', 4) 129 | assert len(wl_graph.nodes) == 6 130 | assert len(wl_graph.edges) == 7 131 | assert len(wl_graph.labels) == 1 132 | assert len(wl_graph.labels[0]) == len(wl_graph.nodes) + len(wl_graph.edges) 133 | 134 | 135 | def test_wl_relabel(): 136 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 137 | triples = [(str(s), str(p), str(o)) for s, p, o in rdf_graph] 138 | wl_graph_a1 = wlkernel.WLGraph(triples, 'A1', 4) 139 | wl_graph_b1 = wlkernel.WLGraph(triples, 'B1', 4) 140 | 141 | uniq_labels_a1_0 = set(wl_graph_a1.labels[0].values()) 142 | uniq_labels_b1_0 = set(wl_graph_b1.labels[0].values()) 143 | 144 | wlkernel.wl_relabel([wl_graph_a1, wl_graph_b1]) 145 | uniq_labels_a1_1 = set(wl_graph_a1.labels[1].values()) 146 | uniq_labels_b1_1 = set(wl_graph_b1.labels[1].values()) 147 | assert len(wl_graph_a1.labels) == len(wl_graph_b1.labels) == 2 148 | assert len(uniq_labels_a1_0) < len(uniq_labels_a1_1) 149 | assert len(uniq_labels_b1_0) < len(uniq_labels_b1_1) 150 | 151 | wlkernel.wl_relabel([wl_graph_a1, wl_graph_b1]) 152 | uniq_labels_a1_2 = set(wl_graph_a1.labels[2].values()) 153 | uniq_labels_b1_2 = set(wl_graph_b1.labels[2].values()) 154 | assert len(wl_graph_a1.labels) == len(wl_graph_b1.labels) == 3 155 | 156 | wlkernel.wl_relabel([wl_graph_a1, wl_graph_b1]) 157 | uniq_labels_a1_3 = set(wl_graph_a1.labels[3].values()) 158 | uniq_labels_b1_3 = set(wl_graph_b1.labels[3].values()) 159 | assert len(wl_graph_a1.labels) == len(wl_graph_b1.labels) == 4 160 | 161 | 162 | def test_wl_kernel(): 163 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 164 | triples = [(str(s), str(p), str(o)) for s, p, o in rdf_graph] 165 | wl_graph_a1 = wlkernel.WLGraph(triples, 'A1', 4) 166 | wl_graph_b1 = wlkernel.WLGraph(triples, 'B1', 4) 167 | 168 | assert wlkernel.wl_kernel(wl_graph_a1, wl_graph_b1) == 11*1 169 | assert wlkernel.wl_kernel(wl_graph_a1, wl_graph_b1, 1) == 11*0.5 + 4*1 170 | 171 | 172 | def test_wl_kernel_matrix(): 173 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 174 | triples = [(str(s), str(p), str(o)) for s, p, o in rdf_graph] 175 | wl_graph_a1 = wlkernel.WLGraph(triples, 'A1', 4) 176 | wl_graph_b1 = wlkernel.WLGraph(triples, 'B1', 4) 177 | wl_graph_a2 = wlkernel.WLGraph(triples, 'A2', 4) 178 | 179 | kernel_matrix = wlkernel.wl_kernel_matrix( 180 | [wl_graph_a1, wl_graph_b1, wl_graph_a2], iterations=1 181 | ) 182 | 183 | assert len(kernel_matrix) == len(kernel_matrix[0]) == 3 184 | assert kernel_matrix[0][1] == wlkernel.wl_kernel( 185 | wl_graph_a1, wl_graph_b1, iterations=1 186 | ) 187 | assert kernel_matrix[0][2] == wlkernel.wl_kernel( 188 | wl_graph_a1, wl_graph_a2, iterations=1 189 | ) 190 | assert kernel_matrix[1][0] == wlkernel.wl_kernel( 191 | wl_graph_a1, wl_graph_b1, iterations=1 192 | ) 193 | 194 | 195 | def test_wlrdfgraph_depth_0(): 196 | ''' 197 | ###### 198 | # A1 # 199 | ###### 200 | ''' 201 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 202 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 203 | wlrdf_graph = wlkernel.WLRDFGraph(triples, ['A1'], 0) 204 | assert len(wlrdf_graph.nodes) == 1 205 | assert len(wlrdf_graph.edges) == 0 206 | assert len(wlrdf_graph.labels) == 1 207 | assert len(wlrdf_graph.labels[0]) == 1 208 | assert len(wlrdf_graph.instance_nodes) == 1 209 | assert len(wlrdf_graph.instance_nodes['A1']) == 0 210 | assert len(wlrdf_graph.instance_edges) == 1 211 | assert len(wlrdf_graph.instance_edges['A1']) == 0 212 | 213 | 214 | def test_wlrdfgraph_depth_1(): 215 | r''' 216 | ###### 217 | # A1 # 218 | ###### 219 | / \ 220 | P2 / \ P3 221 | / \ 222 | ##### ##### 223 | # C # # D # 224 | ##### ##### 225 | ''' 226 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 227 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 228 | wlrdf_graph = wlkernel.WLRDFGraph(triples, ['A1'], 1) 229 | assert len(wlrdf_graph.nodes) == 3 230 | assert len(wlrdf_graph.edges) == 2 231 | assert len(wlrdf_graph.labels) == 1 232 | assert len(wlrdf_graph.labels[0]) == 5 233 | assert len(wlrdf_graph.instance_nodes) == 1 234 | assert len(wlrdf_graph.instance_nodes['A1']) == 2 235 | assert len(wlrdf_graph.instance_edges) == 1 236 | assert len(wlrdf_graph.instance_edges['A1']) == 2 237 | 238 | 239 | def test_wlrdfgraph_depth_2(): 240 | r''' 241 | ###### 242 | # A1 # 243 | ###### 244 | / \ 245 | P2 / \ P3 246 | / \ 247 | ##### ##### 248 | # C # # D # 249 | ##### ##### 250 | \ / 251 | P4 \ / P4 252 | \ / 253 | ##### 254 | # H # 255 | ##### 256 | ''' 257 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 258 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 259 | wlrdf_graph = wlkernel.WLRDFGraph(triples, ['A1'], 2) 260 | assert len(wlrdf_graph.nodes) == 4 261 | assert len(wlrdf_graph.edges) == 4 262 | assert len(wlrdf_graph.labels) == 1 263 | assert len(wlrdf_graph.labels[0]) == 8 264 | assert len(wlrdf_graph.instance_nodes) == 1 265 | assert len(wlrdf_graph.instance_nodes['A1']) == 3 266 | assert len(wlrdf_graph.instance_edges) == 1 267 | assert len(wlrdf_graph.instance_edges['A1']) == 4 268 | 269 | 270 | def test_wlrdfgraph_depth_4(): 271 | r''' 272 | ###### 273 | # A1 # 274 | ###### 275 | / \ 276 | P2 / \ P3 277 | / \ 278 | ##### ##### 279 | # C # # D # 280 | ##### ##### 281 | \ / 282 | P4 \ / P4 283 | \ / 284 | ##### 285 | # H # 286 | ##### 287 | | 288 | | P6 289 | | 290 | ###### 291 | # A2 # 292 | ###### 293 | / \ 294 | P3 / \ P2 295 | / \ 296 | ##### ##### 297 | # E # # D # 298 | ##### ##### 299 | ''' 300 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 301 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 302 | wlrdf_graph = wlkernel.WLRDFGraph(triples, ['A1'], 4) 303 | assert len(wlrdf_graph.nodes) == 6 304 | assert len(wlrdf_graph.edges) == 7 305 | assert len(wlrdf_graph.labels) == 1 306 | assert len(wlrdf_graph.labels[0]) == 14 307 | assert len(wlrdf_graph.instance_nodes) == 1 308 | assert len(wlrdf_graph.instance_nodes['A1']) == 5 309 | assert len(wlrdf_graph.instance_edges) == 1 310 | assert len(wlrdf_graph.instance_edges['A1']) == 7 311 | 312 | 313 | def test_wlrdf_relabel(): 314 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 315 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 316 | wlrdf_graph = wlkernel.WLRDFGraph(triples, ['A1', 'B1'], 4) 317 | 318 | uniq_labels_0 = set(wlrdf_graph.labels[0].values()) 319 | 320 | wlrdf_graph.relabel() 321 | uniq_labels_1 = set(wlrdf_graph.labels[1].values()) 322 | 323 | wlrdf_graph.relabel() 324 | uniq_labels_2 = set(wlrdf_graph.labels[1].values()) 325 | 326 | assert len(wlrdf_graph.labels) == 3 327 | assert len(uniq_labels_0) < len(uniq_labels_1) 328 | assert len(uniq_labels_1) == len(uniq_labels_2) 329 | 330 | 331 | def test_wlrdf_kernel(): 332 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 333 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 334 | wlrdf_graph = wlkernel.WLRDFGraph(triples, ['A1', 'B1'], 4) 335 | assert wlkernel.wlrdf_kernel(wlrdf_graph, 'A1', 'B1') == 10*1 336 | assert wlkernel.wlrdf_kernel(wlrdf_graph, 'A1', 'B1', 1) == 10*0.5 + 3 337 | 338 | 339 | def test_wlrdf_kernel_matrix(): 340 | rdf_graph = rdflib.Graph().parse(example_data, format='turtle') 341 | triples = ((str(s), str(p), str(o)) for s, p, o in rdf_graph) 342 | wlrdf_graph = wlkernel.WLRDFGraph(triples, ['A1', 'B1'], 4) 343 | 344 | kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, ['A1', 'B1']) 345 | 346 | assert len(kernel_matrix) == len(kernel_matrix[0]) == 2 347 | assert kernel_matrix[0][1] == wlkernel.wlrdf_kernel( 348 | wlrdf_graph, 'A1', 'B1' 349 | ) 350 | assert kernel_matrix[1][0] == wlkernel.wlrdf_kernel( 351 | wlrdf_graph, 'A1', 'B1' 352 | ) 353 | -------------------------------------------------------------------------------- /wlkernel/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /wlkernel/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /wlkernel/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /wlkernel/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /wlkernel/.idea/wlkernel.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /wlkernel/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 49 | 50 | 55 | 56 | 57 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 |