├── .gitignore
├── LICENSE
├── README.md
├── kgsearch.gif
├── kgsearch
├── __init__.py
├── __version__.py
├── app
│ ├── __init__.py
│ └── app.py
├── data
│ ├── .gitkeep
│ ├── data.csv
│ └── metadata.json
└── web
│ ├── app.html
│ └── style.css
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | *.pkl
7 |
8 | *.DS_Store
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 | cover/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | .pybuilder/
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | # For a library or package, you might want to ignore these files since the code is
91 | # intended to run in multiple environments; otherwise, check them in:
92 | # .python-version
93 |
94 | # pipenv
95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
98 | # install all needed dependencies.
99 | #Pipfile.lock
100 |
101 | # poetry
102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | # This is especially recommended for binary packages to ensure reproducibility, and is more
104 | # commonly ignored for libraries.
105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 |
108 | # pdm
109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | # in version control.
113 | # https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 |
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 |
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 |
123 | # SageMath parsed files
124 | *.sage.py
125 |
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 |
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 |
139 | # Rope project settings
140 | .ropeproject
141 |
142 | # mkdocs documentation
143 | /site
144 |
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 |
150 | # Pyre type checker
151 | .pyre/
152 |
153 | # pytype static type analyzer
154 | .pytype/
155 |
156 | # Cython debug symbols
157 | cython_debug/
158 |
159 | # PyCharm
160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | # and can be added to the global gitignore or merged into this file. For a more nuclear
163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Raphael Sourty
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
KGSearch
3 |
4 |
5 | 
6 |
7 | KGSearch is a minimalist tool for searching and viewing entities in a graph and is dedicated to a local environment. The application provides a Python client with three distinct terminal commands: `add, start, open`. The application default proposes to search through the knowledge graph [Countries](https://www.aaai.org/ocs/index.php/SSS/SSS15/paper/view/10257/10026). You can explore the borders we must cross to get from one country to another and see how small the 🌍 is.
8 |
9 | ## Installation
10 |
11 | ```sh
12 | pip install git+https://github.com/raphaelsty/kgsearch
13 | ```
14 |
15 | ## ✅ Quick Start
16 |
17 | The `start` command starts the API and opens the user interface:
18 |
19 | ```sh
20 | kg start
21 | ```
22 |
23 | ## ⭐️ Query
24 |
25 | KGSearch suggests performing multiple queries via the `;` separator.
26 |
27 | The query `france;germany;china` will be divided into three subqueries `france`, `germany`, and `china` to visualize the interactions between the entities of our choice.
28 |
29 | The `top K` field allows selecting the number of candidate entities retrieved by the search engine (1 by default).
30 |
31 | The `neighbours` field selects the number of neighbors to be displayed (1 by default).
32 |
33 | The `prune` field removes entities that have fewer than `prune` connections to other entities (1 by default).
34 |
35 | ## 🤖 Custom KG
36 |
37 | We can add our graph to KGSearch via the command:
38 |
39 | ```sh
40 | kg add -f data.csv
41 | ```
42 |
43 | The graph must be saved in CSV format and structured as triples (head, relation, tail) with a comma separator and without column names. Here is an example of a compatible CSV file:
44 |
45 | ```sh
46 | senegal,neighbor,gambia
47 | senegal,neighbor,mauritania
48 | senegal,neighbor,mali
49 | senegal,neighbor,guinea-bissau
50 | senegal,neighbor,guinea
51 | ```
52 |
53 | We can also add custom metadata for each entity to be displayed in the user interface using `meta -f`:
54 |
55 | ```sh
56 | kg meta -f metadata.json
57 | ```
58 |
59 | where the `metadata.json` file has the label of the entity as a key and a set of characteristics:
60 |
61 | ```json
62 | {
63 | "senegal": {"url": "https://en.wikipedia.org/wiki/senegal"},
64 | "gambia": {"url": "https://en.wikipedia.org/wiki/gambia"},
65 | "mauritania": {"url": "https://en.wikipedia.org/wiki/gambia"},
66 | "mali": {"url": "https://en.wikipedia.org/wiki/mali"},
67 | }
68 | ```
69 |
70 | ## 📑 Notes
71 |
72 | If you have already started the application, you can reopen a window with the open command:
73 |
74 | ```sh
75 | kg open
76 | ```
77 |
78 | The library [Cherche](https://github.com/raphaelsty/cherche) provides the entity search engine. KGSearch relies on a local flask API. The user interface is developed in React and uses the [3D Force-Directed Graph](https://github.com/vasturiano/3d-force-graph) library.
--------------------------------------------------------------------------------
/kgsearch.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raphaelsty/kgsearch/64dc6117e11ca689d7df8dc6635ec1798d895de3/kgsearch.gif
--------------------------------------------------------------------------------
/kgsearch/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import webbrowser
3 |
4 | import click
5 | from rich import print
6 |
7 | from .app import Search, create_app, save_metadata
8 |
9 | __all__ = ["app"]
10 |
11 | path = os.path.abspath(os.path.dirname(__file__))
12 |
13 |
14 | @click.command("start", short_help="Start the app")
15 | @click.argument("arg", type=str)
16 | @click.option("-f", help="Csv file with triples.")
17 | def start(arg, f):
18 |
19 | if arg == "start":
20 | # lsof -i:9200
21 | # lsof -i:5000
22 | # kill -9
23 |
24 | app = create_app()
25 |
26 | print("🎉 Starting the app.")
27 | webbrowser.open(os.path.join("file://" + path, "web/app.html"))
28 | app.run()
29 |
30 | elif arg == "add":
31 | Search(file=f).save(path=os.path.join(path, "data/search.pkl"))
32 |
33 | elif arg == "meta":
34 | save_metadata(origin=f, source=os.path.join(path, "data/metadata.json"))
35 |
36 | elif arg == "open":
37 | print("😎 Opening web.")
38 | webbrowser.open(os.path.join("file://" + path, "web/app.html"))
39 |
--------------------------------------------------------------------------------
/kgsearch/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 0, 1)
2 |
3 | __version__ = ".".join(map(str, VERSION))
4 |
--------------------------------------------------------------------------------
/kgsearch/app/__init__.py:
--------------------------------------------------------------------------------
1 | from .app import Search, create_app, save_metadata
2 |
3 | __all__ = ["Search", "create_app", "save_metadata"]
4 |
--------------------------------------------------------------------------------
/kgsearch/app/app.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import json
3 | import os
4 | import pathlib
5 | import pickle
6 | from functools import lru_cache
7 |
8 | import pandas as pd
9 | from cherche import retrieve
10 | from flask import Flask
11 | from flask_cors import CORS, cross_origin
12 | from sklearn.feature_extraction.text import TfidfVectorizer
13 |
14 | __all__ = ["Search", "create_app", "save_metadata"]
15 |
16 |
17 | def save_metadata(origin, source):
18 | """Export metadata to the library."""
19 | with open(origin, "r") as f:
20 | metadata = json.load(f)
21 |
22 | with open(source, "w") as f:
23 | json.dump(metadata, f, indent=4)
24 |
25 |
26 | class Search:
27 | """Search over KG."""
28 |
29 | def __init__(self, file: str) -> None:
30 |
31 | self.colors = ["#00A36C", "#9370DB", "#bbae98", "#7393B3", "#677179", "#318ce7", "#088F8F"]
32 | self.metadata = {}
33 |
34 | triples = pd.read_csv(file, header=None, sep=",")
35 |
36 | documents = [
37 | {"key": key, "label": label}
38 | for key, label in pd.concat([triples[0], triples[2]], axis="rows")
39 | .drop_duplicates(keep="first")
40 | .reset_index(drop=True)
41 | .to_dict()
42 | .items()
43 | ]
44 |
45 | self.retriever = (
46 | retrieve.TfIdf(
47 | key="key",
48 | on="label",
49 | documents=documents,
50 | tfidf=TfidfVectorizer(lowercase=True, ngram_range=(3, 7), analyzer="char"),
51 | k=30,
52 | )
53 | + documents
54 | )
55 |
56 | self.triples = collections.defaultdict(tuple)
57 | self.relations = collections.defaultdict(list)
58 |
59 | for h, r, t in triples.to_records(index=False).tolist():
60 | self.triples[h] += tuple([t])
61 | self.triples[t] += tuple([h])
62 | self.relations[f"{h}_{t}"].append(r)
63 |
64 | self.explore.cache_clear()
65 |
66 | def save(self, path):
67 | """Save the search object."""
68 | with open(path, "wb") as f:
69 | pickle.dump(self, f)
70 | return self
71 |
72 | def load_metadata(self, path):
73 | """Load metadata"""
74 | with open(path, "r") as f:
75 | self.metadata = json.load(f)
76 | return self
77 |
78 | @lru_cache(maxsize=10000)
79 | def explore(self, entities, neighbours, entity, depth, max_depth):
80 | depth += 1
81 |
82 | for neighbour in neighbours:
83 |
84 | entities += tuple([tuple([entity, neighbour])])
85 |
86 | if depth < max_depth:
87 |
88 | entities = self.explore(
89 | entities=entities,
90 | neighbours=self.triples.get(neighbour, tuple([])),
91 | entity=neighbour,
92 | depth=depth,
93 | max_depth=max_depth,
94 | )
95 |
96 | return entities
97 |
98 | def __call__(self, query: str, k: int, n: int, p: int):
99 | nodes, links = [], []
100 | entities, h_r_t = {}, {}
101 | prune = collections.defaultdict(int)
102 |
103 | candidates, seen = [], {}
104 | for q in query.split(";"):
105 | answer = self.retriever(q.strip())[: int(k)]
106 | for candidate in answer:
107 | if candidate["label"] not in seen:
108 | candidates.append(candidate)
109 | seen[candidate["label"]] = True
110 |
111 | for group, e in enumerate(candidates):
112 |
113 | e = e["label"]
114 |
115 | nodes.append(
116 | {
117 | "id": e,
118 | "group": group,
119 | "color": "#960018",
120 | "fontWeight": "bold",
121 | "metadata": self.metadata.get(e, {}),
122 | }
123 | )
124 |
125 | entities[e] = True
126 |
127 | for group, e in enumerate(candidates):
128 |
129 | e = e["label"]
130 | color = self.colors[group % len(self.colors)]
131 | match = self.explore(
132 | entities=tuple([]), neighbours=self.triples[e], entity=e, depth=0, max_depth=n
133 | )
134 |
135 | for h, t in list(match):
136 |
137 | if h not in entities:
138 | nodes.append(
139 | {
140 | "id": h,
141 | "group": group,
142 | "color": color,
143 | "metadata": self.metadata.get(h, {}),
144 | }
145 | )
146 | entities[h] = True
147 |
148 | if t not in entities:
149 | nodes.append(
150 | {
151 | "id": t,
152 | "group": group,
153 | "color": color,
154 | "metadata": self.metadata.get(t, {}),
155 | }
156 | )
157 | entities[t] = True
158 |
159 | for r in self.relations[f"{h}_{t}"]:
160 | if f"{h}_{r}_{t}" not in h_r_t:
161 | links.append({"source": h, "target": t, "value": 1, "relation": r})
162 | h_r_t[f"{h}_{r}_{t}"] = True
163 | prune[h] += 1
164 | prune[t] += 1
165 |
166 | for r in self.relations[f"{t}_{h}"]:
167 | if f"{t}_{r}_{h}" not in h_r_t:
168 | links.append({"source": t, "relation": r, "target": h, "value": 1})
169 | h_r_t[f"{t}_{r}_{h}"] = True
170 | prune[h] += 1
171 | prune[t] += 1
172 | # Prune
173 | if p > 1:
174 | links = [
175 | link for link in links if prune[link["source"]] >= p and prune[link["target"]] >= p
176 | ]
177 |
178 | nodes = [node for node in nodes if prune[node["id"]] >= p]
179 |
180 | return {"nodes": nodes, "links": links}
181 |
182 |
183 | def create_app():
184 | app = Flask(__name__)
185 | app.config["JSONIFY_PRETTYPRINT_REGULAR"] = True
186 | app.config["CORS_HEADERS"] = "Content-Type"
187 | CORS(app, resources={r"/search/*": {"origins": "*"}})
188 |
189 | @app.route("/search///