├── pycleora ├── py.typed ├── .gitignore ├── __init__.py └── pycleora.pyi ├── legacy ├── docs │ ├── requirements.txt │ ├── source │ │ ├── _static │ │ │ ├── cleora_logo.png │ │ │ ├── cleora-columns.png │ │ │ ├── cleora-sparse-matrix.png │ │ │ ├── hypergraph-expansion.png │ │ │ └── hypergraph-expansion-for-each-hyperedge.png │ │ ├── examples.rst │ │ ├── index.rst │ │ ├── algorithms.rst │ │ ├── conf.py │ │ ├── why_cleora.rst │ │ ├── graph_creation.rst │ │ └── running.rst │ ├── Makefile │ └── make.bat ├── files │ ├── samples │ │ ├── edgelist_2.tsv │ │ ├── edgelist_1.tsv │ │ └── edgelist_2.json │ └── images │ │ ├── cleora.png │ │ └── figure_1.png ├── .cargo │ └── config ├── src │ ├── lib.rs │ ├── configuration.rs │ ├── pipeline.rs │ ├── main.rs │ ├── persistence.rs │ ├── embedding.rs │ └── entity.rs ├── Cargo.toml ├── .github │ └── workflows │ │ ├── ci.yml │ │ └── release.yml ├── CHANGELOG.md ├── tests │ └── snapshot.rs ├── benches │ └── cleora_benchmark.rs └── example_classification.ipynb ├── files ├── samples │ ├── edgelist_2.tsv │ ├── edgelist_1.tsv │ └── edgelist_2.json └── images │ ├── cleora.png │ └── figure_1.png ├── .cargo └── config ├── .gitignore ├── examples ├── predefined_cleora_loop.py ├── column_indices.py ├── graph_pickle.py ├── cleora_loop.py └── from_iterator.py ├── pyproject.toml ├── Cargo.toml ├── LICENSE ├── src ├── embedding.rs ├── configuration.rs ├── sparse_matrix.rs ├── entity.rs ├── pipeline.rs ├── lib.rs └── sparse_matrix_builder.rs ├── CHANGELOG.md ├── tests └── snapshot.rs ├── benches └── cleora_benchmark.rs ├── .github └── workflows │ └── CI.yml └── README.md /pycleora/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /legacy/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==3.5.3 2 | -------------------------------------------------------------------------------- /files/samples/edgelist_2.tsv: -------------------------------------------------------------------------------- 1 | u1 p1 p2 b1 b2 2 | u2 p2 p3 p4 b1 -------------------------------------------------------------------------------- /legacy/files/samples/edgelist_2.tsv: -------------------------------------------------------------------------------- 1 | u1 p1 p2 b1 b2 2 | u2 p2 p3 p4 b1 -------------------------------------------------------------------------------- /pycleora/.gitignore: -------------------------------------------------------------------------------- 1 | cleora.cpython-310-x86_64-linux-gnu.so 2 | __pycache__ -------------------------------------------------------------------------------- /.cargo/config: -------------------------------------------------------------------------------- 1 | [target.x86_64-unknown-linux-musl] 2 | linker = "x86_64-linux-musl-gcc" 3 | -------------------------------------------------------------------------------- /files/samples/edgelist_1.tsv: -------------------------------------------------------------------------------- 1 | a ba bac abb r rrr rr 2 | a ab bca bcc rr r 3 | ba ab a aa abb r rrr -------------------------------------------------------------------------------- /legacy/.cargo/config: -------------------------------------------------------------------------------- 1 | [target.x86_64-unknown-linux-musl] 2 | linker = "x86_64-linux-musl-gcc" 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /target 3 | **/*.rs.bk 4 | .idea 5 | cleora-light-rust.iml 6 | *.out 7 | *.so -------------------------------------------------------------------------------- /files/images/cleora.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/files/images/cleora.png -------------------------------------------------------------------------------- /legacy/files/samples/edgelist_1.tsv: -------------------------------------------------------------------------------- 1 | a ba bac abb r rrr rr 2 | a ab bca bcc rr r 3 | ba ab a aa abb r rrr -------------------------------------------------------------------------------- /files/images/figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/files/images/figure_1.png -------------------------------------------------------------------------------- /legacy/files/images/cleora.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/files/images/cleora.png -------------------------------------------------------------------------------- /legacy/files/images/figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/files/images/figure_1.png -------------------------------------------------------------------------------- /legacy/docs/source/_static/cleora_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/cleora_logo.png -------------------------------------------------------------------------------- /legacy/docs/source/_static/cleora-columns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/cleora-columns.png -------------------------------------------------------------------------------- /legacy/docs/source/_static/cleora-sparse-matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/cleora-sparse-matrix.png -------------------------------------------------------------------------------- /legacy/docs/source/_static/hypergraph-expansion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/hypergraph-expansion.png -------------------------------------------------------------------------------- /legacy/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod configuration; 2 | pub mod embedding; 3 | pub mod entity; 4 | pub mod persistence; 5 | pub mod pipeline; 6 | pub mod sparse_matrix; 7 | -------------------------------------------------------------------------------- /files/samples/edgelist_2.json: -------------------------------------------------------------------------------- 1 | {"users": "u1", "products": ["p1", "p2"], "brands": ["b1", "b2"]} 2 | {"users": "u2", "products": ["p2", "p3", "p4"], "brands": ["b1"]} 3 | -------------------------------------------------------------------------------- /legacy/files/samples/edgelist_2.json: -------------------------------------------------------------------------------- 1 | {"users": "u1", "products": ["p1", "p2"], "brands": ["b1", "b2"]} 2 | {"users": "u2", "products": ["p2", "p3", "p4"], "brands": ["b1"]} 3 | -------------------------------------------------------------------------------- /legacy/docs/source/_static/hypergraph-expansion-for-each-hyperedge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/hypergraph-expansion-for-each-hyperedge.png -------------------------------------------------------------------------------- /pycleora/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .pycleora import SparseMatrix 4 | 5 | def embed_using_baseline_cleora(graph, feature_dim: int, iter: int): 6 | embeddings = graph.initialize_deterministically(feature_dim) 7 | for i in range(iter): 8 | embeddings = graph.left_markov_propagate(embeddings) 9 | embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) 10 | return embeddings -------------------------------------------------------------------------------- /examples/predefined_cleora_loop.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from pycleora import embed_using_baseline_cleora, SparseMatrix 4 | 5 | start_time = time.time() 6 | graph = SparseMatrix.from_files(["perf_inputs/0.tsv", "perf_inputs/1.tsv", "perf_inputs/2.tsv", "perf_inputs/3.tsv", "perf_inputs/4.tsv", "perf_inputs/5.tsv", "perf_inputs/6.tsv", "perf_inputs/7.tsv"], "complex::reflexive::name") 7 | embeddings = embed_using_baseline_cleora(graph, 128, 3) 8 | print(f"Took {time.time() - start_time} seconds ") 9 | -------------------------------------------------------------------------------- /examples/column_indices.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pycleora import SparseMatrix 3 | 4 | hyperedges = [ 5 | 'a\t1', 6 | 'a\t2', 7 | 'b\t5', 8 | 'b\t2', 9 | 'c\t8', 10 | ] 11 | 12 | graph = SparseMatrix.from_iterator((e for e in hyperedges), "char num") 13 | 14 | entity_ids = np.array(graph.entity_ids) 15 | print(entity_ids) 16 | print(graph.entity_degrees) 17 | 18 | print(graph.get_entity_column_mask('char')) 19 | print(graph.get_entity_column_mask('num')) 20 | 21 | print(entity_ids[graph.get_entity_column_mask('char')]) 22 | print(entity_ids[graph.get_entity_column_mask('num')]) -------------------------------------------------------------------------------- /legacy/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /examples/graph_pickle.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | from pycleora import SparseMatrix 5 | 6 | import pickle 7 | 8 | start_time = time.time() 9 | 10 | graph = SparseMatrix.from_files(["perf_inputs/0.tsv"], "complex::reflexive::name") 11 | 12 | print("Entities n", len(graph.entity_ids)) 13 | print(graph.entity_ids[:10]) 14 | 15 | with open('graph.pkl', 'wb') as f: 16 | pickle.dump(graph, f) 17 | 18 | with open('graph.pkl', 'rb') as f: 19 | graph_reread = pickle.load(f) 20 | 21 | print(graph.entity_ids[:10]) 22 | print(graph_reread.entity_ids[:10]) 23 | 24 | embeddings = graph_reread.initialize_deterministically(feature_dim=128, seed=0) 25 | embeddings = graph_reread.left_markov_propagate(embeddings) 26 | 27 | print(embeddings) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.2.3"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "pycleora" 7 | requires-python = ">=3.7" 8 | classifiers = [ 9 | "Programming Language :: Rust", 10 | "Programming Language :: Python :: Implementation :: CPython", 11 | "Programming Language :: Python :: Implementation :: PyPy", 12 | ] 13 | version = "2.1.0" 14 | description = "Sparse hypergraph structure and markov-propagation for node embeddings embeddings exposed via Python bindings." 15 | readme = { file = "README.md", content-type = "text/markdown" } 16 | authors = [ 17 | { name = "Jacek Dabrowski", email = "jack.dabrowski@synerise.com" } 18 | ] 19 | license = { file = "LICENSE" } 20 | 21 | 22 | [tool.maturin] 23 | features = ["pyo3/extension-module"] 24 | -------------------------------------------------------------------------------- /legacy/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /examples/cleora_loop.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | from pycleora import SparseMatrix 5 | 6 | start_time = time.time() 7 | 8 | # graph = SparseMatrix.from_files(["zaba30_large_5m.tsv"], "basket complex::product", hyperedge_trim_n=16) 9 | graph = SparseMatrix.from_files(["perf_inputs/0.tsv", "perf_inputs/1.tsv", "perf_inputs/2.tsv", "perf_inputs/3.tsv", "perf_inputs/4.tsv", "perf_inputs/5.tsv", "perf_inputs/6.tsv", "perf_inputs/7.tsv"], "complex::reflexive::name") 10 | 11 | print("Entities n", len(graph.entity_ids)) 12 | # embeddings = np.random.randn(len(graph.entity_ids), 128).astype(np.float32) 13 | embeddings = graph.initialize_deterministically(feature_dim=128, seed=0) 14 | 15 | for i in range(3): 16 | embeddings = graph.left_markov_propagate(embeddings) 17 | # embeddings = graph.symmetric_markov_propagate(embeddings) 18 | 19 | embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) 20 | print(f"Iter {i} finished") 21 | 22 | print(graph.entity_ids[:10]) 23 | 24 | print(f"Took {time.time() - start_time} seconds ") 25 | -------------------------------------------------------------------------------- /examples/from_iterator.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | from pycleora import SparseMatrix 5 | 6 | start_time = time.time() 7 | 8 | def edges_iterator(): 9 | lines = [] 10 | 11 | files = ["perf_inputs/0.tsv", "perf_inputs/1.tsv", "perf_inputs/2.tsv", "perf_inputs/3.tsv", "perf_inputs/4.tsv", "perf_inputs/5.tsv", "perf_inputs/6.tsv", "perf_inputs/7.tsv"] 12 | for file in files: 13 | with open(file, 'rt') as f: 14 | lines.extend(f) 15 | 16 | iteration_start_time = time.time() 17 | for line in lines: 18 | yield line 19 | print(f"Iteration took {time.time() - iteration_start_time} seconds ") 20 | 21 | graph = SparseMatrix.from_iterator(edges_iterator(), "complex::reflexive::product") 22 | 23 | print("Entities n", len(graph.entity_ids)) 24 | print(graph.entity_ids[:10]) 25 | 26 | embeddings = np.random.randn(len(graph.entity_ids), 256).astype(np.float32) 27 | 28 | for i in range(3): 29 | embeddings = graph.left_markov_propagate(embeddings) 30 | embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) 31 | print(f"Iter {i} finished") 32 | 33 | print(f"Took {time.time() - start_time} seconds ") -------------------------------------------------------------------------------- /legacy/docs/source/examples.rst: -------------------------------------------------------------------------------- 1 | .. _examples: 2 | 3 | Use cases examples 4 | ===================== 5 | 6 | .. list-table:: 7 | :widths: 40 80 80 8 | :header-rows: 1 9 | 10 | * - Examples 11 | - Description 12 | - Dataset 13 | * - `Classification `_ 14 | - Synerise Cleora Classification Example for Facebook Large Page-Page Network 15 | - `Facebook Large Page-Page Network `_ 16 | * - `Link Prediction `_ 17 | - Synerise Cleora Link Prediction Example for Facebook Large Page-Page Network 18 | - `Facebook Large Page-Page Network `_ 19 | * - `Link Prediction `_ 20 | - Synerise Cleora Link Prediction Example for The Complete Journey 21 | - `The Complete Journey `_ -------------------------------------------------------------------------------- /legacy/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | :github_url: https://github.com/Synerise/cleora 2 | 3 | Synerise Cleora AI Documentation 4 | ===================================== 5 | 6 | **Synersie Cleora AI** is a general-purpose model for efficient, scalable learning of stable and inductive entity embeddings for heterogeneous relational data. **Cleora** embeds entities in n-dimensional spherical spaces utilizing extremely fast stable, iterative random projections, which allows for unparalleled performance and scalability. 7 | 8 | Types of data which can be embedded include for example: 9 | 10 | - heterogeneous undirected graphs 11 | - heterogeneous undirected hypergraphs 12 | - text and other categorical array data 13 | - any combination of the above 14 | 15 | Read the whitepaper `Cleora: A Simple, Strong and Scalable Graph Embedding Scheme `_ 16 | 17 | =========================================== 18 | 19 | .. toctree:: 20 | :maxdepth: 2 21 | :caption: Contents: 22 | 23 | graph_creation 24 | algorithms 25 | running 26 | examples 27 | why_cleora 28 | 29 | 30 | 31 | Indices and tables 32 | ================== 33 | 34 | * :ref:`genindex` 35 | * :ref:`modindex` 36 | * :ref:`search` 37 | -------------------------------------------------------------------------------- /legacy/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cleora" 3 | version = "1.2.3" 4 | authors = ["Piotr Babel ", "Jacek Dabrowski ", "Konrad Goluchowski "] 5 | edition = "2018" 6 | license-file = "LICENSE" 7 | readme = "README.md" 8 | description = """ 9 | Cleora is a general-purpose model for efficient, scalable learning of stable and inductive entity embeddings for heterogeneous relational data. 10 | """ 11 | 12 | [build] 13 | rustflags = ["-C", "target-cpu=native"] 14 | 15 | [dependencies] 16 | bus = "2.2.4" 17 | clap = { version = "3.1.8", features = ["cargo"] } 18 | env_logger = "0.9.0" 19 | log = "0.4.17" 20 | memmap = "0.7.0" 21 | rayon = "1.5.3" 22 | rustc-hash = "1.1.0" 23 | smallvec = "1.8.1" 24 | twox-hash = "1.6.3" 25 | simdjson-rust = {git = "https://github.com/SunDoge/simdjson-rust"} 26 | ryu = "1.0.10" 27 | ndarray = "0.15.4" 28 | ndarray-npy = "0.8.1" 29 | serde_json = "1.0.81" 30 | uuid = { version = "1.1.2", features = ["v4"] } 31 | 32 | [dev-dependencies] 33 | criterion = "0.3.3" 34 | insta = "1.3.0" 35 | 36 | [[bench]] 37 | name = "cleora_benchmark" 38 | harness = false 39 | 40 | [profile.release] 41 | opt-level = 3 42 | lto = true 43 | codegen-units = 1 44 | -------------------------------------------------------------------------------- /pycleora/pycleora.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, Iterable, Optional, Self 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | 6 | 7 | class SparseMatrix: 8 | def __new__(cls, *args: Any) -> Self: 9 | pass 10 | 11 | @classmethod 12 | def from_iterator( 13 | cls, hyperedges: Iterable[str], columns: str, hyperedge_trim_n: int = 16, num_workers: Optional[int] = None 14 | ) -> Self: 15 | pass 16 | 17 | @classmethod 18 | def from_files( 19 | cls, filepaths: list[str], columns: str, hyperedge_trim_n: int = 16, num_workers: Optional[int] = None 20 | ) -> Self: 21 | pass 22 | 23 | def left_markov_propagate(self, x: NDArray[np.float32], num_workers: Optional[int] = None) -> NDArray[np.float32]: 24 | pass 25 | 26 | def symmetric_markov_propagate( 27 | self, x: NDArray[np.float32], num_workers: Optional[int] = None 28 | ) -> NDArray[np.float32]: 29 | pass 30 | 31 | def get_entity_column_mask(self, column_name: str) -> NDArray[np.bool]: 32 | pass 33 | 34 | def entity_degrees(self) -> NDArray[np.float32]: 35 | pass 36 | 37 | def initialize_deterministically(self, feature_dim: int, seed: int = 0) -> NDArray[np.float32]: 38 | pass 39 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pycleora" 3 | version = "2.1.0" 4 | edition = "2018" 5 | license-file = "LICENSE" 6 | readme = "README.md" 7 | documentation = "https://github.com/synerise/cleora" 8 | homepage = "https://github.com/synerise/cleora" 9 | repository = "https://github.com/synerise/cleora" 10 | description = """ 11 | Sparse hypergraph structure and markov-propagation for node embeddings embeddings exposed via Python bindings. 12 | """ 13 | 14 | [lib] 15 | crate-type = ["cdylib", "rlib"] 16 | 17 | [build] 18 | rustflags = ["-C", "target-cpu=native"] 19 | 20 | [dependencies] 21 | log = "0.4.17" 22 | rayon = "1.5.3" 23 | rustc-hash = "1.1.0" 24 | smallvec = "1.8.1" 25 | twox-hash = "1.6.3" 26 | ndarray = { version = "0.15.4", features = ["rayon"] } 27 | ndarray-npy = "0.8.1" 28 | uuid = { version = "1.1.2", features = ["v4"] } 29 | crossbeam = "0.8.1" 30 | dashmap = { version = "5.3.4", features = ["rayon"] } 31 | num_cpus = "1.13.1" 32 | itertools = "0.10.3" 33 | serde = { version = "1.0.163", features = ["derive"] } 34 | bincode = "1.3.3" 35 | 36 | pyo3 = "0.18.1" 37 | numpy = "0.18" 38 | 39 | [dev-dependencies] 40 | criterion = "0.3.3" 41 | insta = "1.3.0" 42 | ndarray-rand = "0.14.0" 43 | 44 | [[bench]] 45 | name = "cleora_benchmark" 46 | harness = false 47 | 48 | [profile.release] 49 | opt-level = 3 50 | lto = true 51 | codegen-units = 1 52 | 53 | [target.aarch64-apple-darwin] 54 | linker = "aarch64-apple-darwin21.4-clang" 55 | ar = "aarch64-apple-darwin21.4-ar" 56 | rustflags = [ 57 | "-C", "link-arg=-undefined", 58 | "-C", "link-arg=dynamic_lookup", 59 | ] 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | SOFTWARE LICENSING 2 | 3 | You are licensed to use Synerise Cleora produced by Synerise SA. under an MIT LICENSE 4 | 5 | Synerise Cleora MIT License 6 | 7 | Copyright (c) 2020 Synerise SA - [entered into the Register of 8 | Entrepreneurs of the National Court Register maintained by the 9 | District Court for Kraków-Śródmieście in Kraków, XI Commercial Division 10 | of the National Court Register with the KRS number 0000468034, 11 | NIP (tax identification number) number 679309 32 92, 12 | share capital in the amount of PLN 556 150,00. paid up in full.] 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining 15 | a copy of this software and associated documentation files (the 16 | "Software"), to deal in the Software without restriction, including 17 | without limitation the rights to use, copy, modify, merge, publish, 18 | distribute, sublicense, and/or sell copies of the Software, and to 19 | permit persons to whom the Software is furnished to do so, subject to 20 | the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included 23 | in all copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 28 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 29 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 30 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 31 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /legacy/docs/source/algorithms.rst: -------------------------------------------------------------------------------- 1 | .. _algorithms: 2 | 3 | Hypergraph expansion methods 4 | ============================================================= 5 | 6 | 7 | .. figure:: _static/hypergraph-expansion.png 8 | :figwidth: 100 % 9 | :width: 100 % 10 | :align: center 11 | :alt: examples use case of column modifiers 12 | 13 | **Hypergraph Expansion** - Cleora needs to break down all existing hyperedges into edges as the algorithm relies on the pairwise notion of node transition. Hypergraph expansion to graph is done using two alternative strategies: 14 | 15 | 16 | Clique Expansion 17 | --------------------------- 18 | 19 | - Each hyperedge is transformed into a clique - a subgraph where each pair of nodes is connected with an edge. Space/time complexity of this approach is: 20 | 21 | .. math:: 22 | 23 | O(|V| \times d + |E| \times k^2) 24 | 25 | where: E is the number of hyperedges. 26 | 27 | With the usage of cliques the number of created edges can be significant but guarantees better fidelity to the original hyperedge relationship. We apply this scheme to smaller graphs. 28 | 29 | 30 | .. figure:: _static/hypergraph-expansion-for-each-hyperedge.png 31 | :figwidth: 100 % 32 | :width: 100 % 33 | :align: center 34 | :alt: examples use case of column modifiers 35 | 36 | 37 | Star Expansion 38 | --------------------------- 39 | - Extra node is introduced which links to the original nodes contained by a hyperedge. Space/time complexity of this approach is: 40 | 41 | .. math:: 42 | 43 | (|V|+|E|) \times d + |E|k) 44 | 45 | Here we must count in the time and space needed to embed an extra entity for each hyperedge, but we save on the number of created edges, which would be only k for each hyperedge. This approach is suited for large graphs. 46 | -------------------------------------------------------------------------------- /legacy/.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | env: 10 | RUST_BACKTRACE: 1 11 | 12 | jobs: 13 | check: 14 | name: Check 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v2 18 | - uses: actions-rs/toolchain@v1 19 | with: 20 | profile: minimal 21 | toolchain: stable 22 | override: true 23 | - uses: actions-rs/cargo@v1 24 | with: 25 | command: check 26 | 27 | test: 28 | name: Test Suite 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v2 32 | - uses: actions-rs/toolchain@v1 33 | with: 34 | profile: minimal 35 | toolchain: stable 36 | override: true 37 | - uses: actions-rs/cargo@v1 38 | with: 39 | command: test 40 | 41 | fmt: 42 | name: Rustfmt 43 | runs-on: ubuntu-latest 44 | steps: 45 | - uses: actions/checkout@v2 46 | - uses: actions-rs/toolchain@v1 47 | with: 48 | profile: minimal 49 | toolchain: stable 50 | override: true 51 | - run: rustup component add rustfmt 52 | - uses: actions-rs/cargo@v1 53 | with: 54 | command: fmt 55 | args: --all -- --check 56 | 57 | clippy: 58 | name: Clippy 59 | runs-on: ubuntu-latest 60 | steps: 61 | - uses: actions/checkout@v2 62 | - uses: actions-rs/toolchain@v1 63 | with: 64 | profile: minimal 65 | toolchain: stable 66 | override: true 67 | - run: rustup component add clippy 68 | - uses: actions-rs/cargo@v1 69 | with: 70 | command: clippy 71 | args: -- -D warnings 72 | -------------------------------------------------------------------------------- /legacy/docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'Synerise Cleora' 21 | copyright = '2021, Synerise' 22 | author = 'Synerise' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '1.1.0' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | "sphinx.ext.intersphinx", 35 | "sphinx.ext.autodoc", 36 | "sphinx.ext.mathjax", 37 | "sphinx.ext.viewcode", 38 | ] 39 | 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ['_templates'] 43 | 44 | # List of patterns, relative to source directory, that match files and 45 | # directories to ignore when looking for source files. 46 | # This pattern also affects html_static_path and html_extra_path. 47 | exclude_patterns = [] 48 | 49 | 50 | # -- Options for HTML output ------------------------------------------------- 51 | 52 | # The theme to use for HTML and HTML Help pages. See the documentation for 53 | # a list of builtin themes. 54 | # 55 | html_theme = 'alabaster' 56 | # Add any paths that contain custom static files (such as style sheets) here, 57 | # relative to this directory. They are copied after the builtin static files, 58 | # so a file named "default.css" will overwrite the builtin "default.css". 59 | 60 | # sets the darker appearence 61 | html_theme_options = { 62 | 'style': 'darker' 63 | } 64 | 65 | html_static_path = ['_static'] 66 | html_logo = '_static/cleora_logo.png' 67 | -------------------------------------------------------------------------------- /src/embedding.rs: -------------------------------------------------------------------------------- 1 | use crate::sparse_matrix::Edge; 2 | use crate::sparse_matrix::SparseMatrix; 3 | use ndarray::{Array, Array1, Array2, ArrayView2, Axis}; 4 | use rayon::prelude::*; 5 | use rayon::ThreadPoolBuilder; 6 | 7 | pub enum MarkovType { 8 | Left, 9 | Symmetric, 10 | } 11 | 12 | pub struct NdArrayMatrix; 13 | 14 | impl NdArrayMatrix { 15 | pub fn multiply( 16 | sparse_matrix_reader: &SparseMatrix, 17 | other: ArrayView2, 18 | markov_type: MarkovType, 19 | num_workers: usize, 20 | ) -> Array2 { 21 | let mut new_matrix: Array2 = Array::zeros(other.raw_dim()); 22 | ThreadPoolBuilder::new() 23 | .num_threads(num_workers) 24 | .build() 25 | .unwrap() 26 | .install(|| { 27 | new_matrix 28 | .axis_iter_mut(Axis(0)) 29 | .into_par_iter() 30 | .zip(sparse_matrix_reader.slices.par_iter()) 31 | .for_each(|(mut row, (start, end))| { 32 | let edges = &sparse_matrix_reader.edges[*start..*end]; 33 | 34 | let new_row: Array1 = edges 35 | .par_iter() 36 | .fold( 37 | || Array1::zeros(other.shape()[1]), 38 | |mut row, edge| { 39 | let Edge { 40 | left_markov_value, 41 | symmetric_markov_value, 42 | other_entity_ix, 43 | } = edge; 44 | let value = match markov_type { 45 | MarkovType::Left => left_markov_value, 46 | MarkovType::Symmetric => symmetric_markov_value, 47 | }; 48 | let other_row = &other.row(*other_entity_ix as usize); 49 | row.scaled_add(*value, other_row); 50 | row 51 | }, 52 | ) 53 | .reduce_with(|v1, v2| v1 + v2) 54 | .expect("Must have at least one edge"); 55 | 56 | row.assign(&new_row); 57 | }); 58 | }); 59 | new_matrix 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /legacy/docs/source/why_cleora.rst: -------------------------------------------------------------------------------- 1 | .. why-cleora: 2 | 3 | Why worth use Synerise Cleora? 4 | =============================================== 5 | 6 | Key technical features of Cleora embeddings 7 | -------------------------------------------------------------------------------------------- 8 | 9 | The embeddings produced by Cleora are different from those produced by Node2vec, Word2vec, DeepWalk or other systems in this class by a number of key properties: 10 | 11 | - **efficiency** - Cleora is two orders of magnitude faster than Node2Vec or DeepWalk 12 | - **inductivity** - as Cleora embeddings of an entity are defined only by interactions with other entities, vectors for new entities can be computed on-the-fly 13 | - **updatability** - refreshing a Cleora embedding for an entity is a very fast operation allowing for real-time updates without retraining 14 | - **stability** - all starting vectors for entities are deterministic, which means that Cleora embeddings on similar datasets will end up being similar. Methods like Word2vec, Node2vec or DeepWalk return different results with every run. 15 | - **cross-dataset compositionality** - thanks to stability of Cleora embeddings, embeddings of the same entity on multiple datasets can be combined by averaging, yielding meaningful vectors 16 | - **dim-wise independence** - thanks to the process producing Cleora embeddings, every dimension is independent of others. This property allows for efficient and low-parameter method for combining multi-view embeddings with Conv1d layers. 17 | - **extreme parallelism and performance** - Cleora is written in Rust utilizing thread-level parallelism for all calculations except input file loading. In practice this means that the embedding process is often faster than loading the input data. 18 | 19 | Key usability features of Cleora embeddings 20 | -------------------------------------------------------------------------------------------- 21 | 22 | The technical properties described above imply good production-readiness of Cleora, which from the end-user perspective can be summarized as follows: 23 | 24 | - heterogeneous relational tables can be embedded without any artificial data pre-processing 25 | - mixed interaction + text datasets can be embedded with ease 26 | - cold start problem for new entities is non-existent 27 | - real-time updates of the embeddings do not require any separate solutions 28 | - multi-view embeddings work out of the box 29 | - temporal, incremental embeddings are stable out of the box, with no need for re-alignment, rotations or other methods 30 | - extremely large datasets are supported and can be embedded within seconds / minutes 31 | -------------------------------------------------------------------------------- /legacy/.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | unix-release: 10 | name: ${{ matrix.target }} 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | include: 15 | - os: ubuntu-18.04 16 | target: x86_64-unknown-linux-gnu 17 | 18 | - os: ubuntu-18.04 19 | target: x86_64-unknown-linux-musl 20 | 21 | - os: macos-latest 22 | target: x86_64-apple-darwin 23 | 24 | steps: 25 | - name: Checkout repository 26 | uses: actions/checkout@v2 27 | 28 | - name: Set the version 29 | id: version 30 | run: echo ::set-output name=VERSION::${GITHUB_REF#refs/tags/} 31 | 32 | - name: Install Rust 33 | uses: actions-rs/toolchain@v1 34 | with: 35 | toolchain: stable 36 | profile: minimal 37 | override: true 38 | target: ${{ matrix.target }} 39 | 40 | - name: Build 41 | run: cargo build --release --locked 42 | 43 | - name: Strip binary 44 | run: strip target/release/cleora 45 | 46 | - name: Upload binaries to release 47 | uses: svenstaro/upload-release-action@v1-release 48 | with: 49 | repo_token: ${{ secrets.GITHUB_TOKEN }} 50 | file: target/release/cleora 51 | asset_name: cleora-${{ steps.version.outputs.VERSION }}-${{ matrix.target }} 52 | tag: ${{ github.ref }} 53 | 54 | windows-release: 55 | name: ${{ matrix.target }} 56 | runs-on: ${{ matrix.os }} 57 | strategy: 58 | matrix: 59 | include: 60 | - os: windows-latest 61 | target: x86_64-pc-windows-msvc 62 | 63 | steps: 64 | - name: Checkout repository 65 | uses: actions/checkout@v2 66 | 67 | - name: Set the version 68 | id: version 69 | run: | 70 | $TAG=${env:GITHUB_REF} -replace 'refs/tags/', '' 71 | echo "::set-output name=VERSION::$TAG" 72 | 73 | - name: Install Rust 74 | uses: actions-rs/toolchain@v1 75 | with: 76 | toolchain: stable 77 | profile: minimal 78 | override: true 79 | target: ${{ matrix.target }} 80 | 81 | - name: Build 82 | run: cargo build --release --locked 83 | 84 | - name: Upload binaries to release 85 | uses: svenstaro/upload-release-action@v1-release 86 | with: 87 | repo_token: ${{ secrets.GITHUB_TOKEN }} 88 | file: target/release/cleora.exe 89 | asset_name: cleora-${{ steps.version.outputs.VERSION }}-${{ matrix.target }} 90 | tag: ${{ github.ref }} -------------------------------------------------------------------------------- /src/configuration.rs: -------------------------------------------------------------------------------- 1 | use crate::sparse_matrix::SparseMatrixDescriptor; 2 | 3 | #[derive(Debug)] 4 | pub struct Configuration { 5 | pub seed: Option, 6 | pub matrix_desc: SparseMatrixDescriptor, 7 | pub columns: Vec, 8 | pub hyperedge_trim_n: usize, 9 | pub num_workers_graph_building: usize, 10 | } 11 | 12 | #[derive(Debug, Default)] 13 | pub struct Column { 14 | /// Name, header of the column 15 | pub name: String, 16 | 17 | /// The field is composite, containing multiple entity identifiers separated by space 18 | pub complex: bool, 19 | 20 | /// The field is reflexive, which means that it interacts with itself, additional output file is written for every such field 21 | pub reflexive: bool, 22 | } 23 | 24 | /// Extract columns config based on raw strings. 25 | pub fn parse_fields(columns: &str) -> Result, String> { 26 | let cols: Vec<&str> = columns.split(' ').collect(); 27 | 28 | let mut columns: Vec = Vec::new(); 29 | for col in cols { 30 | let parts: Vec<&str> = col.split("::").collect(); 31 | 32 | let column_name: &str; 33 | let mut complex = false; 34 | let mut reflexive = false; 35 | 36 | let parts_len = parts.len(); 37 | if parts_len > 1 { 38 | column_name = *parts.last().unwrap(); 39 | let column_name_idx = parts_len - 1; 40 | for &part in &parts[..column_name_idx] { 41 | if part.eq_ignore_ascii_case("complex") { 42 | complex = true; 43 | } else if part.eq_ignore_ascii_case("reflexive") { 44 | reflexive = true; 45 | } else { 46 | let message = format!("Unrecognized column field modifier: {}", part); 47 | return Err(message); 48 | } 49 | } 50 | } else { 51 | column_name = col; 52 | } 53 | let column = Column { 54 | name: column_name.to_string(), 55 | complex, 56 | reflexive, 57 | }; 58 | columns.push(column); 59 | } 60 | 61 | let columns = validate_column_modifiers(columns)?; 62 | Ok(columns) 63 | } 64 | 65 | fn validate_column_modifiers(cols: Vec) -> Result, String> { 66 | for col in &cols { 67 | // transient::reflexive - this would generate no output 68 | // transient::reflexive::complex - this would generate no output 69 | if col.reflexive && !col.complex { 70 | let message = format!( 71 | "A field cannot be REFLEXIVE but NOT COMPLEX. It does not make sense: {}", 72 | col.name 73 | ); 74 | return Err(message); 75 | } 76 | } 77 | Ok(cols) 78 | } 79 | -------------------------------------------------------------------------------- /legacy/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 1.2.3 (June 29, 2022) 2 | 3 | ### Changed 4 | - Bump libs ([#60]). 5 | 6 | [#60]: https://github.com/Synerise/cleora/pull/60 7 | 8 | ### Fixed 9 | - Check for malformed lines in input ([#59]). 10 | 11 | [#59]: https://github.com/Synerise/cleora/pull/59 12 | 13 | 14 | # 1.2.2 (June 24, 2022) 15 | 16 | ### Changed 17 | - Allow cleora to accept multiple input files as positional args. Named argument 'input' is getting deprecated. 18 | 19 | [#55]: https://github.com/Synerise/cleora/pull/55 20 | 21 | 22 | # 1.2.1 (April 13, 2022) 23 | 24 | ### Changed 25 | - Optimize "--output-format numpy" mode, so it doesn't require additional memory when writing output file ([#50]). 26 | - Bump libs ([#52]). 27 | 28 | [#50]: https://github.com/Synerise/cleora/pull/50 29 | [#52]: https://github.com/Synerise/cleora/pull/52 30 | 31 | 32 | # 1.2.0 (March 17, 2022) 33 | 34 | ### Added 35 | - Use default hasher for vector init ([#47]). 36 | 37 | [#47]: https://github.com/Synerise/cleora/pull/47 38 | 39 | 40 | # 1.1.1 (May 14, 2021) 41 | 42 | ### Added 43 | - Init embedding with seed during training ([#27]). 44 | 45 | [#27]: https://github.com/Synerise/cleora/pull/27 46 | 47 | 48 | # 1.1.0 (December 23, 2020) 49 | 50 | ### Changed 51 | - Bumped `env_logger` to `0.8.2`, `smallvec` to `1.5.1`, removed `fnv` hasher ([#11]). 52 | 53 | [#11]: https://github.com/Synerise/cleora/pull/11 54 | 55 | ### Added 56 | - Tests (snapshots) for in-memory and memory-mapped files calculations of embeddings ([#12]). 57 | - Support for `NumPy` output format (available via `--output-format` program argument) ([#15]). 58 | - Jupyter notebooks with experiments ([#16]). 59 | 60 | [#12]: https://github.com/Synerise/cleora/pull/12 61 | [#15]: https://github.com/Synerise/cleora/pull/15 62 | [#16]: https://github.com/Synerise/cleora/pull/16 63 | 64 | ### Improved 65 | - Used `vector` for `hash_to_id` mappings, non-allocating cartesian product, `ryu` crate for faster write ([#13]). 66 | - Sparse Matrix refactor (cleanup, simplification, using iter, speedup). Use Cargo.toml data for clap crate ([#17]). 67 | - Unify and simplify embeddings calculation for in-memory and mmap matrices ([#18]). 68 | 69 | [#13]: https://github.com/Synerise/cleora/pull/13 70 | [#17]: https://github.com/Synerise/cleora/pull/17 71 | [#18]: https://github.com/Synerise/cleora/pull/18 72 | 73 | 74 | # 1.0.1 (November 23, 2020) 75 | 76 | ### Fixed 77 | - Skip reading invalid UTF-8 line ([#8]). 78 | - Fix clippy warnings ([#7]). 79 | 80 | [#8]: https://github.com/Synerise/cleora/pull/8 81 | [#7]: https://github.com/Synerise/cleora/pull/7 82 | 83 | ### Added 84 | - JSON support ([#3]). 85 | - Snapshot testing ([#5]). 86 | 87 | [#3]: https://github.com/Synerise/cleora/pull/3 88 | [#5]: https://github.com/Synerise/cleora/pull/5 89 | 90 | 91 | # 1.0.0 (November 6, 2020) 92 | 93 | - Initial release. -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 2.0.0 (Nov 24, 2024) 2 | 3 | ### New version released 4 | - Python package with rust bindings 5 | - Improved performance, memory usage and concurrency 6 | - Allowing external embedding as seed 7 | - Interoperable with numpy 8 | - Not a standalone console application any more 9 | - old version available in legacy/ folder will not be maintained 10 | 11 | # 1.2.3 (June 29, 2022) 12 | 13 | ### Changed 14 | - Bump libs ([#60]). 15 | 16 | [#60]: https://github.com/Synerise/cleora/pull/60 17 | 18 | ### Fixed 19 | - Check for malformed lines in input ([#59]). 20 | 21 | [#59]: https://github.com/Synerise/cleora/pull/59 22 | 23 | 24 | # 1.2.2 (June 24, 2022) 25 | 26 | ### Changed 27 | - Allow cleora to accept multiple input files as positional args. Named argument 'input' is getting deprecated. 28 | 29 | [#55]: https://github.com/Synerise/cleora/pull/55 30 | 31 | 32 | # 1.2.1 (April 13, 2022) 33 | 34 | ### Changed 35 | - Optimize "--output-format numpy" mode, so it doesn't require additional memory when writing output file ([#50]). 36 | - Bump libs ([#52]). 37 | 38 | [#50]: https://github.com/Synerise/cleora/pull/50 39 | [#52]: https://github.com/Synerise/cleora/pull/52 40 | 41 | 42 | # 1.2.0 (March 17, 2022) 43 | 44 | ### Added 45 | - Use default hasher for vector init ([#47]). 46 | 47 | [#47]: https://github.com/Synerise/cleora/pull/47 48 | 49 | 50 | # 1.1.1 (May 14, 2021) 51 | 52 | ### Added 53 | - Init embedding with seed during training ([#27]). 54 | 55 | [#27]: https://github.com/Synerise/cleora/pull/27 56 | 57 | 58 | # 1.1.0 (December 23, 2020) 59 | 60 | ### Changed 61 | - Bumped `env_logger` to `0.8.2`, `smallvec` to `1.5.1`, removed `fnv` hasher ([#11]). 62 | 63 | [#11]: https://github.com/Synerise/cleora/pull/11 64 | 65 | ### Added 66 | - Tests (snapshots) for in-memory and memory-mapped files calculations of embeddings ([#12]). 67 | - Support for `NumPy` output format (available via `--output-format` program argument) ([#15]). 68 | - Jupyter notebooks with experiments ([#16]). 69 | 70 | [#12]: https://github.com/Synerise/cleora/pull/12 71 | [#15]: https://github.com/Synerise/cleora/pull/15 72 | [#16]: https://github.com/Synerise/cleora/pull/16 73 | 74 | ### Improved 75 | - Used `vector` for `hash_to_id` mappings, non-allocating cartesian product, `ryu` crate for faster write ([#13]). 76 | - Sparse Matrix refactor (cleanup, simplification, using iter, speedup). Use Cargo.toml data for clap crate ([#17]). 77 | - Unify and simplify embeddings calculation for in-memory and mmap matrices ([#18]). 78 | 79 | [#13]: https://github.com/Synerise/cleora/pull/13 80 | [#17]: https://github.com/Synerise/cleora/pull/17 81 | [#18]: https://github.com/Synerise/cleora/pull/18 82 | 83 | 84 | # 1.0.1 (November 23, 2020) 85 | 86 | ### Fixed 87 | - Skip reading invalid UTF-8 line ([#8]). 88 | - Fix clippy warnings ([#7]). 89 | 90 | [#8]: https://github.com/Synerise/cleora/pull/8 91 | [#7]: https://github.com/Synerise/cleora/pull/7 92 | 93 | ### Added 94 | - JSON support ([#3]). 95 | - Snapshot testing ([#5]). 96 | 97 | [#3]: https://github.com/Synerise/cleora/pull/3 98 | [#5]: https://github.com/Synerise/cleora/pull/5 99 | 100 | 101 | # 1.0.0 (November 6, 2020) 102 | 103 | - Initial release. -------------------------------------------------------------------------------- /src/sparse_matrix.rs: -------------------------------------------------------------------------------- 1 | use crate::configuration::Column; 2 | use pyo3::pyclass; 3 | use serde::{Deserialize, Serialize}; 4 | 5 | pub fn create_sparse_matrix_descriptor( 6 | colums: &Vec, 7 | ) -> Result { 8 | let mut matrices_descs = create_sparse_matrices_descriptors(colums); 9 | if matrices_descs.len() != 1 { 10 | return Err("More than one relation! Adjust your columns so there is only one relation."); 11 | } 12 | Ok(matrices_descs.remove(0)) 13 | } 14 | 15 | /// Creates combinations of column pairs as sparse matrices. 16 | /// Let's say that we have such columns configuration: complex::a reflexive::complex::b c. This is provided 17 | /// as `&[Column]` after parsing the config. 18 | /// The allowed column modifiers are: 19 | /// - transient - the field is virtual - it is considered during embedding process, no entity is written for the column, 20 | /// - complex - the field is composite, containing multiple entity identifiers separated by space, 21 | /// - reflexive - the field is reflexive, which means that it interacts with itself, additional output file is written for every such field. 22 | /// We create sparse matrix for every columns relations (based on column modifiers). 23 | /// For our example we have: 24 | /// - sparse matrix for column a and b, 25 | /// - sparse matrix for column a and c, 26 | /// - sparse matrix for column b and c, 27 | /// - sparse matrix for column b and b (reflexive column). 28 | /// Apart from column names in sparse matrix we provide indices for incoming data. We have 3 columns such as a, b and c 29 | /// but column b is reflexive so we need to include this column. The result is: (a, b, c, b). 30 | /// The rule is that every reflexive column is append with the order of occurrence to the end of constructed array. 31 | pub fn create_sparse_matrices_descriptors(cols: &Vec) -> Vec { 32 | let mut sparse_matrix_builders: Vec = Vec::new(); 33 | let num_fields = cols.len(); 34 | let mut reflexive_count = 0; 35 | 36 | for i in 0..num_fields { 37 | for j in i..num_fields { 38 | let col_i = &cols[i]; 39 | let col_j = &cols[j]; 40 | if i < j { 41 | let sm = SparseMatrixDescriptor::new( 42 | i as u8, 43 | col_i.name.clone(), 44 | j as u8, 45 | col_j.name.clone(), 46 | ); 47 | sparse_matrix_builders.push(sm); 48 | } else if i == j && col_i.reflexive { 49 | let new_j = num_fields + reflexive_count; 50 | reflexive_count += 1; 51 | let sm = SparseMatrixDescriptor::new( 52 | i as u8, 53 | col_i.name.clone(), 54 | new_j as u8, 55 | col_j.name.clone(), 56 | ); 57 | sparse_matrix_builders.push(sm); 58 | } 59 | } 60 | } 61 | sparse_matrix_builders 62 | } 63 | 64 | #[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] 65 | pub struct SparseMatrixDescriptor { 66 | /// First column index for which we creates subgraph 67 | pub col_a_id: u8, 68 | 69 | /// First column name 70 | pub col_a_name: String, 71 | 72 | /// Second column index for which we creates subgraph 73 | pub col_b_id: u8, 74 | 75 | /// Second column name 76 | pub col_b_name: String, 77 | } 78 | 79 | #[pyclass(name = "SparseMatrix", module = "cleora")] 80 | #[derive(Debug, Serialize, Deserialize)] 81 | pub struct SparseMatrix { 82 | pub descriptor: SparseMatrixDescriptor, 83 | #[pyo3(get, set)] 84 | pub entity_ids: Vec, 85 | pub entities: Vec, 86 | pub edges: Vec, 87 | /// Maps entities to its edges 88 | /// I-th slice represent edges going out of ith node 89 | /// Example: 90 | /// Given slices=[(0, 4), (4, 10), (10, 11)] 91 | /// edges[0..4] are outgoing edges for entity=0 92 | /// edges[4..10] are outgoing edges for entity=1 93 | /// edges[10..11] are outgoing edges for entity=2 94 | pub slices: Vec<(usize, usize)>, 95 | pub column_ids: Vec, 96 | } 97 | 98 | #[derive(Debug, Clone, Copy, Serialize, Deserialize)] 99 | pub struct Entity { 100 | pub row_sum: f32, 101 | } 102 | 103 | #[derive(Debug, Serialize, Deserialize)] 104 | pub struct Edge { 105 | pub other_entity_ix: u32, 106 | pub left_markov_value: f32, 107 | pub symmetric_markov_value: f32, 108 | } 109 | -------------------------------------------------------------------------------- /tests/snapshot.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | use insta::assert_debug_snapshot; 4 | use ndarray; 5 | use ndarray::{Array, Array2, ArrayBase, Dim, Ix, OwnedRepr}; 6 | use ndarray_rand::rand::rngs::StdRng; 7 | use ndarray_rand::rand::{RngCore, SeedableRng}; 8 | use ndarray_rand::rand_distr::Uniform; 9 | use ndarray_rand::RandomExt; 10 | 11 | use cleora::embedding::{MarkovType, NdArrayMatrix}; 12 | use cleora::sparse_matrix::SparseMatrix; 13 | 14 | fn round(arr: Array2) -> Array2 { 15 | arr.map(|v| (v * 1000.) as i32) 16 | } 17 | 18 | #[test] 19 | fn test_markov_left_01() { 20 | let (graph, embeddings) = create_graph_embeddings_complex_reflexive(); 21 | let embedding_out = NdArrayMatrix::multiply(&graph, embeddings.view(), MarkovType::Left, 8); 22 | let embedding_out = round(embedding_out); 23 | assert_debug_snapshot!(embedding_out); 24 | } 25 | 26 | #[test] 27 | fn test_markov_left_02() { 28 | let (graph, embeddings) = create_graph_embeddings_complex_complex(); 29 | let embedding_out = NdArrayMatrix::multiply(&graph, embeddings.view(), MarkovType::Left, 8); 30 | let embedding_out = round(embedding_out); 31 | assert_debug_snapshot!(embedding_out); 32 | } 33 | 34 | #[test] 35 | fn test_markov_sym_01() { 36 | let (graph, embeddings) = create_graph_embeddings_complex_reflexive(); 37 | let embedding_out = 38 | NdArrayMatrix::multiply(&graph, embeddings.view(), MarkovType::Symmetric, 8); 39 | let embedding_out = round(embedding_out); 40 | assert_debug_snapshot!(embedding_out); 41 | } 42 | 43 | #[test] 44 | fn test_markov_sym_02() { 45 | let (graph, embeddings) = create_graph_embeddings_complex_complex(); 46 | let embedding_out = 47 | NdArrayMatrix::multiply(&graph, embeddings.view(), MarkovType::Symmetric, 8); 48 | let embedding_out = round(embedding_out); 49 | assert_debug_snapshot!(embedding_out); 50 | } 51 | 52 | fn create_graph_embeddings_complex_complex( 53 | ) -> (SparseMatrix, ArrayBase, Dim<[Ix; 2]>>) { 54 | let num_embeddings: usize = 100; 55 | let mut rng: StdRng = SeedableRng::seed_from_u64(21_37); 56 | 57 | let mut edges: Vec<_> = vec![]; 58 | for _ in 0..1000 { 59 | let col_1_node_1 = rng.next_u32() % (num_embeddings as u32); 60 | let col_1_node_2 = rng.next_u32() % (num_embeddings as u32); 61 | 62 | let col_2_node_1 = rng.next_u32() % (num_embeddings as u32); 63 | let col_2_node_2 = rng.next_u32() % (num_embeddings as u32); 64 | 65 | edges.push(format!( 66 | "{} {}\t{} {}", 67 | col_1_node_1, col_1_node_2, col_2_node_1, col_2_node_2 68 | )) 69 | } 70 | let edges_ref: Vec<&str> = edges.iter().map(|s| s.as_ref()).collect(); 71 | let graph = SparseMatrix::from_rust_iterator( 72 | "complex::entity_a complex::entity_b", 73 | 16, 74 | edges_ref.into_iter(), 75 | None, 76 | ) 77 | .unwrap(); 78 | 79 | let feature_dim: usize = 32; 80 | 81 | let embeddings = Array::random_using( 82 | (num_embeddings, feature_dim), 83 | Uniform::new(0., 10.), 84 | &mut rng, 85 | ); 86 | (graph, embeddings) 87 | } 88 | 89 | fn create_graph_embeddings_complex_reflexive( 90 | ) -> (SparseMatrix, ArrayBase, Dim<[Ix; 2]>>) { 91 | let num_embeddings: usize = 100; 92 | let mut rng: StdRng = SeedableRng::seed_from_u64(21_37); 93 | 94 | let mut edges: Vec<_> = vec![]; 95 | for _ in 0..1000 { 96 | let node_a = rng.next_u32() % (num_embeddings as u32); 97 | let node_b = rng.next_u32() % (num_embeddings as u32); 98 | edges.push(format!("{} {}", node_a, node_b)) 99 | } 100 | let edges_ref: Vec<&str> = edges.iter().map(|s| s.as_ref()).collect(); 101 | let graph = SparseMatrix::from_rust_iterator( 102 | "reflexive::complex::entity_id", 103 | 16, 104 | edges_ref.into_iter(), 105 | None, 106 | ) 107 | .unwrap(); 108 | 109 | let feature_dim: usize = 32; 110 | 111 | let embeddings = Array::random_using( 112 | (num_embeddings, feature_dim), 113 | Uniform::new(0., 10.), 114 | &mut rng, 115 | ); 116 | (graph, embeddings) 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /legacy/docs/source/graph_creation.rst: -------------------------------------------------------------------------------- 1 | .. _graph-creation: 2 | 3 | Graph Creation 4 | ========================= 5 | 6 | Cleora as a tool 7 | ---------------------- 8 | 9 | Cleora is built as a multi-purpose "just embed it" tool, suitable for many different data types and formats. Our tool ingests a relational table of rows representing a typed and undirected heterogeneous hypergraph, which can contain multiple: 10 | 11 | - typed categorical columns 12 | - typed categorical array columns 13 | 14 | Based on the column format specification, Cleora performs: 15 | 16 | - Star decomposition of hyper-edges 17 | - Creation of pairwise graphs for all pairs of entity types 18 | - Embedding of each graph 19 | - The final output of Cleora consists of multiple files for each (undirected) pair of entity types in the table. 20 | 21 | Those embeddings can then be utilized in a novel way thanks to their dim-wise independence property, which is described further below. 22 | 23 | Graph construction 24 | ------------------------ 25 | 26 | 27 | .. figure:: _static/cleora-sparse-matrix.png 28 | :figwidth: 100 % 29 | :width: 100 % 30 | :align: center 31 | :alt: Sparse Matrix 32 | 33 | 34 | 35 | Graph construction starts with the creation of a helper matrix *P* object as a regular 2-D Rust array, which is built according to the selected 36 | expansion method. An example involving clique expansion is presented in Figure - a Cartesian product (all combinations) of all columns is created. 37 | Each entity identifier from the original input file is hashed with `xxhash `_ - a fast and efficient hashing method. 38 | We hash the identifiers to store them in a unified, small data format. From the first line of our example: 39 | 40 | .. math:: 41 | 42 | U1\:P1\:P2\:B1\:B2 43 | 44 | we get 4 combinations produced by the Cartesian product: 45 | 46 | .. math:: 47 | 48 | [4,\:U1hash,\:P1hash,\:B1hash] \\ 49 | [4,\:U1hash,\:P1hash,\:B2hash] \\ 50 | [4,\:U1hash,\:P2hash,\:B1hash] \\ 51 | [4,\:U1hash,\:P2hash,\:B2hash] 52 | 53 | At the beginning we insert the total number of combinations (in this case 4). Then we add another 3 rows representing combinations from the second row of the input. 54 | 55 | Subsequently, for each relation pair from matrix `P` we create a separate matrix `M` as a `SparseMatrix` struct (the matrices `M` will usually hold mostly zeros). 56 | Each matrix `M` object is produced in a separate thread in a stepwise fashion. The rows of matrix `P` object are broadcasted to all matrix `M` objects, 57 | and each matrix `M` object reads the buffer selecting the appropriate values, updating its content. 58 | For example, M3 (users and products) reads the hashes from indexes 1 and 2. After reading the first vector: 59 | 60 | .. math:: 61 | 62 | [4,\:U1hash,\:P1hash,\:B1hash] 63 | 64 | the edge value for **U1hash <-> P1hash** equals 1/4 (1 divided by the total number of Cartesian products). After reading the next vector: 65 | 66 | .. math:: 67 | 68 | [4,\:U1hash,\:P1hash,\:B2hash] 69 | 70 | 71 | the edge value for **U1hash <-> P1hash** updates to 1/2 (1/4 + 1/4). After reading the next two, we finally have: 72 | 73 | **U1hash <-> P1hash** = 1/2 74 | 75 | **U1hash <-> P2hash** = 1/2 76 | 77 | Sparse Matrix 78 | --------------------- 79 | 80 | For maximum efficiency we created a custom implementation of a sparse matrix data structure - the SparseMatrix struct. It follows the sparse matrix coordinate format (COO). Its purpose is to save space by holding only the coordinates and values of nonzero entities. 81 | 82 | Embedding is done in 2 basic steps: graph construction and training. 83 | 84 | Let's assume that the basic configuration of the program looks like this: 85 | 86 | .. code-block:: bash 87 | 88 | --input files/samples/edgelist_2.tsv 89 | --columns="users complex::products complex::brands" 90 | --dimension 3 91 | --number-of-iterations 4 92 | 93 | Every SparseMatrix is created based on the program argument **--columns**. For our example, there will be three SparseMatrix'es that will only read data from the columns: 94 | 95 | - users and brands by M1 96 | - products and brands by M2 97 | - users and products by M3 98 | 99 | 100 | Memory consumption 101 | ------------------- 102 | 103 | Every **SparseMatrix** object allocates space for: 104 | 105 | - **|V|** objects, each occupying 40 bytes, 106 | - **2 x |E|** objects (in undirected graphs we need to count an edge in both directions), each occupying 24 bytes. 107 | 108 | 109 | During training we need additonal 110 | 111 | .. math:: 112 | 113 | 114 | 2 \times d \times | V | 115 | 116 | objects, each occupying 4 bytes (this can be avoided by using memory-mapped files, see `--in-memory-embedding-calculation` argument for the program). 117 | -------------------------------------------------------------------------------- /legacy/docs/source/running.rst: -------------------------------------------------------------------------------- 1 | .. _running: 2 | 3 | Running configuration 4 | ====================== 5 | 6 | This page details how to use the Cleora run command to define the embedding resources at runtime. 7 | 8 | Synopsis 9 | -------- 10 | 11 | **cleora** [*options 2*] <*params 1*> [*options 2*] <*params 2*> ... 12 | 13 | Run options 14 | -------------- 15 | 16 | - input 17 | 18 | Using input param: *--input* or *-i* 19 | 20 | Param description: A parameter that defines path for input file. You can use also absolute path or relative path. 21 | 22 | 23 | - file type 24 | 25 | Using file type param: *--type* or *-t* 26 | 27 | Param description: This parameter is responsible for defining the input file extension to the algorithm. Cleora supports two kinds of input files .tsv (tab-separated values) and .json. 28 | 29 | - dimension 30 | 31 | Using dimension param: *--dimenstion* or *-d* 32 | 33 | Param description: Embedding dimension size. 34 | 35 | - number of iterations 36 | 37 | Using number of iterations param: *--dimenstion* or *-d* 38 | 39 | param Description: Set maximum number of iterations. 40 | 41 | - columns 42 | 43 | Using columnns param: *--columns* or *-c* 44 | 45 | Param description: Set column names (max. 12), with modifiers from list: [transient::, reflexive::, complex::] 46 | 47 | .. list-table:: 48 | :widths: 20 80 49 | :header-rows: 1 50 | 51 | * - Modifiers 52 | - Description 53 | * - transient 54 | - The field is virtual - it is considered during embedding process, no entity is written for the column 55 | * - complex 56 | - The field is composite, containing multiple entity identifiers separated by space in TSV or an array in JSON 57 | * - reflexive 58 | - The field is reflexive, which means that it interacts with itself, additional output file is written for every such field 59 | * - ignore 60 | - The field is ignored, no output file is written for the field 61 | 62 | 63 | Allowed combinations of modifiers are: 64 | - `transient` 65 | - `complex` 66 | - `transient::complex` 67 | - `reflexive::complex` 68 | 69 | 70 | 71 | For TSV datasets containing composite fields (categorical array), multiple items within a field are then separated by space. 72 | 73 | The specification of an input format is as follows: 74 | 75 | .. code-block:: none 76 | 77 | --columns="[column modifiers, ::] ..." 78 | 79 | 80 | Combinations which don't make sense are: 81 | 82 | .. list-table:: 83 | :widths: 40 80 84 | :header-rows: 1 85 | 86 | * - Modifiers 87 | - Description 88 | * - reflexive 89 | - This would represent an identity relation 90 | * - transient::reflexive 91 | - This would generate no output 92 | * - reflexive::transient::complex 93 | - This would generate no output 94 | 95 | Picture below representation how works column modifiers: 96 | 97 | .. figure:: _static/cleora-columns.png 98 | :figwidth: 100 % 99 | :width: 60 % 100 | :align: center 101 | :alt: examples use case of column modifiers 102 | 103 | 104 | - relation name 105 | 106 | Using relation param: *--relation-name* or *-r* 107 | 108 | Param description: Name of the relation, for output filename generation. 109 | 110 | - prepend field name 111 | 112 | Using prepend field name param: *--relation-name* or *-r* 113 | 114 | Param description: Prameter that responsible for prepending field name to entity in output. 115 | 116 | - log every n 117 | 118 | Using log every n params : *--log-every-n* or *-l* 119 | 120 | Param description: Set log output for every N lines 121 | 122 | - in memory embedding calculation 123 | 124 | Using log in memory embedding calculation param: *--in-memory-embedding-calculation* or *-e* 125 | 126 | Param description: Parameter that responsible for using calculate embeddings in memory or with memory-mapped files. Default is on (setting -e 0). If you want off use -e 1. 127 | 128 | -output dir 129 | 130 | Using output dir param: *--output-dir* or *-o* 131 | 132 | Param description: Set output directory for files with embeddings. 133 | 134 | -output format 135 | 136 | Using output format param: *--output-format* or *-o* 137 | 138 | Param Description: A parameter that defines the format of the output file. Possible output format are textfile (.txt) and numpy (.npy) 139 | 140 | 141 | Examples Cleora run configuration 142 | --------------------------------- 143 | 144 | Remember before you will first run cleora training (after download binary file from repository) to set execute file permission using *chmod +x* 145 | 146 | .. code-block:: bash 147 | 148 | 149 | chmod +x cleora 150 | ./cleora -i files/samples/edgelist_sample.tsv 151 | --columns="complex::reflexive::a b complex::c" 152 | -d 128 153 | -n 5 154 | --relation-name=test_realation_name 155 | -p 0 156 | 157 | 158 | -------------------------------------------------------------------------------- /legacy/tests/snapshot.rs: -------------------------------------------------------------------------------- 1 | use cleora::configuration::{Column, Configuration, FileType, OutputFormat}; 2 | use cleora::embedding::{calculate_embeddings, calculate_embeddings_mmap}; 3 | use cleora::persistence::embedding::EmbeddingPersistor; 4 | use cleora::persistence::entity::InMemoryEntityMappingPersistor; 5 | use cleora::pipeline::build_graphs; 6 | use insta::assert_debug_snapshot; 7 | use std::io; 8 | use std::sync::Arc; 9 | 10 | /// This test performs work for sample case and saves snapshot file. 11 | /// Snapshot testing takes advantage of deterministic character of Cleora. 12 | /// Any discrepancies between original snapshot results and current ones can be then 13 | /// reviewed along with the code which introduced discrepancy. 14 | /// 15 | /// Differing snapshot has to be renamed by removing .new from the name. 16 | /// For more information, please review https://crates.io/crates/insta 17 | /// 18 | /// Code executed performs roughly the same work as: 19 | /// ./cleora -i files/samples/edgelist_1.tsv --columns="complex::reflexive::a b complex::c" 20 | /// -d 128 -n 4 --relation-name=R1 -p 0 21 | #[test] 22 | fn test_build_graphs_and_create_embeddings() { 23 | let config = prepare_config(); 24 | 25 | let in_memory_entity_mapping_persistor = InMemoryEntityMappingPersistor::default(); 26 | let in_memory_entity_mapping_persistor = Arc::new(in_memory_entity_mapping_persistor); 27 | 28 | // build sparse matrices 29 | let sparse_matrices = build_graphs(&config, in_memory_entity_mapping_persistor.clone()); 30 | 31 | let config = Arc::new(config); 32 | 33 | // embeddings for in-memory and mmap files calculation should be the same 34 | for sparse_matrix in sparse_matrices.into_iter() { 35 | let sparse_matrix = Arc::new(sparse_matrix); 36 | let snapshot_name = format!( 37 | "embeddings_{}_{}", 38 | sparse_matrix.col_a_name, sparse_matrix.col_b_name 39 | ); 40 | 41 | let mut in_memory_embedding_persistor = InMemoryEmbeddingPersistor::default(); 42 | // calculate embeddings in memory 43 | calculate_embeddings( 44 | config.clone(), 45 | sparse_matrix.clone(), 46 | in_memory_entity_mapping_persistor.clone(), 47 | &mut in_memory_embedding_persistor, 48 | ); 49 | assert_debug_snapshot!(snapshot_name.clone(), in_memory_embedding_persistor); 50 | 51 | let mut in_memory_embedding_persistor = InMemoryEmbeddingPersistor::default(); 52 | // calculate embeddings with mmap files 53 | calculate_embeddings_mmap( 54 | config.clone(), 55 | sparse_matrix.clone(), 56 | in_memory_entity_mapping_persistor.clone(), 57 | &mut in_memory_embedding_persistor, 58 | ); 59 | assert_debug_snapshot!(snapshot_name, in_memory_embedding_persistor); 60 | } 61 | } 62 | 63 | fn prepare_config() -> Configuration { 64 | let columns = vec![ 65 | Column { 66 | name: "a".to_string(), 67 | complex: true, 68 | reflexive: true, 69 | ..Column::default() 70 | }, 71 | Column { 72 | name: "b".to_string(), 73 | ..Column::default() 74 | }, 75 | Column { 76 | name: "c".to_string(), 77 | complex: true, 78 | ..Column::default() 79 | }, 80 | ]; 81 | 82 | let config = Configuration { 83 | produce_entity_occurrence_count: true, 84 | embeddings_dimension: 128, 85 | max_number_of_iteration: 4, 86 | seed: None, 87 | prepend_field: false, 88 | log_every_n: 10000, 89 | in_memory_embedding_calculation: true, 90 | input: vec!["files/samples/edgelist_1.tsv".to_string()], 91 | file_type: FileType::Tsv, 92 | output_format: OutputFormat::TextFile, 93 | output_dir: None, 94 | relation_name: "r1".to_string(), 95 | columns, 96 | }; 97 | config 98 | } 99 | 100 | #[derive(Debug, Default)] 101 | struct InMemoryEmbeddingPersistor { 102 | entity_count: u32, 103 | dimenstion: u16, 104 | entities: Vec, 105 | } 106 | 107 | #[derive(Debug)] 108 | struct InMemoryEntity { 109 | entity: String, 110 | occur_count: u32, 111 | vector: Vec, 112 | } 113 | 114 | impl EmbeddingPersistor for InMemoryEmbeddingPersistor { 115 | fn put_metadata(&mut self, entity_count: u32, dimension: u16) -> Result<(), io::Error> { 116 | self.entity_count = entity_count; 117 | self.dimenstion = dimension; 118 | Ok(()) 119 | } 120 | fn put_data( 121 | &mut self, 122 | entity: &str, 123 | occur_count: u32, 124 | vector: Vec, 125 | ) -> Result<(), io::Error> { 126 | let entity = entity.to_string(); 127 | self.entities.push(InMemoryEntity { 128 | entity, 129 | occur_count, 130 | vector, 131 | }); 132 | Ok(()) 133 | } 134 | fn finish(&mut self) -> Result<(), io::Error> { 135 | Ok(()) 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /benches/cleora_benchmark.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use fnv::FnvHasher; 3 | use std::collections::hash_map::DefaultHasher; 4 | use std::hash::Hasher; 5 | use twox_hash::XxHash64; 6 | 7 | fn default_hash(entity: &str) -> u64 { 8 | let mut hasher = DefaultHasher::new(); 9 | hasher.write(entity.as_bytes()); 10 | hasher.finish() 11 | } 12 | 13 | fn xx_hash(entity: &str) -> u64 { 14 | let mut hasher = XxHash64::default(); 15 | hasher.write(entity.as_bytes()); 16 | hasher.finish() 17 | } 18 | 19 | fn fnv_hash(entity: &str) -> u64 { 20 | let mut hasher = FnvHasher::default(); 21 | hasher.write(entity.as_bytes()); 22 | hasher.finish() 23 | } 24 | 25 | fn hash_benchmark(c: &mut Criterion) { 26 | c.bench_function("hash", |b| b.iter(|| fnv_hash(black_box("cleora")))); 27 | } 28 | 29 | fn bench_hashes(c: &mut Criterion) { 30 | let mut group = c.benchmark_group("Hashing"); 31 | for s in ["Poland", "Germany", "USA", "United Kingdom", "Norway"].iter() { 32 | group.bench_with_input(BenchmarkId::new("Default", s), s, |b, s| { 33 | b.iter(|| default_hash(s)) 34 | }); 35 | group.bench_with_input(BenchmarkId::new("XXHash", s), s, |b, s| { 36 | b.iter(|| xx_hash(s)) 37 | }); 38 | group.bench_with_input(BenchmarkId::new("FnvHash", s), s, |b, s| { 39 | b.iter(|| fnv_hash(s)) 40 | }); 41 | } 42 | group.finish(); 43 | } 44 | 45 | struct CartesianProduct { 46 | lengths: Vec, 47 | indices: Vec, 48 | } 49 | 50 | impl CartesianProduct { 51 | fn new(lengths: Vec) -> CartesianProduct { 52 | let indices = vec![0; lengths.len()]; 53 | CartesianProduct { lengths, indices } 54 | } 55 | } 56 | 57 | impl Iterator for CartesianProduct { 58 | type Item = Vec; 59 | 60 | fn next(&mut self) -> Option { 61 | let result = self.indices.clone(); 62 | let len = self.indices.len(); 63 | for i in (0..len).rev() { 64 | if self.indices[i] == (self.lengths[i] - 1) { 65 | self.indices[i] = 0; 66 | if i == 0 { 67 | return None; 68 | } 69 | } else { 70 | self.indices[i] += 1; 71 | break; 72 | } 73 | } 74 | Some(result) 75 | } 76 | } 77 | 78 | fn generate_combinations_with_length( 79 | hashes: Vec>, 80 | lens: Vec, 81 | transient_lens: Vec, 82 | ) -> Vec> { 83 | let row_length = lens.len(); 84 | let mut combinations = 1; 85 | for &len in &lens { 86 | combinations *= len; 87 | } 88 | 89 | let mut transient_combinations = 1; 90 | for transient_len in transient_lens { 91 | transient_combinations *= transient_len; 92 | } 93 | 94 | let total_combinations = u64::from(combinations * transient_combinations); 95 | 96 | let mut result: Vec> = Vec::with_capacity(combinations as usize); 97 | let cartesian = CartesianProduct::new(lens); 98 | let mut counter = 0; 99 | 100 | for indices in cartesian { 101 | let mut arr: Vec = Vec::with_capacity(row_length + 1); 102 | arr.push(total_combinations); 103 | let hashes_length = hashes.len(); 104 | for i in 0..hashes_length { 105 | let id = indices[i]; 106 | let value = hashes.get(i).unwrap().get(id as usize).unwrap(); 107 | arr.push(*value); 108 | } 109 | result.insert(counter, arr); 110 | counter += 1; 111 | } 112 | 113 | result 114 | } 115 | 116 | fn generate_combinations_with_length_benchmark(c: &mut Criterion) { 117 | let hashes = vec![ 118 | vec![ 119 | 12528106613309397869, 120 | 9708327007652588651, 121 | 14980293948133487802, 122 | 12266831465718424827, 123 | 17286486014462130850, 124 | 11758309849656381133, 125 | 10347099512938872293, 126 | 804562942093240192, 127 | 3059164883323983321, 128 | ], 129 | vec![ 130 | 12528106613309397869, 131 | 9708327007652588651, 132 | 14980293948133487802, 133 | 12266831465718424827, 134 | 17286486014462130850, 135 | 11758309849656381133, 136 | 10347099512938872293, 137 | 804562942093240192, 138 | 3059164883323983321, 139 | ], 140 | ]; 141 | let lens = vec![9, 9]; 142 | let transient_lens = vec![1]; 143 | c.bench_function("generate_combinations_with_length", |b| { 144 | b.iter(|| { 145 | generate_combinations_with_length( 146 | black_box(hashes.clone()), 147 | black_box(lens.clone()), 148 | black_box(transient_lens.clone()), 149 | ) 150 | }) 151 | }); 152 | } 153 | 154 | criterion_group!( 155 | benches, 156 | generate_combinations_with_length_benchmark, 157 | bench_hashes 158 | ); 159 | criterion_main!(benches); 160 | -------------------------------------------------------------------------------- /legacy/benches/cleora_benchmark.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use fnv::FnvHasher; 3 | use std::collections::hash_map::DefaultHasher; 4 | use std::hash::Hasher; 5 | use twox_hash::XxHash64; 6 | 7 | fn default_hash(entity: &str) -> u64 { 8 | let mut hasher = DefaultHasher::new(); 9 | hasher.write(entity.as_bytes()); 10 | hasher.finish() 11 | } 12 | 13 | fn xx_hash(entity: &str) -> u64 { 14 | let mut hasher = XxHash64::default(); 15 | hasher.write(entity.as_bytes()); 16 | hasher.finish() 17 | } 18 | 19 | fn fnv_hash(entity: &str) -> u64 { 20 | let mut hasher = FnvHasher::default(); 21 | hasher.write(entity.as_bytes()); 22 | hasher.finish() 23 | } 24 | 25 | fn hash_benchmark(c: &mut Criterion) { 26 | c.bench_function("hash", |b| b.iter(|| fnv_hash(black_box("cleora")))); 27 | } 28 | 29 | fn bench_hashes(c: &mut Criterion) { 30 | let mut group = c.benchmark_group("Hashing"); 31 | for s in ["Poland", "Germany", "USA", "United Kingdom", "Norway"].iter() { 32 | group.bench_with_input(BenchmarkId::new("Default", s), s, |b, s| { 33 | b.iter(|| default_hash(s)) 34 | }); 35 | group.bench_with_input(BenchmarkId::new("XXHash", s), s, |b, s| { 36 | b.iter(|| xx_hash(s)) 37 | }); 38 | group.bench_with_input(BenchmarkId::new("FnvHash", s), s, |b, s| { 39 | b.iter(|| fnv_hash(s)) 40 | }); 41 | } 42 | group.finish(); 43 | } 44 | 45 | struct CartesianProduct { 46 | lengths: Vec, 47 | indices: Vec, 48 | } 49 | 50 | impl CartesianProduct { 51 | fn new(lengths: Vec) -> CartesianProduct { 52 | let indices = vec![0; lengths.len()]; 53 | CartesianProduct { lengths, indices } 54 | } 55 | } 56 | 57 | impl Iterator for CartesianProduct { 58 | type Item = Vec; 59 | 60 | fn next(&mut self) -> Option { 61 | let result = self.indices.clone(); 62 | let len = self.indices.len(); 63 | for i in (0..len).rev() { 64 | if self.indices[i] == (self.lengths[i] - 1) { 65 | self.indices[i] = 0; 66 | if i == 0 { 67 | return None; 68 | } 69 | } else { 70 | self.indices[i] += 1; 71 | break; 72 | } 73 | } 74 | Some(result) 75 | } 76 | } 77 | 78 | fn generate_combinations_with_length( 79 | hashes: Vec>, 80 | lens: Vec, 81 | transient_lens: Vec, 82 | ) -> Vec> { 83 | let row_length = lens.len(); 84 | let mut combinations = 1; 85 | for &len in &lens { 86 | combinations *= len; 87 | } 88 | 89 | let mut transient_combinations = 1; 90 | for transient_len in transient_lens { 91 | transient_combinations *= transient_len; 92 | } 93 | 94 | let total_combinations = u64::from(combinations * transient_combinations); 95 | 96 | let mut result: Vec> = Vec::with_capacity(combinations as usize); 97 | let cartesian = CartesianProduct::new(lens); 98 | let mut counter = 0; 99 | 100 | for indices in cartesian { 101 | let mut arr: Vec = Vec::with_capacity(row_length + 1); 102 | arr.push(total_combinations); 103 | let hashes_length = hashes.len(); 104 | for i in 0..hashes_length { 105 | let id = indices[i]; 106 | let value = hashes.get(i).unwrap().get(id as usize).unwrap(); 107 | arr.push(*value); 108 | } 109 | result.insert(counter, arr); 110 | counter += 1; 111 | } 112 | 113 | result 114 | } 115 | 116 | fn generate_combinations_with_length_benchmark(c: &mut Criterion) { 117 | let hashes = vec![ 118 | vec![ 119 | 12528106613309397869, 120 | 9708327007652588651, 121 | 14980293948133487802, 122 | 12266831465718424827, 123 | 17286486014462130850, 124 | 11758309849656381133, 125 | 10347099512938872293, 126 | 804562942093240192, 127 | 3059164883323983321, 128 | ], 129 | vec![ 130 | 12528106613309397869, 131 | 9708327007652588651, 132 | 14980293948133487802, 133 | 12266831465718424827, 134 | 17286486014462130850, 135 | 11758309849656381133, 136 | 10347099512938872293, 137 | 804562942093240192, 138 | 3059164883323983321, 139 | ], 140 | ]; 141 | let lens = vec![9, 9]; 142 | let transient_lens = vec![1]; 143 | c.bench_function("generate_combinations_with_length", |b| { 144 | b.iter(|| { 145 | generate_combinations_with_length( 146 | black_box(hashes.clone()), 147 | black_box(lens.clone()), 148 | black_box(transient_lens.clone()), 149 | ) 150 | }) 151 | }); 152 | } 153 | 154 | criterion_group!( 155 | benches, 156 | generate_combinations_with_length_benchmark, 157 | bench_hashes 158 | ); 159 | criterion_main!(benches); 160 | -------------------------------------------------------------------------------- /src/entity.rs: -------------------------------------------------------------------------------- 1 | use itertools::{Itertools, Product}; 2 | use std::hash::Hasher; 3 | use std::ops::Range; 4 | use std::sync::Arc; 5 | 6 | use smallvec::{IntoIter, SmallVec}; 7 | use twox_hash::XxHash64; 8 | 9 | use crate::configuration::Configuration; 10 | use crate::sparse_matrix_builder::NodeIndexerBuilder; 11 | 12 | /// Indicates how many elements in a vector can be placed on Stack (used by smallvec crate). The rest 13 | /// of the vector is placed on Heap. 14 | pub const SMALL_VECTOR_SIZE: usize = 8; 15 | 16 | #[derive(Debug, Clone)] 17 | pub struct Hyperedge { 18 | hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]>, 19 | slices: [Range; 2], 20 | } 21 | 22 | impl Hyperedge { 23 | #[inline] 24 | pub fn nodes(&self, column_id: usize) -> SmallVec<[u64; SMALL_VECTOR_SIZE]> { 25 | let slice = self.slices.get(column_id).unwrap(); 26 | let mut v = SmallVec::with_capacity(slice.len()); 27 | for ix in slice.start..slice.end { 28 | v.push(self.hashes[ix as usize]) 29 | } 30 | v 31 | } 32 | 33 | #[inline(always)] 34 | pub fn edges_iter( 35 | &self, 36 | col_id_a: u8, 37 | col_id_b: u8, 38 | ) -> Product, IntoIter<[u64; 8]>> { 39 | let nodes_a = self.nodes(col_id_a as usize); 40 | let nodes_b = self.nodes(col_id_b as usize); 41 | nodes_a.into_iter().cartesian_product(nodes_b) 42 | } 43 | 44 | pub fn edges_num(&self, col_id_a: u8, col_id_b: u8) -> usize { 45 | self.slices[col_id_a as usize].len() * self.slices[col_id_b as usize].len() 46 | } 47 | } 48 | 49 | pub struct EntityProcessor<'a, S: NodeIndexerBuilder> { 50 | config: &'a Configuration, 51 | not_ignored_columns_count: u16, 52 | node_indexer: Arc, 53 | } 54 | 55 | impl<'a, S: NodeIndexerBuilder> EntityProcessor<'a, S> { 56 | pub fn new(config: &'a Configuration, node_indexer: Arc) -> EntityProcessor<'a, S> { 57 | let not_ignored_columns_count = config.columns.len() as u16; 58 | EntityProcessor { 59 | config, 60 | not_ignored_columns_count, 61 | node_indexer, 62 | } 63 | } 64 | 65 | /// Every row can create few combinations (cartesian products) which are hashed and provided for sparse matrix creation. 66 | /// `row` - array of strings such as: ("userId1", "productId1 productId2", "brandId1"). 67 | pub fn process_row_and_get_edges( 68 | &self, 69 | row: &[SmallVec<[&str; SMALL_VECTOR_SIZE]>], 70 | ) -> Hyperedge { 71 | let mut hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = 72 | SmallVec::with_capacity(self.not_ignored_columns_count as usize); 73 | let mut slices: [Range; 2] = [0..0, 0..0]; 74 | let mut reflexive_count = 0; 75 | let mut current_offset = 0u32; 76 | 77 | for (i, column_entities) in row.iter().enumerate() { 78 | let column = &self.config.columns[i]; 79 | let column_id = i as u8; 80 | if column.complex { 81 | for entity in column_entities { 82 | let hash = hash_entity(entity); 83 | hashes.push(hash); 84 | self.node_indexer.process(hash, entity, column_id); 85 | } 86 | let length = column_entities.len() as u32; 87 | slices[i] = current_offset..(current_offset + length); 88 | if column.reflexive { 89 | // put reflexive column data to the end of the buffers 90 | let reflexive_id = (self.not_ignored_columns_count + reflexive_count) as usize; 91 | slices[reflexive_id] = current_offset..(current_offset + length); 92 | reflexive_count += 1; 93 | } 94 | current_offset += length; 95 | } else { 96 | let entity = column_entities.first().unwrap(); 97 | let hash = hash_entity(entity); 98 | hashes.push(hash); 99 | self.node_indexer.process(hash, entity, column_id); 100 | let length = 1u32; 101 | slices[i] = current_offset..(current_offset + length); 102 | current_offset += length; 103 | } 104 | } 105 | Hyperedge { hashes, slices } 106 | } 107 | } 108 | 109 | #[inline(always)] 110 | pub fn hash_entity(entity: &str) -> u64 { 111 | let mut hasher = XxHash64::default(); 112 | hasher.write(entity.as_bytes()); 113 | hasher.finish() 114 | } 115 | 116 | #[cfg(test)] 117 | mod tests { 118 | use smallvec::{smallvec, SmallVec}; 119 | 120 | use crate::entity::{Hyperedge, SMALL_VECTOR_SIZE}; 121 | 122 | #[test] 123 | fn generate_cartesian_product_hashes() { 124 | // hashes for entities in every column 125 | // column_1: 1 entity 126 | // column_2: 2 entities 127 | // column_3: 3 entities 128 | let slices = [0..2, 2..5]; 129 | let hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = smallvec![10, 20, 30, 40, 50]; 130 | let hyperedge = Hyperedge { hashes, slices }; 131 | let combinations: Vec<_> = hyperedge.edges_iter(0, 1).collect(); 132 | assert_eq!((10, 30), *combinations.get(0).unwrap()); 133 | assert_eq!((10, 40), *combinations.get(1).unwrap()); 134 | assert_eq!((10, 50), *combinations.get(2).unwrap()); 135 | assert_eq!((20, 30), *combinations.get(3).unwrap()); 136 | assert_eq!((20, 40), *combinations.get(4).unwrap()); 137 | assert_eq!((20, 50), *combinations.get(5).unwrap()); 138 | assert_eq!(None, combinations.get(6)); 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /legacy/src/configuration.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug)] 2 | pub enum FileType { 3 | Json, 4 | Tsv, 5 | } 6 | 7 | #[derive(Debug)] 8 | pub enum OutputFormat { 9 | TextFile, 10 | Numpy, 11 | } 12 | 13 | /// Pipeline configuration 14 | #[derive(Debug)] 15 | pub struct Configuration { 16 | /// Produce or not entity counter to the output file 17 | pub produce_entity_occurrence_count: bool, 18 | 19 | /// Dimension of the embedding 20 | pub embeddings_dimension: u16, 21 | 22 | /// Maximum number of iteration for training 23 | pub max_number_of_iteration: u8, 24 | 25 | /// Seed for embedding initialization 26 | pub seed: Option, 27 | 28 | /// Prepend field name to entity in the output file. It differentiates entities with the same 29 | /// name from different columns 30 | pub prepend_field: bool, 31 | 32 | /// After how many lines we log the progress 33 | pub log_every_n: u32, 34 | 35 | /// Calculate embeddings in memory or with memory-mapped files. If we don't have enough 36 | /// RAM we can support training with mmap files 37 | pub in_memory_embedding_calculation: bool, 38 | 39 | /// Paths to the input files 40 | pub input: Vec, 41 | 42 | /// Type of the input file 43 | pub file_type: FileType, 44 | 45 | /// Output directory for files with embeddings 46 | pub output_dir: Option, 47 | 48 | /// Output format 49 | pub output_format: OutputFormat, 50 | 51 | /// Name of the relation, for output filename generation 52 | pub relation_name: String, 53 | 54 | /// Columns configuration 55 | pub columns: Vec, 56 | } 57 | 58 | /// Column configuration 59 | #[derive(Debug, Default)] 60 | pub struct Column { 61 | /// Name, header of the column 62 | pub name: String, 63 | 64 | /// The field is virtual - it is considered during embedding process, no entity is written for the column 65 | pub transient: bool, 66 | 67 | /// The field is composite, containing multiple entity identifiers separated by space 68 | pub complex: bool, 69 | 70 | /// The field is reflexive, which means that it interacts with itself, additional output file is written for every such field 71 | pub reflexive: bool, 72 | 73 | /// The field is ignored, no output file is written for the field 74 | pub ignored: bool, 75 | } 76 | 77 | impl Configuration { 78 | /// Create default configuration with specified input file path and columns. 79 | pub fn default(input: String, columns: Vec) -> Configuration { 80 | Configuration { 81 | produce_entity_occurrence_count: true, 82 | embeddings_dimension: 128, 83 | max_number_of_iteration: 4, 84 | seed: None, 85 | prepend_field: true, 86 | log_every_n: 1000, 87 | in_memory_embedding_calculation: true, 88 | file_type: FileType::Tsv, 89 | input: vec![input], 90 | output_dir: None, 91 | output_format: OutputFormat::TextFile, 92 | relation_name: String::from("emb"), 93 | columns, 94 | } 95 | } 96 | 97 | /// Filter out ignored columns. Entities from such columns are omitted. 98 | pub fn not_ignored_columns(&self) -> Vec<&Column> { 99 | self.columns.iter().filter(|&c| !c.ignored).collect() 100 | } 101 | } 102 | 103 | /// Extract columns config based on raw strings. 104 | pub fn extract_fields(cols: Vec<&str>) -> Result, String> { 105 | let mut columns: Vec = Vec::new(); 106 | 107 | for col in cols { 108 | let parts: Vec<&str> = col.split("::").collect(); 109 | 110 | let column_name: &str; 111 | let mut transient = false; 112 | let mut complex = false; 113 | let mut reflexive = false; 114 | let mut ignored = false; 115 | 116 | let parts_len = parts.len(); 117 | if parts_len > 1 { 118 | column_name = *parts.last().unwrap(); 119 | let column_name_idx = parts_len - 1; 120 | for &part in &parts[..column_name_idx] { 121 | if part.eq_ignore_ascii_case("transient") { 122 | transient = true; 123 | } else if part.eq_ignore_ascii_case("complex") { 124 | complex = true; 125 | } else if part.eq_ignore_ascii_case("reflexive") { 126 | reflexive = true; 127 | } else if part.eq_ignore_ascii_case("ignore") { 128 | ignored = true; 129 | } else { 130 | let message = format!("Unrecognized column field modifier: {}", part); 131 | return Err(message); 132 | } 133 | } 134 | } else { 135 | column_name = col; 136 | } 137 | let column = Column { 138 | name: column_name.to_string(), 139 | transient, 140 | complex, 141 | reflexive, 142 | ignored, 143 | }; 144 | columns.push(column); 145 | } 146 | Ok(columns) 147 | } 148 | 149 | /// Validate column modifiers. 150 | pub fn validate_fields(cols: Vec) -> Result, String> { 151 | for col in &cols { 152 | // transient::reflexive - this would generate no output 153 | // transient::reflexive::complex - this would generate no output 154 | if col.reflexive && col.transient { 155 | let message = format!("A field cannot be REFLEXIVE and simultaneously TRANSIENT. It does not make sense: {}", col.name); 156 | return Err(message); 157 | } 158 | if col.reflexive && !col.complex { 159 | let message = format!( 160 | "A field cannot be REFLEXIVE but NOT COMPLEX. It does not make sense: {}", 161 | col.name 162 | ); 163 | return Err(message); 164 | } 165 | } 166 | Ok(cols) 167 | } 168 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by maturin v1.7.4 2 | # To update, run 3 | # 4 | # maturin generate-ci github 5 | # 6 | name: CI 7 | 8 | on: 9 | push: 10 | branches: 11 | - main 12 | - master 13 | tags: 14 | - '*' 15 | pull_request: 16 | workflow_dispatch: 17 | 18 | permissions: 19 | contents: read 20 | 21 | jobs: 22 | linux: 23 | runs-on: ${{ matrix.platform.runner }} 24 | strategy: 25 | matrix: 26 | platform: 27 | - runner: ubuntu-latest 28 | target: x86_64 29 | - runner: ubuntu-latest 30 | target: x86 31 | - runner: ubuntu-latest 32 | target: aarch64 33 | # - runner: ubuntu-latest 34 | # target: armv7 35 | # - runner: ubuntu-latest 36 | # target: s390x 37 | # - runner: ubuntu-latest 38 | # target: ppc64le 39 | steps: 40 | - uses: actions/checkout@v4 41 | - name: Install Dependencies 42 | run: | 43 | sudo apt-get update 44 | sudo apt-get install -y musl-tools gcc musl-dev 45 | - uses: actions/setup-python@v5 46 | with: 47 | python-version: 3.x 48 | - name: Build wheels 49 | uses: PyO3/maturin-action@v1 50 | with: 51 | target: ${{ matrix.platform.target }} 52 | args: --release --out dist --find-interpreter 53 | sccache: 'true' 54 | manylinux: auto 55 | - name: Upload wheels 56 | uses: actions/upload-artifact@v4 57 | with: 58 | name: wheels-linux-${{ matrix.platform.target }} 59 | path: dist 60 | 61 | # musllinux: 62 | # runs-on: ${{ matrix.platform.runner }} 63 | # strategy: 64 | # matrix: 65 | # platform: 66 | # - runner: ubuntu-latest 67 | # target: x86_64 68 | # - runner: ubuntu-latest 69 | # target: x86 70 | # - runner: ubuntu-latest 71 | # target: aarch64 72 | # - runner: ubuntu-latest 73 | # target: armv7 74 | # steps: 75 | # - uses: actions/checkout@v4 76 | # - name: Install Dependencies 77 | # run: | 78 | # sudo apt-get update 79 | # sudo apt-get install -y musl-tools gcc musl-dev 80 | # - uses: actions/setup-python@v5 81 | # with: 82 | # python-version: 3.x 83 | # - name: Build wheels 84 | # uses: PyO3/maturin-action@v1 85 | # with: 86 | # target: ${{ matrix.platform.target }} 87 | # args: --release --out dist --find-interpreter 88 | # sccache: 'true' 89 | # manylinux: musllinux_1_2 90 | # - name: Upload wheels 91 | # uses: actions/upload-artifact@v4 92 | # with: 93 | # name: wheels-musllinux-${{ matrix.platform.target }} 94 | # path: dist 95 | 96 | windows: 97 | runs-on: ${{ matrix.platform.runner }} 98 | strategy: 99 | matrix: 100 | platform: 101 | - runner: windows-latest 102 | target: x64 103 | - runner: windows-latest 104 | target: x86 105 | steps: 106 | - uses: actions/checkout@v4 107 | - uses: actions/setup-python@v5 108 | with: 109 | python-version: 3.x 110 | architecture: ${{ matrix.platform.target }} 111 | - name: Build wheels 112 | uses: PyO3/maturin-action@v1 113 | with: 114 | target: ${{ matrix.platform.target }} 115 | args: --release --out dist --find-interpreter 116 | sccache: 'true' 117 | - name: Upload wheels 118 | uses: actions/upload-artifact@v4 119 | with: 120 | name: wheels-windows-${{ matrix.platform.target }} 121 | path: dist 122 | 123 | macos: 124 | runs-on: ${{ matrix.platform.runner }} 125 | strategy: 126 | matrix: 127 | platform: 128 | # - runner: macos-12 129 | # target: x86_64 130 | - runner: macos-14 131 | target: aarch64 132 | steps: 133 | - uses: actions/checkout@v4 134 | - uses: actions/setup-python@v5 135 | with: 136 | python-version: 3.x 137 | - name: Build wheels 138 | uses: PyO3/maturin-action@v1 139 | with: 140 | target: ${{ matrix.platform.target }} 141 | args: --release --out dist --find-interpreter 142 | sccache: 'true' 143 | - name: Upload wheels 144 | uses: actions/upload-artifact@v4 145 | with: 146 | name: wheels-macos-${{ matrix.platform.target }} 147 | path: dist 148 | 149 | sdist: 150 | runs-on: ubuntu-latest 151 | steps: 152 | - uses: actions/checkout@v4 153 | - name: Build sdist 154 | uses: PyO3/maturin-action@v1 155 | with: 156 | command: sdist 157 | args: --out dist 158 | - name: Upload sdist 159 | uses: actions/upload-artifact@v4 160 | with: 161 | name: wheels-sdist 162 | path: dist 163 | 164 | release: 165 | name: Release 166 | runs-on: ubuntu-latest 167 | if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} 168 | needs: [linux, windows, macos, sdist] 169 | permissions: 170 | # Use to sign the release artifacts 171 | id-token: write 172 | # Used to upload release artifacts 173 | contents: write 174 | # Used to generate artifact attestation 175 | attestations: write 176 | steps: 177 | - uses: actions/download-artifact@v4 178 | - name: Generate artifact attestation 179 | uses: actions/attest-build-provenance@v1 180 | with: 181 | subject-path: 'wheels-*/*' 182 | - name: Publish to PyPI 183 | if: "startsWith(github.ref, 'refs/tags/')" 184 | uses: PyO3/maturin-action@v1 185 | env: 186 | MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} 187 | with: 188 | command: upload 189 | args: --non-interactive --skip-existing wheels-*/* -------------------------------------------------------------------------------- /legacy/src/pipeline.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{BufRead, BufReader}; 3 | 4 | use crate::configuration::{Column, Configuration, FileType, OutputFormat}; 5 | use crate::embedding::{calculate_embeddings, calculate_embeddings_mmap}; 6 | use crate::entity::{EntityProcessor, SMALL_VECTOR_SIZE}; 7 | use crate::persistence::embedding::{EmbeddingPersistor, NpyPersistor, TextFileVectorPersistor}; 8 | use crate::persistence::entity::InMemoryEntityMappingPersistor; 9 | use crate::sparse_matrix::{create_sparse_matrices, SparseMatrix}; 10 | use bus::Bus; 11 | use log::{error, info, warn}; 12 | use simdjson_rust::dom; 13 | use smallvec::{smallvec, SmallVec}; 14 | use std::sync::Arc; 15 | use std::thread; 16 | 17 | /// Create SparseMatrix'es based on columns config. Every SparseMatrix operates in separate 18 | /// thread. EntityProcessor reads data in main thread and broadcast cartesian products 19 | /// to SparseMatrix'es. 20 | pub fn build_graphs( 21 | config: &Configuration, 22 | in_memory_entity_mapping_persistor: Arc, 23 | ) -> Vec { 24 | let sparse_matrices = create_sparse_matrices(&config.columns); 25 | dbg!(&sparse_matrices); 26 | 27 | let mut bus: Bus> = Bus::new(128); 28 | let mut sparse_matrix_threads = Vec::new(); 29 | for mut sparse_matrix in sparse_matrices { 30 | let rx = bus.add_rx(); 31 | let handle = thread::spawn(move || { 32 | for received in rx { 33 | sparse_matrix.handle_pair(&received); 34 | } 35 | sparse_matrix.finish(); 36 | sparse_matrix 37 | }); 38 | sparse_matrix_threads.push(handle); 39 | } 40 | 41 | for input in config.input.iter() { 42 | let mut entity_processor = EntityProcessor::new( 43 | config, 44 | in_memory_entity_mapping_persistor.clone(), 45 | |hashes| { 46 | bus.broadcast(hashes); 47 | }, 48 | ); 49 | 50 | match &config.file_type { 51 | FileType::Json => { 52 | let mut parser = dom::Parser::default(); 53 | read_file(input, config.log_every_n as u64, move |line| { 54 | let row = parse_json_line(line, &mut parser, &config.columns); 55 | entity_processor.process_row(&row); 56 | }); 57 | } 58 | FileType::Tsv => { 59 | let config_col_num = config.columns.len(); 60 | read_file(input, config.log_every_n as u64, move |line| { 61 | let row = parse_tsv_line(line); 62 | let line_col_num = row.len(); 63 | if line_col_num == config_col_num { 64 | entity_processor.process_row(&row); 65 | } else { 66 | warn!("Wrong number of columns (expected: {}, provided: {}). The line [{}] is skipped.", config_col_num, line_col_num, line); 67 | } 68 | }); 69 | } 70 | } 71 | } 72 | 73 | drop(bus); 74 | 75 | let mut sparse_matrices = vec![]; 76 | for join_handle in sparse_matrix_threads { 77 | let sparse_matrix = join_handle 78 | .join() 79 | .expect("Couldn't join on the associated thread"); 80 | sparse_matrices.push(sparse_matrix); 81 | } 82 | 83 | sparse_matrices 84 | } 85 | 86 | /// Read file line by line. Pass every valid line to handler for parsing. 87 | fn read_file(filepath: &str, log_every: u64, mut line_handler: F) 88 | where 89 | F: FnMut(&str), 90 | { 91 | let input_file = File::open(filepath).expect("Can't open file"); 92 | let mut buffered = BufReader::new(input_file); 93 | 94 | let mut line_number = 1u64; 95 | let mut line = String::new(); 96 | loop { 97 | match buffered.read_line(&mut line) { 98 | Ok(bytes_read) => { 99 | // EOF 100 | if bytes_read == 0 { 101 | break; 102 | } 103 | 104 | line_handler(&line); 105 | } 106 | Err(err) => { 107 | error!("Can't read line number: {}. Error: {}.", line_number, err); 108 | } 109 | }; 110 | 111 | // clear to reuse the buffer 112 | line.clear(); 113 | 114 | if line_number % log_every == 0 { 115 | info!("Number of lines processed: {}", line_number); 116 | } 117 | 118 | line_number += 1; 119 | } 120 | } 121 | 122 | /// Parse a line of JSON and read its columns into a vector for processing. 123 | fn parse_json_line( 124 | line: &str, 125 | parser: &mut dom::Parser, 126 | columns: &[Column], 127 | ) -> Vec> { 128 | let parsed = parser.parse(line).unwrap(); 129 | columns 130 | .iter() 131 | .map(|c| { 132 | if !c.complex { 133 | let elem = parsed.at_key(&c.name).unwrap(); 134 | let value = match elem.get_type() { 135 | dom::element::ElementType::String => elem.get_string().unwrap(), 136 | _ => elem.minify(), 137 | }; 138 | smallvec![value] 139 | } else { 140 | parsed 141 | .at_key(&c.name) 142 | .unwrap() 143 | .get_array() 144 | .expect("Values for complex columns must be arrays") 145 | .into_iter() 146 | .map(|v| match v.get_type() { 147 | dom::element::ElementType::String => v.get_string().unwrap(), 148 | _ => v.minify(), 149 | }) 150 | .collect() 151 | } 152 | }) 153 | .collect() 154 | } 155 | 156 | /// Parse a line of TSV and read its columns into a vector for processing. 157 | fn parse_tsv_line(line: &str) -> Vec> { 158 | let values = line.trim().split('\t'); 159 | values.map(|c| c.split(' ').collect()).collect() 160 | } 161 | 162 | /// Train SparseMatrix'es (graphs) in separated threads. 163 | pub fn train( 164 | config: Configuration, 165 | in_memory_entity_mapping_persistor: Arc, 166 | sparse_matrices: Vec, 167 | ) { 168 | let config = Arc::new(config); 169 | let mut embedding_threads = Vec::new(); 170 | for sparse_matrix in sparse_matrices { 171 | let sparse_matrix = Arc::new(sparse_matrix); 172 | let config = config.clone(); 173 | let in_memory_entity_mapping_persistor = in_memory_entity_mapping_persistor.clone(); 174 | let handle = thread::spawn(move || { 175 | let directory = match config.output_dir.as_ref() { 176 | Some(out) => format!("{}/", out.clone()), 177 | None => String::from(""), 178 | }; 179 | let ofp = format!( 180 | "{}{}__{}__{}.out", 181 | directory, 182 | config.relation_name, 183 | sparse_matrix.col_a_name.as_str(), 184 | sparse_matrix.col_b_name.as_str() 185 | ); 186 | 187 | let mut persistor: Box = match &config.output_format { 188 | OutputFormat::TextFile => Box::new(TextFileVectorPersistor::new( 189 | ofp, 190 | config.produce_entity_occurrence_count, 191 | )), 192 | OutputFormat::Numpy => Box::new(NpyPersistor::new( 193 | ofp, 194 | config.produce_entity_occurrence_count, 195 | )), 196 | }; 197 | if config.in_memory_embedding_calculation { 198 | calculate_embeddings( 199 | config.clone(), 200 | sparse_matrix.clone(), 201 | in_memory_entity_mapping_persistor, 202 | persistor.as_mut(), 203 | ); 204 | } else { 205 | calculate_embeddings_mmap( 206 | config.clone(), 207 | sparse_matrix.clone(), 208 | in_memory_entity_mapping_persistor, 209 | persistor.as_mut(), 210 | ); 211 | } 212 | }); 213 | embedding_threads.push(handle); 214 | } 215 | 216 | for join_handle in embedding_threads { 217 | join_handle 218 | .join() 219 | .expect("Couldn't join on the associated thread"); 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /src/pipeline.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::min; 2 | use std::fs::File; 3 | use std::io::{BufRead, BufReader}; 4 | use std::sync::Arc; 5 | use std::time::Instant; 6 | 7 | use crossbeam::channel; 8 | use crossbeam::channel::{Receiver, Sender}; 9 | use crossbeam::thread as cb_thread; 10 | use crossbeam::thread::{Scope, ScopedJoinHandle}; 11 | use itertools::Itertools; 12 | use log::{error, info, warn}; 13 | use smallvec::SmallVec; 14 | 15 | use crate::configuration::Configuration; 16 | use crate::entity::{EntityProcessor, Hyperedge, SMALL_VECTOR_SIZE}; 17 | use crate::sparse_matrix::SparseMatrix; 18 | use crate::sparse_matrix_builder::NodeIndexerBuilder; 19 | use crate::sparse_matrix_builder::{ 20 | AsyncNodeIndexerBuilder, NodeIndexer, SparseMatrixBuffer, SparseMatrixBuffersReducer, 21 | SyncNodeIndexerBuilder, 22 | }; 23 | 24 | pub fn build_graph_from_iterator<'a>( 25 | config: &Configuration, 26 | hyperedges: impl Iterator, 27 | ) -> SparseMatrix { 28 | cb_thread::scope(|s| { 29 | let (hyperedges_s, hyperedges_r) = channel::bounded(64 * config.num_workers_graph_building); 30 | 31 | // Consumer first, producer second to avoid deadlock 32 | let matrix_buffer = make_consumer(hyperedges_r, config, s); 33 | let node_indexer = make_producer_from_iterator(config, hyperedges, hyperedges_s); 34 | 35 | let buffers = matrix_buffer 36 | .into_iter() 37 | .map(|h| h.join().unwrap()) 38 | .collect_vec(); 39 | SparseMatrixBuffersReducer::new(node_indexer, buffers, config.num_workers_graph_building) 40 | .reduce() 41 | }) 42 | .expect("All work in thread scope finished") 43 | } 44 | 45 | fn make_producer_from_iterator<'a>( 46 | config: &Configuration, 47 | hyperedges: impl Iterator, 48 | hyperedges_s: Sender, 49 | ) -> NodeIndexer { 50 | let node_indexer_builder: Arc = Default::default(); 51 | let entity_processor = EntityProcessor::new(config, node_indexer_builder.clone()); 52 | for line in hyperedges { 53 | consume_line(config, &hyperedges_s, &entity_processor, line); 54 | } 55 | drop(entity_processor); 56 | let node_indexer_builder = 57 | Arc::try_unwrap(node_indexer_builder).expect("All other references should be dropped"); 58 | node_indexer_builder.finish() 59 | } 60 | 61 | fn consume_line( 62 | config: &Configuration, 63 | hyperedges_s: &Sender, 64 | entity_processor: &EntityProcessor, 65 | line: &str, 66 | ) { 67 | let row = parse_tsv_line(line); 68 | let line_col_num = row.len(); 69 | if line_col_num == config.columns.len() { 70 | let hyperedge = entity_processor.process_row_and_get_edges(&row); 71 | hyperedges_s.send(hyperedge).unwrap(); 72 | } else { 73 | warn!( 74 | "Wrong number of columns (expected: {}, provided: {}). The line [{}] is skipped.", 75 | config.columns.len(), 76 | line_col_num, 77 | line 78 | ); 79 | } 80 | } 81 | 82 | pub fn build_graph_from_files(config: &Configuration, input_files: Vec) -> SparseMatrix { 83 | let processing_worker_num = config.num_workers_graph_building; 84 | cb_thread::scope(|s| { 85 | let (hyperedges_s, hyperedges_r) = channel::bounded(processing_worker_num * 64); 86 | 87 | // Consumer first, producer second to avoid deadlock 88 | let matrix_buffers: Vec<_> = make_consumer(hyperedges_r, config, s); 89 | let node_indexer = make_producer_from_files(config, &input_files, s, hyperedges_s); 90 | 91 | let buffers = matrix_buffers 92 | .into_iter() 93 | .map(|h| h.join().unwrap()) 94 | .collect_vec(); 95 | 96 | let merging_start_time = Instant::now(); 97 | let result = 98 | SparseMatrixBuffersReducer::new(node_indexer, buffers, processing_worker_num).reduce(); 99 | info!( 100 | "Merging finished in {} sec", 101 | merging_start_time.elapsed().as_secs() 102 | ); 103 | result 104 | }) 105 | .expect("Threads finished work") 106 | } 107 | 108 | fn make_producer_from_files<'c: 'e, 'e: 's, 's>( 109 | config: &'c Configuration, 110 | input_files: &'c Vec, 111 | s: &'s Scope<'e>, 112 | hyperedges_s: Sender, 113 | ) -> NodeIndexer { 114 | let (files_s, files_r) = channel::unbounded(); 115 | 116 | for input in input_files { 117 | files_s.send(input).unwrap() 118 | } 119 | drop(files_s); 120 | 121 | let max_file_reading_worker_num = min(config.num_workers_graph_building, 4); 122 | let file_reading_worker_num = min(max_file_reading_worker_num, input_files.len()); 123 | 124 | let log_every_n = 10000; 125 | 126 | if file_reading_worker_num == 1 { 127 | let node_indexer_builder: Arc = Default::default(); 128 | let entity_processor = EntityProcessor::new(config, node_indexer_builder.clone()); 129 | consume_files(config, hyperedges_s, files_r, log_every_n, entity_processor); 130 | let node_indexer_builder = 131 | Arc::try_unwrap(node_indexer_builder).expect("All other references should be dropped"); 132 | node_indexer_builder.finish() 133 | } else { 134 | let node_indexer_builder: Arc = Default::default(); 135 | let producers = (0..file_reading_worker_num) 136 | .map(|_| { 137 | let hyperedges_s = hyperedges_s.clone(); 138 | let files_r = files_r.clone(); 139 | let entity_processor = EntityProcessor::new(config, node_indexer_builder.clone()); 140 | 141 | s.spawn(move |_| { 142 | consume_files(config, hyperedges_s, files_r, log_every_n, entity_processor); 143 | }) 144 | }) 145 | .collect_vec(); 146 | drop(hyperedges_s); // hyperedges_s got distributed among producers, drop seed object 147 | drop(files_r); 148 | 149 | producers.into_iter().for_each(|h| h.join().unwrap()); 150 | let node_indexer_builder = 151 | Arc::try_unwrap(node_indexer_builder).expect("All other references should be dropped"); 152 | node_indexer_builder.finish() 153 | } 154 | } 155 | 156 | fn consume_files( 157 | config: &Configuration, 158 | hyperedges_s: Sender, 159 | files_r: Receiver<&String>, 160 | log_every_n: u64, 161 | entity_processor: EntityProcessor, 162 | ) { 163 | for input in files_r { 164 | read_file(input, log_every_n, |line| { 165 | consume_line(config, &hyperedges_s, &entity_processor, line); 166 | }); 167 | } 168 | } 169 | 170 | fn make_consumer<'s, 'a: 'a>( 171 | hyperedges_r: Receiver, 172 | config: &'a Configuration, 173 | s: &'s Scope<'a>, 174 | ) -> Vec> { 175 | (0..config.num_workers_graph_building) 176 | .map(|_| { 177 | let hyperedges_r = hyperedges_r.clone(); 178 | let sparse_matrices = config.matrix_desc.clone(); 179 | 180 | s.spawn(move |_| { 181 | let mut buffer = sparse_matrices.make_buffer(config.hyperedge_trim_n); 182 | for hyperedge in hyperedges_r { 183 | buffer.handle_hyperedge(&hyperedge); 184 | } 185 | buffer 186 | }) 187 | }) 188 | .collect() 189 | } 190 | 191 | /// Read file line by line. Pass every valid line to handler for parsing. 192 | fn read_file(filepath: &str, log_every: u64, mut line_handler: F) 193 | where 194 | F: FnMut(&str), 195 | { 196 | let input_file = File::open(filepath).expect("Can't open file"); 197 | let mut buffered = BufReader::new(input_file); 198 | 199 | let mut line_number = 1u64; 200 | let mut line = String::new(); 201 | loop { 202 | match buffered.read_line(&mut line) { 203 | Ok(bytes_read) => { 204 | // EOF 205 | if bytes_read == 0 { 206 | break; 207 | } 208 | 209 | line_handler(&line); 210 | } 211 | Err(err) => { 212 | error!("Can't read line number: {}. Error: {}.", line_number, err); 213 | } 214 | }; 215 | 216 | // clear to reuse the buffer 217 | line.clear(); 218 | 219 | if line_number % log_every == 0 { 220 | info!("Number of lines processed: {}", line_number); 221 | } 222 | 223 | line_number += 1; 224 | } 225 | } 226 | 227 | /// Parse a line of TSV and read its columns into a vector for processing. 228 | fn parse_tsv_line(line: &str) -> Vec> { 229 | let values = line.trim().split('\t'); 230 | values.map(|c| c.split(' ').collect()).collect() 231 | } 232 | -------------------------------------------------------------------------------- /legacy/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::time::Instant; 2 | 3 | use clap::{crate_authors, crate_description, crate_name, crate_version, Arg, Command}; 4 | use cleora::configuration; 5 | use cleora::configuration::Configuration; 6 | use cleora::configuration::OutputFormat; 7 | use cleora::persistence::entity::InMemoryEntityMappingPersistor; 8 | use cleora::pipeline::{build_graphs, train}; 9 | use env_logger::Env; 10 | use std::fs; 11 | use std::sync::Arc; 12 | 13 | #[macro_use] 14 | extern crate log; 15 | 16 | fn main() { 17 | let env = Env::default() 18 | .filter_or("MY_LOG_LEVEL", "info") 19 | .write_style_or("MY_LOG_STYLE", "always"); 20 | env_logger::init_from_env(env); 21 | 22 | let now = Instant::now(); 23 | 24 | let matches = Command::new(crate_name!()) 25 | .version(crate_version!()) 26 | .author(crate_authors!()) 27 | .about(crate_description!()) 28 | .arg( 29 | Arg::new("inputs") 30 | .multiple_values(true) 31 | .help("Input files paths") 32 | .takes_value(true), 33 | ) 34 | .arg( 35 | Arg::new("input") 36 | .short('i') 37 | .long("input") 38 | .help("Deprecated. Use positional args for input files") 39 | .takes_value(true), 40 | ) 41 | .arg( 42 | Arg::new("file-type") 43 | .short('t') 44 | .long("type") 45 | .possible_values(&["tsv", "json"]) 46 | .help("Input file type") 47 | .takes_value(true), 48 | ) 49 | .arg( 50 | Arg::new("output-dir") 51 | .short('o') 52 | .long("output-dir") 53 | .help("Output directory for files with embeddings") 54 | .takes_value(true), 55 | ) 56 | .arg( 57 | Arg::new("dimension") 58 | .short('d') 59 | .long("dimension") 60 | .required(true) 61 | .help("Embedding dimension size") 62 | .takes_value(true), 63 | ) 64 | .arg( 65 | Arg::new("number-of-iterations") 66 | .short('n') 67 | .long("number-of-iterations") 68 | .required(true) 69 | .help("Max number of iterations") 70 | .takes_value(true), 71 | ) 72 | .arg( 73 | Arg::new("seed") 74 | .short('s') 75 | .long("seed") 76 | .help("Seed (integer) for embedding initialization") 77 | .takes_value(true), 78 | ) 79 | .arg( 80 | Arg::new("columns") 81 | .short('c') 82 | .long("columns") 83 | .required(true) 84 | .help( 85 | "Column names (max 12), with modifiers: [transient::, reflexive::, complex::]", 86 | ) 87 | .takes_value(true), 88 | ) 89 | .arg( 90 | Arg::new("relation-name") 91 | .short('r') 92 | .long("relation-name") 93 | .default_value("emb") 94 | .help("Name of the relation, for output filename generation") 95 | .takes_value(true), 96 | ) 97 | .arg( 98 | Arg::new("prepend-field-name") 99 | .short('p') 100 | .long("prepend-field-name") 101 | .possible_values(&["0", "1"]) 102 | .default_value("0") 103 | .help("Prepend field name to entity in output") 104 | .takes_value(true), 105 | ) 106 | .arg( 107 | Arg::new("log-every-n") 108 | .short('l') 109 | .long("log-every-n") 110 | .default_value("10000") 111 | .help("Log output every N lines") 112 | .takes_value(true), 113 | ) 114 | .arg( 115 | Arg::new("in-memory-embedding-calculation") 116 | .short('e') 117 | .long("in-memory-embedding-calculation") 118 | .possible_values(&["0", "1"]) 119 | .default_value("1") 120 | .help("Calculate embeddings in memory or with memory-mapped files") 121 | .takes_value(true), 122 | ) 123 | .arg( 124 | Arg::new("output-format") 125 | .short('f') 126 | .help("Output format. One of: textfile|numpy") 127 | .possible_values(&["textfile", "numpy"]) 128 | .default_value("textfile") 129 | .takes_value(true), 130 | ) 131 | .get_matches(); 132 | 133 | info!("Reading args..."); 134 | 135 | let input: Vec = { 136 | let named_arg = matches.value_of("input"); 137 | let position_args = match matches.values_of("inputs") { 138 | None => vec![], 139 | Some(values) => values.into_iter().collect(), 140 | }; 141 | position_args 142 | .into_iter() 143 | .chain(named_arg.into_iter()) 144 | .map(|s| s.to_string()) 145 | .collect() 146 | }; 147 | if input.is_empty() { 148 | panic!("Missing input files") 149 | } 150 | 151 | let file_type = match matches.value_of("file-type") { 152 | Some(type_name) => match type_name { 153 | "tsv" => configuration::FileType::Tsv, 154 | "json" => configuration::FileType::Json, 155 | _ => panic!("Invalid file type {}", type_name), 156 | }, 157 | None => configuration::FileType::Tsv, 158 | }; 159 | let output_dir = matches.value_of("output-dir").map(|s| s.to_string()); 160 | // try to create output directory for files with embeddings 161 | if let Some(output_dir) = output_dir.as_ref() { 162 | fs::create_dir_all(output_dir).expect("Can't create output directory"); 163 | } 164 | let dimension: u16 = matches.value_of("dimension").unwrap().parse().unwrap(); 165 | let max_iter: u8 = matches 166 | .value_of("number-of-iterations") 167 | .unwrap() 168 | .parse() 169 | .unwrap(); 170 | let seed: Option = matches.value_of("seed").map(|s| s.parse().unwrap()); 171 | let relation_name = matches.value_of("relation-name").unwrap(); 172 | let prepend_field_name = { 173 | let value: u8 = matches 174 | .value_of("prepend-field-name") 175 | .unwrap() 176 | .parse() 177 | .unwrap(); 178 | value == 1 179 | }; 180 | let log_every: u32 = matches.value_of("log-every-n").unwrap().parse().unwrap(); 181 | let in_memory_embedding_calculation = { 182 | let value: u8 = matches 183 | .value_of("in-memory-embedding-calculation") 184 | .unwrap() 185 | .parse() 186 | .unwrap(); 187 | value == 1 188 | }; 189 | let columns = { 190 | let cols_str = matches.value_of("columns").unwrap(); 191 | let cols_str_separated: Vec<&str> = cols_str.split(' ').collect(); 192 | match configuration::extract_fields(cols_str_separated) { 193 | Ok(cols) => match configuration::validate_fields(cols) { 194 | Ok(validated_cols) => validated_cols, 195 | Err(msg) => panic!("Invalid column fields. Message: {}", msg), 196 | }, 197 | Err(msg) => panic!("Parsing problem. Message: {}", msg), 198 | } 199 | }; 200 | 201 | let output_format = match matches.value_of("output-format").unwrap() { 202 | "textfile" => OutputFormat::TextFile, 203 | "numpy" => OutputFormat::Numpy, 204 | _ => panic!("unsupported output format"), 205 | }; 206 | 207 | let config = Configuration { 208 | produce_entity_occurrence_count: true, 209 | embeddings_dimension: dimension, 210 | max_number_of_iteration: max_iter, 211 | seed, 212 | prepend_field: prepend_field_name, 213 | log_every_n: log_every, 214 | in_memory_embedding_calculation, 215 | input, 216 | file_type, 217 | output_dir, 218 | output_format, 219 | relation_name: relation_name.to_string(), 220 | columns, 221 | }; 222 | dbg!(&config); 223 | 224 | info!("Starting calculation..."); 225 | let in_memory_entity_mapping_persistor = InMemoryEntityMappingPersistor::default(); 226 | let in_memory_entity_mapping_persistor = Arc::new(in_memory_entity_mapping_persistor); 227 | 228 | let sparse_matrices = build_graphs(&config, in_memory_entity_mapping_persistor.clone()); 229 | info!( 230 | "Finished Sparse Matrices calculation in {} sec", 231 | now.elapsed().as_secs() 232 | ); 233 | 234 | train(config, in_memory_entity_mapping_persistor, sparse_matrices); 235 | info!("Finished in {} sec", now.elapsed().as_secs()); 236 | } 237 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::min; 2 | use std::collections::hash_map::DefaultHasher; 3 | use std::collections::HashMap; 4 | use std::hash::Hasher; 5 | 6 | use bincode::{deserialize, serialize}; 7 | use ndarray::{Array1, Array2, ArrayViewMut2, Axis, Ix1, Ix2}; 8 | use numpy::{PyArray, PyArray2, ToPyArray}; 9 | use pyo3::exceptions::PyValueError; 10 | use pyo3::prelude::*; 11 | use pyo3::types::{PyBytes, PyIterator, PyString, PyTuple}; 12 | use rayon::iter::IndexedParallelIterator; 13 | use rayon::iter::ParallelIterator; 14 | use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator}; 15 | 16 | use crate::configuration::Configuration; 17 | use crate::embedding::{MarkovType, NdArrayMatrix}; 18 | use crate::entity::hash_entity; 19 | use crate::pipeline::{build_graph_from_files, build_graph_from_iterator}; 20 | use crate::sparse_matrix::{create_sparse_matrix_descriptor, SparseMatrix, SparseMatrixDescriptor}; 21 | 22 | pub mod configuration; 23 | pub mod embedding; 24 | pub mod entity; 25 | pub mod pipeline; 26 | pub mod sparse_matrix; 27 | pub mod sparse_matrix_builder; 28 | 29 | // Methods not exposed to python 30 | impl SparseMatrix { 31 | fn markov_propagate<'py>( 32 | &self, 33 | x: &'py PyArray2, 34 | markov_type: MarkovType, 35 | num_workers: Option, 36 | ) -> &'py PyArray { 37 | let array = unsafe { x.as_array() }; 38 | let multiplication_workers: usize = num_workers.unwrap_or_else(num_cpus::get); 39 | let propagated = NdArrayMatrix::multiply(self, array, markov_type, multiplication_workers); 40 | propagated.to_pyarray(x.py()) 41 | } 42 | 43 | pub fn from_rust_iterator<'a>( 44 | columns: &str, 45 | hyperedge_trim_n: usize, 46 | hyperedges: impl Iterator, 47 | num_workers: Option, 48 | ) -> Result { 49 | let columns = configuration::parse_fields(columns).expect("Columns should be valid"); 50 | let matrix_desc = create_sparse_matrix_descriptor(&columns)?; 51 | let config = Configuration { 52 | seed: None, 53 | columns, 54 | matrix_desc, 55 | hyperedge_trim_n, 56 | num_workers_graph_building: num_workers.unwrap_or_else(|| min(num_cpus::get(), 8)), 57 | }; 58 | 59 | Ok(build_graph_from_iterator(&config, hyperedges)) 60 | } 61 | 62 | fn initialize_deterministically_rust(&self, mut vectors: ArrayViewMut2, seed: i64) { 63 | vectors 64 | .axis_iter_mut(Axis(0)) 65 | .into_par_iter() 66 | .enumerate() 67 | .for_each(|(entity_ix, mut row)| { 68 | let entity_id_hash = hash_entity(self.entity_ids[entity_ix].as_str()); 69 | row.indexed_iter_mut().for_each(|(col_ix, v)| { 70 | let value = init_value(col_ix, entity_id_hash, seed); 71 | *v = value 72 | }); 73 | }); 74 | } 75 | } 76 | 77 | #[pymethods] 78 | impl SparseMatrix { 79 | #[pyo3(signature = (x, num_workers = None))] 80 | pub fn left_markov_propagate<'py>( 81 | &self, 82 | x: &'py PyArray2, 83 | num_workers: Option, 84 | ) -> &'py PyArray { 85 | self.markov_propagate(x, MarkovType::Left, num_workers) 86 | } 87 | 88 | #[pyo3(signature = (x, num_workers = None))] 89 | fn symmetric_markov_propagate<'py>( 90 | &self, 91 | x: &'py PyArray2, 92 | num_workers: Option, 93 | ) -> &'py PyArray { 94 | self.markov_propagate(x, MarkovType::Symmetric, num_workers) 95 | } 96 | 97 | #[staticmethod] 98 | #[pyo3(signature = (hyperedges, columns, hyperedge_trim_n = 16, num_workers = None))] 99 | fn from_iterator( 100 | hyperedges: &PyIterator, 101 | columns: &str, 102 | hyperedge_trim_n: usize, 103 | num_workers: Option, 104 | ) -> PyResult { 105 | let hyperedges = hyperedges.map(|line| { 106 | let line = line.expect("Should be proper line"); 107 | let line: &PyString = line 108 | .downcast() 109 | .expect("Iterator elements should be strings"); 110 | let line = line.to_str().expect("Should be proper UTF-8 string"); 111 | line 112 | }); 113 | SparseMatrix::from_rust_iterator(columns, hyperedge_trim_n, hyperedges, num_workers) 114 | .map_err(PyValueError::new_err) 115 | } 116 | 117 | #[staticmethod] 118 | #[pyo3(signature = (filepaths, columns, hyperedge_trim_n = 16, num_workers = None))] 119 | fn from_files( 120 | filepaths: Vec, 121 | columns: &str, 122 | hyperedge_trim_n: usize, 123 | num_workers: Option, 124 | ) -> PyResult { 125 | for filepath in filepaths.iter() { 126 | if !filepath.ends_with(".tsv") { 127 | return Err(PyValueError::new_err("Only .tsv files are supported")); 128 | } 129 | } 130 | 131 | let columns = configuration::parse_fields(columns).expect("Columns should be valid"); 132 | let matrix_desc = 133 | create_sparse_matrix_descriptor(&columns).map_err(PyValueError::new_err)?; 134 | 135 | let config = Configuration { 136 | seed: None, 137 | matrix_desc, 138 | columns, 139 | hyperedge_trim_n, 140 | // TODO consider limiting to some maximum no of workers 141 | num_workers_graph_building: num_workers.unwrap_or_else(num_cpus::get), 142 | }; 143 | Ok(build_graph_from_files(&config, filepaths)) 144 | } 145 | 146 | fn get_entity_column_mask<'py>( 147 | &self, 148 | py: Python<'py>, 149 | column_name: String, 150 | ) -> PyResult<&'py PyArray> { 151 | let column_id_by_name = HashMap::from([ 152 | (&self.descriptor.col_a_name, self.descriptor.col_a_id), 153 | (&self.descriptor.col_b_name, self.descriptor.col_b_id), 154 | ]); 155 | let column_id = column_id_by_name 156 | .get(&column_name) 157 | .ok_or(PyValueError::new_err("Column name invalid"))?; 158 | 159 | let mask: Vec = self 160 | .column_ids 161 | .par_iter() 162 | .map(|id| *id == *column_id) 163 | .collect(); 164 | let mask = Array1::from_vec(mask); 165 | Ok(mask.to_pyarray(py)) 166 | } 167 | 168 | #[getter] 169 | fn entity_degrees<'py>(&self, py: Python<'py>) -> &'py PyArray { 170 | let entity_degrees: Vec = self.entities.par_iter().map(|e| e.row_sum).collect(); 171 | Array1::from_vec(entity_degrees).to_pyarray(py) 172 | } 173 | 174 | #[pyo3(signature = (feature_dim, seed = 0))] 175 | fn initialize_deterministically<'py>( 176 | &self, 177 | py: Python<'py>, 178 | feature_dim: usize, 179 | seed: i64, 180 | ) -> &'py PyArray { 181 | let mut vectors = Array2::zeros([self.entity_ids.len(), feature_dim]); 182 | self.initialize_deterministically_rust(vectors.view_mut(), seed); 183 | vectors.to_pyarray(py) 184 | } 185 | 186 | // Stuff needed for pickle to work (new, getstate, setstate) 187 | #[new] 188 | #[pyo3(signature = (*args))] 189 | fn new(args: &PyTuple) -> Self { 190 | match args.len() { 191 | 0 => SparseMatrix { 192 | descriptor: SparseMatrixDescriptor { 193 | col_a_id: 0, 194 | col_a_name: "".to_string(), 195 | col_b_id: 0, 196 | col_b_name: "".to_string(), 197 | }, 198 | entity_ids: vec![], 199 | entities: vec![], 200 | edges: vec![], 201 | slices: vec![], 202 | column_ids: vec![], 203 | }, 204 | _ => panic!("SparseMatrix::new never meant to be called by user. Only 0-arg implementation provided to make pickle happy"), 205 | } 206 | } 207 | 208 | pub fn __getstate__(&self, py: Python) -> PyResult { 209 | Ok(PyBytes::new(py, &serialize(self).unwrap()).to_object(py)) 210 | } 211 | 212 | pub fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { 213 | match state.extract::<&PyBytes>(py) { 214 | Ok(s) => { 215 | let sm: SparseMatrix = deserialize(s.as_bytes()).unwrap(); 216 | *self = sm; 217 | Ok(()) 218 | } 219 | Err(e) => Err(e), 220 | } 221 | } 222 | } 223 | 224 | fn init_value(col: usize, hsh: u64, fixed_random_value: i64) -> f32 { 225 | let hash = |num: i64| { 226 | let mut hasher = DefaultHasher::new(); 227 | hasher.write_i64(num); 228 | hasher.finish() as i64 229 | }; 230 | 231 | const MAX_HASH_I64: i64 = 8 * 1024 * 1024; 232 | const MAX_HASH_F32: f32 = MAX_HASH_I64 as f32; 233 | ((hash((hsh as i64) + (col as i64) + fixed_random_value) % MAX_HASH_I64) as f32) / MAX_HASH_F32 234 | } 235 | 236 | #[pymodule] 237 | #[pyo3(name = "pycleora")] 238 | fn pycleora(_py: Python, m: &PyModule) -> PyResult<()> { 239 | m.add_class::()?; 240 | Ok(()) 241 | } 242 | -------------------------------------------------------------------------------- /legacy/src/persistence.rs: -------------------------------------------------------------------------------- 1 | pub mod entity { 2 | use rustc_hash::FxHashMap; 3 | use std::sync::RwLock; 4 | 5 | pub trait EntityMappingPersistor { 6 | fn get_entity(&self, hash: u64) -> Option; 7 | fn put_data(&self, hash: u64, entity: String); 8 | fn contains(&self, hash: u64) -> bool; 9 | } 10 | 11 | #[derive(Debug, Default)] 12 | pub struct InMemoryEntityMappingPersistor { 13 | entity_mappings: RwLock>, 14 | } 15 | 16 | impl EntityMappingPersistor for InMemoryEntityMappingPersistor { 17 | fn get_entity(&self, hash: u64) -> Option { 18 | let entity_mappings_read = self.entity_mappings.read().unwrap(); 19 | entity_mappings_read.get(&hash).map(|s| s.to_string()) 20 | } 21 | 22 | fn put_data(&self, hash: u64, entity: String) { 23 | let mut entity_mappings_write = self.entity_mappings.write().unwrap(); 24 | entity_mappings_write.insert(hash, entity); 25 | } 26 | 27 | fn contains(&self, hash: u64) -> bool { 28 | let entity_mappings_read = self.entity_mappings.read().unwrap(); 29 | entity_mappings_read.contains_key(&hash) 30 | } 31 | } 32 | } 33 | 34 | pub mod embedding { 35 | use crate::persistence::embedding::memmap::OwnedMmapArrayViewMut; 36 | use ndarray::{s, Array}; 37 | use ndarray_npy::write_zeroed_npy; 38 | use std::fs::File; 39 | use std::io; 40 | use std::io::{BufWriter, Error, ErrorKind, Write}; 41 | 42 | pub trait EmbeddingPersistor { 43 | fn put_metadata(&mut self, entity_count: u32, dimension: u16) -> Result<(), io::Error>; 44 | fn put_data( 45 | &mut self, 46 | entity: &str, 47 | occur_count: u32, 48 | vector: Vec, 49 | ) -> Result<(), io::Error>; 50 | fn finish(&mut self) -> Result<(), io::Error>; 51 | } 52 | 53 | pub struct TextFileVectorPersistor { 54 | buf_writer: BufWriter, 55 | produce_entity_occurrence_count: bool, 56 | } 57 | 58 | impl TextFileVectorPersistor { 59 | pub fn new(filename: String, produce_entity_occurrence_count: bool) -> Self { 60 | let msg = format!("Unable to create file: {}", filename); 61 | let file = File::create(filename).expect(&msg); 62 | TextFileVectorPersistor { 63 | buf_writer: BufWriter::new(file), 64 | produce_entity_occurrence_count, 65 | } 66 | } 67 | } 68 | 69 | impl EmbeddingPersistor for TextFileVectorPersistor { 70 | fn put_metadata(&mut self, entity_count: u32, dimension: u16) -> Result<(), io::Error> { 71 | write!(&mut self.buf_writer, "{} {}", entity_count, dimension)?; 72 | Ok(()) 73 | } 74 | 75 | fn put_data( 76 | &mut self, 77 | entity: &str, 78 | occur_count: u32, 79 | vector: Vec, 80 | ) -> Result<(), io::Error> { 81 | self.buf_writer.write_all(b"\n")?; 82 | self.buf_writer.write_all(entity.as_bytes())?; 83 | 84 | if self.produce_entity_occurrence_count { 85 | write!(&mut self.buf_writer, " {}", occur_count)?; 86 | } 87 | 88 | for &v in &vector { 89 | self.buf_writer.write_all(b" ")?; 90 | let mut buf = ryu::Buffer::new(); // cheap op 91 | self.buf_writer.write_all(buf.format_finite(v).as_bytes())?; 92 | } 93 | 94 | Ok(()) 95 | } 96 | 97 | fn finish(&mut self) -> Result<(), io::Error> { 98 | self.buf_writer.write_all(b"\n")?; 99 | Ok(()) 100 | } 101 | } 102 | 103 | mod memmap { 104 | use memmap::MmapMut; 105 | use ndarray::ArrayViewMut2; 106 | use std::fs::OpenOptions; 107 | use std::io; 108 | use std::io::{Error, ErrorKind}; 109 | use std::ptr::drop_in_place; 110 | 111 | pub struct OwnedMmapArrayViewMut { 112 | mmap_ptr: *mut MmapMut, 113 | mmap_data: Option>, 114 | } 115 | 116 | impl OwnedMmapArrayViewMut { 117 | pub fn new(filename: &str) -> Result { 118 | use ndarray_npy::ViewMutNpyExt; 119 | 120 | let file = OpenOptions::new().read(true).write(true).open(filename)?; 121 | let mmap = unsafe { MmapMut::map_mut(&file)? }; 122 | let mmap = Box::new(mmap); 123 | let mmap = Box::leak(mmap); 124 | let mmap_ptr: *mut MmapMut = mmap as *mut _; 125 | 126 | let mmap_data = ArrayViewMut2::<'static, f32>::view_mut_npy(mmap) 127 | .map_err(|_| Error::new(ErrorKind::Other, "Mmap view error"))?; 128 | 129 | Ok(Self { 130 | mmap_ptr, 131 | mmap_data: Some(mmap_data), 132 | }) 133 | } 134 | 135 | pub fn data_view<'a>(&'a mut self) -> &'a mut ArrayViewMut2<'a, f32> { 136 | let view = self 137 | .mmap_data 138 | .as_mut() 139 | .expect("Should be always defined. None only used in Drop"); 140 | 141 | // SAFETY: shortening lifetime from 'static to 'a is safe because underlying buffer won't be dropped until view is borrowed 142 | unsafe { 143 | core::mem::transmute::< 144 | &mut ArrayViewMut2<'static, f32>, 145 | &mut ArrayViewMut2<'a, f32>, 146 | >(view) 147 | } 148 | } 149 | } 150 | 151 | impl Drop for OwnedMmapArrayViewMut { 152 | fn drop(&mut self) { 153 | // Unwind references with reverse order. 154 | // First remove view that points to mmap_ptr 155 | self.mmap_data = None; 156 | // And now drop mmap_ptr 157 | // SAFETY: safe because pointer leaked in constructor. 158 | unsafe { drop_in_place(self.mmap_ptr) } 159 | } 160 | } 161 | } 162 | 163 | pub struct NpyPersistor { 164 | entities: Vec, 165 | occurences: Vec, 166 | array_file_name: String, 167 | array_file: File, 168 | array_write_context: Option, 169 | occurences_buf: Option>, 170 | entities_buf: BufWriter, 171 | } 172 | 173 | impl NpyPersistor { 174 | pub fn new(filename: String, produce_entity_occurrence_count: bool) -> Self { 175 | let entities_filename = format!("{}.entities", &filename); 176 | let entities_buf = BufWriter::new( 177 | File::create(&entities_filename) 178 | .unwrap_or_else(|_| panic!("Unable to create file: {}", &entities_filename)), 179 | ); 180 | 181 | let occurences_filename = format!("{}.occurences", &filename); 182 | let occurences_buf = if produce_entity_occurrence_count { 183 | Some(BufWriter::new( 184 | File::create(&occurences_filename).unwrap_or_else(|_| { 185 | panic!("Unable to create file: {}", &occurences_filename) 186 | }), 187 | )) 188 | } else { 189 | None 190 | }; 191 | 192 | let array_file_name = format!("{}.npy", &filename); 193 | let array_file = File::create(&array_file_name) 194 | .unwrap_or_else(|_| panic!("Unable to create file: {}", &array_file_name)); 195 | 196 | Self { 197 | entities: vec![], 198 | occurences: vec![], 199 | array_file_name, 200 | array_file, 201 | array_write_context: None, 202 | occurences_buf, 203 | entities_buf, 204 | } 205 | } 206 | } 207 | 208 | impl EmbeddingPersistor for NpyPersistor { 209 | fn put_metadata(&mut self, entity_count: u32, dimension: u16) -> Result<(), io::Error> { 210 | write_zeroed_npy::( 211 | &self.array_file, 212 | [entity_count as usize, dimension as usize], 213 | ) 214 | .map_err(|_| Error::new(ErrorKind::Other, "Write zeroed npy error"))?; 215 | self.array_write_context = Some(OwnedMmapArrayViewMut::new(&self.array_file_name)?); 216 | Ok(()) 217 | } 218 | 219 | fn put_data( 220 | &mut self, 221 | entity: &str, 222 | occur_count: u32, 223 | vector: Vec, 224 | ) -> Result<(), io::Error> { 225 | let array = &mut self 226 | .array_write_context 227 | .as_mut() 228 | .expect("Should be defined. Was put_metadata not called?") 229 | .data_view(); 230 | 231 | array 232 | .slice_mut(s![self.entities.len(), ..]) 233 | .assign(&Array::from(vector)); 234 | self.entities.push(entity.to_owned()); 235 | self.occurences.push(occur_count); 236 | Ok(()) 237 | } 238 | 239 | fn finish(&mut self) -> Result<(), io::Error> { 240 | use ndarray_npy::WriteNpyExt; 241 | 242 | serde_json::to_writer_pretty(&mut self.entities_buf, &self.entities)?; 243 | 244 | if let Some(occurences_buf) = self.occurences_buf.as_mut() { 245 | let occur = ndarray::ArrayView1::from(&self.occurences); 246 | occur.write_npy(occurences_buf).map_err(|e| { 247 | Error::new( 248 | ErrorKind::Other, 249 | format!("Could not save occurences: {}", e), 250 | ) 251 | })?; 252 | } 253 | 254 | Ok(()) 255 | } 256 | } 257 | } 258 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | ![Cleora logo](files/images/cleora.png) 4 | 5 |

6 | 7 | ## Achievements 8 | 9 | :one:st place at [SIGIR eCom Challenge 2020](https://sigir-ecom.github.io/ecom20DCPapers/SIGIR_eCom20_DC_paper_1.pdf) 10 | 11 | :two:nd place and Best Paper Award at [WSDM Booking.com Challenge 2021](http://ceur-ws.org/Vol-2855/challenge_short_3.pdf) 12 | 13 | :two:nd place at [Twitter Recsys Challenge 2021](https://recsys-twitter.com/competition_leaderboard/latest) 14 | 15 | :three:rd place at [KDD Cup 2021](https://ogb.stanford.edu/paper/kddcup2021/mag240m_SyneriseAI.pdf) 16 | 17 | 18 | # Cleora 19 | 20 | _**Cleora** is a genus of moths in the family **Geometridae**. Their scientific name derives from the Ancient Greek geo γῆ or γαῖα "the earth", and metron μέτρον "measure" in reference to the way their larvae, or "inchworms", appear to "**measure the earth**" as they move along in a looping fashion._ 21 | 22 | Cleora is a general-purpose model for efficient, scalable learning of stable and inductive entity embeddings for heterogeneous relational data. 23 | 24 | # Introducing Cleora 2.0.0 - Python native 25 | 26 | **Installation** 27 | ``` 28 | pip install pycleora 29 | ``` 30 | 31 | **Build instructions** 32 | ``` 33 | # prepare python env 34 | pip install maturin 35 | 36 | # Install pycleora in current env (meant for development) 37 | maturin develop 38 | 39 | # Usage example below. More examples in examples/ folder. 40 | ``` 41 | ## Changelog 42 | 43 | **Cleora** is now available as a Python package `pycleora`. Key improvements compared to the previous version: 44 | * _performance optimizations_: ~10x faster embedding times 45 | * _performance optimizations_: significantly reduced memory usage 46 | * _latest research_: improved embedding quality 47 | * _new feature_: can create graphs from a Python `iterators` in addition to `tsv` files 48 | * _new feature_: seamless integration with `NumPy` 49 | * _new feature_: item attributes support via custom embeddings initialization 50 | * _new feature_: adjustable vector projection / normalization after each propagation step 51 | 52 | **Breaking changes:** 53 | * _transient_ modifier not supported any more - creating `complex::reflexive` columns for hypergraph embeddings, _grouped by_ the transient entity gives better results. 54 | 55 | 56 | # Usage example: 57 | 58 | ``` 59 | from pycleora import SparseMatrix 60 | import numpy as np 61 | import pandas as pd 62 | import random 63 | 64 | # Generate example data 65 | customers = [f"Customer_{i}" for i in range(1, 20)] 66 | products = [f"Product_{j}" for j in range(1, 20)] 67 | 68 | data = { 69 | "customer": random.choices(customers, k=100), 70 | "product": random.choices(products, k=100), 71 | } 72 | 73 | # Create DataFrame 74 | df = pd.DataFrame(data) 75 | 76 | # Create hyperedges 77 | customer_products = df.groupby('customer')['product'].apply(list).values 78 | 79 | # Convert to Cleora input format 80 | cleora_input = map(lambda x: ' '.join(x), customer_products) 81 | 82 | # Create Markov transition matrix for the hypergraph 83 | mat = SparseMatrix.from_iterator(cleora_input, columns='complex::reflexive::product') 84 | 85 | # Look at entity ids in the matrix, corresponding to embedding vectors 86 | print(mat.entity_ids) 87 | # ['Product_5', 'Product_3', 'Product_2', 'Product_4', 'Product_1'] 88 | 89 | # Initialize embedding vectors externally, using text, image, random vectors 90 | # embeddings = ... 91 | 92 | # Or use built-in random deterministic initialization 93 | embeddings = mat.initialize_deterministically(1024) 94 | 95 | # Perform Markov random walk, then normalize however many times we want 96 | 97 | NUM_WALKS = 3 # The optimal number depends on the graph, typically between 3 and 7 yields good results 98 | # lower values tend to capture co-occurrence, higher iterations capture substitutability in a context 99 | 100 | for i in range(NUM_WALKS): 101 | # Can propagate with a symmetric matrix as well, but left Markov is a great default 102 | embeddings = mat.left_markov_propagate(embeddings) 103 | # Normalize with L2 norm by default, for the embeddings to reside on a hypersphere. Can use standardization instead. 104 | embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True) 105 | 106 | # We're done, here are our embeddings 107 | 108 | for entity, embedding in zip(mat.entity_ids, embeddings): 109 | print(entity, embedding) 110 | 111 | # We can now compare our embeddings with dot product (since they are L2 normalized) 112 | 113 | print(np.dot(embeddings[0], embeddings[1])) 114 | print(np.dot(embeddings[0], embeddings[2])) 115 | print(np.dot(embeddings[0], embeddings[3])) 116 | ``` 117 | # FAQ 118 | 119 | **Q: What should I embed?** 120 | 121 | A: Any entities that interact with each other, co-occur or can be said to be present together in a given context. Examples can include: products in a shopping basket, locations frequented by the same people at similar times, employees collaborating together, chemical molecules being present in specific circumstances, proteins produced by the same bacteria, drug interactions, co-authors of the same academic papers, companies occurring together in the same LinkedIn profiles. 122 | 123 | **Q: How should I construct the input?** 124 | 125 | A: What works best is grouping entities co-occurring in a similar context, and feeding them in whitespace-separated lines using `complex::reflexive` modifier is a good idea. E.g. if you have product data, you can group the products by shopping baskets or by users. If you have urls, you can group them by browser sessions, of by (user, time window) pairs. Check out the usage example above. Grouping products by customers is just one possibility. 126 | 127 | **Q: Can I embed users and products simultaneously, to compare them with cosine similarity?** 128 | 129 | A: No, this is a methodologically wrong approach, stemming from outdated matrix factorization approaches. What you should do is come up with good product embeddings first, then create user embeddings from them. Feeding two columns e.g. `user product` into cleora will result in a bipartite graph. Similar products will be close to each other, similar users will be close to each other, but users and products will not necessarily be similar to each other. 130 | 131 | **Q: What embedding dimensionality to use?** 132 | 133 | A: The more, the better, but we typically work from _1024_ to _4096_. Memory is cheap and machines are powerful, so don't skimp on embedding size. 134 | 135 | **Q: How many iterations of Markov propagation should I use?** 136 | 137 | A: Depends on what you want to achieve. Low iterations (3) tend to approximate the co-occurrence matrix, while high iterations (7+) tend to give contextual similarity (think skip-gram but much more accurate and faster). 138 | 139 | **Q: How do I incorporate external information, e.g. entity metadata, images, texts into the embeddings?** 140 | 141 | A: Just initialize the embedding matrix with your own vectors coming from a VIT, setence-transformers, of a random projection of your numeric features. In that scenario low numbers of Markov iterations (1 to 3) tend to work best. 142 | 143 | **Q: My embeddings don't fit in memory, what do I do?** 144 | 145 | A: Cleora operates on dimensions independently. Initialize your embeddings with a smaller number of dimensions, run Cleora, persist to disk, then repeat. You can concatenate your resulting embedding vectors afterwards, but remember to normalize them afterwards! 146 | 147 | **Q: Is there a minimum number of entity occurrences?** 148 | 149 | A: No, an entity `A` co-occuring just 1 time with some other entity `B` will get a proper embedding, i.e. `B` will be the most similar to `A`. The other way around, `A` will be highly ranked among nearest neighbors of `B`, which may or may not be desirable, depending on your use case. Feel free to prune your input to Cleora to eliminate low-frequency items. 150 | 151 | **Q: Are there any edge cases where Cleora can fail?** 152 | 153 | A: Cleora works best for relatively sparse hypergraphs. If all your hyperedges contain some very common entity `X`, e.g. a _shopping bag_, then it will degrade the quality of embeddings by degenerating shortest paths in the random walk. It is a good practice to remove such entities from the hypergraph. 154 | 155 | **Q: How can Cleora be so fast and accurate at the same time?** 156 | 157 | A: Not using negative sampling is a great boon. By constructing the (sparse) Markov transition matrix, Cleora explicitly performs all possible random walks in a hypergraph in one big step (a single matrix multiplication). That's what we call a single _iteration_. We perform 3+ such iterations. Thanks to a highly efficient implementation in Rust, with special care for concurrency, memory layout and cache coherence, it is blazingly fast. Negative sampling or randomly selecting random walks tend to introduce a lot of noise - Cleora is free of those burdens. 158 | 159 | # Science 160 | 161 | **Read the whitepaper ["Cleora: A Simple, Strong and Scalable Graph Embedding Scheme"](https://arxiv.org/abs/2102.02302)** 162 | 163 | Cleora embeds entities in *n-dimensional spherical spaces* utilizing extremely fast stable, iterative random projections, which allows for unparalleled performance and scalability. 164 | 165 | Types of data which can be embedded include for example: 166 | - heterogeneous undirected graphs 167 | - heterogeneous undirected hypergraphs 168 | - text and other categorical array data 169 | - any combination of the above 170 | 171 | **!!! Disclaimer: the numbers below are for Cleora 1.x, new version is significantly faster, but yet have to re-run the benchmarks** 172 | 173 | Key competitive advantages of Cleora: 174 | * more than **197x faster than DeepWalk** 175 | * **~4x-8x faster than [PyTorch-BigGraph](https://ai.facebook.com/blog/open-sourcing-pytorch-biggraph-for-faster-embeddings-of-extremely-large-graphs/)** (depends on use case) 176 | * star expansion, clique expansion, and no expansion support for hypergraphs 177 | * **quality of results outperforming or competitive** with other embedding frameworks like [PyTorch-BigGraph](https://ai.facebook.com/blog/open-sourcing-pytorch-biggraph-for-faster-embeddings-of-extremely-large-graphs/), GOSH, DeepWalk, LINE 178 | * can embed extremely large graphs & hypergraphs on a single machine 179 | 180 | Embedding times - example: 181 | 182 | 183 | 184 | 189 | 190 | 191 | 196 | 197 | 198 | 203 | 204 |
Algorithm 185 | FB dataset 186 | RoadNet dataset 187 | LiveJournal dataset 188 |
Cleora 192 | 00:00:43 h 193 | 00:21:59 h 194 | 01:31:42 h 195 |
PyTorch-BigGraph 199 | 00:04.33 h 200 | 00:31:11 h 201 | 07:10:00 h 202 |
205 | 206 | Link Prediction results - example: 207 | 208 | 209 | 212 | 213 | 214 | 215 | 216 | 224 | 225 | 233 | 234 | 242 | 243 | 252 |
210 | 211 | FB datasetRoadNet datasetLiveJournal dataset
Algorithm 217 | MRR 218 | HitRate@10 219 | MRR 220 | HitRate@10 221 | MRR 222 | HitRate@10 223 |
Cleora 226 | 0.072 227 | 0.172 228 | 0.929 229 | 0.942 230 | 0.586 231 | 0.627 232 |
PyTorch-BigGraph 235 | 0.035 236 | 0.072 237 | 0.850 238 | 0.866 239 | 0.565 240 | 0.672 241 |
253 | 254 | ## Cleora design principles 255 | Cleora is built as a multi-purpose "just embed it" tool, suitable for many different data types and formats. 256 | 257 | Cleora ingests a relational table of rows representing a typed and undirected heterogeneous hypergraph, which can contain multiple: 258 | - typed categorical columns 259 | - typed categorical array columns 260 | 261 | For example a relational table representing shopping baskets may have the following columns: 262 | 263 | user <\t> product <\t> store 264 | 265 | With the input file containing values: 266 | 267 | user_id <\t> product_id product_id product_id <\t> store_id 268 | 269 | Every column has a type, which is used to determine whether spaces of identifiers between different columns are shared or distinct. It is possible for two columns to share a type, which is the case for homogeneous graphs: 270 | 271 | user <\t> user 272 | 273 | Based on the column format specification, Cleora performs: 274 | - Star decomposition of hyper-edges 275 | - Creation of pairwise graphs for all pairs of entity types 276 | - Embedding of each graph 277 | 278 | The final output of Cleora consists of multiple files for each (undirected) pair of entity types in the table. 279 | 280 | Those embeddings can then be utilized in a novel way thanks to their dim-wise independence property, which is described further below. 281 | 282 | ## Key technical features of Cleora embeddings 283 | The embeddings produced by Cleora are different from those produced by Node2vec, Word2vec, DeepWalk or other systems in this class by a number of key properties: 284 | - **efficiency** - Cleora is two orders of magnitude faster than Node2Vec or DeepWalk 285 | - **inductivity** - as Cleora embeddings of an entity are defined only by interactions with other entities, vectors for new entities can be computed on-the-fly 286 | - **updatability** - refreshing a Cleora embedding for an entity is a very fast operation allowing for real-time updates without retraining 287 | - **stability** - all starting vectors for entities are deterministic, which means that Cleora embeddings on similar datasets will end up being similar. Methods like Word2vec, Node2vec or DeepWalk return different results with every run. 288 | - **cross-dataset compositionality** - thanks to stability of Cleora embeddings, embeddings of the same entity on multiple datasets can be combined by averaging, yielding meaningful vectors 289 | - **dim-wise independence** - thanks to the process producing Cleora embeddings, every dimension is independent of others. This property allows for efficient and low-parameter method for combining multi-view embeddings with Conv1d layers. 290 | - **extreme parallelism and performance** - Cleora is written in Rust utilizing thread-level parallelism for all calculations except input file loading. In practice this means that the embedding process is often faster than loading the input data. 291 | 292 | ## Key usability features of Cleora embeddings 293 | 294 | The technical properties described above imply good production-readiness of Cleora, which from the end-user perspective can be summarized as follows: 295 | - heterogeneous relational tables can be embedded without any artificial data pre-processing 296 | - mixed interaction + text datasets can be embedded with ease 297 | - cold start problem for new entities is non-existent 298 | - real-time updates of the embeddings do not require any separate solutions 299 | - multi-view embeddings work out of the box 300 | - temporal, incremental embeddings are stable out of the box, with no need for re-alignment, rotations or other methods 301 | - extremely large datasets are supported and can be embedded within seconds / minutes 302 | 303 | ## Documentation 304 | 305 | **!!! Disclaimer the documentation below is for Cleora 1.x, to be updated for 2.x** 306 | 307 | More information can be found in [the full documentation](https://cleora.readthedocs.io/). 308 | 309 | For details contact us at cleora@synerise.com 310 | 311 | ## Cite 312 | 313 | Please cite [our paper](https://arxiv.org/abs/2102.02302) (and the respective papers of the methods used) if you use this code in your own work: 314 | 315 | ``` 316 | @article{DBLP:journals/corr/abs-2102-02302, 317 | author = {Barbara Rychalska, Piotr Babel, Konrad Goluchowski, Andrzej Michalowski, Jacek Dabrowski}, 318 | title = {Cleora: {A} Simple, Strong and Scalable Graph Embedding Scheme}, 319 | journal = {CoRR}, 320 | year = {2021} 321 | } 322 | ``` 323 | 324 | ## License 325 | 326 | Synerise Cleora is MIT licensed, as found in the [LICENSE](LICENSE) file. 327 | 328 | 329 | ## How to Contribute 330 | 331 | Pull requests are welcome. 332 | -------------------------------------------------------------------------------- /src/sparse_matrix_builder.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | use std::cmp::Reverse; 3 | use std::collections::HashMap; 4 | use std::hash::BuildHasherDefault; 5 | use std::ptr; 6 | use std::sync::atomic::{AtomicUsize, Ordering}; 7 | 8 | use dashmap::DashMap; 9 | use itertools::Itertools; 10 | use rayon::iter::IntoParallelIterator; 11 | use rayon::iter::IntoParallelRefIterator; 12 | use rayon::iter::ParallelDrainFull; 13 | use rayon::iter::ParallelIterator; 14 | use rayon::prelude::ParallelSliceMut; 15 | use rayon::ThreadPoolBuilder; 16 | use rustc_hash::FxHasher; 17 | use smallvec::SmallVec; 18 | 19 | use crate::entity::{Hyperedge, SMALL_VECTOR_SIZE}; 20 | use crate::sparse_matrix::{Edge, Entity, SparseMatrix, SparseMatrixDescriptor}; 21 | 22 | #[derive(Debug, Default)] 23 | struct Row { 24 | occurrence: u32, 25 | row_sum: f32, 26 | } 27 | 28 | /// Data locality plays huge role in propagation phase performance 29 | /// We want connected nodes to have similar indices, as they will get updated together. 30 | /// NodeIndexer assigns successive indices to nodes connected via hyper-edges. 31 | /// Such ordering yields significant performance boost in propagation phase. 32 | #[derive(Debug, Default)] 33 | pub struct NodeIndexer { 34 | pub key_2_index: HashMap>, 35 | pub index_2_key: Vec, 36 | pub index_2_entity_id: Vec, 37 | pub index_2_column_id: Vec, 38 | } 39 | 40 | pub trait NodeIndexerBuilder { 41 | fn process(&self, key: u64, entity_id: &str, column_id: u8); 42 | fn finish(self) -> NodeIndexer; 43 | } 44 | 45 | #[derive(Debug)] 46 | pub struct SyncNodeIndexerBuilder { 47 | node_indexer: RefCell, 48 | } 49 | 50 | impl Default for SyncNodeIndexerBuilder { 51 | fn default() -> Self { 52 | SyncNodeIndexerBuilder { 53 | node_indexer: RefCell::new(NodeIndexer { 54 | key_2_index: Default::default(), 55 | index_2_key: vec![], 56 | index_2_column_id: vec![], 57 | index_2_entity_id: vec![], 58 | }), 59 | } 60 | } 61 | } 62 | 63 | impl NodeIndexerBuilder for SyncNodeIndexerBuilder { 64 | fn process(&self, key: u64, entity_id: &str, column_id: u8) { 65 | let mut node_indexer = self.node_indexer.borrow_mut(); 66 | 67 | if node_indexer.key_2_index.contains_key(&key) { 68 | return; 69 | } 70 | let index = node_indexer.key_2_index.len(); 71 | node_indexer.key_2_index.insert(key, index); 72 | node_indexer.index_2_key.push(key); 73 | node_indexer.index_2_entity_id.push(entity_id.to_string()); 74 | node_indexer.index_2_column_id.push(column_id); 75 | } 76 | 77 | fn finish(self) -> NodeIndexer { 78 | self.node_indexer.into_inner() 79 | } 80 | } 81 | 82 | #[derive(Debug)] 83 | pub struct IndexedEntity { 84 | index: usize, 85 | id: String, 86 | column_id: u8, 87 | } 88 | 89 | #[derive(Debug, Default)] 90 | pub struct AsyncNodeIndexerBuilder { 91 | key_2_entity: DashMap>, 92 | next_index: AtomicUsize, 93 | } 94 | 95 | impl NodeIndexerBuilder for AsyncNodeIndexerBuilder { 96 | fn process(&self, key: u64, entity_id: &str, column_id: u8) { 97 | self.key_2_entity.entry(key).or_insert_with(|| { 98 | let index = self.next_index.fetch_add(1, Ordering::Relaxed); 99 | let id = entity_id.to_string(); 100 | IndexedEntity { 101 | index, 102 | id, 103 | column_id, 104 | } 105 | }); 106 | } 107 | 108 | fn finish(self) -> NodeIndexer { 109 | // Thin wrappers over pointer to make it Send/Sync 110 | // https://stackoverflow.com/a/70848420 111 | 112 | #[derive(Copy, Clone)] 113 | struct PointerU64(*mut u64); 114 | unsafe impl Send for PointerU64 {} 115 | unsafe impl Sync for PointerU64 {} 116 | 117 | #[derive(Copy, Clone)] 118 | struct PointerString(*mut String); 119 | unsafe impl Send for PointerString {} 120 | unsafe impl Sync for PointerString {} 121 | 122 | #[derive(Copy, Clone)] 123 | struct PointerU8(*mut u8); 124 | unsafe impl Send for PointerU8 {} 125 | unsafe impl Sync for PointerU8 {} 126 | 127 | let numel = self.next_index.into_inner(); 128 | let mut index_2_key: Vec = vec![0; numel]; 129 | let mut index_2_entity_id = vec![String::new(); numel]; 130 | let mut index_2_column_id = vec![0; numel]; 131 | 132 | let index_2_key_ptr = PointerU64(index_2_key.as_mut_ptr()); 133 | let index_2_entity_id_ptr = PointerString(index_2_entity_id.as_mut_ptr()); 134 | let index_2_column_id_ptr = PointerU8(index_2_column_id.as_mut_ptr()); 135 | 136 | let key_2_index = self 137 | .key_2_entity 138 | .into_par_iter() 139 | .map(|(key, indexed_entity)| { 140 | let IndexedEntity { 141 | index, 142 | id: entity_id, 143 | column_id, 144 | } = indexed_entity; 145 | unsafe { 146 | ptr::write(index_2_key_ptr.0.add(index), key); 147 | ptr::write(index_2_entity_id_ptr.0.add(index), entity_id); 148 | ptr::write(index_2_column_id_ptr.0.add(index), column_id); 149 | } 150 | (key, index) 151 | }) 152 | .collect(); 153 | 154 | NodeIndexer { 155 | key_2_index, 156 | index_2_key, 157 | index_2_entity_id, 158 | index_2_column_id, 159 | } 160 | } 161 | } 162 | 163 | impl SparseMatrixDescriptor { 164 | pub fn new(col_a_id: u8, col_a_name: String, col_b_id: u8, col_b_name: String) -> Self { 165 | Self { 166 | col_a_id, 167 | col_a_name, 168 | col_b_id, 169 | col_b_name, 170 | } 171 | } 172 | 173 | pub fn make_buffer(&self, hyperedge_trim_n: usize) -> SparseMatrixBuffer { 174 | SparseMatrixBuffer { 175 | descriptor: self.clone(), 176 | edge_count: 0, 177 | hash_2_row: Default::default(), 178 | hashes_2_edge: Default::default(), 179 | hyperedge_trim_n, 180 | } 181 | } 182 | } 183 | 184 | #[derive(Debug)] 185 | pub struct SparseMatrixBuffer { 186 | pub descriptor: SparseMatrixDescriptor, 187 | pub edge_count: u32, 188 | hash_2_row: HashMap>, 189 | hashes_2_edge: HashMap<(u64, u64), f32, BuildHasherDefault>, 190 | hyperedge_trim_n: usize, 191 | } 192 | 193 | impl SparseMatrixBuffer { 194 | pub fn handle_hyperedge(&mut self, hyperedge: &Hyperedge) { 195 | let SparseMatrixDescriptor { 196 | col_a_id, col_b_id, .. 197 | } = self.descriptor; 198 | let total_combinations = hyperedge.edges_num(col_a_id, col_b_id) as u32; 199 | 200 | let mut nodes_a = hyperedge.nodes(col_a_id as usize); 201 | let mut nodes_b = hyperedge.nodes(col_b_id as usize); 202 | 203 | for hash in &nodes_a { 204 | self.update_row(*hash, nodes_b.len() as u32); 205 | } 206 | for hash in &nodes_b { 207 | self.update_row(*hash, nodes_a.len() as u32); 208 | } 209 | 210 | let value = 1f32 / (total_combinations as f32); 211 | 212 | let (nodes_a_high, nodes_a_low) = self.get_high_low_nodes(&mut nodes_a); 213 | let (nodes_b_high, nodes_b_low) = self.get_high_low_nodes(&mut nodes_b); 214 | self.handle_combinations(nodes_a_high, nodes_b_high, value); 215 | self.handle_combinations(nodes_a_high, nodes_b_low, value); 216 | self.handle_combinations(nodes_a_low, nodes_b_high, value); 217 | // Ignore 'low-to-low' combinations 218 | } 219 | 220 | fn get_high_low_nodes<'a>( 221 | &self, 222 | nodes: &'a mut SmallVec<[u64; SMALL_VECTOR_SIZE]>, 223 | ) -> (&'a [u64], &'a [u64]) { 224 | if nodes.len() > self.hyperedge_trim_n { 225 | nodes.select_nth_unstable_by_key(self.hyperedge_trim_n, |h| { 226 | Reverse(self.hash_2_row.get(h).map_or(0, |r| r.occurrence)) 227 | }); 228 | nodes.split_at(self.hyperedge_trim_n) 229 | } else { 230 | (nodes, &[]) 231 | } 232 | } 233 | 234 | fn handle_combinations(&mut self, a_hashes: &[u64], b_hashes: &[u64], value: f32) { 235 | for a_hash in a_hashes { 236 | for b_hash in b_hashes { 237 | self.add_pair_symmetric(*a_hash, *b_hash, value); 238 | } 239 | } 240 | } 241 | 242 | /// It creates sparse matrix for two columns in the incoming data. 243 | /// Let's say that we have such columns: 244 | /// customers | products | brands 245 | /// incoming data: 246 | /// userId1 | productId1, productId2 | brandId1, brandId2 247 | /// userId2 | productId1 | brandId3, brandId4, brandId5 248 | /// etc. 249 | /// One of the sparse matrices could represent customers and products relation (products and brands relation, customers and brands relation). 250 | /// This sparse matrix (customers and products relation) handles every combination in these columns according to 251 | /// total combinations in a row. 252 | /// The first row in the incoming data produces two combinations according to 4 total combinations: 253 | /// userId1, productId1 and userId1, productId2 254 | /// The second row produces one combination userId2, productId1 according to 3 total combinations. 255 | /// `a_hash` - hash of a entity for a column A 256 | /// `b_hash` - hash of a entity for a column B 257 | /// `count` - total number of combinations in a row 258 | fn add_pair_symmetric(&mut self, a_hash: u64, b_hash: u64, value: f32) { 259 | self.edge_count += 1; 260 | self.update_edge(a_hash, b_hash, value); 261 | self.update_edge(b_hash, a_hash, value); 262 | } 263 | 264 | fn update_row(&mut self, hash: u64, count: u32) { 265 | let val = 1f32 / (count as f32); 266 | let e = self.hash_2_row.entry(hash).or_default(); 267 | e.occurrence += count; 268 | e.row_sum += val 269 | } 270 | 271 | fn update_edge(&mut self, a_hash: u64, b_hash: u64, val: f32) { 272 | let e = self.hashes_2_edge.entry((a_hash, b_hash)).or_default(); 273 | *e += val; 274 | } 275 | } 276 | 277 | #[derive(Debug)] 278 | pub struct SparseMatrixBuffersReducer { 279 | descriptor: SparseMatrixDescriptor, 280 | buffers: Vec, 281 | node_indexer: NodeIndexer, 282 | num_workers: usize, 283 | } 284 | 285 | pub struct EdgeEntry { 286 | pub row: u32, 287 | pub col: u32, 288 | pub value: f32, 289 | } 290 | 291 | impl SparseMatrixBuffersReducer { 292 | pub fn new( 293 | node_indexer: NodeIndexer, 294 | buffers: Vec, 295 | num_workers: usize, 296 | ) -> Self { 297 | if buffers.is_empty() { 298 | panic!("Cannot reduce 0 buffers") 299 | } 300 | 301 | let descriptor = buffers[0].descriptor.clone(); 302 | for buffer in &buffers { 303 | if descriptor != buffer.descriptor { 304 | panic!("Can only reduce buffers with the same sparse matrix description") 305 | } 306 | } 307 | 308 | Self { 309 | descriptor, 310 | buffers, 311 | node_indexer, 312 | num_workers, 313 | } 314 | } 315 | 316 | pub fn reduce(self) -> SparseMatrix { 317 | ThreadPoolBuilder::new() 318 | .num_threads(self.num_workers) 319 | .build() 320 | .unwrap() 321 | .install(|| { 322 | let node_indexer = self.node_indexer; 323 | 324 | // Extract buffers so their fields can be moved to reducing functions 325 | let (hash_2_row_maps, hashes_2_edge_map): (Vec<_>, Vec<_>) = self 326 | .buffers 327 | .into_iter() 328 | .map(|b| (b.hash_2_row, b.hashes_2_edge)) 329 | .unzip(); 330 | let entities = 331 | SparseMatrixBuffersReducer::reduce_to_entities(&node_indexer, hash_2_row_maps); 332 | let mut edges: Vec<_> = 333 | SparseMatrixBuffersReducer::reduce_to_edges(&node_indexer, hashes_2_edge_map); 334 | edges.par_sort_by_key(|entry| (entry.row, entry.col)); 335 | 336 | let slices: Vec<_> = edges 337 | .iter() 338 | .enumerate() 339 | .group_by(|(_, entry)| entry.row) 340 | .into_iter() 341 | .map(|(_, mut group)| { 342 | let first = group.next().expect("Group have at least one element"); 343 | let last = group.last().unwrap_or(first); 344 | (first.0, last.0 + 1) 345 | }) 346 | .collect(); 347 | 348 | let mut edges: Vec<_> = edges 349 | .into_par_iter() 350 | .map(|entry| Edge { 351 | other_entity_ix: entry.col, 352 | // use this field for different purpose to avoid reallocation 353 | left_markov_value: entry.value, 354 | symmetric_markov_value: 0.0, 355 | }) 356 | .collect(); 357 | 358 | slices 359 | .iter() 360 | .enumerate() 361 | .for_each(|(row_ix, (start_ix, end_ix))| { 362 | let row_sum = entities[row_ix].row_sum; 363 | let slice = &mut edges[(*start_ix)..(*end_ix)]; 364 | slice.iter_mut().for_each(|edge| { 365 | let value = edge.left_markov_value; 366 | 367 | let left_markov_normalization = row_sum; 368 | let symmetric_markov_normalization = { 369 | let col_sum = entities[edge.other_entity_ix as usize].row_sum; 370 | (row_sum * col_sum).sqrt() 371 | }; 372 | edge.left_markov_value = value / left_markov_normalization; 373 | edge.symmetric_markov_value = value / symmetric_markov_normalization; 374 | }) 375 | }); 376 | 377 | SparseMatrix { 378 | descriptor: self.descriptor, 379 | entity_ids: node_indexer.index_2_entity_id, 380 | column_ids: node_indexer.index_2_column_id, 381 | entities, 382 | edges, 383 | slices, 384 | } 385 | }) 386 | } 387 | 388 | fn reduce_to_entities( 389 | node_indexer: &NodeIndexer, 390 | entity_maps: Vec>>, 391 | ) -> Vec { 392 | node_indexer 393 | .index_2_key 394 | .par_iter() 395 | .map(|hash| { 396 | let mut entity_agg = Entity { row_sum: 0.0 }; 397 | for entity_map in entity_maps.iter() { 398 | if let Some(entity) = entity_map.get(hash) { 399 | entity_agg.row_sum += entity.row_sum; 400 | } 401 | } 402 | entity_agg 403 | }) 404 | .collect() 405 | } 406 | 407 | fn reduce_to_edges( 408 | node_indexer: &NodeIndexer, 409 | edge_maps: Vec>>, 410 | ) -> Vec { 411 | // Dashmap to have concurrent write access with par_drain 412 | // par_drain is recommended to not increase peak memory usage 413 | let reduced_edge_map: DashMap<(u64, u64), f32, BuildHasherDefault> = 414 | Default::default(); 415 | for mut edge_map in edge_maps.into_iter() { 416 | edge_map.par_drain().for_each(|(k, v)| { 417 | reduced_edge_map 418 | .entry(k) 419 | .and_modify(|rv| *rv += v) 420 | .or_insert(v); 421 | }) 422 | } 423 | reduced_edge_map 424 | .into_par_iter() 425 | .map(|((row_hash, col_hash), value)| { 426 | let row = *node_indexer 427 | .key_2_index 428 | .get(&row_hash) 429 | .expect("Hash value was indexed") as u32; 430 | let col = *node_indexer 431 | .key_2_index 432 | .get(&col_hash) 433 | .expect("Hash value was indexed") as u32; 434 | EdgeEntry { row, col, value } 435 | }) 436 | .collect() 437 | } 438 | } 439 | -------------------------------------------------------------------------------- /legacy/example_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 145, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "from sklearn.model_selection import train_test_split\n", 11 | "from sklearn.linear_model import SGDClassifier\n", 12 | "from sklearn.utils import shuffle\n", 13 | "from tqdm import tqdm\n", 14 | "import pickle as pkl\n", 15 | "import pandas as pd\n", 16 | "import random\n", 17 | "import sys\n", 18 | "import os\n", 19 | "from sklearn.metrics import f1_score" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 146, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "random.seed(0)\n", 29 | "np.random.seed(0)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 147, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "config = {\n", 39 | " #embedding computation\n", 40 | " 'cleora_n_iter': 5,\n", 41 | " 'cleora_dim': 1024,\n", 42 | " \n", 43 | " #dataset preparation\n", 44 | " 'train_test_split': 0.2,\n", 45 | " \n", 46 | " #training classification\n", 47 | " 'input_embeddings': [\n", 48 | " 'output/emb__cluster_id__StarNode.out',\n", 49 | " 'output/emb__CliqueNode__CliqueNode.out',\n", 50 | " ],\n", 51 | " 'batch_size': 256,\n", 52 | " 'test_batch_size': 1000,\n", 53 | " 'epochs': [20],\n", 54 | "}" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "# Dataset preparation" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "1. Download the Facebook dataset from SNAP: https://snap.stanford.edu/data/facebook-large-page-page-network.html\n", 69 | "2. Extract the dataset to ./facebook_large/\n", 70 | "3. Compute Cleora embeddings as shown in \"Cleora training\" section in `example_link_prediction.ipynb`" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 148, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "df_cleora = pd.read_csv(\"./facebook_large/musae_facebook_edges.csv\")" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 149, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/html": [ 90 | "
\n", 91 | "\n", 104 | "\n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | "
id_1id_2
0018427
1121708
2122208
3122171
416829
\n", 140 | "
" 141 | ], 142 | "text/plain": [ 143 | " id_1 id_2\n", 144 | "0 0 18427\n", 145 | "1 1 21708\n", 146 | "2 1 22208\n", 147 | "3 1 22171\n", 148 | "4 1 6829" 149 | ] 150 | }, 151 | "execution_count": 149, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "df_cleora.head()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 150, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "train_cleora, test_cleora = train_test_split(df_cleora, test_size=config['train_test_split'])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 151, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "fb_cleora_input_clique_filename = \"fb_cleora_input_clique.txt\"\n", 176 | "fb_cleora_input_star_filename = \"fb_cleora_input_star.txt\"\n", 177 | "output_dir = 'output'" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 152, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "with open(fb_cleora_input_clique_filename, \"w\") as f_cleora_clique, open(fb_cleora_input_star_filename, \"w\") as f_cleora_star:\n", 187 | " grouped_train = train_cleora.groupby('id_1')\n", 188 | " for n, (name, group) in enumerate(grouped_train):\n", 189 | " group_list = group['id_2'].tolist()\n", 190 | " group_elems = list(map(str, group_list))\n", 191 | " f_cleora_clique.write(\"{} {}\\n\".format(name, ' '.join(group_elems)))\n", 192 | " f_cleora_star.write(\"{}\\t{}\\n\".format(n, name))\n", 193 | " for elem in group_elems:\n", 194 | " f_cleora_star.write(\"{}\\t{}\\n\".format(n, elem))" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 153, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "df = pd.read_csv(\"facebook_large/musae_facebook_target.csv\")" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 154, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "classes = df['page_type'].unique()\n", 220 | "class_ids = list(range(0, len(classes)))\n", 221 | "class_dict = {k:v for k,v in zip(classes, class_ids)}\n", 222 | "df['page_type'] = [class_dict[item] for item in df['page_type']] " 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 155, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "train_filename = \"fb_classification_train.txt\"\n", 232 | "test_filename = \"fb_classification_test.txt\"" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 156, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "train, test = train_test_split(df, test_size=config['train_test_split'])" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 157, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "with open(train_filename, \"w\") as f_train:\n", 251 | " for index, row in train.iterrows():\n", 252 | " f_train.write(\"{} {}\\n\".format(row['id'], row['page_type']))" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 158, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "with open(test_filename, \"w\") as f_test:\n", 262 | " for index, row in test.iterrows():\n", 263 | " f_test.write(\"{} {}\\n\".format(row['id'], row['page_type']))" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "# Cleora training" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "Download an appropriate binary Cleora release from: https://github.com/Synerise/cleora/releases . \n", 278 | "\n", 279 | "A Linux GNU version is assumed in this example, but any other will do." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 159, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "import subprocess\n", 289 | "\n", 290 | "\n", 291 | "def columns2output_filename(output_dir, columns):\n", 292 | " columns_split = columns.split()\n", 293 | " if len(columns_split) == 1 and 'reflexive' in columns:\n", 294 | " column_name = columns.split('::')[-1]\n", 295 | " return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')\n", 296 | "\n", 297 | " column_names = [i.split('::')[-1] for i in columns_split]\n", 298 | " return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')\n", 299 | "\n", 300 | "\n", 301 | "def train_cleora(dim, n_iter, columns, input_filename, output_dir):\n", 302 | " command = ['./cleora-v1.0.1-x86_64-unknown-linux-gnu',\n", 303 | " '--columns', columns,\n", 304 | " '--dimension', str(dim), \n", 305 | " '-n', str(n_iter), \n", 306 | " '--input', input_filename, \n", 307 | " '-o', output_dir]\n", 308 | " subprocess.run(command, check=True)\n", 309 | " return columns2output_filename(output_dir, columns)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "## Star expansion\n", 317 | "\n", 318 | "In the `fb_cleora_input_star.txt` file the first column is a virtual node. The parameter `-c \"transient::cluster_id node\"` means that embeddings will not be created for nodes from this column. This translates to star expansion scheme." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 160, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "CPU times: user 1.37 ms, sys: 8.1 ms, total: 9.47 ms\n", 331 | "Wall time: 8.59 s\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "%%time\n", 337 | "cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"transient::cluster_id StarNode\", fb_cleora_input_star_filename, output_dir)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "## Clique expansion\n", 345 | "\n", 346 | "The `fb_cleora_input_clique.txt` file has the structure of adjacency list. The parameter `-c \"complex::reflexive::node\"` means that edges will be created for all cominations of nodes from each line. This translates to clique expansion scheme." 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 161, 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | "CPU times: user 4.42 ms, sys: 8.34 ms, total: 12.8 ms\n", 359 | "Wall time: 13.7 s\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "%%time\n", 365 | "cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"complex::reflexive::CliqueNode\", fb_cleora_input_clique_filename, output_dir)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "## No expansion\n", 373 | "\n", 374 | "You can also compute Cleora without any expansion scheme by providing an input file in the edgelist format (single pair of nodes per line). Run with a simple parameter: `-c \"node1 node2\"`." 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "# Classification\n", 382 | "\n", 383 | "We train a simple multiclass Logistic Regression classifier to predict the class of node based on its embedding. We assess the quality of the classifier with of 2 metrics: micro-F1 and macro-F1." 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 162, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "def read_embeddings(input_file):\n", 393 | " df_full = pd.read_csv(input_file, delimiter = \" \", skiprows=[0], header=None, \n", 394 | " index_col=0)\n", 395 | " df_full = df_full.drop([1], axis=1)\n", 396 | "\n", 397 | " return df_full" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 163, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "def read_train_test(embeddings):\n", 407 | " valid_idx = embeddings.index.to_numpy()\n", 408 | " \n", 409 | " train = np.loadtxt(train_filename, delimiter=\" \", dtype=np.int) \n", 410 | " test = np.loadtxt(test_filename, delimiter=\" \", dtype=np.int)\n", 411 | " \n", 412 | " train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]\n", 413 | " test = [t for t in test if (t[0] in valid_idx) and (t[1] in valid_idx)] \n", 414 | " \n", 415 | " train = np.array(train)\n", 416 | " test = np.array(test)\n", 417 | " \n", 418 | " return train,test" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 164, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "batch_size = config['batch_size']\n", 428 | "test_batch_size = config['test_batch_size']" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 165, 434 | "metadata": { 435 | "scrolled": true 436 | }, 437 | "outputs": [ 438 | { 439 | "name": "stderr", 440 | "output_type": "stream", 441 | "text": [ 442 | "100%|██████████| 20/20 [00:15<00:00, 1.29it/s]" 443 | ] 444 | }, 445 | { 446 | "name": "stdout", 447 | "output_type": "stream", 448 | "text": [ 449 | "algo: output/emb__cluster_id__StarNode.out epochs: 20, micro f1: 0.9093110871905274, macro f1:0.9094875754311472\n" 450 | ] 451 | }, 452 | { 453 | "name": "stderr", 454 | "output_type": "stream", 455 | "text": [ 456 | "\n", 457 | "100%|██████████| 20/20 [00:15<00:00, 1.33it/s]" 458 | ] 459 | }, 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "algo: output/emb__CliqueNode__CliqueNode.out epochs: 20, micro f1: 0.9171151776103337, macro f1:0.9169262311726959\n" 465 | ] 466 | }, 467 | { 468 | "name": "stderr", 469 | "output_type": "stream", 470 | "text": [ 471 | "\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "for algo in config['input_embeddings']:\n", 477 | " embeddings = read_embeddings(algo)\n", 478 | " train,test = read_train_test(embeddings)\n", 479 | " \n", 480 | " y_train = train[:, 1]\n", 481 | " y_test = test[:, 1]\n", 482 | "\n", 483 | " clf = SGDClassifier(random_state=0, loss='log', alpha=0.0001)\n", 484 | " for e in tqdm(range(0, max(config['epochs']))):\n", 485 | " for idx in range(0,train.shape[0],batch_size):\n", 486 | " ex=train[idx:min(idx+batch_size,train.shape[0]),:]\n", 487 | "\n", 488 | " ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n", 489 | " ex_y = y_train[idx:min(idx+batch_size,train.shape[0])]\n", 490 | " \n", 491 | " clf.partial_fit(ex_emb_in, ex_y, classes=[0,1,2,3])\n", 492 | " \n", 493 | " if e+1 in config['epochs']:\n", 494 | " acc = 0.0\n", 495 | " y_pred = []\n", 496 | " for n, idx in enumerate(range(0,test.shape[0],test_batch_size)):\n", 497 | " ex=test[idx:min(idx+test_batch_size,train.shape[0]),:]\n", 498 | " ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n", 499 | " pred = clf.predict_proba(ex_emb_in)\n", 500 | " \n", 501 | " classes = np.argmax(pred, axis=1)\n", 502 | " y_pred.extend(classes)\n", 503 | "\n", 504 | " f1_micro = f1_score(y_test, y_pred, average='micro')\n", 505 | " f1_macro = f1_score(y_test, y_pred, average='macro')\n", 506 | " print('algo: {} epochs: {}, micro f1: {}, macro f1:{}'.format(algo, e+1, f1_micro, f1_macro))\n" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [] 522 | } 523 | ], 524 | "metadata": { 525 | "kernelspec": { 526 | "display_name": "Python 3", 527 | "language": "python", 528 | "name": "python3" 529 | }, 530 | "language_info": { 531 | "codemirror_mode": { 532 | "name": "ipython", 533 | "version": 3 534 | }, 535 | "file_extension": ".py", 536 | "mimetype": "text/x-python", 537 | "name": "python", 538 | "nbconvert_exporter": "python", 539 | "pygments_lexer": "ipython3", 540 | "version": "3.7.4" 541 | } 542 | }, 543 | "nbformat": 4, 544 | "nbformat_minor": 4 545 | } 546 | -------------------------------------------------------------------------------- /legacy/src/embedding.rs: -------------------------------------------------------------------------------- 1 | use crate::configuration::Configuration; 2 | use crate::persistence::embedding::EmbeddingPersistor; 3 | use crate::persistence::entity::EntityMappingPersistor; 4 | use crate::sparse_matrix::SparseMatrixReader; 5 | use log::{info, warn}; 6 | use memmap::MmapMut; 7 | use rayon::prelude::*; 8 | use std::collections::hash_map::DefaultHasher; 9 | use std::collections::HashSet; 10 | use std::fs; 11 | use std::fs::OpenOptions; 12 | use std::hash::Hasher; 13 | use std::marker::PhantomData; 14 | use std::sync::Arc; 15 | use uuid::Uuid; 16 | 17 | /// Number of broken entities (those with errors during writing to the file) which are logged. 18 | /// There can be much more but log the first few. 19 | const LOGGED_NUMBER_OF_BROKEN_ENTITIES: usize = 20; 20 | 21 | /// Used during matrix initialization. No specific requirement (ca be lower as well). 22 | const MAX_HASH_I64: i64 = 8 * 1024 * 1024; 23 | const MAX_HASH_F32: f32 = MAX_HASH_I64 as f32; 24 | 25 | /// Wrapper for different types of matrix structures such as 2-dim vectors or memory-mapped files 26 | trait MatrixWrapper { 27 | /// Initializing a matrix with values from its dimensions and the hash values from the sparse matrix 28 | fn init_with_hashes( 29 | rows: usize, 30 | cols: usize, 31 | fixed_random_value: i64, 32 | sparse_matrix_reader: Arc, 33 | ) -> Self; 34 | 35 | /// Returns value for specific coordinates 36 | fn get_value(&self, row: usize, col: usize) -> f32; 37 | 38 | /// Normalizing a matrix by rows sum 39 | fn normalize(&mut self); 40 | 41 | /// Multiplies sparse matrix by the matrix 42 | fn multiply( 43 | sparse_matrix_reader: Arc, 44 | other: Self, 45 | ) -> Self; 46 | } 47 | 48 | /// Two dimensional vectors as matrix representation 49 | struct TwoDimVectorMatrix { 50 | rows: usize, 51 | cols: usize, 52 | matrix: Vec>, 53 | } 54 | 55 | impl MatrixWrapper for TwoDimVectorMatrix { 56 | fn init_with_hashes( 57 | rows: usize, 58 | cols: usize, 59 | fixed_random_value: i64, 60 | sparse_matrix_reader: Arc, 61 | ) -> Self { 62 | let result: Vec> = (0..cols) 63 | .into_par_iter() 64 | .map(|i| { 65 | let mut col: Vec = Vec::with_capacity(rows); 66 | for hsh in sparse_matrix_reader.iter_hashes() { 67 | let col_value = init_value(i, hsh.value, fixed_random_value); 68 | col.push(col_value); 69 | } 70 | col 71 | }) 72 | .collect(); 73 | Self { 74 | rows, 75 | cols, 76 | matrix: result, 77 | } 78 | } 79 | 80 | #[inline] 81 | fn get_value(&self, row: usize, col: usize) -> f32 { 82 | let column: &Vec = self.matrix.get(col).unwrap(); 83 | column[row] 84 | } 85 | 86 | fn normalize(&mut self) { 87 | let mut row_sum = vec![0f32; self.rows]; 88 | 89 | for col in self.matrix.iter() { 90 | for (j, sum) in row_sum.iter_mut().enumerate() { 91 | let value = col[j]; 92 | *sum += value.powi(2) 93 | } 94 | } 95 | 96 | let row_sum = Arc::new(row_sum); 97 | self.matrix.par_iter_mut().for_each(|col| { 98 | for (j, value) in col.iter_mut().enumerate() { 99 | let sum = row_sum[j]; 100 | *value /= sum.sqrt(); 101 | } 102 | }); 103 | } 104 | 105 | fn multiply( 106 | sparse_matrix_reader: Arc, 107 | other: Self, 108 | ) -> Self { 109 | let rnew = zero_2d(other.rows, other.cols); 110 | 111 | let result: Vec> = other 112 | .matrix 113 | .into_par_iter() 114 | .zip(rnew) 115 | .update(|data| { 116 | let (res_col, rnew_col) = data; 117 | for entry in sparse_matrix_reader.iter_entries() { 118 | let elem = rnew_col.get_mut(entry.row as usize).unwrap(); 119 | let value = res_col[entry.col as usize]; 120 | *elem += value * entry.value; 121 | } 122 | }) 123 | .map(|data| data.1) 124 | .collect(); 125 | 126 | Self { 127 | rows: other.rows, 128 | cols: other.cols, 129 | matrix: result, 130 | } 131 | } 132 | } 133 | 134 | fn init_value(col: usize, hsh: u64, fixed_random_value: i64) -> f32 { 135 | ((hash((hsh as i64) + (col as i64) + fixed_random_value) % MAX_HASH_I64) as f32) / MAX_HASH_F32 136 | } 137 | 138 | fn hash(num: i64) -> i64 { 139 | let mut hasher = DefaultHasher::new(); 140 | hasher.write_i64(num); 141 | hasher.finish() as i64 142 | } 143 | 144 | fn zero_2d(row: usize, col: usize) -> Vec> { 145 | let mut res: Vec> = Vec::with_capacity(col); 146 | for _i in 0..col { 147 | let col = vec![0f32; row]; 148 | res.push(col); 149 | } 150 | res 151 | } 152 | 153 | /// Memory-mapped file as matrix representation. Every column of the matrix is placed side by side in the file. 154 | struct MMapMatrix { 155 | rows: usize, 156 | cols: usize, 157 | file_name: String, 158 | matrix: MmapMut, 159 | } 160 | 161 | impl MatrixWrapper for MMapMatrix { 162 | fn init_with_hashes( 163 | rows: usize, 164 | cols: usize, 165 | fixed_random_value: i64, 166 | sparse_matrix_reader: Arc, 167 | ) -> Self { 168 | let uuid = Uuid::new_v4(); 169 | let file_name = format!("{}_matrix_{}", sparse_matrix_reader.get_id(), uuid); 170 | let mut mmap = create_mmap(rows, cols, file_name.as_str()); 171 | 172 | mmap.par_chunks_mut(rows * 4) 173 | .enumerate() 174 | .for_each(|(i, chunk)| { 175 | // i - number of dimension 176 | // chunk - column/vector of bytes 177 | for (j, hsh) in sparse_matrix_reader.iter_hashes().enumerate() { 178 | let col_value = init_value(i, hsh.value, fixed_random_value); 179 | MMapMatrix::update_column(j, chunk, |value| unsafe { *value = col_value }); 180 | } 181 | }); 182 | 183 | mmap.flush() 184 | .expect("Can't flush memory map modifications to disk"); 185 | 186 | Self { 187 | rows, 188 | cols, 189 | file_name, 190 | matrix: mmap, 191 | } 192 | } 193 | 194 | #[inline] 195 | fn get_value(&self, row: usize, col: usize) -> f32 { 196 | let start_idx = ((col * self.rows) + row) * 4; 197 | let end_idx = start_idx + 4; 198 | let pointer: *const u8 = (&self.matrix[start_idx..end_idx]).as_ptr(); 199 | unsafe { 200 | let value = pointer as *const f32; 201 | *value 202 | } 203 | } 204 | 205 | fn normalize(&mut self) { 206 | let entities_count = self.rows; 207 | let mut row_sum = vec![0f32; entities_count]; 208 | 209 | for i in 0..(self.cols as usize) { 210 | for (j, sum) in row_sum.iter_mut().enumerate() { 211 | let value = self.get_value(j, i); 212 | *sum += value.powi(2) 213 | } 214 | } 215 | 216 | let row_sum = Arc::new(row_sum); 217 | self.matrix 218 | .par_chunks_mut(entities_count * 4) 219 | .enumerate() 220 | .for_each(|(_i, chunk)| { 221 | // i - number of dimension 222 | // chunk - column/vector of bytes 223 | for (j, &sum) in row_sum.iter().enumerate() { 224 | MMapMatrix::update_column(j, chunk, |value| unsafe { *value /= sum.sqrt() }); 225 | } 226 | }); 227 | 228 | self.matrix 229 | .flush() 230 | .expect("Can't flush memory map modifications to disk"); 231 | } 232 | 233 | fn multiply( 234 | sparse_matrix_reader: Arc, 235 | other: Self, 236 | ) -> Self { 237 | let rows = other.rows; 238 | let cols = other.cols; 239 | 240 | let uuid = Uuid::new_v4(); 241 | let file_name = format!("{}_matrix_{}", sparse_matrix_reader.get_id(), uuid); 242 | let mut mmap_output = create_mmap(rows, cols, file_name.as_str()); 243 | 244 | let input = Arc::new(other); 245 | mmap_output 246 | .par_chunks_mut(rows * 4) 247 | .enumerate() 248 | .for_each_with(input, |input, (i, chunk)| { 249 | for entry in sparse_matrix_reader.iter_entries() { 250 | let input_value = input.get_value(entry.col as usize, i); 251 | MMapMatrix::update_column(entry.row as usize, chunk, |value| unsafe { 252 | *value += input_value * entry.value 253 | }); 254 | } 255 | }); 256 | 257 | mmap_output 258 | .flush() 259 | .expect("Can't flush memory map modifications to disk"); 260 | 261 | Self { 262 | rows, 263 | cols, 264 | file_name, 265 | matrix: mmap_output, 266 | } 267 | } 268 | } 269 | 270 | /// Creates memory-mapped file with allocated number of bytes 271 | fn create_mmap(rows: usize, cols: usize, file_name: &str) -> MmapMut { 272 | let number_of_bytes = (rows * cols * 4) as u64; 273 | let file = OpenOptions::new() 274 | .read(true) 275 | .write(true) 276 | .create(true) 277 | .open(file_name) 278 | .expect("Can't create new set of options for memory mapped file"); 279 | file.set_len(number_of_bytes).unwrap_or_else(|_| { 280 | panic!( 281 | "Can't update the size of {} file to {} bytes", 282 | file_name, number_of_bytes 283 | ) 284 | }); 285 | unsafe { 286 | MmapMut::map_mut(&file).unwrap_or_else(|_| { 287 | panic!( 288 | "Can't create memory mapped file for the underlying file {}", 289 | file_name 290 | ) 291 | }) 292 | } 293 | } 294 | 295 | /// Used to remove memory-mapped file after processing 296 | impl Drop for MMapMatrix { 297 | fn drop(&mut self) { 298 | fs::remove_file(self.file_name.as_str()).unwrap_or_else(|_| { 299 | warn!( 300 | "File {} can't be removed after work. Remove the file in order to save disk space.", 301 | self.file_name.as_str() 302 | ) 303 | }); 304 | } 305 | } 306 | 307 | impl MMapMatrix { 308 | #[inline] 309 | fn update_column(col: usize, chunk: &mut [u8], func: F) 310 | where 311 | F: Fn(*mut f32), 312 | { 313 | let start_idx = col * 4; 314 | let end_idx = start_idx + 4; 315 | let pointer: *mut u8 = (&mut chunk[start_idx..end_idx]).as_mut_ptr(); 316 | let value = pointer as *mut f32; 317 | func(value); 318 | } 319 | } 320 | 321 | /// Calculate embeddings in memory. 322 | pub fn calculate_embeddings( 323 | config: Arc, 324 | sparse_matrix_reader: Arc, 325 | entity_mapping_persistor: Arc, 326 | embedding_persistor: &mut dyn EmbeddingPersistor, 327 | ) where 328 | T1: SparseMatrixReader + Sync + Send, 329 | T2: EntityMappingPersistor, 330 | { 331 | let mult = MatrixMultiplicator::new(config.clone(), sparse_matrix_reader); 332 | let init: TwoDimVectorMatrix = mult.initialize(); 333 | let res = mult.propagate(config.max_number_of_iteration, init); 334 | mult.persist(res, entity_mapping_persistor, embedding_persistor); 335 | 336 | info!("Finalizing embeddings calculations!") 337 | } 338 | 339 | /// Provides matrix multiplication based on sparse matrix data. 340 | #[derive(Debug)] 341 | struct MatrixMultiplicator { 342 | dimension: usize, 343 | number_of_entities: usize, 344 | fixed_random_value: i64, 345 | sparse_matrix_reader: Arc, 346 | _marker: PhantomData, 347 | } 348 | 349 | impl MatrixMultiplicator 350 | where 351 | T: SparseMatrixReader + Sync + Send, 352 | M: MatrixWrapper, 353 | { 354 | fn new(config: Arc, sparse_matrix_reader: Arc) -> Self { 355 | let rand_value = config.seed.map(hash).unwrap_or(0); 356 | Self { 357 | dimension: config.embeddings_dimension as usize, 358 | number_of_entities: sparse_matrix_reader.get_number_of_entities() as usize, 359 | fixed_random_value: rand_value, 360 | sparse_matrix_reader, 361 | _marker: PhantomData, 362 | } 363 | } 364 | 365 | /// Initialize a matrix 366 | fn initialize(&self) -> M { 367 | info!( 368 | "Start initialization. Dims: {}, entities: {}.", 369 | self.dimension, self.number_of_entities 370 | ); 371 | 372 | let result = M::init_with_hashes( 373 | self.number_of_entities, 374 | self.dimension, 375 | self.fixed_random_value, 376 | self.sparse_matrix_reader.clone(), 377 | ); 378 | 379 | info!( 380 | "Done initializing. Dims: {}, entities: {}.", 381 | self.dimension, self.number_of_entities 382 | ); 383 | result 384 | } 385 | 386 | /// The sparse matrix is multiplied by a freshly initialized matrix M. 387 | /// Multiplication is done against each column of matrix M in a separate thread. 388 | /// The obtained columns of the new matrix are subsequently merged into the full matrix. 389 | /// The matrix is L2-normalized, again in a multithreaded fashion across matrix columns. 390 | /// Finally, depending on the target iteration number, the matrix is either returned 391 | /// or fed for next iterations of multiplication against the sparse matrix. 392 | fn propagate(&self, max_iter: u8, res: M) -> M { 393 | info!("Start propagating. Number of iterations: {}.", max_iter); 394 | 395 | let mut new_res = res; 396 | for i in 0..max_iter { 397 | let mut next = M::multiply(self.sparse_matrix_reader.clone(), new_res); 398 | next.normalize(); 399 | new_res = next; 400 | 401 | info!( 402 | "Done iter: {}. Dims: {}, entities: {}, num data points: {}.", 403 | i, 404 | self.dimension, 405 | self.number_of_entities, 406 | self.sparse_matrix_reader.get_number_of_entries() 407 | ); 408 | } 409 | 410 | info!("Done propagating."); 411 | new_res 412 | } 413 | 414 | /// Saves results to output such as textfile, numpy etc 415 | fn persist( 416 | &self, 417 | res: M, 418 | entity_mapping_persistor: Arc, 419 | embedding_persistor: &mut dyn EmbeddingPersistor, 420 | ) where 421 | T1: EntityMappingPersistor, 422 | { 423 | info!("Start saving embeddings."); 424 | 425 | embedding_persistor 426 | .put_metadata(self.number_of_entities as u32, self.dimension as u16) 427 | .unwrap_or_else(|_| { 428 | // if can't write first data to the file, probably further is the same 429 | panic!( 430 | "Can't write metadata. Entities: {}. Dimension: {}.", 431 | self.number_of_entities, self.dimension 432 | ) 433 | }); 434 | 435 | // entities which can't be written to the file (error occurs) 436 | let mut broken_entities = HashSet::new(); 437 | for (i, hash) in self.sparse_matrix_reader.iter_hashes().enumerate() { 438 | let entity_name_opt = entity_mapping_persistor.get_entity(hash.value); 439 | if let Some(entity_name) = entity_name_opt { 440 | let mut embedding: Vec = Vec::with_capacity(self.dimension); 441 | for j in 0..self.dimension { 442 | let value = res.get_value(i, j); 443 | embedding.push(value); 444 | } 445 | embedding_persistor 446 | .put_data(&entity_name, hash.occurrence, embedding) 447 | .unwrap_or_else(|_| { 448 | broken_entities.insert(entity_name); 449 | }); 450 | }; 451 | } 452 | 453 | if !broken_entities.is_empty() { 454 | log_broken_entities(broken_entities); 455 | } 456 | 457 | embedding_persistor 458 | .finish() 459 | .unwrap_or_else(|_| warn!("Can't finish writing to the file.")); 460 | 461 | info!("Done saving embeddings."); 462 | } 463 | } 464 | 465 | fn log_broken_entities(broken_entities: HashSet) { 466 | let num_of_broken_entities = broken_entities.len(); 467 | let few_broken_entities: HashSet<_> = broken_entities 468 | .into_iter() 469 | .take(LOGGED_NUMBER_OF_BROKEN_ENTITIES) 470 | .collect(); 471 | warn!( 472 | "Number of entities which can't be written to the file: {}. First {} broken entities: {:?}.", 473 | num_of_broken_entities, LOGGED_NUMBER_OF_BROKEN_ENTITIES, few_broken_entities 474 | ); 475 | } 476 | 477 | /// Calculate embeddings with memory-mapped files. 478 | pub fn calculate_embeddings_mmap( 479 | config: Arc, 480 | sparse_matrix_reader: Arc, 481 | entity_mapping_persistor: Arc, 482 | embedding_persistor: &mut dyn EmbeddingPersistor, 483 | ) where 484 | T1: SparseMatrixReader + Sync + Send, 485 | T2: EntityMappingPersistor, 486 | { 487 | let mult = MatrixMultiplicator::new(config.clone(), sparse_matrix_reader); 488 | let init: MMapMatrix = mult.initialize(); 489 | let res = mult.propagate(config.max_number_of_iteration, init); 490 | mult.persist(res, entity_mapping_persistor, embedding_persistor); 491 | 492 | info!("Finalizing embeddings calculations!") 493 | } 494 | -------------------------------------------------------------------------------- /legacy/src/entity.rs: -------------------------------------------------------------------------------- 1 | use crate::configuration::{Column, Configuration}; 2 | use crate::persistence::entity::EntityMappingPersistor; 3 | use smallvec::{smallvec, SmallVec}; 4 | use std::hash::Hasher; 5 | use std::sync::Arc; 6 | use twox_hash::XxHash64; 7 | 8 | /// Indicates how many elements in a vector can be placed on Stack (used by smallvec crate). The rest 9 | /// of the vector is placed on Heap. 10 | pub const SMALL_VECTOR_SIZE: usize = 8; 11 | 12 | /// Marker for elements in a vector. Let's say that we have `vec![1, 2, 3, 4]` 13 | /// and `LengthAndOffset { length: 2, offset : 1 }`. Offset points to the second element in the vector 14 | /// and length tell us how many elements we should take (in that case 2 elements: 2 and 3). 15 | #[derive(Clone, Copy)] 16 | struct LengthAndOffset { 17 | length: u32, 18 | offset: u32, 19 | } 20 | 21 | struct CartesianProduct { 22 | has_next: bool, 23 | lengths_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]>, 24 | indices: SmallVec<[u32; SMALL_VECTOR_SIZE]>, 25 | } 26 | 27 | impl CartesianProduct { 28 | fn new( 29 | lengths_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]>, 30 | ) -> CartesianProduct { 31 | let indices: SmallVec<[u32; SMALL_VECTOR_SIZE]> = lengths_and_offsets 32 | .iter() 33 | .map(|length_and_offset| length_and_offset.offset) 34 | .collect(); 35 | CartesianProduct { 36 | has_next: true, 37 | lengths_and_offsets, 38 | indices, 39 | } 40 | } 41 | } 42 | 43 | impl Iterator for CartesianProduct { 44 | /// The type of the elements being iterated over. 45 | type Item = SmallVec<[u32; SMALL_VECTOR_SIZE]>; 46 | 47 | /// Advances the iterator and returns the next value - cartesian product. 48 | #[inline(always)] 49 | fn next(&mut self) -> Option { 50 | if !self.has_next { 51 | return None; 52 | } 53 | 54 | let len = self.indices.len(); 55 | let result: SmallVec<[u32; SMALL_VECTOR_SIZE]> = SmallVec::from_slice(&self.indices); 56 | for i in (0..len).rev() { 57 | let LengthAndOffset { length, offset } = self.lengths_and_offsets[i]; 58 | let last_index = length + offset; 59 | if self.indices[i] == (last_index - 1) { 60 | self.indices[i] = offset; 61 | if i == 0 { 62 | self.has_next = false; 63 | } 64 | } else { 65 | self.indices[i] += 1; 66 | break; 67 | } 68 | } 69 | Some(result) 70 | } 71 | } 72 | 73 | pub struct EntityProcessor<'a, T, F> 74 | where 75 | T: EntityMappingPersistor, 76 | F: FnMut(SmallVec<[u64; SMALL_VECTOR_SIZE]>), 77 | { 78 | config: &'a Configuration, 79 | field_hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]>, 80 | not_ignored_columns_count: u16, 81 | columns_count: u16, 82 | entity_mapping_persistor: Arc, 83 | hashes_handler: F, 84 | } 85 | 86 | impl<'a, T, F> EntityProcessor<'a, T, F> 87 | where 88 | T: EntityMappingPersistor, 89 | F: FnMut(SmallVec<[u64; SMALL_VECTOR_SIZE]>), 90 | { 91 | pub fn new( 92 | config: &'a Configuration, 93 | persistor: Arc, 94 | hashes_handler: F, 95 | ) -> EntityProcessor<'a, T, F> { 96 | let columns = &config.columns; 97 | // hashes for column names are used to differentiate entities with the same name 98 | // from different columns 99 | let field_hashes_vec: Vec = columns.iter().map(|c| hash(&c.name)).collect(); 100 | let field_hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = SmallVec::from_vec(field_hashes_vec); 101 | let not_ignored_cols = config.not_ignored_columns(); 102 | let mut not_ignored_columns_count = 0; 103 | let mut reflexive_columns_count = 0; 104 | for &col in ¬_ignored_cols { 105 | not_ignored_columns_count += 1; 106 | if col.reflexive { 107 | reflexive_columns_count += 1 108 | }; 109 | } 110 | 111 | let columns_count = not_ignored_columns_count + reflexive_columns_count; 112 | 113 | EntityProcessor { 114 | config, 115 | field_hashes, 116 | not_ignored_columns_count, 117 | columns_count, 118 | entity_mapping_persistor: persistor, 119 | hashes_handler, 120 | } 121 | } 122 | 123 | /// Every row can create few combinations (cartesian products) which are hashed and provided for sparse matrix creation. 124 | /// `row` - array of strings such as: ("userId1", "productId1 productId2", "brandId1"). 125 | pub fn process_row>(&mut self, row: &[SmallVec<[S; SMALL_VECTOR_SIZE]>]) { 126 | let mut hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = 127 | SmallVec::with_capacity(self.not_ignored_columns_count as usize); 128 | let mut lens_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]> = 129 | smallvec![LengthAndOffset{ length: 0, offset: 0}; self.columns_count as usize]; 130 | let mut reflexive_count = 0; 131 | let mut current_offset = 0u32; 132 | 133 | let mut idx = 0; 134 | for (i, column_entities) in row.iter().enumerate() { 135 | let column = &self.config.columns[i]; 136 | if !column.ignored { 137 | if column.complex { 138 | for entity in column_entities { 139 | let hash = self.field_hashes[i] ^ hash(entity.as_ref()); 140 | hashes.push(hash); 141 | self.update_entity_mapping(entity.as_ref(), hash, column); 142 | } 143 | let length = column_entities.len() as u32; 144 | lens_and_offsets[idx] = LengthAndOffset { 145 | length, 146 | offset: current_offset, 147 | }; 148 | if column.reflexive { 149 | // put reflexive column data to the end of the buffers 150 | let reflexive_id = 151 | (self.not_ignored_columns_count + reflexive_count) as usize; 152 | lens_and_offsets[reflexive_id] = LengthAndOffset { 153 | length, 154 | offset: current_offset, 155 | }; 156 | reflexive_count += 1; 157 | } 158 | current_offset += length; 159 | } else { 160 | let entity = column_entities.get(0).unwrap().as_ref(); 161 | let hash = self.field_hashes[i] ^ hash(entity); 162 | hashes.push(hash); 163 | self.update_entity_mapping(entity, hash, column); 164 | let length = 1u32; 165 | lens_and_offsets[idx] = LengthAndOffset { 166 | length, 167 | offset: current_offset, 168 | }; 169 | current_offset += length; 170 | } 171 | idx += 1; 172 | } 173 | } 174 | 175 | let hash_rows = self.generate_combinations_with_length(hashes, lens_and_offsets); 176 | for hash_row in hash_rows { 177 | (self.hashes_handler)(hash_row); 178 | } 179 | } 180 | 181 | #[inline(always)] 182 | fn update_entity_mapping(&self, entity: &str, hash: u64, column: &Column) { 183 | if !column.transient && !self.entity_mapping_persistor.contains(hash) { 184 | let entry = if self.config.prepend_field { 185 | let mut entry = column.name.clone(); 186 | entry.push_str("__"); 187 | entry.push_str(entity); 188 | entry 189 | } else { 190 | entity.to_string() 191 | }; 192 | self.entity_mapping_persistor.put_data(hash, entry); 193 | } 194 | } 195 | 196 | /// It creates Cartesian Product for incoming data. 197 | /// Let's say that we have such columns: 198 | /// customers | products | brands 199 | /// incoming data: 200 | /// userId1 | productId1, productId2 | brandId1, brandId2 201 | /// Total number of combinations is equal to 4 (1 * 2 * 2) based on: 202 | /// number of entities in customers column * number of entities in products column * number of entities in brands column 203 | /// Cartesian Products for our data: 204 | /// (userId1, productId1, brandId1), (userId1, productId1, brandId2), (userId1, productId2, brandId1), (userId1, productId2, brandId2) 205 | /// `hashes` - entity hashes 206 | /// `lens_and_offsets` - number of entities per column 207 | /// return entity hashes Cartesian Products. Size of the array (matrix) is equal to number of combinations x number of columns (including reflexive column) 208 | #[inline(always)] 209 | fn generate_combinations_with_length( 210 | &self, 211 | hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]>, 212 | lens_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]>, 213 | ) -> impl Iterator> { 214 | let row_length = lens_and_offsets.len(); 215 | let mut total_combinations = 1; 216 | for len_and_offset in &lens_and_offsets { 217 | total_combinations *= len_and_offset.length; 218 | } 219 | 220 | let cartesian = CartesianProduct::new(lens_and_offsets); 221 | 222 | cartesian.map(move |indices| { 223 | let mut arr: SmallVec<[u64; SMALL_VECTOR_SIZE]> = 224 | SmallVec::with_capacity(row_length + 1); 225 | arr.push(total_combinations as u64); 226 | for i in indices { 227 | let value = hashes[i as usize]; 228 | arr.push(value); 229 | } 230 | arr 231 | }) 232 | } 233 | } 234 | 235 | #[inline(always)] 236 | fn hash(entity: &str) -> u64 { 237 | let mut hasher = XxHash64::default(); 238 | hasher.write(entity.as_bytes()); 239 | hasher.finish() 240 | } 241 | 242 | #[cfg(test)] 243 | mod tests { 244 | use crate::configuration::{Column, Configuration}; 245 | use crate::entity::{ 246 | hash, CartesianProduct, EntityProcessor, LengthAndOffset, SMALL_VECTOR_SIZE, 247 | }; 248 | use crate::persistence::entity::InMemoryEntityMappingPersistor; 249 | use smallvec::{smallvec, SmallVec}; 250 | use std::sync::Arc; 251 | 252 | fn prepare_lengths_and_offsets( 253 | entities_per_column: &[u32], 254 | ) -> SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]> { 255 | let mut lens_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]> = 256 | SmallVec::with_capacity(entities_per_column.len()); 257 | let mut offset = 0; 258 | for &num_of_entities in entities_per_column { 259 | lens_and_offsets.push(LengthAndOffset { 260 | length: num_of_entities, 261 | offset, 262 | }); 263 | offset += num_of_entities; 264 | } 265 | lens_and_offsets 266 | } 267 | 268 | fn prepare_hashes( 269 | total_combination: u64, 270 | entities: &[&str], 271 | field_hashes: &[u64], 272 | ) -> SmallVec<[u64; SMALL_VECTOR_SIZE]> { 273 | let mut hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = SmallVec::new(); 274 | hashes.push(total_combination); 275 | for (i, &entity) in entities.iter().enumerate() { 276 | let hash = field_hashes[i] ^ hash(entity); 277 | hashes.push(hash); 278 | } 279 | hashes 280 | } 281 | 282 | #[test] 283 | fn generate_cartesian_product_indices() { 284 | let lengths_and_offsets = prepare_lengths_and_offsets(&[2, 1, 3]); 285 | 286 | let cartesian_product = CartesianProduct::new(lengths_and_offsets); 287 | let mut iter = cartesian_product.into_iter(); 288 | 289 | assert_eq!(Some(smallvec![0, 2, 3]), iter.next()); 290 | assert_eq!(Some(smallvec![0, 2, 4]), iter.next()); 291 | assert_eq!(Some(smallvec![0, 2, 5]), iter.next()); 292 | assert_eq!(Some(smallvec![1, 2, 3]), iter.next()); 293 | assert_eq!(Some(smallvec![1, 2, 4]), iter.next()); 294 | assert_eq!(Some(smallvec![1, 2, 5]), iter.next()); 295 | 296 | assert_eq!(None, iter.next()); 297 | } 298 | 299 | #[test] 300 | fn generate_cartesian_product_hashes() { 301 | let dummy_config = Configuration::default(String::from(""), vec![]); 302 | 303 | // hashes for entities in every column 304 | // column_1: 1 entity 305 | // column_2: 2 entities 306 | // column_3: 3 entities 307 | let lengths_and_offsets = prepare_lengths_and_offsets(&[1, 2, 3]); 308 | let hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = smallvec![10, 20, 30, 40, 50, 60]; 309 | let mut total_combinations = 1u64; 310 | for len_and_offset in &lengths_and_offsets { 311 | total_combinations *= len_and_offset.length as u64; 312 | } 313 | 314 | let in_memory_entity_mapping_persistor = InMemoryEntityMappingPersistor::default(); 315 | let in_memory_entity_mapping_persistor = Arc::new(in_memory_entity_mapping_persistor); 316 | let entity_processor = EntityProcessor::new( 317 | &dummy_config, 318 | in_memory_entity_mapping_persistor.clone(), 319 | |_hashes| {}, 320 | ); 321 | 322 | let combinations: Vec<_> = entity_processor 323 | .generate_combinations_with_length(hashes, lengths_and_offsets) 324 | .collect(); 325 | assert_eq!( 326 | &SmallVec::from([total_combinations, 10, 20, 40]), 327 | combinations.get(0).unwrap() 328 | ); 329 | assert_eq!( 330 | &SmallVec::from([total_combinations, 10, 20, 50]), 331 | combinations.get(1).unwrap() 332 | ); 333 | assert_eq!( 334 | &SmallVec::from([total_combinations, 10, 20, 60]), 335 | combinations.get(2).unwrap() 336 | ); 337 | assert_eq!( 338 | &SmallVec::from([total_combinations, 10, 30, 40]), 339 | combinations.get(3).unwrap() 340 | ); 341 | assert_eq!( 342 | &SmallVec::from([total_combinations, 10, 30, 50]), 343 | combinations.get(4).unwrap() 344 | ); 345 | assert_eq!( 346 | &SmallVec::from([total_combinations, 10, 30, 60]), 347 | combinations.get(5).unwrap() 348 | ); 349 | assert_eq!(None, combinations.get(6)); 350 | } 351 | 352 | #[test] 353 | fn process_row_and_handle_hashes() { 354 | let columns = vec![ 355 | Column { 356 | name: String::from("column_1"), 357 | transient: false, 358 | complex: false, 359 | reflexive: false, 360 | ignored: true, 361 | }, 362 | Column { 363 | name: String::from("column_2"), 364 | transient: true, 365 | complex: false, 366 | reflexive: false, 367 | ignored: false, 368 | }, 369 | Column { 370 | name: String::from("column_3"), 371 | transient: false, 372 | complex: true, 373 | reflexive: true, 374 | ignored: false, 375 | }, 376 | Column { 377 | name: String::from("column_4"), 378 | transient: false, 379 | complex: false, 380 | reflexive: false, 381 | ignored: false, 382 | }, 383 | ]; 384 | // columns configuration: ignored::column_1 transient::column_2 complex::reflexive::column3 column_4 385 | // first column is ignored - we don't process entities from that column 386 | // third column is reflexive so we put it at the end 387 | let column_names = vec![ 388 | columns[1].name.clone(), 389 | columns[2].name.clone(), 390 | columns[3].name.clone(), 391 | columns[2].name.clone(), 392 | ]; 393 | // hashes for column names are used to differentiate entities with the same name 394 | // from different columns 395 | let field_hashes: Vec = column_names.iter().map(|name| hash(name)).collect(); 396 | 397 | // columns are most important, the rest can be omitted 398 | let dummy_config = Configuration::default(String::from(""), columns); 399 | 400 | let in_memory_entity_mapping_persistor = InMemoryEntityMappingPersistor::default(); 401 | let in_memory_entity_mapping_persistor = Arc::new(in_memory_entity_mapping_persistor); 402 | let mut result: SmallVec<[SmallVec<[u64; SMALL_VECTOR_SIZE]>; SMALL_VECTOR_SIZE]> = 403 | SmallVec::new(); 404 | let mut entity_processor = EntityProcessor::new( 405 | &dummy_config, 406 | in_memory_entity_mapping_persistor.clone(), 407 | |hashes| { 408 | result.push(hashes); 409 | }, 410 | ); 411 | 412 | let row = vec![ 413 | smallvec!["a"], 414 | smallvec!["bb"], 415 | smallvec!["ccc", "ddd"], 416 | smallvec!["eeee"], 417 | ]; 418 | entity_processor.process_row(&row); 419 | 420 | // first column is ignored, third one is reflexive so the entities go at the end 421 | // input: "bb", "ccc ddd", "eeee", "ccc ddd" 422 | // number of cartesian products from the above entities 423 | assert_eq!(4, result.len()); 424 | assert_eq!( 425 | prepare_hashes(4, &["bb", "ccc", "eeee", "ccc"], &field_hashes), 426 | result[0] 427 | ); 428 | assert_eq!( 429 | prepare_hashes(4, &["bb", "ccc", "eeee", "ddd"], &field_hashes), 430 | result[1] 431 | ); 432 | assert_eq!( 433 | prepare_hashes(4, &["bb", "ddd", "eeee", "ccc"], &field_hashes), 434 | result[2] 435 | ); 436 | assert_eq!( 437 | prepare_hashes(4, &["bb", "ddd", "eeee", "ddd"], &field_hashes), 438 | result[3] 439 | ); 440 | } 441 | } 442 | --------------------------------------------------------------------------------