├── pycleora
    ├── py.typed
    ├── .gitignore
    ├── __init__.py
    └── pycleora.pyi
├── legacy
    ├── docs
    │   ├── requirements.txt
    │   ├── source
    │   │   ├── _static
    │   │   │   ├── cleora_logo.png
    │   │   │   ├── cleora-columns.png
    │   │   │   ├── cleora-sparse-matrix.png
    │   │   │   ├── hypergraph-expansion.png
    │   │   │   └── hypergraph-expansion-for-each-hyperedge.png
    │   │   ├── examples.rst
    │   │   ├── index.rst
    │   │   ├── algorithms.rst
    │   │   ├── conf.py
    │   │   ├── why_cleora.rst
    │   │   ├── graph_creation.rst
    │   │   └── running.rst
    │   ├── Makefile
    │   └── make.bat
    ├── files
    │   ├── samples
    │   │   ├── edgelist_2.tsv
    │   │   ├── edgelist_1.tsv
    │   │   └── edgelist_2.json
    │   └── images
    │   │   ├── cleora.png
    │   │   └── figure_1.png
    ├── .cargo
    │   └── config
    ├── src
    │   ├── lib.rs
    │   ├── configuration.rs
    │   ├── pipeline.rs
    │   ├── main.rs
    │   ├── persistence.rs
    │   ├── embedding.rs
    │   └── entity.rs
    ├── Cargo.toml
    ├── .github
    │   └── workflows
    │   │   ├── ci.yml
    │   │   └── release.yml
    ├── CHANGELOG.md
    ├── tests
    │   └── snapshot.rs
    ├── benches
    │   └── cleora_benchmark.rs
    └── example_classification.ipynb
├── files
    ├── samples
    │   ├── edgelist_2.tsv
    │   ├── edgelist_1.tsv
    │   └── edgelist_2.json
    └── images
    │   ├── cleora.png
    │   └── figure_1.png
├── .cargo
    └── config
├── .gitignore
├── examples
    ├── predefined_cleora_loop.py
    ├── column_indices.py
    ├── graph_pickle.py
    ├── cleora_loop.py
    └── from_iterator.py
├── pyproject.toml
├── Cargo.toml
├── LICENSE
├── src
    ├── embedding.rs
    ├── configuration.rs
    ├── sparse_matrix.rs
    ├── entity.rs
    ├── pipeline.rs
    ├── lib.rs
    └── sparse_matrix_builder.rs
├── CHANGELOG.md
├── tests
    └── snapshot.rs
├── benches
    └── cleora_benchmark.rs
├── .github
    └── workflows
    │   └── CI.yml
└── README.md


/pycleora/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/legacy/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==3.5.3
2 | 


--------------------------------------------------------------------------------
/files/samples/edgelist_2.tsv:
--------------------------------------------------------------------------------
1 | u1	p1 p2	b1 b2
2 | u2	p2 p3 p4	b1


--------------------------------------------------------------------------------
/legacy/files/samples/edgelist_2.tsv:
--------------------------------------------------------------------------------
1 | u1	p1 p2	b1 b2
2 | u2	p2 p3 p4	b1


--------------------------------------------------------------------------------
/pycleora/.gitignore:
--------------------------------------------------------------------------------
1 | cleora.cpython-310-x86_64-linux-gnu.so
2 | __pycache__


--------------------------------------------------------------------------------
/.cargo/config:
--------------------------------------------------------------------------------
1 | [target.x86_64-unknown-linux-musl]
2 | linker = "x86_64-linux-musl-gcc"
3 | 


--------------------------------------------------------------------------------
/files/samples/edgelist_1.tsv:
--------------------------------------------------------------------------------
1 | a ba bac	abb	r rrr rr
2 | a ab bca	bcc	rr r
3 | ba ab a aa	abb	r rrr


--------------------------------------------------------------------------------
/legacy/.cargo/config:
--------------------------------------------------------------------------------
1 | [target.x86_64-unknown-linux-musl]
2 | linker = "x86_64-linux-musl-gcc"
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | /target
3 | **/*.rs.bk
4 | .idea
5 | cleora-light-rust.iml
6 | *.out
7 | *.so


--------------------------------------------------------------------------------
/files/images/cleora.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/files/images/cleora.png


--------------------------------------------------------------------------------
/legacy/files/samples/edgelist_1.tsv:
--------------------------------------------------------------------------------
1 | a ba bac	abb	r rrr rr
2 | a ab bca	bcc	rr r
3 | ba ab a aa	abb	r rrr


--------------------------------------------------------------------------------
/files/images/figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/files/images/figure_1.png


--------------------------------------------------------------------------------
/legacy/files/images/cleora.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/files/images/cleora.png


--------------------------------------------------------------------------------
/legacy/files/images/figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/files/images/figure_1.png


--------------------------------------------------------------------------------
/legacy/docs/source/_static/cleora_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/cleora_logo.png


--------------------------------------------------------------------------------
/legacy/docs/source/_static/cleora-columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/cleora-columns.png


--------------------------------------------------------------------------------
/legacy/docs/source/_static/cleora-sparse-matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/cleora-sparse-matrix.png


--------------------------------------------------------------------------------
/legacy/docs/source/_static/hypergraph-expansion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/hypergraph-expansion.png


--------------------------------------------------------------------------------
/legacy/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod configuration;
2 | pub mod embedding;
3 | pub mod entity;
4 | pub mod persistence;
5 | pub mod pipeline;
6 | pub mod sparse_matrix;
7 | 


--------------------------------------------------------------------------------
/files/samples/edgelist_2.json:
--------------------------------------------------------------------------------
1 | {"users": "u1", "products": ["p1", "p2"], "brands": ["b1", "b2"]}
2 | {"users": "u2", "products": ["p2", "p3", "p4"], "brands": ["b1"]}
3 | 


--------------------------------------------------------------------------------
/legacy/files/samples/edgelist_2.json:
--------------------------------------------------------------------------------
1 | {"users": "u1", "products": ["p1", "p2"], "brands": ["b1", "b2"]}
2 | {"users": "u2", "products": ["p2", "p3", "p4"], "brands": ["b1"]}
3 | 


--------------------------------------------------------------------------------
/legacy/docs/source/_static/hypergraph-expansion-for-each-hyperedge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaseModelAI/cleora/HEAD/legacy/docs/source/_static/hypergraph-expansion-for-each-hyperedge.png


--------------------------------------------------------------------------------
/pycleora/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .pycleora import SparseMatrix
 4 | 
 5 | def embed_using_baseline_cleora(graph, feature_dim: int, iter: int):
 6 |     embeddings = graph.initialize_deterministically(feature_dim)
 7 |     for i in range(iter):
 8 |         embeddings = graph.left_markov_propagate(embeddings)
 9 |         embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True)
10 |     return embeddings


--------------------------------------------------------------------------------
/examples/predefined_cleora_loop.py:
--------------------------------------------------------------------------------
1 | import time
2 | 
3 | from pycleora import embed_using_baseline_cleora, SparseMatrix
4 | 
5 | start_time = time.time()
6 | graph = SparseMatrix.from_files(["perf_inputs/0.tsv", "perf_inputs/1.tsv", "perf_inputs/2.tsv", "perf_inputs/3.tsv", "perf_inputs/4.tsv", "perf_inputs/5.tsv", "perf_inputs/6.tsv", "perf_inputs/7.tsv"], "complex::reflexive::name")
7 | embeddings = embed_using_baseline_cleora(graph, 128, 3)
8 | print(f"Took {time.time() - start_time} seconds ")
9 | 


--------------------------------------------------------------------------------
/examples/column_indices.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pycleora import SparseMatrix
 3 | 
 4 | hyperedges = [
 5 |     'a\t1',
 6 |     'a\t2',
 7 |     'b\t5',
 8 |     'b\t2',
 9 |     'c\t8',
10 | ]
11 | 
12 | graph = SparseMatrix.from_iterator((e for e in hyperedges), "char num")
13 | 
14 | entity_ids = np.array(graph.entity_ids)
15 | print(entity_ids)
16 | print(graph.entity_degrees)
17 | 
18 | print(graph.get_entity_column_mask('char'))
19 | print(graph.get_entity_column_mask('num'))
20 | 
21 | print(entity_ids[graph.get_entity_column_mask('char')])
22 | print(entity_ids[graph.get_entity_column_mask('num')])


--------------------------------------------------------------------------------
/legacy/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/examples/graph_pickle.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import numpy as np
 4 | from pycleora import SparseMatrix
 5 | 
 6 | import pickle
 7 | 
 8 | start_time = time.time()
 9 | 
10 | graph = SparseMatrix.from_files(["perf_inputs/0.tsv"], "complex::reflexive::name")
11 | 
12 | print("Entities n", len(graph.entity_ids))
13 | print(graph.entity_ids[:10])
14 | 
15 | with open('graph.pkl', 'wb') as f:
16 |     pickle.dump(graph, f)
17 | 
18 | with open('graph.pkl', 'rb') as f:
19 |     graph_reread = pickle.load(f)
20 | 
21 | print(graph.entity_ids[:10])
22 | print(graph_reread.entity_ids[:10])
23 | 
24 | embeddings = graph_reread.initialize_deterministically(feature_dim=128, seed=0)
25 | embeddings = graph_reread.left_markov_propagate(embeddings)
26 | 
27 | print(embeddings)


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin>=1.2.3"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "pycleora"
 7 | requires-python = ">=3.7"
 8 | classifiers = [
 9 |     "Programming Language :: Rust",
10 |     "Programming Language :: Python :: Implementation :: CPython",
11 |     "Programming Language :: Python :: Implementation :: PyPy",
12 | ]
13 | version = "2.1.0"
14 | description = "Sparse hypergraph structure and markov-propagation for node embeddings embeddings exposed via Python bindings."
15 | readme = { file = "README.md", content-type = "text/markdown" }
16 | authors = [
17 |     { name = "Jacek Dabrowski", email = "jack.dabrowski@synerise.com" }
18 | ]
19 | license = { file = "LICENSE" }
20 | 
21 | 
22 | [tool.maturin]
23 | features = ["pyo3/extension-module"]
24 | 


--------------------------------------------------------------------------------
/legacy/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/examples/cleora_loop.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import numpy as np
 4 | from pycleora import SparseMatrix
 5 | 
 6 | start_time = time.time()
 7 | 
 8 | # graph = SparseMatrix.from_files(["zaba30_large_5m.tsv"], "basket complex::product", hyperedge_trim_n=16)
 9 | graph = SparseMatrix.from_files(["perf_inputs/0.tsv", "perf_inputs/1.tsv", "perf_inputs/2.tsv", "perf_inputs/3.tsv", "perf_inputs/4.tsv", "perf_inputs/5.tsv", "perf_inputs/6.tsv", "perf_inputs/7.tsv"], "complex::reflexive::name")
10 | 
11 | print("Entities n", len(graph.entity_ids))
12 | # embeddings = np.random.randn(len(graph.entity_ids), 128).astype(np.float32)
13 | embeddings = graph.initialize_deterministically(feature_dim=128, seed=0)
14 | 
15 | for i in range(3):
16 |     embeddings = graph.left_markov_propagate(embeddings)
17 |     # embeddings = graph.symmetric_markov_propagate(embeddings)
18 | 
19 |     embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True)
20 |     print(f"Iter {i} finished")
21 | 
22 | print(graph.entity_ids[:10])
23 | 
24 | print(f"Took {time.time() - start_time} seconds ")
25 | 


--------------------------------------------------------------------------------
/examples/from_iterator.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import numpy as np
 4 | from pycleora import SparseMatrix
 5 | 
 6 | start_time = time.time()
 7 | 
 8 | def edges_iterator():
 9 |     lines = []
10 | 
11 |     files = ["perf_inputs/0.tsv", "perf_inputs/1.tsv", "perf_inputs/2.tsv", "perf_inputs/3.tsv", "perf_inputs/4.tsv", "perf_inputs/5.tsv", "perf_inputs/6.tsv", "perf_inputs/7.tsv"]
12 |     for file in files:
13 |         with open(file, 'rt') as f:
14 |             lines.extend(f)
15 | 
16 |     iteration_start_time = time.time()
17 |     for line in lines:
18 |         yield line
19 |     print(f"Iteration took {time.time() - iteration_start_time} seconds ")
20 | 
21 | graph = SparseMatrix.from_iterator(edges_iterator(), "complex::reflexive::product")
22 | 
23 | print("Entities n", len(graph.entity_ids))
24 | print(graph.entity_ids[:10])
25 | 
26 | embeddings = np.random.randn(len(graph.entity_ids), 256).astype(np.float32)
27 | 
28 | for i in range(3):
29 |     embeddings = graph.left_markov_propagate(embeddings)
30 |     embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True)
31 |     print(f"Iter {i} finished")
32 | 
33 | print(f"Took {time.time() - start_time} seconds ")


--------------------------------------------------------------------------------
/legacy/docs/source/examples.rst:
--------------------------------------------------------------------------------
 1 | .. _examples:
 2 | 
 3 | Use cases examples
 4 | ===================== 
 5 | 
 6 | .. list-table::
 7 |    :widths: 40 80 80
 8 |    :header-rows: 1
 9 | 
10 |    * - Examples
11 |      - Description
12 |      - Dataset
13 |    * - `Classification <https://colab.research.google.com/drive/16NFWHHiYSH_oE0zdl6p8hWAkOpaeKZvv?usp=sharing>`_
14 |      - Synerise Cleora Classification Example for Facebook Large Page-Page Network
15 |      - `Facebook Large Page-Page Network <https://snap.stanford.edu/data/facebook-large-page-page-network.html>`_
16 |    * - `Link Prediction <https://colab.research.google.com/drive/13RkpK0L5sTeT1rfGgy2YdaGC4sHfjTfT?usp=sharing>`_
17 |      - Synerise Cleora Link Prediction Example for Facebook Large Page-Page Network
18 |      - `Facebook Large Page-Page Network <https://snap.stanford.edu/data/facebook-large-page-page-network.html>`_
19 |    * - `Link Prediction <https://colab.research.google.com/drive/1a_GgkJ-nirZ3hYk2fsYmPmEsGQcGMuXC?usp=sharing>`_
20 |      - Synerise Cleora Link Prediction Example for The Complete Journey
21 |      - `The Complete Journey <https://www.dunnhumby.com/wp-content/uploads/sourcefiles/dunnhumby_The-Complete-Journey.zip>`_


--------------------------------------------------------------------------------
/legacy/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | :github_url: https://github.com/Synerise/cleora
 2 | 
 3 | Synerise Cleora AI  Documentation
 4 | =====================================
 5 | 
 6 | **Synersie Cleora AI** is a general-purpose model for efficient, scalable learning of stable and inductive entity embeddings for heterogeneous relational data. **Cleora** embeds entities in n-dimensional spherical spaces utilizing extremely fast stable, iterative random projections, which allows for unparalleled performance and scalability.
 7 | 
 8 | Types of data which can be embedded include for example:
 9 | 
10 | - heterogeneous undirected graphs
11 | - heterogeneous undirected hypergraphs
12 | - text and other categorical array data
13 | - any combination of the above
14 | 
15 | Read the whitepaper `Cleora: A Simple, Strong and Scalable Graph Embedding Scheme <https://arxiv.org/abs/2102.02302>`_
16 | 
17 | ===========================================
18 | 
19 | .. toctree::
20 |    :maxdepth: 2
21 |    :caption: Contents:
22 | 
23 |    graph_creation
24 |    algorithms
25 |    running
26 |    examples
27 |    why_cleora
28 | 
29 | 
30 | 
31 | Indices and tables
32 | ==================
33 | 
34 | * :ref:`genindex`
35 | * :ref:`modindex`
36 | * :ref:`search`
37 | 


--------------------------------------------------------------------------------
/legacy/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cleora"
 3 | version = "1.2.3"
 4 | authors = ["Piotr Babel <piotr.babel@synerise.com>", "Jacek Dabrowski <jack.dabrowski@synerise.com>", "Konrad Goluchowski <konrad.goluchowski@synerise.com>"]
 5 | edition = "2018"
 6 | license-file = "LICENSE"
 7 | readme = "README.md"
 8 | description = """
 9 | Cleora is a general-purpose model for efficient, scalable learning of stable and inductive entity embeddings for heterogeneous relational data.
10 | """
11 | 
12 | [build]
13 | rustflags = ["-C", "target-cpu=native"]
14 | 
15 | [dependencies]
16 | bus = "2.2.4"
17 | clap = { version = "3.1.8", features = ["cargo"] }
18 | env_logger = "0.9.0"
19 | log = "0.4.17"
20 | memmap = "0.7.0"
21 | rayon = "1.5.3"
22 | rustc-hash = "1.1.0"
23 | smallvec = "1.8.1"
24 | twox-hash = "1.6.3"
25 | simdjson-rust = {git = "https://github.com/SunDoge/simdjson-rust"}
26 | ryu = "1.0.10"
27 | ndarray = "0.15.4"
28 | ndarray-npy = "0.8.1"
29 | serde_json = "1.0.81"
30 | uuid = { version = "1.1.2", features = ["v4"] }
31 | 
32 | [dev-dependencies]
33 | criterion = "0.3.3"
34 | insta = "1.3.0"
35 | 
36 | [[bench]]
37 | name = "cleora_benchmark"
38 | harness = false
39 | 
40 | [profile.release]
41 | opt-level = 3
42 | lto = true
43 | codegen-units = 1
44 | 


--------------------------------------------------------------------------------
/pycleora/pycleora.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Any, Iterable, Optional, Self
 2 | 
 3 | import numpy as np
 4 | from numpy.typing import NDArray
 5 | 
 6 | 
 7 | class SparseMatrix:
 8 |     def __new__(cls, *args: Any) -> Self:
 9 |         pass
10 | 
11 |     @classmethod
12 |     def from_iterator(
13 |         cls, hyperedges: Iterable[str], columns: str, hyperedge_trim_n: int = 16, num_workers: Optional[int] = None
14 |     ) -> Self:
15 |         pass
16 | 
17 |     @classmethod
18 |     def from_files(
19 |         cls, filepaths: list[str], columns: str, hyperedge_trim_n: int = 16, num_workers: Optional[int] = None
20 |     ) -> Self:
21 |         pass
22 | 
23 |     def left_markov_propagate(self, x: NDArray[np.float32], num_workers: Optional[int] = None) -> NDArray[np.float32]:
24 |         pass
25 | 
26 |     def symmetric_markov_propagate(
27 |         self, x: NDArray[np.float32], num_workers: Optional[int] = None
28 |     ) -> NDArray[np.float32]:
29 |         pass
30 | 
31 |     def get_entity_column_mask(self, column_name: str) -> NDArray[np.bool]:
32 |         pass
33 | 
34 |     def entity_degrees(self) -> NDArray[np.float32]:
35 |         pass
36 | 
37 |     def initialize_deterministically(self, feature_dim: int, seed: int = 0) -> NDArray[np.float32]:
38 |         pass
39 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pycleora"
 3 | version = "2.1.0"
 4 | edition = "2018"
 5 | license-file = "LICENSE"
 6 | readme = "README.md"
 7 | documentation = "https://github.com/synerise/cleora"
 8 | homepage = "https://github.com/synerise/cleora"
 9 | repository = "https://github.com/synerise/cleora"
10 | description = """
11 | Sparse hypergraph structure and markov-propagation for node embeddings embeddings exposed via Python bindings.
12 | """
13 | 
14 | [lib]
15 | crate-type = ["cdylib", "rlib"]
16 | 
17 | [build]
18 | rustflags = ["-C", "target-cpu=native"]
19 | 
20 | [dependencies]
21 | log = "0.4.17"
22 | rayon = "1.5.3"
23 | rustc-hash = "1.1.0"
24 | smallvec = "1.8.1"
25 | twox-hash = "1.6.3"
26 | ndarray = { version = "0.15.4", features = ["rayon"] }
27 | ndarray-npy = "0.8.1"
28 | uuid = { version = "1.1.2", features = ["v4"] }
29 | crossbeam = "0.8.1"
30 | dashmap = { version = "5.3.4", features = ["rayon"] }
31 | num_cpus = "1.13.1"
32 | itertools = "0.10.3"
33 | serde = { version = "1.0.163", features = ["derive"] }
34 | bincode = "1.3.3"
35 | 
36 | pyo3 = "0.18.1"
37 | numpy = "0.18"
38 | 
39 | [dev-dependencies]
40 | criterion = "0.3.3"
41 | insta = "1.3.0"
42 | ndarray-rand = "0.14.0"
43 | 
44 | [[bench]]
45 | name = "cleora_benchmark"
46 | harness = false
47 | 
48 | [profile.release]
49 | opt-level = 3
50 | lto = true
51 | codegen-units = 1
52 | 
53 | [target.aarch64-apple-darwin]
54 | linker = "aarch64-apple-darwin21.4-clang"
55 | ar = "aarch64-apple-darwin21.4-ar"
56 | rustflags = [
57 |   "-C", "link-arg=-undefined",
58 |   "-C", "link-arg=dynamic_lookup",
59 | ]
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | SOFTWARE LICENSING
 2 | 
 3 | You are licensed to use Synerise Cleora produced by Synerise SA. under an MIT LICENSE
 4 | 
 5 | Synerise Cleora MIT License
 6 | 
 7 | Copyright (c) 2020 Synerise SA - [entered into the Register of
 8 | Entrepreneurs of the National Court Register maintained by the
 9 | District Court for Kraków-Śródmieście in Kraków, XI Commercial Division
10 | of the National Court Register with the KRS number 0000468034,
11 | NIP (tax identification number) number 679309 32 92,
12 | share capital in the amount of PLN 556 150,00. paid up in full.]
13 | 
14 | Permission is hereby granted, free of charge, to any person obtaining
15 | a copy of this software and associated documentation files (the
16 | "Software"), to deal in the Software without restriction, including
17 | without limitation the rights to use, copy, modify, merge, publish,
18 | distribute, sublicense, and/or sell copies of the Software, and to
19 | permit persons to whom the Software is furnished to do so, subject to
20 | the following conditions:
21 | 
22 | The above copyright notice and this permission notice shall be included
23 | in all copies or substantial portions of the Software.
24 | 
25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
28 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
29 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
30 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
31 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/legacy/docs/source/algorithms.rst:
--------------------------------------------------------------------------------
 1 | .. _algorithms:
 2 | 
 3 | Hypergraph expansion methods
 4 | ============================================================= 
 5 | 
 6 | 
 7 | .. figure:: _static/hypergraph-expansion.png
 8 |     :figwidth: 100 %
 9 |     :width: 100 %
10 |     :align: center
11 |     :alt: examples use case of column modifiers
12 | 
13 | **Hypergraph Expansion** - Cleora needs to break down all existing hyperedges into edges as the algorithm relies on the pairwise notion of node transition. Hypergraph expansion to graph is done using two alternative strategies:
14 | 
15 | 
16 | Clique Expansion
17 | ---------------------------
18 | 
19 | - Each hyperedge is transformed into a clique - a subgraph where each pair of nodes is connected with an edge. Space/time complexity of this approach is:
20 | 
21 | .. math::
22 | 
23 |    O(|V| \times d + |E| \times k^2) 
24 | 
25 | where: E is the number of hyperedges. 
26 | 
27 | With the usage of cliques the number of created edges can be significant but guarantees better fidelity to the original hyperedge relationship. We apply this scheme to smaller graphs.
28 | 
29 | 
30 | .. figure:: _static/hypergraph-expansion-for-each-hyperedge.png
31 |     :figwidth: 100 %
32 |     :width: 100 %
33 |     :align: center
34 |     :alt: examples use case of column modifiers
35 | 
36 | 
37 | Star Expansion
38 | ---------------------------
39 | - Extra node is introduced which links to the original nodes contained by a hyperedge. Space/time complexity of this approach is:
40 | 
41 | .. math::
42 | 
43 |    (|V|+|E|) \times d + |E|k)
44 | 
45 | Here we must count in the time and space needed to embed an extra entity for each hyperedge, but we save on the number of created edges, which would be only k for each hyperedge. This approach is suited for large graphs.
46 | 


--------------------------------------------------------------------------------
/legacy/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | env:
10 |   RUST_BACKTRACE: 1
11 | 
12 | jobs:
13 |   check:
14 |     name: Check
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - uses: actions-rs/toolchain@v1
19 |         with:
20 |           profile: minimal
21 |           toolchain: stable
22 |           override: true
23 |       - uses: actions-rs/cargo@v1
24 |         with:
25 |           command: check
26 | 
27 |   test:
28 |     name: Test Suite
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - uses: actions/checkout@v2
32 |       - uses: actions-rs/toolchain@v1
33 |         with:
34 |           profile: minimal
35 |           toolchain: stable
36 |           override: true
37 |       - uses: actions-rs/cargo@v1
38 |         with:
39 |           command: test
40 | 
41 |   fmt:
42 |     name: Rustfmt
43 |     runs-on: ubuntu-latest
44 |     steps:
45 |       - uses: actions/checkout@v2
46 |       - uses: actions-rs/toolchain@v1
47 |         with:
48 |           profile: minimal
49 |           toolchain: stable
50 |           override: true
51 |       - run: rustup component add rustfmt
52 |       - uses: actions-rs/cargo@v1
53 |         with:
54 |           command: fmt
55 |           args: --all -- --check
56 | 
57 |   clippy:
58 |     name: Clippy
59 |     runs-on: ubuntu-latest
60 |     steps:
61 |       - uses: actions/checkout@v2
62 |       - uses: actions-rs/toolchain@v1
63 |         with:
64 |           profile: minimal
65 |           toolchain: stable
66 |           override: true
67 |       - run: rustup component add clippy
68 |       - uses: actions-rs/cargo@v1
69 |         with:
70 |           command: clippy
71 |           args: -- -D warnings
72 | 


--------------------------------------------------------------------------------
/legacy/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'Synerise Cleora'
21 | copyright = '2021, Synerise'
22 | author = 'Synerise'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | release = '1.1.0'
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     "sphinx.ext.intersphinx",
35 |     "sphinx.ext.autodoc",
36 |     "sphinx.ext.mathjax",
37 |     "sphinx.ext.viewcode",
38 | ]
39 | 
40 | 
41 | # Add any paths that contain templates here, relative to this directory.
42 | templates_path = ['_templates']
43 | 
44 | # List of patterns, relative to source directory, that match files and
45 | # directories to ignore when looking for source files.
46 | # This pattern also affects html_static_path and html_extra_path.
47 | exclude_patterns = []
48 | 
49 | 
50 | # -- Options for HTML output -------------------------------------------------
51 | 
52 | # The theme to use for HTML and HTML Help pages.  See the documentation for
53 | # a list of builtin themes.
54 | #
55 | html_theme = 'alabaster'
56 | # Add any paths that contain custom static files (such as style sheets) here,
57 | # relative to this directory. They are copied after the builtin static files,
58 | # so a file named "default.css" will overwrite the builtin "default.css".
59 | 
60 | # sets the darker appearence
61 | html_theme_options = {
62 |     'style': 'darker'
63 | }
64 | 
65 | html_static_path = ['_static']
66 | html_logo = '_static/cleora_logo.png'
67 | 


--------------------------------------------------------------------------------
/src/embedding.rs:
--------------------------------------------------------------------------------
 1 | use crate::sparse_matrix::Edge;
 2 | use crate::sparse_matrix::SparseMatrix;
 3 | use ndarray::{Array, Array1, Array2, ArrayView2, Axis};
 4 | use rayon::prelude::*;
 5 | use rayon::ThreadPoolBuilder;
 6 | 
 7 | pub enum MarkovType {
 8 |     Left,
 9 |     Symmetric,
10 | }
11 | 
12 | pub struct NdArrayMatrix;
13 | 
14 | impl NdArrayMatrix {
15 |     pub fn multiply(
16 |         sparse_matrix_reader: &SparseMatrix,
17 |         other: ArrayView2<f32>,
18 |         markov_type: MarkovType,
19 |         num_workers: usize,
20 |     ) -> Array2<f32> {
21 |         let mut new_matrix: Array2<f32> = Array::zeros(other.raw_dim());
22 |         ThreadPoolBuilder::new()
23 |             .num_threads(num_workers)
24 |             .build()
25 |             .unwrap()
26 |             .install(|| {
27 |                 new_matrix
28 |                     .axis_iter_mut(Axis(0))
29 |                     .into_par_iter()
30 |                     .zip(sparse_matrix_reader.slices.par_iter())
31 |                     .for_each(|(mut row, (start, end))| {
32 |                         let edges = &sparse_matrix_reader.edges[*start..*end];
33 | 
34 |                         let new_row: Array1<f32> = edges
35 |                             .par_iter()
36 |                             .fold(
37 |                                 || Array1::zeros(other.shape()[1]),
38 |                                 |mut row, edge| {
39 |                                     let Edge {
40 |                                         left_markov_value,
41 |                                         symmetric_markov_value,
42 |                                         other_entity_ix,
43 |                                     } = edge;
44 |                                     let value = match markov_type {
45 |                                         MarkovType::Left => left_markov_value,
46 |                                         MarkovType::Symmetric => symmetric_markov_value,
47 |                                     };
48 |                                     let other_row = &other.row(*other_entity_ix as usize);
49 |                                     row.scaled_add(*value, other_row);
50 |                                     row
51 |                                 },
52 |                             )
53 |                             .reduce_with(|v1, v2| v1 + v2)
54 |                             .expect("Must have at least one edge");
55 | 
56 |                         row.assign(&new_row);
57 |                     });
58 |             });
59 |         new_matrix
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/legacy/docs/source/why_cleora.rst:
--------------------------------------------------------------------------------
 1 | .. why-cleora:
 2 | 
 3 | Why worth use Synerise Cleora?
 4 | ===============================================
 5 | 
 6 | Key technical features of Cleora embeddings
 7 | --------------------------------------------------------------------------------------------
 8 | 
 9 | The embeddings produced by Cleora are different from those produced by Node2vec, Word2vec, DeepWalk or other systems in this class by a number of key properties:
10 | 
11 | - **efficiency** - Cleora is two orders of magnitude faster than Node2Vec or DeepWalk
12 | - **inductivity** - as Cleora embeddings of an entity are defined only by interactions with other entities, vectors for new entities can be computed on-the-fly
13 | - **updatability** - refreshing a Cleora embedding for an entity is a very fast operation allowing for real-time updates without retraining
14 | - **stability** - all starting vectors for entities are deterministic, which means that Cleora embeddings on similar datasets will end up being similar. Methods like Word2vec, Node2vec or DeepWalk return different results with every run.
15 | - **cross-dataset compositionality** - thanks to stability of Cleora embeddings, embeddings of the same entity on multiple datasets can be combined by averaging, yielding meaningful vectors
16 | - **dim-wise independence** - thanks to the process producing Cleora embeddings, every dimension is independent of others. This property allows for efficient and low-parameter method for combining multi-view embeddings with Conv1d layers.
17 | - **extreme parallelism and performance** - Cleora is written in Rust utilizing thread-level parallelism for all calculations except input file loading. In practice this means that the embedding process is often faster than loading the input data.
18 | 
19 | Key usability features of Cleora embeddings
20 | --------------------------------------------------------------------------------------------
21 | 
22 | The technical properties described above imply good production-readiness of Cleora, which from the end-user perspective can be summarized as follows:
23 | 
24 | - heterogeneous relational tables can be embedded without any artificial data pre-processing
25 | - mixed interaction + text datasets can be embedded with ease
26 | - cold start problem for new entities is non-existent
27 | - real-time updates of the embeddings do not require any separate solutions
28 | - multi-view embeddings work out of the box
29 | - temporal, incremental embeddings are stable out of the box, with no need for re-alignment, rotations or other methods
30 | - extremely large datasets are supported and can be embedded within seconds / minutes
31 | 


--------------------------------------------------------------------------------
/legacy/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*'
 7 | 
 8 | jobs:
 9 |   unix-release:
10 |     name: ${{ matrix.target }}
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         include:
15 |           - os: ubuntu-18.04
16 |             target: x86_64-unknown-linux-gnu
17 | 
18 |           - os: ubuntu-18.04
19 |             target: x86_64-unknown-linux-musl
20 | 
21 |           - os: macos-latest
22 |             target: x86_64-apple-darwin
23 | 
24 |     steps:
25 |       - name: Checkout repository
26 |         uses: actions/checkout@v2
27 | 
28 |       - name: Set the version
29 |         id: version
30 |         run: echo ::set-output name=VERSION::${GITHUB_REF#refs/tags/}
31 | 
32 |       - name: Install Rust
33 |         uses: actions-rs/toolchain@v1
34 |         with:
35 |           toolchain: stable
36 |           profile: minimal
37 |           override: true
38 |           target: ${{ matrix.target }}
39 | 
40 |       - name: Build
41 |         run: cargo build --release --locked
42 | 
43 |       - name: Strip binary
44 |         run: strip target/release/cleora
45 | 
46 |       - name: Upload binaries to release
47 |         uses: svenstaro/upload-release-action@v1-release
48 |         with:
49 |           repo_token: ${{ secrets.GITHUB_TOKEN }}
50 |           file: target/release/cleora
51 |           asset_name: cleora-${{ steps.version.outputs.VERSION }}-${{ matrix.target }}
52 |           tag: ${{ github.ref }}
53 | 
54 |   windows-release:
55 |     name: ${{ matrix.target }}
56 |     runs-on: ${{ matrix.os }}
57 |     strategy:
58 |       matrix:
59 |         include:
60 |           - os: windows-latest
61 |             target: x86_64-pc-windows-msvc
62 | 
63 |     steps:
64 |       - name: Checkout repository
65 |         uses: actions/checkout@v2
66 | 
67 |       - name: Set the version
68 |         id: version
69 |         run: |
70 |           $TAG=${env:GITHUB_REF} -replace 'refs/tags/', ''
71 |           echo "::set-output name=VERSION::$TAG"
72 | 
73 |       - name: Install Rust
74 |         uses: actions-rs/toolchain@v1
75 |         with:
76 |           toolchain: stable
77 |           profile: minimal
78 |           override: true
79 |           target: ${{ matrix.target }}
80 | 
81 |       - name: Build
82 |         run: cargo build --release --locked
83 | 
84 |       - name: Upload binaries to release
85 |         uses: svenstaro/upload-release-action@v1-release
86 |         with:
87 |           repo_token: ${{ secrets.GITHUB_TOKEN }}
88 |           file: target/release/cleora.exe
89 |           asset_name: cleora-${{ steps.version.outputs.VERSION }}-${{ matrix.target }}
90 |           tag: ${{ github.ref }}


--------------------------------------------------------------------------------
/src/configuration.rs:
--------------------------------------------------------------------------------
 1 | use crate::sparse_matrix::SparseMatrixDescriptor;
 2 | 
 3 | #[derive(Debug)]
 4 | pub struct Configuration {
 5 |     pub seed: Option<i64>,
 6 |     pub matrix_desc: SparseMatrixDescriptor,
 7 |     pub columns: Vec<Column>,
 8 |     pub hyperedge_trim_n: usize,
 9 |     pub num_workers_graph_building: usize,
10 | }
11 | 
12 | #[derive(Debug, Default)]
13 | pub struct Column {
14 |     /// Name, header of the column
15 |     pub name: String,
16 | 
17 |     /// The field is composite, containing multiple entity identifiers separated by space
18 |     pub complex: bool,
19 | 
20 |     /// The field is reflexive, which means that it interacts with itself, additional output file is written for every such field
21 |     pub reflexive: bool,
22 | }
23 | 
24 | /// Extract columns config based on raw strings.
25 | pub fn parse_fields(columns: &str) -> Result<Vec<Column>, String> {
26 |     let cols: Vec<&str> = columns.split(' ').collect();
27 | 
28 |     let mut columns: Vec<Column> = Vec::new();
29 |     for col in cols {
30 |         let parts: Vec<&str> = col.split("::").collect();
31 | 
32 |         let column_name: &str;
33 |         let mut complex = false;
34 |         let mut reflexive = false;
35 | 
36 |         let parts_len = parts.len();
37 |         if parts_len > 1 {
38 |             column_name = *parts.last().unwrap();
39 |             let column_name_idx = parts_len - 1;
40 |             for &part in &parts[..column_name_idx] {
41 |                 if part.eq_ignore_ascii_case("complex") {
42 |                     complex = true;
43 |                 } else if part.eq_ignore_ascii_case("reflexive") {
44 |                     reflexive = true;
45 |                 } else {
46 |                     let message = format!("Unrecognized column field modifier: {}", part);
47 |                     return Err(message);
48 |                 }
49 |             }
50 |         } else {
51 |             column_name = col;
52 |         }
53 |         let column = Column {
54 |             name: column_name.to_string(),
55 |             complex,
56 |             reflexive,
57 |         };
58 |         columns.push(column);
59 |     }
60 | 
61 |     let columns = validate_column_modifiers(columns)?;
62 |     Ok(columns)
63 | }
64 | 
65 | fn validate_column_modifiers(cols: Vec<Column>) -> Result<Vec<Column>, String> {
66 |     for col in &cols {
67 |         // transient::reflexive - this would generate no output
68 |         // transient::reflexive::complex - this would generate no output
69 |         if col.reflexive && !col.complex {
70 |             let message = format!(
71 |                 "A field cannot be REFLEXIVE but NOT COMPLEX. It does not make sense: {}",
72 |                 col.name
73 |             );
74 |             return Err(message);
75 |         }
76 |     }
77 |     Ok(cols)
78 | }
79 | 


--------------------------------------------------------------------------------
/legacy/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 1.2.3 (June 29, 2022)
 2 | 
 3 | ### Changed
 4 | - Bump libs ([#60]).
 5 | 
 6 | [#60]: https://github.com/Synerise/cleora/pull/60
 7 | 
 8 | ### Fixed
 9 | - Check for malformed lines in input ([#59]).
10 | 
11 | [#59]: https://github.com/Synerise/cleora/pull/59
12 | 
13 | 
14 | # 1.2.2 (June 24, 2022)
15 | 
16 | ### Changed
17 | - Allow cleora to accept multiple input files as positional args. Named argument 'input' is getting deprecated.
18 | 
19 | [#55]: https://github.com/Synerise/cleora/pull/55
20 | 
21 | 
22 | # 1.2.1 (April 13, 2022)
23 | 
24 | ### Changed
25 | - Optimize "--output-format numpy" mode, so it doesn't require additional memory when writing output file ([#50]).
26 | - Bump libs ([#52]).
27 | 
28 | [#50]: https://github.com/Synerise/cleora/pull/50
29 | [#52]: https://github.com/Synerise/cleora/pull/52
30 | 
31 | 
32 | # 1.2.0 (March 17, 2022)
33 | 
34 | ### Added
35 | - Use default hasher for vector init ([#47]).
36 | 
37 | [#47]: https://github.com/Synerise/cleora/pull/47
38 | 
39 | 
40 | # 1.1.1 (May 14, 2021)
41 | 
42 | ### Added
43 | - Init embedding with seed during training ([#27]).
44 | 
45 | [#27]: https://github.com/Synerise/cleora/pull/27
46 | 
47 | 
48 | # 1.1.0 (December 23, 2020)
49 | 
50 | ### Changed
51 | - Bumped `env_logger` to `0.8.2`, `smallvec` to `1.5.1`, removed `fnv` hasher ([#11]).
52 | 
53 | [#11]: https://github.com/Synerise/cleora/pull/11
54 | 
55 | ### Added
56 | - Tests (snapshots) for in-memory and memory-mapped files calculations of embeddings ([#12]).
57 | - Support for `NumPy` output format (available via `--output-format` program argument) ([#15]).
58 | - Jupyter notebooks with experiments ([#16]).
59 | 
60 | [#12]: https://github.com/Synerise/cleora/pull/12
61 | [#15]: https://github.com/Synerise/cleora/pull/15
62 | [#16]: https://github.com/Synerise/cleora/pull/16
63 | 
64 | ### Improved
65 | - Used `vector` for `hash_to_id` mappings, non-allocating cartesian product, `ryu` crate for faster write ([#13]).
66 | - Sparse Matrix refactor (cleanup, simplification, using iter, speedup). Use Cargo.toml data for clap crate ([#17]).
67 | - Unify and simplify embeddings calculation for in-memory and mmap matrices ([#18]).
68 | 
69 | [#13]: https://github.com/Synerise/cleora/pull/13
70 | [#17]: https://github.com/Synerise/cleora/pull/17
71 | [#18]: https://github.com/Synerise/cleora/pull/18
72 | 
73 | 
74 | # 1.0.1 (November 23, 2020)
75 | 
76 | ### Fixed
77 | - Skip reading invalid UTF-8 line ([#8]).
78 | - Fix clippy warnings ([#7]).
79 | 
80 | [#8]: https://github.com/Synerise/cleora/pull/8
81 | [#7]: https://github.com/Synerise/cleora/pull/7
82 | 
83 | ### Added
84 | - JSON support ([#3]).
85 | - Snapshot testing ([#5]).
86 | 
87 | [#3]: https://github.com/Synerise/cleora/pull/3
88 | [#5]: https://github.com/Synerise/cleora/pull/5
89 | 
90 | 
91 | # 1.0.0 (November 6, 2020)
92 | 
93 | - Initial release.


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # 2.0.0 (Nov 24, 2024)
  2 | 
  3 | ### New version released
  4 | - Python package with rust bindings
  5 | - Improved performance, memory usage and concurrency
  6 | - Allowing external embedding as seed
  7 | - Interoperable with numpy
  8 | - Not a standalone console application any more
  9 | - old version available in legacy/ folder will not be maintained
 10 | 
 11 | # 1.2.3 (June 29, 2022)
 12 | 
 13 | ### Changed
 14 | - Bump libs ([#60]).
 15 | 
 16 | [#60]: https://github.com/Synerise/cleora/pull/60
 17 | 
 18 | ### Fixed
 19 | - Check for malformed lines in input ([#59]).
 20 | 
 21 | [#59]: https://github.com/Synerise/cleora/pull/59
 22 | 
 23 | 
 24 | # 1.2.2 (June 24, 2022)
 25 | 
 26 | ### Changed
 27 | - Allow cleora to accept multiple input files as positional args. Named argument 'input' is getting deprecated.
 28 | 
 29 | [#55]: https://github.com/Synerise/cleora/pull/55
 30 | 
 31 | 
 32 | # 1.2.1 (April 13, 2022)
 33 | 
 34 | ### Changed
 35 | - Optimize "--output-format numpy" mode, so it doesn't require additional memory when writing output file ([#50]).
 36 | - Bump libs ([#52]).
 37 | 
 38 | [#50]: https://github.com/Synerise/cleora/pull/50
 39 | [#52]: https://github.com/Synerise/cleora/pull/52
 40 | 
 41 | 
 42 | # 1.2.0 (March 17, 2022)
 43 | 
 44 | ### Added
 45 | - Use default hasher for vector init ([#47]).
 46 | 
 47 | [#47]: https://github.com/Synerise/cleora/pull/47
 48 | 
 49 | 
 50 | # 1.1.1 (May 14, 2021)
 51 | 
 52 | ### Added
 53 | - Init embedding with seed during training ([#27]).
 54 | 
 55 | [#27]: https://github.com/Synerise/cleora/pull/27
 56 | 
 57 | 
 58 | # 1.1.0 (December 23, 2020)
 59 | 
 60 | ### Changed
 61 | - Bumped `env_logger` to `0.8.2`, `smallvec` to `1.5.1`, removed `fnv` hasher ([#11]).
 62 | 
 63 | [#11]: https://github.com/Synerise/cleora/pull/11
 64 | 
 65 | ### Added
 66 | - Tests (snapshots) for in-memory and memory-mapped files calculations of embeddings ([#12]).
 67 | - Support for `NumPy` output format (available via `--output-format` program argument) ([#15]).
 68 | - Jupyter notebooks with experiments ([#16]).
 69 | 
 70 | [#12]: https://github.com/Synerise/cleora/pull/12
 71 | [#15]: https://github.com/Synerise/cleora/pull/15
 72 | [#16]: https://github.com/Synerise/cleora/pull/16
 73 | 
 74 | ### Improved
 75 | - Used `vector` for `hash_to_id` mappings, non-allocating cartesian product, `ryu` crate for faster write ([#13]).
 76 | - Sparse Matrix refactor (cleanup, simplification, using iter, speedup). Use Cargo.toml data for clap crate ([#17]).
 77 | - Unify and simplify embeddings calculation for in-memory and mmap matrices ([#18]).
 78 | 
 79 | [#13]: https://github.com/Synerise/cleora/pull/13
 80 | [#17]: https://github.com/Synerise/cleora/pull/17
 81 | [#18]: https://github.com/Synerise/cleora/pull/18
 82 | 
 83 | 
 84 | # 1.0.1 (November 23, 2020)
 85 | 
 86 | ### Fixed
 87 | - Skip reading invalid UTF-8 line ([#8]).
 88 | - Fix clippy warnings ([#7]).
 89 | 
 90 | [#8]: https://github.com/Synerise/cleora/pull/8
 91 | [#7]: https://github.com/Synerise/cleora/pull/7
 92 | 
 93 | ### Added
 94 | - JSON support ([#3]).
 95 | - Snapshot testing ([#5]).
 96 | 
 97 | [#3]: https://github.com/Synerise/cleora/pull/3
 98 | [#5]: https://github.com/Synerise/cleora/pull/5
 99 | 
100 | 
101 | # 1.0.0 (November 6, 2020)
102 | 
103 | - Initial release.


--------------------------------------------------------------------------------
/src/sparse_matrix.rs:
--------------------------------------------------------------------------------
  1 | use crate::configuration::Column;
  2 | use pyo3::pyclass;
  3 | use serde::{Deserialize, Serialize};
  4 | 
  5 | pub fn create_sparse_matrix_descriptor(
  6 |     colums: &Vec<Column>,
  7 | ) -> Result<SparseMatrixDescriptor, &'static str> {
  8 |     let mut matrices_descs = create_sparse_matrices_descriptors(colums);
  9 |     if matrices_descs.len() != 1 {
 10 |         return Err("More than one relation! Adjust your columns so there is only one relation.");
 11 |     }
 12 |     Ok(matrices_descs.remove(0))
 13 | }
 14 | 
 15 | /// Creates combinations of column pairs as sparse matrices.
 16 | /// Let's say that we have such columns configuration: complex::a reflexive::complex::b c. This is provided
 17 | /// as `&[Column]` after parsing the config.
 18 | /// The allowed column modifiers are:
 19 | /// - transient - the field is virtual - it is considered during embedding process, no entity is written for the column,
 20 | /// - complex   - the field is composite, containing multiple entity identifiers separated by space,
 21 | /// - reflexive - the field is reflexive, which means that it interacts with itself, additional output file is written for every such field.
 22 | /// We create sparse matrix for every columns relations (based on column modifiers).
 23 | /// For our example we have:
 24 | /// - sparse matrix for column a and b,
 25 | /// - sparse matrix for column a and c,
 26 | /// - sparse matrix for column b and c,
 27 | /// - sparse matrix for column b and b (reflexive column).
 28 | /// Apart from column names in sparse matrix we provide indices for incoming data. We have 3 columns such as a, b and c
 29 | /// but column b is reflexive so we need to include this column. The result is: (a, b, c, b).
 30 | /// The rule is that every reflexive column is append with the order of occurrence to the end of constructed array.
 31 | pub fn create_sparse_matrices_descriptors(cols: &Vec<Column>) -> Vec<SparseMatrixDescriptor> {
 32 |     let mut sparse_matrix_builders: Vec<SparseMatrixDescriptor> = Vec::new();
 33 |     let num_fields = cols.len();
 34 |     let mut reflexive_count = 0;
 35 | 
 36 |     for i in 0..num_fields {
 37 |         for j in i..num_fields {
 38 |             let col_i = &cols[i];
 39 |             let col_j = &cols[j];
 40 |             if i < j {
 41 |                 let sm = SparseMatrixDescriptor::new(
 42 |                     i as u8,
 43 |                     col_i.name.clone(),
 44 |                     j as u8,
 45 |                     col_j.name.clone(),
 46 |                 );
 47 |                 sparse_matrix_builders.push(sm);
 48 |             } else if i == j && col_i.reflexive {
 49 |                 let new_j = num_fields + reflexive_count;
 50 |                 reflexive_count += 1;
 51 |                 let sm = SparseMatrixDescriptor::new(
 52 |                     i as u8,
 53 |                     col_i.name.clone(),
 54 |                     new_j as u8,
 55 |                     col_j.name.clone(),
 56 |                 );
 57 |                 sparse_matrix_builders.push(sm);
 58 |             }
 59 |         }
 60 |     }
 61 |     sparse_matrix_builders
 62 | }
 63 | 
 64 | #[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
 65 | pub struct SparseMatrixDescriptor {
 66 |     /// First column index for which we creates subgraph
 67 |     pub col_a_id: u8,
 68 | 
 69 |     /// First column name
 70 |     pub col_a_name: String,
 71 | 
 72 |     /// Second column index for which we creates subgraph
 73 |     pub col_b_id: u8,
 74 | 
 75 |     /// Second column name
 76 |     pub col_b_name: String,
 77 | }
 78 | 
 79 | #[pyclass(name = "SparseMatrix", module = "cleora")]
 80 | #[derive(Debug, Serialize, Deserialize)]
 81 | pub struct SparseMatrix {
 82 |     pub descriptor: SparseMatrixDescriptor,
 83 |     #[pyo3(get, set)]
 84 |     pub entity_ids: Vec<String>,
 85 |     pub entities: Vec<Entity>,
 86 |     pub edges: Vec<Edge>,
 87 |     /// Maps entities to its edges
 88 |     /// I-th slice represent edges going out of ith node
 89 |     /// Example:
 90 |     /// Given slices=[(0, 4), (4, 10), (10, 11)]
 91 |     /// edges[0..4] are outgoing edges for entity=0
 92 |     /// edges[4..10] are outgoing edges for entity=1
 93 |     /// edges[10..11] are outgoing edges for entity=2
 94 |     pub slices: Vec<(usize, usize)>,
 95 |     pub column_ids: Vec<u8>,
 96 | }
 97 | 
 98 | #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 99 | pub struct Entity {
100 |     pub row_sum: f32,
101 | }
102 | 
103 | #[derive(Debug, Serialize, Deserialize)]
104 | pub struct Edge {
105 |     pub other_entity_ix: u32,
106 |     pub left_markov_value: f32,
107 |     pub symmetric_markov_value: f32,
108 | }
109 | 


--------------------------------------------------------------------------------
/tests/snapshot.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(test)]
  2 | mod tests {
  3 |     use insta::assert_debug_snapshot;
  4 |     use ndarray;
  5 |     use ndarray::{Array, Array2, ArrayBase, Dim, Ix, OwnedRepr};
  6 |     use ndarray_rand::rand::rngs::StdRng;
  7 |     use ndarray_rand::rand::{RngCore, SeedableRng};
  8 |     use ndarray_rand::rand_distr::Uniform;
  9 |     use ndarray_rand::RandomExt;
 10 | 
 11 |     use cleora::embedding::{MarkovType, NdArrayMatrix};
 12 |     use cleora::sparse_matrix::SparseMatrix;
 13 | 
 14 |     fn round(arr: Array2<f32>) -> Array2<i32> {
 15 |         arr.map(|v| (v * 1000.) as i32)
 16 |     }
 17 | 
 18 |     #[test]
 19 |     fn test_markov_left_01() {
 20 |         let (graph, embeddings) = create_graph_embeddings_complex_reflexive();
 21 |         let embedding_out = NdArrayMatrix::multiply(&graph, embeddings.view(), MarkovType::Left, 8);
 22 |         let embedding_out = round(embedding_out);
 23 |         assert_debug_snapshot!(embedding_out);
 24 |     }
 25 | 
 26 |     #[test]
 27 |     fn test_markov_left_02() {
 28 |         let (graph, embeddings) = create_graph_embeddings_complex_complex();
 29 |         let embedding_out = NdArrayMatrix::multiply(&graph, embeddings.view(), MarkovType::Left, 8);
 30 |         let embedding_out = round(embedding_out);
 31 |         assert_debug_snapshot!(embedding_out);
 32 |     }
 33 | 
 34 |     #[test]
 35 |     fn test_markov_sym_01() {
 36 |         let (graph, embeddings) = create_graph_embeddings_complex_reflexive();
 37 |         let embedding_out =
 38 |             NdArrayMatrix::multiply(&graph, embeddings.view(), MarkovType::Symmetric, 8);
 39 |         let embedding_out = round(embedding_out);
 40 |         assert_debug_snapshot!(embedding_out);
 41 |     }
 42 | 
 43 |     #[test]
 44 |     fn test_markov_sym_02() {
 45 |         let (graph, embeddings) = create_graph_embeddings_complex_complex();
 46 |         let embedding_out =
 47 |             NdArrayMatrix::multiply(&graph, embeddings.view(), MarkovType::Symmetric, 8);
 48 |         let embedding_out = round(embedding_out);
 49 |         assert_debug_snapshot!(embedding_out);
 50 |     }
 51 | 
 52 |     fn create_graph_embeddings_complex_complex(
 53 |     ) -> (SparseMatrix, ArrayBase<OwnedRepr<f32>, Dim<[Ix; 2]>>) {
 54 |         let num_embeddings: usize = 100;
 55 |         let mut rng: StdRng = SeedableRng::seed_from_u64(21_37);
 56 | 
 57 |         let mut edges: Vec<_> = vec![];
 58 |         for _ in 0..1000 {
 59 |             let col_1_node_1 = rng.next_u32() % (num_embeddings as u32);
 60 |             let col_1_node_2 = rng.next_u32() % (num_embeddings as u32);
 61 | 
 62 |             let col_2_node_1 = rng.next_u32() % (num_embeddings as u32);
 63 |             let col_2_node_2 = rng.next_u32() % (num_embeddings as u32);
 64 | 
 65 |             edges.push(format!(
 66 |                 "{} {}\t{} {}",
 67 |                 col_1_node_1, col_1_node_2, col_2_node_1, col_2_node_2
 68 |             ))
 69 |         }
 70 |         let edges_ref: Vec<&str> = edges.iter().map(|s| s.as_ref()).collect();
 71 |         let graph = SparseMatrix::from_rust_iterator(
 72 |             "complex::entity_a complex::entity_b",
 73 |             16,
 74 |             edges_ref.into_iter(),
 75 |             None,
 76 |         )
 77 |         .unwrap();
 78 | 
 79 |         let feature_dim: usize = 32;
 80 | 
 81 |         let embeddings = Array::random_using(
 82 |             (num_embeddings, feature_dim),
 83 |             Uniform::new(0., 10.),
 84 |             &mut rng,
 85 |         );
 86 |         (graph, embeddings)
 87 |     }
 88 | 
 89 |     fn create_graph_embeddings_complex_reflexive(
 90 |     ) -> (SparseMatrix, ArrayBase<OwnedRepr<f32>, Dim<[Ix; 2]>>) {
 91 |         let num_embeddings: usize = 100;
 92 |         let mut rng: StdRng = SeedableRng::seed_from_u64(21_37);
 93 | 
 94 |         let mut edges: Vec<_> = vec![];
 95 |         for _ in 0..1000 {
 96 |             let node_a = rng.next_u32() % (num_embeddings as u32);
 97 |             let node_b = rng.next_u32() % (num_embeddings as u32);
 98 |             edges.push(format!("{} {}", node_a, node_b))
 99 |         }
100 |         let edges_ref: Vec<&str> = edges.iter().map(|s| s.as_ref()).collect();
101 |         let graph = SparseMatrix::from_rust_iterator(
102 |             "reflexive::complex::entity_id",
103 |             16,
104 |             edges_ref.into_iter(),
105 |             None,
106 |         )
107 |         .unwrap();
108 | 
109 |         let feature_dim: usize = 32;
110 | 
111 |         let embeddings = Array::random_using(
112 |             (num_embeddings, feature_dim),
113 |             Uniform::new(0., 10.),
114 |             &mut rng,
115 |         );
116 |         (graph, embeddings)
117 |     }
118 | }
119 | 


--------------------------------------------------------------------------------
/legacy/docs/source/graph_creation.rst:
--------------------------------------------------------------------------------
  1 | .. _graph-creation:
  2 | 
  3 | Graph Creation
  4 | =========================
  5 | 
  6 | Cleora as a tool
  7 | ----------------------
  8 | 
  9 | Cleora is built as a multi-purpose "just embed it" tool, suitable for many different data types and formats. Our tool ingests a relational table of rows representing a typed and undirected heterogeneous hypergraph, which can contain multiple:
 10 | 
 11 | - typed categorical columns
 12 | - typed categorical array columns
 13 | 
 14 | Based on the column format specification, Cleora performs:
 15 | 
 16 | - Star decomposition of hyper-edges
 17 | - Creation of pairwise graphs for all pairs of entity types
 18 | - Embedding of each graph
 19 | - The final output of Cleora consists of multiple files for each (undirected) pair of entity types in the table.
 20 | 
 21 | Those embeddings can then be utilized in a novel way thanks to their dim-wise independence property, which is described further below.
 22 | 
 23 | Graph construction
 24 | ------------------------
 25 | 
 26 | 
 27 | .. figure:: _static/cleora-sparse-matrix.png
 28 |     :figwidth: 100 %
 29 |     :width: 100 %
 30 |     :align: center
 31 |     :alt: Sparse Matrix
 32 | 
 33 | 
 34 | 
 35 | Graph construction starts with the creation of a helper matrix *P* object as a regular 2-D Rust array, which is built according to the selected 
 36 | expansion method. An example involving clique expansion is presented in Figure - a Cartesian product (all combinations) of all columns is created. 
 37 | Each entity identifier from the original input file is hashed with `xxhash <https://cyan4973.github.io/xxHash/>`_ - a fast and efficient hashing method. 
 38 | We hash the identifiers to store them in a unified, small data format. From the first line of our example:
 39 | 
 40 | .. math::
 41 | 
 42 |    U1\:P1\:P2\:B1\:B2
 43 | 
 44 | we get 4 combinations produced by the Cartesian product:
 45 | 
 46 | .. math::
 47 | 
 48 |    [4,\:U1hash,\:P1hash,\:B1hash] \\
 49 |    [4,\:U1hash,\:P1hash,\:B2hash] \\
 50 |    [4,\:U1hash,\:P2hash,\:B1hash] \\
 51 |    [4,\:U1hash,\:P2hash,\:B2hash]
 52 | 
 53 | At the beginning we insert the total number of combinations (in this case 4). Then we add another 3 rows representing combinations from the second row of the input.
 54 | 
 55 | Subsequently, for each relation pair from matrix `P` we create a separate matrix `M` as a `SparseMatrix` struct (the matrices `M` will usually hold mostly zeros). 
 56 | Each matrix `M` object is produced in a separate thread in a stepwise fashion. The rows of matrix `P` object are broadcasted to all matrix `M` objects, 
 57 | and each matrix `M` object reads the buffer selecting the appropriate values, updating its content.
 58 | For example, M3 (users and products) reads the hashes from indexes 1 and 2. After reading the first vector:
 59 | 
 60 | .. math::
 61 | 
 62 |      [4,\:U1hash,\:P1hash,\:B1hash]
 63 | 
 64 | the edge value for **U1hash <-> P1hash** equals 1/4 (1 divided by the total number of Cartesian products). After reading the next vector:
 65 | 
 66 | .. math::
 67 | 
 68 |     [4,\:U1hash,\:P1hash,\:B2hash]
 69 | 
 70 |     
 71 | the edge value for **U1hash <-> P1hash** updates to 1/2 (1/4 + 1/4). After reading the next two, we finally have:
 72 | 
 73 | **U1hash <-> P1hash** = 1/2
 74 | 
 75 | **U1hash <-> P2hash** = 1/2
 76 | 
 77 | Sparse Matrix
 78 | ---------------------
 79 | 
 80 | For maximum efficiency we created a custom implementation of a sparse matrix data structure - the SparseMatrix struct. It follows the sparse matrix coordinate format (COO). Its purpose is to save space by holding only the coordinates and values of nonzero entities.
 81 | 
 82 | Embedding is done in 2 basic steps: graph construction and training.
 83 | 
 84 | Let's assume that the basic configuration of the program looks like this:
 85 | 
 86 |     .. code-block:: bash
 87 | 
 88 |          --input files/samples/edgelist_2.tsv
 89 |          --columns="users complex::products complex::brands"
 90 |          --dimension 3
 91 |          --number-of-iterations 4
 92 | 
 93 | Every SparseMatrix is created based on the program argument **--columns**. For our example, there will be three SparseMatrix'es that will only read data from the columns:
 94 | 
 95 | - users and brands by M1
 96 | - products and brands by M2
 97 | - users and products by M3
 98 | 
 99 | 
100 | Memory consumption
101 | -------------------
102 | 
103 | Every **SparseMatrix** object allocates space for:
104 | 
105 | - **|V|** objects, each occupying 40 bytes,
106 | - **2 x |E|** objects (in undirected graphs we need to count an edge in both directions), each occupying 24 bytes.
107 | 
108 | 
109 | During training we need additonal 
110 | 
111 | .. math::
112 | 
113 | 
114 |      2 \times d \times | V |
115 |      
116 | objects, each occupying 4 bytes (this can be avoided by using memory-mapped files, see `--in-memory-embedding-calculation` argument for the program).
117 | 


--------------------------------------------------------------------------------
/legacy/docs/source/running.rst:
--------------------------------------------------------------------------------
  1 | .. _running:
  2 | 
  3 | Running configuration
  4 | ====================== 
  5 | 
  6 | This page details how to use the Cleora run command to define the embedding resources at runtime.
  7 | 
  8 | Synopsis
  9 | --------
 10 | 
 11 | **cleora** [*options 2*] <*params 1*> [*options 2*] <*params 2*> ...
 12 | 
 13 | Run options 
 14 | --------------
 15 | 
 16 | - input
 17 | 
 18 | Using input param: *--input* or *-i* 
 19 | 
 20 | Param description: A parameter that defines path for input file. You can use also absolute path or relative path.
 21 |      
 22 | 
 23 | - file type
 24 | 
 25 | Using file type param: *--type* or *-t*
 26 | 
 27 | Param description: This parameter is responsible for defining the input file extension to the algorithm. Cleora supports two kinds of input files .tsv (tab-separated values) and .json.
 28 | 
 29 | - dimension
 30 | 
 31 | Using dimension param: *--dimenstion* or *-d* 
 32 | 
 33 | Param description: Embedding dimension size.
 34 | 
 35 | - number of iterations
 36 | 
 37 | Using number of iterations param: *--dimenstion* or *-d*
 38 | 
 39 | param Description: Set maximum number of iterations.
 40 | 
 41 | - columns
 42 | 
 43 | Using columnns param: *--columns* or *-c* 
 44 | 
 45 | Param description: Set column names (max. 12), with modifiers from list: [transient::, reflexive::, complex::]
 46 | 
 47 | .. list-table::
 48 |    :widths: 20 80
 49 |    :header-rows: 1
 50 | 
 51 |    * - Modifiers
 52 |      - Description
 53 |    * - transient
 54 |      - The field is virtual - it is considered during embedding process, no entity is written for the column
 55 |    * - complex
 56 |      - The field is composite, containing multiple entity identifiers separated by space in TSV or an array in JSON
 57 |    * - reflexive
 58 |      - The field is reflexive, which means that it interacts with itself, additional output file is written for every such field
 59 |    * - ignore
 60 |      - The field is ignored, no output file is written for the field
 61 | 
 62 | 
 63 | Allowed combinations of modifiers are:  
 64 |     - `transient`
 65 |     - `complex`
 66 |     - `transient::complex`
 67 |     - `reflexive::complex`
 68 | 
 69 | 
 70 | 
 71 | For TSV datasets containing composite fields (categorical array), multiple items within a field are then separated by space.
 72 | 
 73 | The specification of an input format is as follows:
 74 | 
 75 |     .. code-block:: none
 76 | 
 77 |         --columns="[column modifiers, ::]<column_name> ..."
 78 | 
 79 | 
 80 | Combinations which don't make sense are:
 81 | 
 82 | .. list-table::
 83 |    :widths: 40 80
 84 |    :header-rows: 1
 85 | 
 86 |    * - Modifiers
 87 |      - Description
 88 |    * - reflexive
 89 |      - This would represent an identity relation
 90 |    * - transient::reflexive   
 91 |      - This would generate no output
 92 |    * - reflexive::transient::complex
 93 |      - This would generate no output
 94 | 
 95 | Picture below representation how works column modifiers:
 96 | 
 97 | .. figure:: _static/cleora-columns.png
 98 |     :figwidth: 100 %
 99 |     :width: 60 %
100 |     :align: center
101 |     :alt: examples use case of column modifiers
102 | 
103 | 
104 | - relation name
105 | 
106 | Using relation param: *--relation-name* or *-r*
107 | 
108 | Param description: Name of the relation, for output filename generation.
109 | 
110 | - prepend field name
111 | 
112 | Using prepend field name param: *--relation-name* or *-r*
113 | 
114 | Param description: Prameter that responsible for prepending field name to entity in output.
115 | 
116 | - log every n
117 | 
118 | Using log every n params : *--log-every-n* or *-l*
119 | 
120 | Param description: Set log output for every N lines
121 | 
122 | - in memory embedding calculation
123 | 
124 | Using log in memory embedding calculation param: *--in-memory-embedding-calculation* or *-e*
125 | 
126 | Param description: Parameter that responsible for using calculate embeddings in memory or with memory-mapped files. Default is on (setting -e 0). If you want off use -e 1.
127 | 
128 | -output dir
129 | 
130 | Using output dir param: *--output-dir* or *-o* 
131 | 
132 | Param description: Set output directory for files with embeddings.
133 | 
134 | -output format
135 | 
136 | Using output format param: *--output-format* or *-o*  
137 | 
138 | Param Description: A parameter that defines the format of the output file. Possible output format are textfile (.txt) and numpy (.npy)
139 | 
140 | 
141 | Examples Cleora run configuration
142 | ---------------------------------
143 | 
144 | Remember before you will first run cleora training (after download binary file from repository) to set execute file permission using *chmod +x*  
145 | 
146 | .. code-block:: bash
147 | 
148 |   
149 |    chmod +x cleora
150 |    ./cleora -i files/samples/edgelist_sample.tsv 
151 |             --columns="complex::reflexive::a b complex::c" 
152 |             -d 128 
153 |             -n 5 
154 |             --relation-name=test_realation_name
155 |             -p 0
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/legacy/tests/snapshot.rs:
--------------------------------------------------------------------------------
  1 | use cleora::configuration::{Column, Configuration, FileType, OutputFormat};
  2 | use cleora::embedding::{calculate_embeddings, calculate_embeddings_mmap};
  3 | use cleora::persistence::embedding::EmbeddingPersistor;
  4 | use cleora::persistence::entity::InMemoryEntityMappingPersistor;
  5 | use cleora::pipeline::build_graphs;
  6 | use insta::assert_debug_snapshot;
  7 | use std::io;
  8 | use std::sync::Arc;
  9 | 
 10 | /// This test performs work for sample case and saves snapshot file.
 11 | /// Snapshot testing takes advantage of deterministic character of Cleora.
 12 | /// Any discrepancies between original snapshot results and current ones can be then
 13 | /// reviewed along with the code which introduced discrepancy.
 14 | ///
 15 | /// Differing snapshot has to be renamed by removing .new from the name.
 16 | /// For more information, please review https://crates.io/crates/insta
 17 | ///
 18 | /// Code executed performs roughly the same work as:
 19 | /// ./cleora -i files/samples/edgelist_1.tsv --columns="complex::reflexive::a b complex::c"
 20 | /// -d 128 -n 4 --relation-name=R1 -p 0
 21 | #[test]
 22 | fn test_build_graphs_and_create_embeddings() {
 23 |     let config = prepare_config();
 24 | 
 25 |     let in_memory_entity_mapping_persistor = InMemoryEntityMappingPersistor::default();
 26 |     let in_memory_entity_mapping_persistor = Arc::new(in_memory_entity_mapping_persistor);
 27 | 
 28 |     // build sparse matrices
 29 |     let sparse_matrices = build_graphs(&config, in_memory_entity_mapping_persistor.clone());
 30 | 
 31 |     let config = Arc::new(config);
 32 | 
 33 |     // embeddings for in-memory and mmap files calculation should be the same
 34 |     for sparse_matrix in sparse_matrices.into_iter() {
 35 |         let sparse_matrix = Arc::new(sparse_matrix);
 36 |         let snapshot_name = format!(
 37 |             "embeddings_{}_{}",
 38 |             sparse_matrix.col_a_name, sparse_matrix.col_b_name
 39 |         );
 40 | 
 41 |         let mut in_memory_embedding_persistor = InMemoryEmbeddingPersistor::default();
 42 |         // calculate embeddings in memory
 43 |         calculate_embeddings(
 44 |             config.clone(),
 45 |             sparse_matrix.clone(),
 46 |             in_memory_entity_mapping_persistor.clone(),
 47 |             &mut in_memory_embedding_persistor,
 48 |         );
 49 |         assert_debug_snapshot!(snapshot_name.clone(), in_memory_embedding_persistor);
 50 | 
 51 |         let mut in_memory_embedding_persistor = InMemoryEmbeddingPersistor::default();
 52 |         // calculate embeddings with mmap files
 53 |         calculate_embeddings_mmap(
 54 |             config.clone(),
 55 |             sparse_matrix.clone(),
 56 |             in_memory_entity_mapping_persistor.clone(),
 57 |             &mut in_memory_embedding_persistor,
 58 |         );
 59 |         assert_debug_snapshot!(snapshot_name, in_memory_embedding_persistor);
 60 |     }
 61 | }
 62 | 
 63 | fn prepare_config() -> Configuration {
 64 |     let columns = vec![
 65 |         Column {
 66 |             name: "a".to_string(),
 67 |             complex: true,
 68 |             reflexive: true,
 69 |             ..Column::default()
 70 |         },
 71 |         Column {
 72 |             name: "b".to_string(),
 73 |             ..Column::default()
 74 |         },
 75 |         Column {
 76 |             name: "c".to_string(),
 77 |             complex: true,
 78 |             ..Column::default()
 79 |         },
 80 |     ];
 81 | 
 82 |     let config = Configuration {
 83 |         produce_entity_occurrence_count: true,
 84 |         embeddings_dimension: 128,
 85 |         max_number_of_iteration: 4,
 86 |         seed: None,
 87 |         prepend_field: false,
 88 |         log_every_n: 10000,
 89 |         in_memory_embedding_calculation: true,
 90 |         input: vec!["files/samples/edgelist_1.tsv".to_string()],
 91 |         file_type: FileType::Tsv,
 92 |         output_format: OutputFormat::TextFile,
 93 |         output_dir: None,
 94 |         relation_name: "r1".to_string(),
 95 |         columns,
 96 |     };
 97 |     config
 98 | }
 99 | 
100 | #[derive(Debug, Default)]
101 | struct InMemoryEmbeddingPersistor {
102 |     entity_count: u32,
103 |     dimenstion: u16,
104 |     entities: Vec<InMemoryEntity>,
105 | }
106 | 
107 | #[derive(Debug)]
108 | struct InMemoryEntity {
109 |     entity: String,
110 |     occur_count: u32,
111 |     vector: Vec<f32>,
112 | }
113 | 
114 | impl EmbeddingPersistor for InMemoryEmbeddingPersistor {
115 |     fn put_metadata(&mut self, entity_count: u32, dimension: u16) -> Result<(), io::Error> {
116 |         self.entity_count = entity_count;
117 |         self.dimenstion = dimension;
118 |         Ok(())
119 |     }
120 |     fn put_data(
121 |         &mut self,
122 |         entity: &str,
123 |         occur_count: u32,
124 |         vector: Vec<f32>,
125 |     ) -> Result<(), io::Error> {
126 |         let entity = entity.to_string();
127 |         self.entities.push(InMemoryEntity {
128 |             entity,
129 |             occur_count,
130 |             vector,
131 |         });
132 |         Ok(())
133 |     }
134 |     fn finish(&mut self) -> Result<(), io::Error> {
135 |         Ok(())
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/benches/cleora_benchmark.rs:
--------------------------------------------------------------------------------
  1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
  2 | use fnv::FnvHasher;
  3 | use std::collections::hash_map::DefaultHasher;
  4 | use std::hash::Hasher;
  5 | use twox_hash::XxHash64;
  6 | 
  7 | fn default_hash(entity: &str) -> u64 {
  8 |     let mut hasher = DefaultHasher::new();
  9 |     hasher.write(entity.as_bytes());
 10 |     hasher.finish()
 11 | }
 12 | 
 13 | fn xx_hash(entity: &str) -> u64 {
 14 |     let mut hasher = XxHash64::default();
 15 |     hasher.write(entity.as_bytes());
 16 |     hasher.finish()
 17 | }
 18 | 
 19 | fn fnv_hash(entity: &str) -> u64 {
 20 |     let mut hasher = FnvHasher::default();
 21 |     hasher.write(entity.as_bytes());
 22 |     hasher.finish()
 23 | }
 24 | 
 25 | fn hash_benchmark(c: &mut Criterion) {
 26 |     c.bench_function("hash", |b| b.iter(|| fnv_hash(black_box("cleora"))));
 27 | }
 28 | 
 29 | fn bench_hashes(c: &mut Criterion) {
 30 |     let mut group = c.benchmark_group("Hashing");
 31 |     for s in ["Poland", "Germany", "USA", "United Kingdom", "Norway"].iter() {
 32 |         group.bench_with_input(BenchmarkId::new("Default", s), s, |b, s| {
 33 |             b.iter(|| default_hash(s))
 34 |         });
 35 |         group.bench_with_input(BenchmarkId::new("XXHash", s), s, |b, s| {
 36 |             b.iter(|| xx_hash(s))
 37 |         });
 38 |         group.bench_with_input(BenchmarkId::new("FnvHash", s), s, |b, s| {
 39 |             b.iter(|| fnv_hash(s))
 40 |         });
 41 |     }
 42 |     group.finish();
 43 | }
 44 | 
 45 | struct CartesianProduct {
 46 |     lengths: Vec<u32>,
 47 |     indices: Vec<u32>,
 48 | }
 49 | 
 50 | impl CartesianProduct {
 51 |     fn new(lengths: Vec<u32>) -> CartesianProduct {
 52 |         let indices = vec![0; lengths.len()];
 53 |         CartesianProduct { lengths, indices }
 54 |     }
 55 | }
 56 | 
 57 | impl Iterator for CartesianProduct {
 58 |     type Item = Vec<u32>;
 59 | 
 60 |     fn next(&mut self) -> Option<Self::Item> {
 61 |         let result = self.indices.clone();
 62 |         let len = self.indices.len();
 63 |         for i in (0..len).rev() {
 64 |             if self.indices[i] == (self.lengths[i] - 1) {
 65 |                 self.indices[i] = 0;
 66 |                 if i == 0 {
 67 |                     return None;
 68 |                 }
 69 |             } else {
 70 |                 self.indices[i] += 1;
 71 |                 break;
 72 |             }
 73 |         }
 74 |         Some(result)
 75 |     }
 76 | }
 77 | 
 78 | fn generate_combinations_with_length(
 79 |     hashes: Vec<Vec<u64>>,
 80 |     lens: Vec<u32>,
 81 |     transient_lens: Vec<u32>,
 82 | ) -> Vec<Vec<u64>> {
 83 |     let row_length = lens.len();
 84 |     let mut combinations = 1;
 85 |     for &len in &lens {
 86 |         combinations *= len;
 87 |     }
 88 | 
 89 |     let mut transient_combinations = 1;
 90 |     for transient_len in transient_lens {
 91 |         transient_combinations *= transient_len;
 92 |     }
 93 | 
 94 |     let total_combinations = u64::from(combinations * transient_combinations);
 95 | 
 96 |     let mut result: Vec<Vec<u64>> = Vec::with_capacity(combinations as usize);
 97 |     let cartesian = CartesianProduct::new(lens);
 98 |     let mut counter = 0;
 99 | 
100 |     for indices in cartesian {
101 |         let mut arr: Vec<u64> = Vec::with_capacity(row_length + 1);
102 |         arr.push(total_combinations);
103 |         let hashes_length = hashes.len();
104 |         for i in 0..hashes_length {
105 |             let id = indices[i];
106 |             let value = hashes.get(i).unwrap().get(id as usize).unwrap();
107 |             arr.push(*value);
108 |         }
109 |         result.insert(counter, arr);
110 |         counter += 1;
111 |     }
112 | 
113 |     result
114 | }
115 | 
116 | fn generate_combinations_with_length_benchmark(c: &mut Criterion) {
117 |     let hashes = vec![
118 |         vec![
119 |             12528106613309397869,
120 |             9708327007652588651,
121 |             14980293948133487802,
122 |             12266831465718424827,
123 |             17286486014462130850,
124 |             11758309849656381133,
125 |             10347099512938872293,
126 |             804562942093240192,
127 |             3059164883323983321,
128 |         ],
129 |         vec![
130 |             12528106613309397869,
131 |             9708327007652588651,
132 |             14980293948133487802,
133 |             12266831465718424827,
134 |             17286486014462130850,
135 |             11758309849656381133,
136 |             10347099512938872293,
137 |             804562942093240192,
138 |             3059164883323983321,
139 |         ],
140 |     ];
141 |     let lens = vec![9, 9];
142 |     let transient_lens = vec![1];
143 |     c.bench_function("generate_combinations_with_length", |b| {
144 |         b.iter(|| {
145 |             generate_combinations_with_length(
146 |                 black_box(hashes.clone()),
147 |                 black_box(lens.clone()),
148 |                 black_box(transient_lens.clone()),
149 |             )
150 |         })
151 |     });
152 | }
153 | 
154 | criterion_group!(
155 |     benches,
156 |     generate_combinations_with_length_benchmark,
157 |     bench_hashes
158 | );
159 | criterion_main!(benches);
160 | 


--------------------------------------------------------------------------------
/legacy/benches/cleora_benchmark.rs:
--------------------------------------------------------------------------------
  1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
  2 | use fnv::FnvHasher;
  3 | use std::collections::hash_map::DefaultHasher;
  4 | use std::hash::Hasher;
  5 | use twox_hash::XxHash64;
  6 | 
  7 | fn default_hash(entity: &str) -> u64 {
  8 |     let mut hasher = DefaultHasher::new();
  9 |     hasher.write(entity.as_bytes());
 10 |     hasher.finish()
 11 | }
 12 | 
 13 | fn xx_hash(entity: &str) -> u64 {
 14 |     let mut hasher = XxHash64::default();
 15 |     hasher.write(entity.as_bytes());
 16 |     hasher.finish()
 17 | }
 18 | 
 19 | fn fnv_hash(entity: &str) -> u64 {
 20 |     let mut hasher = FnvHasher::default();
 21 |     hasher.write(entity.as_bytes());
 22 |     hasher.finish()
 23 | }
 24 | 
 25 | fn hash_benchmark(c: &mut Criterion) {
 26 |     c.bench_function("hash", |b| b.iter(|| fnv_hash(black_box("cleora"))));
 27 | }
 28 | 
 29 | fn bench_hashes(c: &mut Criterion) {
 30 |     let mut group = c.benchmark_group("Hashing");
 31 |     for s in ["Poland", "Germany", "USA", "United Kingdom", "Norway"].iter() {
 32 |         group.bench_with_input(BenchmarkId::new("Default", s), s, |b, s| {
 33 |             b.iter(|| default_hash(s))
 34 |         });
 35 |         group.bench_with_input(BenchmarkId::new("XXHash", s), s, |b, s| {
 36 |             b.iter(|| xx_hash(s))
 37 |         });
 38 |         group.bench_with_input(BenchmarkId::new("FnvHash", s), s, |b, s| {
 39 |             b.iter(|| fnv_hash(s))
 40 |         });
 41 |     }
 42 |     group.finish();
 43 | }
 44 | 
 45 | struct CartesianProduct {
 46 |     lengths: Vec<u32>,
 47 |     indices: Vec<u32>,
 48 | }
 49 | 
 50 | impl CartesianProduct {
 51 |     fn new(lengths: Vec<u32>) -> CartesianProduct {
 52 |         let indices = vec![0; lengths.len()];
 53 |         CartesianProduct { lengths, indices }
 54 |     }
 55 | }
 56 | 
 57 | impl Iterator for CartesianProduct {
 58 |     type Item = Vec<u32>;
 59 | 
 60 |     fn next(&mut self) -> Option<Self::Item> {
 61 |         let result = self.indices.clone();
 62 |         let len = self.indices.len();
 63 |         for i in (0..len).rev() {
 64 |             if self.indices[i] == (self.lengths[i] - 1) {
 65 |                 self.indices[i] = 0;
 66 |                 if i == 0 {
 67 |                     return None;
 68 |                 }
 69 |             } else {
 70 |                 self.indices[i] += 1;
 71 |                 break;
 72 |             }
 73 |         }
 74 |         Some(result)
 75 |     }
 76 | }
 77 | 
 78 | fn generate_combinations_with_length(
 79 |     hashes: Vec<Vec<u64>>,
 80 |     lens: Vec<u32>,
 81 |     transient_lens: Vec<u32>,
 82 | ) -> Vec<Vec<u64>> {
 83 |     let row_length = lens.len();
 84 |     let mut combinations = 1;
 85 |     for &len in &lens {
 86 |         combinations *= len;
 87 |     }
 88 | 
 89 |     let mut transient_combinations = 1;
 90 |     for transient_len in transient_lens {
 91 |         transient_combinations *= transient_len;
 92 |     }
 93 | 
 94 |     let total_combinations = u64::from(combinations * transient_combinations);
 95 | 
 96 |     let mut result: Vec<Vec<u64>> = Vec::with_capacity(combinations as usize);
 97 |     let cartesian = CartesianProduct::new(lens);
 98 |     let mut counter = 0;
 99 | 
100 |     for indices in cartesian {
101 |         let mut arr: Vec<u64> = Vec::with_capacity(row_length + 1);
102 |         arr.push(total_combinations);
103 |         let hashes_length = hashes.len();
104 |         for i in 0..hashes_length {
105 |             let id = indices[i];
106 |             let value = hashes.get(i).unwrap().get(id as usize).unwrap();
107 |             arr.push(*value);
108 |         }
109 |         result.insert(counter, arr);
110 |         counter += 1;
111 |     }
112 | 
113 |     result
114 | }
115 | 
116 | fn generate_combinations_with_length_benchmark(c: &mut Criterion) {
117 |     let hashes = vec![
118 |         vec![
119 |             12528106613309397869,
120 |             9708327007652588651,
121 |             14980293948133487802,
122 |             12266831465718424827,
123 |             17286486014462130850,
124 |             11758309849656381133,
125 |             10347099512938872293,
126 |             804562942093240192,
127 |             3059164883323983321,
128 |         ],
129 |         vec![
130 |             12528106613309397869,
131 |             9708327007652588651,
132 |             14980293948133487802,
133 |             12266831465718424827,
134 |             17286486014462130850,
135 |             11758309849656381133,
136 |             10347099512938872293,
137 |             804562942093240192,
138 |             3059164883323983321,
139 |         ],
140 |     ];
141 |     let lens = vec![9, 9];
142 |     let transient_lens = vec![1];
143 |     c.bench_function("generate_combinations_with_length", |b| {
144 |         b.iter(|| {
145 |             generate_combinations_with_length(
146 |                 black_box(hashes.clone()),
147 |                 black_box(lens.clone()),
148 |                 black_box(transient_lens.clone()),
149 |             )
150 |         })
151 |     });
152 | }
153 | 
154 | criterion_group!(
155 |     benches,
156 |     generate_combinations_with_length_benchmark,
157 |     bench_hashes
158 | );
159 | criterion_main!(benches);
160 | 


--------------------------------------------------------------------------------
/src/entity.rs:
--------------------------------------------------------------------------------
  1 | use itertools::{Itertools, Product};
  2 | use std::hash::Hasher;
  3 | use std::ops::Range;
  4 | use std::sync::Arc;
  5 | 
  6 | use smallvec::{IntoIter, SmallVec};
  7 | use twox_hash::XxHash64;
  8 | 
  9 | use crate::configuration::Configuration;
 10 | use crate::sparse_matrix_builder::NodeIndexerBuilder;
 11 | 
 12 | /// Indicates how many elements in a vector can be placed on Stack (used by smallvec crate). The rest
 13 | /// of the vector is placed on Heap.
 14 | pub const SMALL_VECTOR_SIZE: usize = 8;
 15 | 
 16 | #[derive(Debug, Clone)]
 17 | pub struct Hyperedge {
 18 |     hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]>,
 19 |     slices: [Range<u32>; 2],
 20 | }
 21 | 
 22 | impl Hyperedge {
 23 |     #[inline]
 24 |     pub fn nodes(&self, column_id: usize) -> SmallVec<[u64; SMALL_VECTOR_SIZE]> {
 25 |         let slice = self.slices.get(column_id).unwrap();
 26 |         let mut v = SmallVec::with_capacity(slice.len());
 27 |         for ix in slice.start..slice.end {
 28 |             v.push(self.hashes[ix as usize])
 29 |         }
 30 |         v
 31 |     }
 32 | 
 33 |     #[inline(always)]
 34 |     pub fn edges_iter(
 35 |         &self,
 36 |         col_id_a: u8,
 37 |         col_id_b: u8,
 38 |     ) -> Product<IntoIter<[u64; 8]>, IntoIter<[u64; 8]>> {
 39 |         let nodes_a = self.nodes(col_id_a as usize);
 40 |         let nodes_b = self.nodes(col_id_b as usize);
 41 |         nodes_a.into_iter().cartesian_product(nodes_b)
 42 |     }
 43 | 
 44 |     pub fn edges_num(&self, col_id_a: u8, col_id_b: u8) -> usize {
 45 |         self.slices[col_id_a as usize].len() * self.slices[col_id_b as usize].len()
 46 |     }
 47 | }
 48 | 
 49 | pub struct EntityProcessor<'a, S: NodeIndexerBuilder> {
 50 |     config: &'a Configuration,
 51 |     not_ignored_columns_count: u16,
 52 |     node_indexer: Arc<S>,
 53 | }
 54 | 
 55 | impl<'a, S: NodeIndexerBuilder> EntityProcessor<'a, S> {
 56 |     pub fn new(config: &'a Configuration, node_indexer: Arc<S>) -> EntityProcessor<'a, S> {
 57 |         let not_ignored_columns_count = config.columns.len() as u16;
 58 |         EntityProcessor {
 59 |             config,
 60 |             not_ignored_columns_count,
 61 |             node_indexer,
 62 |         }
 63 |     }
 64 | 
 65 |     /// Every row can create few combinations (cartesian products) which are hashed and provided for sparse matrix creation.
 66 |     /// `row` - array of strings such as: ("userId1", "productId1 productId2", "brandId1").
 67 |     pub fn process_row_and_get_edges(
 68 |         &self,
 69 |         row: &[SmallVec<[&str; SMALL_VECTOR_SIZE]>],
 70 |     ) -> Hyperedge {
 71 |         let mut hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> =
 72 |             SmallVec::with_capacity(self.not_ignored_columns_count as usize);
 73 |         let mut slices: [Range<u32>; 2] = [0..0, 0..0];
 74 |         let mut reflexive_count = 0;
 75 |         let mut current_offset = 0u32;
 76 | 
 77 |         for (i, column_entities) in row.iter().enumerate() {
 78 |             let column = &self.config.columns[i];
 79 |             let column_id = i as u8;
 80 |             if column.complex {
 81 |                 for entity in column_entities {
 82 |                     let hash = hash_entity(entity);
 83 |                     hashes.push(hash);
 84 |                     self.node_indexer.process(hash, entity, column_id);
 85 |                 }
 86 |                 let length = column_entities.len() as u32;
 87 |                 slices[i] = current_offset..(current_offset + length);
 88 |                 if column.reflexive {
 89 |                     // put reflexive column data to the end of the buffers
 90 |                     let reflexive_id = (self.not_ignored_columns_count + reflexive_count) as usize;
 91 |                     slices[reflexive_id] = current_offset..(current_offset + length);
 92 |                     reflexive_count += 1;
 93 |                 }
 94 |                 current_offset += length;
 95 |             } else {
 96 |                 let entity = column_entities.first().unwrap();
 97 |                 let hash = hash_entity(entity);
 98 |                 hashes.push(hash);
 99 |                 self.node_indexer.process(hash, entity, column_id);
100 |                 let length = 1u32;
101 |                 slices[i] = current_offset..(current_offset + length);
102 |                 current_offset += length;
103 |             }
104 |         }
105 |         Hyperedge { hashes, slices }
106 |     }
107 | }
108 | 
109 | #[inline(always)]
110 | pub fn hash_entity(entity: &str) -> u64 {
111 |     let mut hasher = XxHash64::default();
112 |     hasher.write(entity.as_bytes());
113 |     hasher.finish()
114 | }
115 | 
116 | #[cfg(test)]
117 | mod tests {
118 |     use smallvec::{smallvec, SmallVec};
119 | 
120 |     use crate::entity::{Hyperedge, SMALL_VECTOR_SIZE};
121 | 
122 |     #[test]
123 |     fn generate_cartesian_product_hashes() {
124 |         // hashes for entities in every column
125 |         // column_1: 1 entity
126 |         // column_2: 2 entities
127 |         // column_3: 3 entities
128 |         let slices = [0..2, 2..5];
129 |         let hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = smallvec![10, 20, 30, 40, 50];
130 |         let hyperedge = Hyperedge { hashes, slices };
131 |         let combinations: Vec<_> = hyperedge.edges_iter(0, 1).collect();
132 |         assert_eq!((10, 30), *combinations.get(0).unwrap());
133 |         assert_eq!((10, 40), *combinations.get(1).unwrap());
134 |         assert_eq!((10, 50), *combinations.get(2).unwrap());
135 |         assert_eq!((20, 30), *combinations.get(3).unwrap());
136 |         assert_eq!((20, 40), *combinations.get(4).unwrap());
137 |         assert_eq!((20, 50), *combinations.get(5).unwrap());
138 |         assert_eq!(None, combinations.get(6));
139 |     }
140 | }
141 | 


--------------------------------------------------------------------------------
/legacy/src/configuration.rs:
--------------------------------------------------------------------------------
  1 | #[derive(Debug)]
  2 | pub enum FileType {
  3 |     Json,
  4 |     Tsv,
  5 | }
  6 | 
  7 | #[derive(Debug)]
  8 | pub enum OutputFormat {
  9 |     TextFile,
 10 |     Numpy,
 11 | }
 12 | 
 13 | /// Pipeline configuration
 14 | #[derive(Debug)]
 15 | pub struct Configuration {
 16 |     /// Produce or not entity counter to the output file
 17 |     pub produce_entity_occurrence_count: bool,
 18 | 
 19 |     /// Dimension of the embedding
 20 |     pub embeddings_dimension: u16,
 21 | 
 22 |     /// Maximum number of iteration for training
 23 |     pub max_number_of_iteration: u8,
 24 | 
 25 |     /// Seed for embedding initialization
 26 |     pub seed: Option<i64>,
 27 | 
 28 |     /// Prepend field name to entity in the output file. It differentiates entities with the same
 29 |     /// name from different columns
 30 |     pub prepend_field: bool,
 31 | 
 32 |     /// After how many lines we log the progress
 33 |     pub log_every_n: u32,
 34 | 
 35 |     /// Calculate embeddings in memory or with memory-mapped files. If we don't have enough
 36 |     /// RAM we can support training with mmap files
 37 |     pub in_memory_embedding_calculation: bool,
 38 | 
 39 |     /// Paths to the input files
 40 |     pub input: Vec<String>,
 41 | 
 42 |     /// Type of the input file
 43 |     pub file_type: FileType,
 44 | 
 45 |     /// Output directory for files with embeddings
 46 |     pub output_dir: Option<String>,
 47 | 
 48 |     /// Output format
 49 |     pub output_format: OutputFormat,
 50 | 
 51 |     /// Name of the relation, for output filename generation
 52 |     pub relation_name: String,
 53 | 
 54 |     /// Columns configuration
 55 |     pub columns: Vec<Column>,
 56 | }
 57 | 
 58 | /// Column configuration
 59 | #[derive(Debug, Default)]
 60 | pub struct Column {
 61 |     /// Name, header of the column
 62 |     pub name: String,
 63 | 
 64 |     /// The field is virtual - it is considered during embedding process, no entity is written for the column
 65 |     pub transient: bool,
 66 | 
 67 |     /// The field is composite, containing multiple entity identifiers separated by space
 68 |     pub complex: bool,
 69 | 
 70 |     /// The field is reflexive, which means that it interacts with itself, additional output file is written for every such field
 71 |     pub reflexive: bool,
 72 | 
 73 |     /// The field is ignored, no output file is written for the field
 74 |     pub ignored: bool,
 75 | }
 76 | 
 77 | impl Configuration {
 78 |     /// Create default configuration with specified input file path and columns.
 79 |     pub fn default(input: String, columns: Vec<Column>) -> Configuration {
 80 |         Configuration {
 81 |             produce_entity_occurrence_count: true,
 82 |             embeddings_dimension: 128,
 83 |             max_number_of_iteration: 4,
 84 |             seed: None,
 85 |             prepend_field: true,
 86 |             log_every_n: 1000,
 87 |             in_memory_embedding_calculation: true,
 88 |             file_type: FileType::Tsv,
 89 |             input: vec![input],
 90 |             output_dir: None,
 91 |             output_format: OutputFormat::TextFile,
 92 |             relation_name: String::from("emb"),
 93 |             columns,
 94 |         }
 95 |     }
 96 | 
 97 |     /// Filter out ignored columns. Entities from such columns are omitted.
 98 |     pub fn not_ignored_columns(&self) -> Vec<&Column> {
 99 |         self.columns.iter().filter(|&c| !c.ignored).collect()
100 |     }
101 | }
102 | 
103 | /// Extract columns config based on raw strings.
104 | pub fn extract_fields(cols: Vec<&str>) -> Result<Vec<Column>, String> {
105 |     let mut columns: Vec<Column> = Vec::new();
106 | 
107 |     for col in cols {
108 |         let parts: Vec<&str> = col.split("::").collect();
109 | 
110 |         let column_name: &str;
111 |         let mut transient = false;
112 |         let mut complex = false;
113 |         let mut reflexive = false;
114 |         let mut ignored = false;
115 | 
116 |         let parts_len = parts.len();
117 |         if parts_len > 1 {
118 |             column_name = *parts.last().unwrap();
119 |             let column_name_idx = parts_len - 1;
120 |             for &part in &parts[..column_name_idx] {
121 |                 if part.eq_ignore_ascii_case("transient") {
122 |                     transient = true;
123 |                 } else if part.eq_ignore_ascii_case("complex") {
124 |                     complex = true;
125 |                 } else if part.eq_ignore_ascii_case("reflexive") {
126 |                     reflexive = true;
127 |                 } else if part.eq_ignore_ascii_case("ignore") {
128 |                     ignored = true;
129 |                 } else {
130 |                     let message = format!("Unrecognized column field modifier: {}", part);
131 |                     return Err(message);
132 |                 }
133 |             }
134 |         } else {
135 |             column_name = col;
136 |         }
137 |         let column = Column {
138 |             name: column_name.to_string(),
139 |             transient,
140 |             complex,
141 |             reflexive,
142 |             ignored,
143 |         };
144 |         columns.push(column);
145 |     }
146 |     Ok(columns)
147 | }
148 | 
149 | /// Validate column modifiers.
150 | pub fn validate_fields(cols: Vec<Column>) -> Result<Vec<Column>, String> {
151 |     for col in &cols {
152 |         // transient::reflexive - this would generate no output
153 |         // transient::reflexive::complex - this would generate no output
154 |         if col.reflexive && col.transient {
155 |             let message = format!("A field cannot be REFLEXIVE and simultaneously TRANSIENT. It does not make sense: {}", col.name);
156 |             return Err(message);
157 |         }
158 |         if col.reflexive && !col.complex {
159 |             let message = format!(
160 |                 "A field cannot be REFLEXIVE but NOT COMPLEX. It does not make sense: {}",
161 |                 col.name
162 |             );
163 |             return Err(message);
164 |         }
165 |     }
166 |     Ok(cols)
167 | }
168 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
  1 | # This file is autogenerated by maturin v1.7.4
  2 | # To update, run
  3 | #
  4 | #    maturin generate-ci github
  5 | #
  6 | name: CI
  7 | 
  8 | on:
  9 |   push:
 10 |     branches:
 11 |       - main
 12 |       - master
 13 |     tags:
 14 |       - '*'
 15 |   pull_request:
 16 |   workflow_dispatch:
 17 | 
 18 | permissions:
 19 |   contents: read
 20 | 
 21 | jobs:
 22 |   linux:
 23 |     runs-on: ${{ matrix.platform.runner }}
 24 |     strategy:
 25 |       matrix:
 26 |         platform:
 27 |           - runner: ubuntu-latest
 28 |             target: x86_64
 29 |           - runner: ubuntu-latest
 30 |             target: x86
 31 |           - runner: ubuntu-latest
 32 |             target: aarch64
 33 |           # - runner: ubuntu-latest
 34 |           #   target: armv7
 35 |           # - runner: ubuntu-latest
 36 |           #   target: s390x
 37 |           # - runner: ubuntu-latest
 38 |           #   target: ppc64le
 39 |     steps:
 40 |       - uses: actions/checkout@v4
 41 |       - name: Install Dependencies
 42 |         run: |
 43 |           sudo apt-get update
 44 |           sudo apt-get install -y musl-tools gcc musl-dev
 45 |       - uses: actions/setup-python@v5
 46 |         with:
 47 |           python-version: 3.x
 48 |       - name: Build wheels
 49 |         uses: PyO3/maturin-action@v1
 50 |         with:
 51 |           target: ${{ matrix.platform.target }}
 52 |           args: --release --out dist --find-interpreter
 53 |           sccache: 'true'
 54 |           manylinux: auto
 55 |       - name: Upload wheels
 56 |         uses: actions/upload-artifact@v4
 57 |         with:
 58 |           name: wheels-linux-${{ matrix.platform.target }}
 59 |           path: dist
 60 | 
 61 |   # musllinux:
 62 |   #   runs-on: ${{ matrix.platform.runner }}
 63 |   #   strategy:
 64 |   #     matrix:
 65 |   #       platform:
 66 |   #         - runner: ubuntu-latest
 67 |   #           target: x86_64
 68 |   #         - runner: ubuntu-latest
 69 |   #           target: x86
 70 |   #         - runner: ubuntu-latest
 71 |   #           target: aarch64
 72 |   #         - runner: ubuntu-latest
 73 |   #           target: armv7
 74 |   #   steps:
 75 |   #     - uses: actions/checkout@v4
 76 |   #     - name: Install Dependencies
 77 |   #       run: |
 78 |   #         sudo apt-get update
 79 |   #         sudo apt-get install -y musl-tools gcc musl-dev
 80 |   #     - uses: actions/setup-python@v5
 81 |   #       with:
 82 |   #         python-version: 3.x
 83 |   #     - name: Build wheels
 84 |   #       uses: PyO3/maturin-action@v1
 85 |   #       with:
 86 |   #         target: ${{ matrix.platform.target }}
 87 |   #         args: --release --out dist --find-interpreter
 88 |   #         sccache: 'true'
 89 |   #         manylinux: musllinux_1_2
 90 |   #     - name: Upload wheels
 91 |   #       uses: actions/upload-artifact@v4
 92 |   #       with:
 93 |   #         name: wheels-musllinux-${{ matrix.platform.target }}
 94 |   #         path: dist
 95 | 
 96 |   windows:
 97 |     runs-on: ${{ matrix.platform.runner }}
 98 |     strategy:
 99 |       matrix:
100 |         platform:
101 |           - runner: windows-latest
102 |             target: x64
103 |           - runner: windows-latest
104 |             target: x86
105 |     steps:
106 |       - uses: actions/checkout@v4
107 |       - uses: actions/setup-python@v5
108 |         with:
109 |           python-version: 3.x
110 |           architecture: ${{ matrix.platform.target }}
111 |       - name: Build wheels
112 |         uses: PyO3/maturin-action@v1
113 |         with:
114 |           target: ${{ matrix.platform.target }}
115 |           args: --release --out dist --find-interpreter
116 |           sccache: 'true'
117 |       - name: Upload wheels
118 |         uses: actions/upload-artifact@v4
119 |         with:
120 |           name: wheels-windows-${{ matrix.platform.target }}
121 |           path: dist
122 | 
123 |   macos:
124 |     runs-on: ${{ matrix.platform.runner }}
125 |     strategy:
126 |       matrix:
127 |         platform:
128 |           # - runner: macos-12
129 |             # target: x86_64
130 |           - runner: macos-14
131 |             target: aarch64
132 |     steps:
133 |       - uses: actions/checkout@v4
134 |       - uses: actions/setup-python@v5
135 |         with:
136 |           python-version: 3.x
137 |       - name: Build wheels
138 |         uses: PyO3/maturin-action@v1
139 |         with:
140 |           target: ${{ matrix.platform.target }}
141 |           args: --release --out dist --find-interpreter
142 |           sccache: 'true'
143 |       - name: Upload wheels
144 |         uses: actions/upload-artifact@v4
145 |         with:
146 |           name: wheels-macos-${{ matrix.platform.target }}
147 |           path: dist
148 | 
149 |   sdist:
150 |     runs-on: ubuntu-latest
151 |     steps:
152 |       - uses: actions/checkout@v4
153 |       - name: Build sdist
154 |         uses: PyO3/maturin-action@v1
155 |         with:
156 |           command: sdist
157 |           args: --out dist
158 |       - name: Upload sdist
159 |         uses: actions/upload-artifact@v4
160 |         with:
161 |           name: wheels-sdist
162 |           path: dist
163 | 
164 |   release:
165 |     name: Release
166 |     runs-on: ubuntu-latest
167 |     if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }}
168 |     needs: [linux, windows, macos, sdist]
169 |     permissions:
170 |       # Use to sign the release artifacts
171 |       id-token: write
172 |       # Used to upload release artifacts
173 |       contents: write
174 |       # Used to generate artifact attestation
175 |       attestations: write
176 |     steps:
177 |       - uses: actions/download-artifact@v4
178 |       - name: Generate artifact attestation
179 |         uses: actions/attest-build-provenance@v1
180 |         with:
181 |           subject-path: 'wheels-*/*'
182 |       - name: Publish to PyPI
183 |         if: "startsWith(github.ref, 'refs/tags/')"
184 |         uses: PyO3/maturin-action@v1
185 |         env:
186 |           MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
187 |         with:
188 |           command: upload
189 |           args: --non-interactive --skip-existing wheels-*/*


--------------------------------------------------------------------------------
/legacy/src/pipeline.rs:
--------------------------------------------------------------------------------
  1 | use std::fs::File;
  2 | use std::io::{BufRead, BufReader};
  3 | 
  4 | use crate::configuration::{Column, Configuration, FileType, OutputFormat};
  5 | use crate::embedding::{calculate_embeddings, calculate_embeddings_mmap};
  6 | use crate::entity::{EntityProcessor, SMALL_VECTOR_SIZE};
  7 | use crate::persistence::embedding::{EmbeddingPersistor, NpyPersistor, TextFileVectorPersistor};
  8 | use crate::persistence::entity::InMemoryEntityMappingPersistor;
  9 | use crate::sparse_matrix::{create_sparse_matrices, SparseMatrix};
 10 | use bus::Bus;
 11 | use log::{error, info, warn};
 12 | use simdjson_rust::dom;
 13 | use smallvec::{smallvec, SmallVec};
 14 | use std::sync::Arc;
 15 | use std::thread;
 16 | 
 17 | /// Create SparseMatrix'es based on columns config. Every SparseMatrix operates in separate
 18 | /// thread. EntityProcessor reads data in main thread and broadcast cartesian products
 19 | /// to SparseMatrix'es.
 20 | pub fn build_graphs(
 21 |     config: &Configuration,
 22 |     in_memory_entity_mapping_persistor: Arc<InMemoryEntityMappingPersistor>,
 23 | ) -> Vec<SparseMatrix> {
 24 |     let sparse_matrices = create_sparse_matrices(&config.columns);
 25 |     dbg!(&sparse_matrices);
 26 | 
 27 |     let mut bus: Bus<SmallVec<[u64; SMALL_VECTOR_SIZE]>> = Bus::new(128);
 28 |     let mut sparse_matrix_threads = Vec::new();
 29 |     for mut sparse_matrix in sparse_matrices {
 30 |         let rx = bus.add_rx();
 31 |         let handle = thread::spawn(move || {
 32 |             for received in rx {
 33 |                 sparse_matrix.handle_pair(&received);
 34 |             }
 35 |             sparse_matrix.finish();
 36 |             sparse_matrix
 37 |         });
 38 |         sparse_matrix_threads.push(handle);
 39 |     }
 40 | 
 41 |     for input in config.input.iter() {
 42 |         let mut entity_processor = EntityProcessor::new(
 43 |             config,
 44 |             in_memory_entity_mapping_persistor.clone(),
 45 |             |hashes| {
 46 |                 bus.broadcast(hashes);
 47 |             },
 48 |         );
 49 | 
 50 |         match &config.file_type {
 51 |             FileType::Json => {
 52 |                 let mut parser = dom::Parser::default();
 53 |                 read_file(input, config.log_every_n as u64, move |line| {
 54 |                     let row = parse_json_line(line, &mut parser, &config.columns);
 55 |                     entity_processor.process_row(&row);
 56 |                 });
 57 |             }
 58 |             FileType::Tsv => {
 59 |                 let config_col_num = config.columns.len();
 60 |                 read_file(input, config.log_every_n as u64, move |line| {
 61 |                     let row = parse_tsv_line(line);
 62 |                     let line_col_num = row.len();
 63 |                     if line_col_num == config_col_num {
 64 |                         entity_processor.process_row(&row);
 65 |                     } else {
 66 |                         warn!("Wrong number of columns (expected: {}, provided: {}). The line [{}] is skipped.", config_col_num, line_col_num, line);
 67 |                     }
 68 |                 });
 69 |             }
 70 |         }
 71 |     }
 72 | 
 73 |     drop(bus);
 74 | 
 75 |     let mut sparse_matrices = vec![];
 76 |     for join_handle in sparse_matrix_threads {
 77 |         let sparse_matrix = join_handle
 78 |             .join()
 79 |             .expect("Couldn't join on the associated thread");
 80 |         sparse_matrices.push(sparse_matrix);
 81 |     }
 82 | 
 83 |     sparse_matrices
 84 | }
 85 | 
 86 | /// Read file line by line. Pass every valid line to handler for parsing.
 87 | fn read_file<F>(filepath: &str, log_every: u64, mut line_handler: F)
 88 | where
 89 |     F: FnMut(&str),
 90 | {
 91 |     let input_file = File::open(filepath).expect("Can't open file");
 92 |     let mut buffered = BufReader::new(input_file);
 93 | 
 94 |     let mut line_number = 1u64;
 95 |     let mut line = String::new();
 96 |     loop {
 97 |         match buffered.read_line(&mut line) {
 98 |             Ok(bytes_read) => {
 99 |                 // EOF
100 |                 if bytes_read == 0 {
101 |                     break;
102 |                 }
103 | 
104 |                 line_handler(&line);
105 |             }
106 |             Err(err) => {
107 |                 error!("Can't read line number: {}. Error: {}.", line_number, err);
108 |             }
109 |         };
110 | 
111 |         // clear to reuse the buffer
112 |         line.clear();
113 | 
114 |         if line_number % log_every == 0 {
115 |             info!("Number of lines processed: {}", line_number);
116 |         }
117 | 
118 |         line_number += 1;
119 |     }
120 | }
121 | 
122 | /// Parse a line of JSON and read its columns into a vector for processing.
123 | fn parse_json_line(
124 |     line: &str,
125 |     parser: &mut dom::Parser,
126 |     columns: &[Column],
127 | ) -> Vec<SmallVec<[String; SMALL_VECTOR_SIZE]>> {
128 |     let parsed = parser.parse(line).unwrap();
129 |     columns
130 |         .iter()
131 |         .map(|c| {
132 |             if !c.complex {
133 |                 let elem = parsed.at_key(&c.name).unwrap();
134 |                 let value = match elem.get_type() {
135 |                     dom::element::ElementType::String => elem.get_string().unwrap(),
136 |                     _ => elem.minify(),
137 |                 };
138 |                 smallvec![value]
139 |             } else {
140 |                 parsed
141 |                     .at_key(&c.name)
142 |                     .unwrap()
143 |                     .get_array()
144 |                     .expect("Values for complex columns must be arrays")
145 |                     .into_iter()
146 |                     .map(|v| match v.get_type() {
147 |                         dom::element::ElementType::String => v.get_string().unwrap(),
148 |                         _ => v.minify(),
149 |                     })
150 |                     .collect()
151 |             }
152 |         })
153 |         .collect()
154 | }
155 | 
156 | /// Parse a line of TSV and read its columns into a vector for processing.
157 | fn parse_tsv_line(line: &str) -> Vec<SmallVec<[&str; SMALL_VECTOR_SIZE]>> {
158 |     let values = line.trim().split('\t');
159 |     values.map(|c| c.split(' ').collect()).collect()
160 | }
161 | 
162 | /// Train SparseMatrix'es (graphs) in separated threads.
163 | pub fn train(
164 |     config: Configuration,
165 |     in_memory_entity_mapping_persistor: Arc<InMemoryEntityMappingPersistor>,
166 |     sparse_matrices: Vec<SparseMatrix>,
167 | ) {
168 |     let config = Arc::new(config);
169 |     let mut embedding_threads = Vec::new();
170 |     for sparse_matrix in sparse_matrices {
171 |         let sparse_matrix = Arc::new(sparse_matrix);
172 |         let config = config.clone();
173 |         let in_memory_entity_mapping_persistor = in_memory_entity_mapping_persistor.clone();
174 |         let handle = thread::spawn(move || {
175 |             let directory = match config.output_dir.as_ref() {
176 |                 Some(out) => format!("{}/", out.clone()),
177 |                 None => String::from(""),
178 |             };
179 |             let ofp = format!(
180 |                 "{}{}__{}__{}.out",
181 |                 directory,
182 |                 config.relation_name,
183 |                 sparse_matrix.col_a_name.as_str(),
184 |                 sparse_matrix.col_b_name.as_str()
185 |             );
186 | 
187 |             let mut persistor: Box<dyn EmbeddingPersistor> = match &config.output_format {
188 |                 OutputFormat::TextFile => Box::new(TextFileVectorPersistor::new(
189 |                     ofp,
190 |                     config.produce_entity_occurrence_count,
191 |                 )),
192 |                 OutputFormat::Numpy => Box::new(NpyPersistor::new(
193 |                     ofp,
194 |                     config.produce_entity_occurrence_count,
195 |                 )),
196 |             };
197 |             if config.in_memory_embedding_calculation {
198 |                 calculate_embeddings(
199 |                     config.clone(),
200 |                     sparse_matrix.clone(),
201 |                     in_memory_entity_mapping_persistor,
202 |                     persistor.as_mut(),
203 |                 );
204 |             } else {
205 |                 calculate_embeddings_mmap(
206 |                     config.clone(),
207 |                     sparse_matrix.clone(),
208 |                     in_memory_entity_mapping_persistor,
209 |                     persistor.as_mut(),
210 |                 );
211 |             }
212 |         });
213 |         embedding_threads.push(handle);
214 |     }
215 | 
216 |     for join_handle in embedding_threads {
217 |         join_handle
218 |             .join()
219 |             .expect("Couldn't join on the associated thread");
220 |     }
221 | }
222 | 


--------------------------------------------------------------------------------
/src/pipeline.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::min;
  2 | use std::fs::File;
  3 | use std::io::{BufRead, BufReader};
  4 | use std::sync::Arc;
  5 | use std::time::Instant;
  6 | 
  7 | use crossbeam::channel;
  8 | use crossbeam::channel::{Receiver, Sender};
  9 | use crossbeam::thread as cb_thread;
 10 | use crossbeam::thread::{Scope, ScopedJoinHandle};
 11 | use itertools::Itertools;
 12 | use log::{error, info, warn};
 13 | use smallvec::SmallVec;
 14 | 
 15 | use crate::configuration::Configuration;
 16 | use crate::entity::{EntityProcessor, Hyperedge, SMALL_VECTOR_SIZE};
 17 | use crate::sparse_matrix::SparseMatrix;
 18 | use crate::sparse_matrix_builder::NodeIndexerBuilder;
 19 | use crate::sparse_matrix_builder::{
 20 |     AsyncNodeIndexerBuilder, NodeIndexer, SparseMatrixBuffer, SparseMatrixBuffersReducer,
 21 |     SyncNodeIndexerBuilder,
 22 | };
 23 | 
 24 | pub fn build_graph_from_iterator<'a>(
 25 |     config: &Configuration,
 26 |     hyperedges: impl Iterator<Item = &'a str>,
 27 | ) -> SparseMatrix {
 28 |     cb_thread::scope(|s| {
 29 |         let (hyperedges_s, hyperedges_r) = channel::bounded(64 * config.num_workers_graph_building);
 30 | 
 31 |         // Consumer first, producer second to avoid deadlock
 32 |         let matrix_buffer = make_consumer(hyperedges_r, config, s);
 33 |         let node_indexer = make_producer_from_iterator(config, hyperedges, hyperedges_s);
 34 | 
 35 |         let buffers = matrix_buffer
 36 |             .into_iter()
 37 |             .map(|h| h.join().unwrap())
 38 |             .collect_vec();
 39 |         SparseMatrixBuffersReducer::new(node_indexer, buffers, config.num_workers_graph_building)
 40 |             .reduce()
 41 |     })
 42 |     .expect("All work in thread scope finished")
 43 | }
 44 | 
 45 | fn make_producer_from_iterator<'a>(
 46 |     config: &Configuration,
 47 |     hyperedges: impl Iterator<Item = &'a str>,
 48 |     hyperedges_s: Sender<Hyperedge>,
 49 | ) -> NodeIndexer {
 50 |     let node_indexer_builder: Arc<SyncNodeIndexerBuilder> = Default::default();
 51 |     let entity_processor = EntityProcessor::new(config, node_indexer_builder.clone());
 52 |     for line in hyperedges {
 53 |         consume_line(config, &hyperedges_s, &entity_processor, line);
 54 |     }
 55 |     drop(entity_processor);
 56 |     let node_indexer_builder =
 57 |         Arc::try_unwrap(node_indexer_builder).expect("All other references should be dropped");
 58 |     node_indexer_builder.finish()
 59 | }
 60 | 
 61 | fn consume_line<S: NodeIndexerBuilder>(
 62 |     config: &Configuration,
 63 |     hyperedges_s: &Sender<Hyperedge>,
 64 |     entity_processor: &EntityProcessor<S>,
 65 |     line: &str,
 66 | ) {
 67 |     let row = parse_tsv_line(line);
 68 |     let line_col_num = row.len();
 69 |     if line_col_num == config.columns.len() {
 70 |         let hyperedge = entity_processor.process_row_and_get_edges(&row);
 71 |         hyperedges_s.send(hyperedge).unwrap();
 72 |     } else {
 73 |         warn!(
 74 |             "Wrong number of columns (expected: {}, provided: {}). The line [{}] is skipped.",
 75 |             config.columns.len(),
 76 |             line_col_num,
 77 |             line
 78 |         );
 79 |     }
 80 | }
 81 | 
 82 | pub fn build_graph_from_files(config: &Configuration, input_files: Vec<String>) -> SparseMatrix {
 83 |     let processing_worker_num = config.num_workers_graph_building;
 84 |     cb_thread::scope(|s| {
 85 |         let (hyperedges_s, hyperedges_r) = channel::bounded(processing_worker_num * 64);
 86 | 
 87 |         // Consumer first, producer second to avoid deadlock
 88 |         let matrix_buffers: Vec<_> = make_consumer(hyperedges_r, config, s);
 89 |         let node_indexer = make_producer_from_files(config, &input_files, s, hyperedges_s);
 90 | 
 91 |         let buffers = matrix_buffers
 92 |             .into_iter()
 93 |             .map(|h| h.join().unwrap())
 94 |             .collect_vec();
 95 | 
 96 |         let merging_start_time = Instant::now();
 97 |         let result =
 98 |             SparseMatrixBuffersReducer::new(node_indexer, buffers, processing_worker_num).reduce();
 99 |         info!(
100 |             "Merging finished in {} sec",
101 |             merging_start_time.elapsed().as_secs()
102 |         );
103 |         result
104 |     })
105 |     .expect("Threads finished work")
106 | }
107 | 
108 | fn make_producer_from_files<'c: 'e, 'e: 's, 's>(
109 |     config: &'c Configuration,
110 |     input_files: &'c Vec<String>,
111 |     s: &'s Scope<'e>,
112 |     hyperedges_s: Sender<Hyperedge>,
113 | ) -> NodeIndexer {
114 |     let (files_s, files_r) = channel::unbounded();
115 | 
116 |     for input in input_files {
117 |         files_s.send(input).unwrap()
118 |     }
119 |     drop(files_s);
120 | 
121 |     let max_file_reading_worker_num = min(config.num_workers_graph_building, 4);
122 |     let file_reading_worker_num = min(max_file_reading_worker_num, input_files.len());
123 | 
124 |     let log_every_n = 10000;
125 | 
126 |     if file_reading_worker_num == 1 {
127 |         let node_indexer_builder: Arc<SyncNodeIndexerBuilder> = Default::default();
128 |         let entity_processor = EntityProcessor::new(config, node_indexer_builder.clone());
129 |         consume_files(config, hyperedges_s, files_r, log_every_n, entity_processor);
130 |         let node_indexer_builder =
131 |             Arc::try_unwrap(node_indexer_builder).expect("All other references should be dropped");
132 |         node_indexer_builder.finish()
133 |     } else {
134 |         let node_indexer_builder: Arc<AsyncNodeIndexerBuilder> = Default::default();
135 |         let producers = (0..file_reading_worker_num)
136 |             .map(|_| {
137 |                 let hyperedges_s = hyperedges_s.clone();
138 |                 let files_r = files_r.clone();
139 |                 let entity_processor = EntityProcessor::new(config, node_indexer_builder.clone());
140 | 
141 |                 s.spawn(move |_| {
142 |                     consume_files(config, hyperedges_s, files_r, log_every_n, entity_processor);
143 |                 })
144 |             })
145 |             .collect_vec();
146 |         drop(hyperedges_s); // hyperedges_s got distributed among producers, drop seed object
147 |         drop(files_r);
148 | 
149 |         producers.into_iter().for_each(|h| h.join().unwrap());
150 |         let node_indexer_builder =
151 |             Arc::try_unwrap(node_indexer_builder).expect("All other references should be dropped");
152 |         node_indexer_builder.finish()
153 |     }
154 | }
155 | 
156 | fn consume_files<S: NodeIndexerBuilder>(
157 |     config: &Configuration,
158 |     hyperedges_s: Sender<Hyperedge>,
159 |     files_r: Receiver<&String>,
160 |     log_every_n: u64,
161 |     entity_processor: EntityProcessor<S>,
162 | ) {
163 |     for input in files_r {
164 |         read_file(input, log_every_n, |line| {
165 |             consume_line(config, &hyperedges_s, &entity_processor, line);
166 |         });
167 |     }
168 | }
169 | 
170 | fn make_consumer<'s, 'a: 'a>(
171 |     hyperedges_r: Receiver<Hyperedge>,
172 |     config: &'a Configuration,
173 |     s: &'s Scope<'a>,
174 | ) -> Vec<ScopedJoinHandle<'s, SparseMatrixBuffer>> {
175 |     (0..config.num_workers_graph_building)
176 |         .map(|_| {
177 |             let hyperedges_r = hyperedges_r.clone();
178 |             let sparse_matrices = config.matrix_desc.clone();
179 | 
180 |             s.spawn(move |_| {
181 |                 let mut buffer = sparse_matrices.make_buffer(config.hyperedge_trim_n);
182 |                 for hyperedge in hyperedges_r {
183 |                     buffer.handle_hyperedge(&hyperedge);
184 |                 }
185 |                 buffer
186 |             })
187 |         })
188 |         .collect()
189 | }
190 | 
191 | /// Read file line by line. Pass every valid line to handler for parsing.
192 | fn read_file<F>(filepath: &str, log_every: u64, mut line_handler: F)
193 | where
194 |     F: FnMut(&str),
195 | {
196 |     let input_file = File::open(filepath).expect("Can't open file");
197 |     let mut buffered = BufReader::new(input_file);
198 | 
199 |     let mut line_number = 1u64;
200 |     let mut line = String::new();
201 |     loop {
202 |         match buffered.read_line(&mut line) {
203 |             Ok(bytes_read) => {
204 |                 // EOF
205 |                 if bytes_read == 0 {
206 |                     break;
207 |                 }
208 | 
209 |                 line_handler(&line);
210 |             }
211 |             Err(err) => {
212 |                 error!("Can't read line number: {}. Error: {}.", line_number, err);
213 |             }
214 |         };
215 | 
216 |         // clear to reuse the buffer
217 |         line.clear();
218 | 
219 |         if line_number % log_every == 0 {
220 |             info!("Number of lines processed: {}", line_number);
221 |         }
222 | 
223 |         line_number += 1;
224 |     }
225 | }
226 | 
227 | /// Parse a line of TSV and read its columns into a vector for processing.
228 | fn parse_tsv_line(line: &str) -> Vec<SmallVec<[&str; SMALL_VECTOR_SIZE]>> {
229 |     let values = line.trim().split('\t');
230 |     values.map(|c| c.split(' ').collect()).collect()
231 | }
232 | 


--------------------------------------------------------------------------------
/legacy/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::time::Instant;
  2 | 
  3 | use clap::{crate_authors, crate_description, crate_name, crate_version, Arg, Command};
  4 | use cleora::configuration;
  5 | use cleora::configuration::Configuration;
  6 | use cleora::configuration::OutputFormat;
  7 | use cleora::persistence::entity::InMemoryEntityMappingPersistor;
  8 | use cleora::pipeline::{build_graphs, train};
  9 | use env_logger::Env;
 10 | use std::fs;
 11 | use std::sync::Arc;
 12 | 
 13 | #[macro_use]
 14 | extern crate log;
 15 | 
 16 | fn main() {
 17 |     let env = Env::default()
 18 |         .filter_or("MY_LOG_LEVEL", "info")
 19 |         .write_style_or("MY_LOG_STYLE", "always");
 20 |     env_logger::init_from_env(env);
 21 | 
 22 |     let now = Instant::now();
 23 | 
 24 |     let matches = Command::new(crate_name!())
 25 |         .version(crate_version!())
 26 |         .author(crate_authors!())
 27 |         .about(crate_description!())
 28 |         .arg(
 29 |             Arg::new("inputs")
 30 |                 .multiple_values(true)
 31 |                 .help("Input files paths")
 32 |                 .takes_value(true),
 33 |         )
 34 |         .arg(
 35 |             Arg::new("input")
 36 |                 .short('i')
 37 |                 .long("input")
 38 |                 .help("Deprecated. Use positional args for input files")
 39 |                 .takes_value(true),
 40 |         )
 41 |         .arg(
 42 |             Arg::new("file-type")
 43 |                 .short('t')
 44 |                 .long("type")
 45 |                 .possible_values(&["tsv", "json"])
 46 |                 .help("Input file type")
 47 |                 .takes_value(true),
 48 |         )
 49 |         .arg(
 50 |             Arg::new("output-dir")
 51 |                 .short('o')
 52 |                 .long("output-dir")
 53 |                 .help("Output directory for files with embeddings")
 54 |                 .takes_value(true),
 55 |         )
 56 |         .arg(
 57 |             Arg::new("dimension")
 58 |                 .short('d')
 59 |                 .long("dimension")
 60 |                 .required(true)
 61 |                 .help("Embedding dimension size")
 62 |                 .takes_value(true),
 63 |         )
 64 |         .arg(
 65 |             Arg::new("number-of-iterations")
 66 |                 .short('n')
 67 |                 .long("number-of-iterations")
 68 |                 .required(true)
 69 |                 .help("Max number of iterations")
 70 |                 .takes_value(true),
 71 |         )
 72 |         .arg(
 73 |             Arg::new("seed")
 74 |                 .short('s')
 75 |                 .long("seed")
 76 |                 .help("Seed (integer) for embedding initialization")
 77 |                 .takes_value(true),
 78 |         )
 79 |         .arg(
 80 |             Arg::new("columns")
 81 |                 .short('c')
 82 |                 .long("columns")
 83 |                 .required(true)
 84 |                 .help(
 85 |                     "Column names (max 12), with modifiers: [transient::, reflexive::, complex::]",
 86 |                 )
 87 |                 .takes_value(true),
 88 |         )
 89 |         .arg(
 90 |             Arg::new("relation-name")
 91 |                 .short('r')
 92 |                 .long("relation-name")
 93 |                 .default_value("emb")
 94 |                 .help("Name of the relation, for output filename generation")
 95 |                 .takes_value(true),
 96 |         )
 97 |         .arg(
 98 |             Arg::new("prepend-field-name")
 99 |                 .short('p')
100 |                 .long("prepend-field-name")
101 |                 .possible_values(&["0", "1"])
102 |                 .default_value("0")
103 |                 .help("Prepend field name to entity in output")
104 |                 .takes_value(true),
105 |         )
106 |         .arg(
107 |             Arg::new("log-every-n")
108 |                 .short('l')
109 |                 .long("log-every-n")
110 |                 .default_value("10000")
111 |                 .help("Log output every N lines")
112 |                 .takes_value(true),
113 |         )
114 |         .arg(
115 |             Arg::new("in-memory-embedding-calculation")
116 |                 .short('e')
117 |                 .long("in-memory-embedding-calculation")
118 |                 .possible_values(&["0", "1"])
119 |                 .default_value("1")
120 |                 .help("Calculate embeddings in memory or with memory-mapped files")
121 |                 .takes_value(true),
122 |         )
123 |         .arg(
124 |             Arg::new("output-format")
125 |                 .short('f')
126 |                 .help("Output format. One of: textfile|numpy")
127 |                 .possible_values(&["textfile", "numpy"])
128 |                 .default_value("textfile")
129 |                 .takes_value(true),
130 |         )
131 |         .get_matches();
132 | 
133 |     info!("Reading args...");
134 | 
135 |     let input: Vec<String> = {
136 |         let named_arg = matches.value_of("input");
137 |         let position_args = match matches.values_of("inputs") {
138 |             None => vec![],
139 |             Some(values) => values.into_iter().collect(),
140 |         };
141 |         position_args
142 |             .into_iter()
143 |             .chain(named_arg.into_iter())
144 |             .map(|s| s.to_string())
145 |             .collect()
146 |     };
147 |     if input.is_empty() {
148 |         panic!("Missing input files")
149 |     }
150 | 
151 |     let file_type = match matches.value_of("file-type") {
152 |         Some(type_name) => match type_name {
153 |             "tsv" => configuration::FileType::Tsv,
154 |             "json" => configuration::FileType::Json,
155 |             _ => panic!("Invalid file type {}", type_name),
156 |         },
157 |         None => configuration::FileType::Tsv,
158 |     };
159 |     let output_dir = matches.value_of("output-dir").map(|s| s.to_string());
160 |     // try to create output directory for files with embeddings
161 |     if let Some(output_dir) = output_dir.as_ref() {
162 |         fs::create_dir_all(output_dir).expect("Can't create output directory");
163 |     }
164 |     let dimension: u16 = matches.value_of("dimension").unwrap().parse().unwrap();
165 |     let max_iter: u8 = matches
166 |         .value_of("number-of-iterations")
167 |         .unwrap()
168 |         .parse()
169 |         .unwrap();
170 |     let seed: Option<i64> = matches.value_of("seed").map(|s| s.parse().unwrap());
171 |     let relation_name = matches.value_of("relation-name").unwrap();
172 |     let prepend_field_name = {
173 |         let value: u8 = matches
174 |             .value_of("prepend-field-name")
175 |             .unwrap()
176 |             .parse()
177 |             .unwrap();
178 |         value == 1
179 |     };
180 |     let log_every: u32 = matches.value_of("log-every-n").unwrap().parse().unwrap();
181 |     let in_memory_embedding_calculation = {
182 |         let value: u8 = matches
183 |             .value_of("in-memory-embedding-calculation")
184 |             .unwrap()
185 |             .parse()
186 |             .unwrap();
187 |         value == 1
188 |     };
189 |     let columns = {
190 |         let cols_str = matches.value_of("columns").unwrap();
191 |         let cols_str_separated: Vec<&str> = cols_str.split(' ').collect();
192 |         match configuration::extract_fields(cols_str_separated) {
193 |             Ok(cols) => match configuration::validate_fields(cols) {
194 |                 Ok(validated_cols) => validated_cols,
195 |                 Err(msg) => panic!("Invalid column fields. Message: {}", msg),
196 |             },
197 |             Err(msg) => panic!("Parsing problem. Message: {}", msg),
198 |         }
199 |     };
200 | 
201 |     let output_format = match matches.value_of("output-format").unwrap() {
202 |         "textfile" => OutputFormat::TextFile,
203 |         "numpy" => OutputFormat::Numpy,
204 |         _ => panic!("unsupported output format"),
205 |     };
206 | 
207 |     let config = Configuration {
208 |         produce_entity_occurrence_count: true,
209 |         embeddings_dimension: dimension,
210 |         max_number_of_iteration: max_iter,
211 |         seed,
212 |         prepend_field: prepend_field_name,
213 |         log_every_n: log_every,
214 |         in_memory_embedding_calculation,
215 |         input,
216 |         file_type,
217 |         output_dir,
218 |         output_format,
219 |         relation_name: relation_name.to_string(),
220 |         columns,
221 |     };
222 |     dbg!(&config);
223 | 
224 |     info!("Starting calculation...");
225 |     let in_memory_entity_mapping_persistor = InMemoryEntityMappingPersistor::default();
226 |     let in_memory_entity_mapping_persistor = Arc::new(in_memory_entity_mapping_persistor);
227 | 
228 |     let sparse_matrices = build_graphs(&config, in_memory_entity_mapping_persistor.clone());
229 |     info!(
230 |         "Finished Sparse Matrices calculation in {} sec",
231 |         now.elapsed().as_secs()
232 |     );
233 | 
234 |     train(config, in_memory_entity_mapping_persistor, sparse_matrices);
235 |     info!("Finished in {} sec", now.elapsed().as_secs());
236 | }
237 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::min;
  2 | use std::collections::hash_map::DefaultHasher;
  3 | use std::collections::HashMap;
  4 | use std::hash::Hasher;
  5 | 
  6 | use bincode::{deserialize, serialize};
  7 | use ndarray::{Array1, Array2, ArrayViewMut2, Axis, Ix1, Ix2};
  8 | use numpy::{PyArray, PyArray2, ToPyArray};
  9 | use pyo3::exceptions::PyValueError;
 10 | use pyo3::prelude::*;
 11 | use pyo3::types::{PyBytes, PyIterator, PyString, PyTuple};
 12 | use rayon::iter::IndexedParallelIterator;
 13 | use rayon::iter::ParallelIterator;
 14 | use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator};
 15 | 
 16 | use crate::configuration::Configuration;
 17 | use crate::embedding::{MarkovType, NdArrayMatrix};
 18 | use crate::entity::hash_entity;
 19 | use crate::pipeline::{build_graph_from_files, build_graph_from_iterator};
 20 | use crate::sparse_matrix::{create_sparse_matrix_descriptor, SparseMatrix, SparseMatrixDescriptor};
 21 | 
 22 | pub mod configuration;
 23 | pub mod embedding;
 24 | pub mod entity;
 25 | pub mod pipeline;
 26 | pub mod sparse_matrix;
 27 | pub mod sparse_matrix_builder;
 28 | 
 29 | // Methods not exposed to python
 30 | impl SparseMatrix {
 31 |     fn markov_propagate<'py>(
 32 |         &self,
 33 |         x: &'py PyArray2<f32>,
 34 |         markov_type: MarkovType,
 35 |         num_workers: Option<usize>,
 36 |     ) -> &'py PyArray<f32, Ix2> {
 37 |         let array = unsafe { x.as_array() };
 38 |         let multiplication_workers: usize = num_workers.unwrap_or_else(num_cpus::get);
 39 |         let propagated = NdArrayMatrix::multiply(self, array, markov_type, multiplication_workers);
 40 |         propagated.to_pyarray(x.py())
 41 |     }
 42 | 
 43 |     pub fn from_rust_iterator<'a>(
 44 |         columns: &str,
 45 |         hyperedge_trim_n: usize,
 46 |         hyperedges: impl Iterator<Item = &'a str>,
 47 |         num_workers: Option<usize>,
 48 |     ) -> Result<SparseMatrix, &'static str> {
 49 |         let columns = configuration::parse_fields(columns).expect("Columns should be valid");
 50 |         let matrix_desc = create_sparse_matrix_descriptor(&columns)?;
 51 |         let config = Configuration {
 52 |             seed: None,
 53 |             columns,
 54 |             matrix_desc,
 55 |             hyperedge_trim_n,
 56 |             num_workers_graph_building: num_workers.unwrap_or_else(|| min(num_cpus::get(), 8)),
 57 |         };
 58 | 
 59 |         Ok(build_graph_from_iterator(&config, hyperedges))
 60 |     }
 61 | 
 62 |     fn initialize_deterministically_rust(&self, mut vectors: ArrayViewMut2<f32>, seed: i64) {
 63 |         vectors
 64 |             .axis_iter_mut(Axis(0))
 65 |             .into_par_iter()
 66 |             .enumerate()
 67 |             .for_each(|(entity_ix, mut row)| {
 68 |                 let entity_id_hash = hash_entity(self.entity_ids[entity_ix].as_str());
 69 |                 row.indexed_iter_mut().for_each(|(col_ix, v)| {
 70 |                     let value = init_value(col_ix, entity_id_hash, seed);
 71 |                     *v = value
 72 |                 });
 73 |             });
 74 |     }
 75 | }
 76 | 
 77 | #[pymethods]
 78 | impl SparseMatrix {
 79 |     #[pyo3(signature = (x, num_workers = None))]
 80 |     pub fn left_markov_propagate<'py>(
 81 |         &self,
 82 |         x: &'py PyArray2<f32>,
 83 |         num_workers: Option<usize>,
 84 |     ) -> &'py PyArray<f32, Ix2> {
 85 |         self.markov_propagate(x, MarkovType::Left, num_workers)
 86 |     }
 87 | 
 88 |     #[pyo3(signature = (x, num_workers = None))]
 89 |     fn symmetric_markov_propagate<'py>(
 90 |         &self,
 91 |         x: &'py PyArray2<f32>,
 92 |         num_workers: Option<usize>,
 93 |     ) -> &'py PyArray<f32, Ix2> {
 94 |         self.markov_propagate(x, MarkovType::Symmetric, num_workers)
 95 |     }
 96 | 
 97 |     #[staticmethod]
 98 |     #[pyo3(signature = (hyperedges, columns, hyperedge_trim_n = 16, num_workers = None))]
 99 |     fn from_iterator(
100 |         hyperedges: &PyIterator,
101 |         columns: &str,
102 |         hyperedge_trim_n: usize,
103 |         num_workers: Option<usize>,
104 |     ) -> PyResult<SparseMatrix> {
105 |         let hyperedges = hyperedges.map(|line| {
106 |             let line = line.expect("Should be proper line");
107 |             let line: &PyString = line
108 |                 .downcast()
109 |                 .expect("Iterator elements should be strings");
110 |             let line = line.to_str().expect("Should be proper UTF-8 string");
111 |             line
112 |         });
113 |         SparseMatrix::from_rust_iterator(columns, hyperedge_trim_n, hyperedges, num_workers)
114 |             .map_err(PyValueError::new_err)
115 |     }
116 | 
117 |     #[staticmethod]
118 |     #[pyo3(signature = (filepaths, columns, hyperedge_trim_n = 16, num_workers = None))]
119 |     fn from_files(
120 |         filepaths: Vec<String>,
121 |         columns: &str,
122 |         hyperedge_trim_n: usize,
123 |         num_workers: Option<usize>,
124 |     ) -> PyResult<SparseMatrix> {
125 |         for filepath in filepaths.iter() {
126 |             if !filepath.ends_with(".tsv") {
127 |                 return Err(PyValueError::new_err("Only .tsv files are supported"));
128 |             }
129 |         }
130 | 
131 |         let columns = configuration::parse_fields(columns).expect("Columns should be valid");
132 |         let matrix_desc =
133 |             create_sparse_matrix_descriptor(&columns).map_err(PyValueError::new_err)?;
134 | 
135 |         let config = Configuration {
136 |             seed: None,
137 |             matrix_desc,
138 |             columns,
139 |             hyperedge_trim_n,
140 |             // TODO consider limiting to some maximum no of workers
141 |             num_workers_graph_building: num_workers.unwrap_or_else(num_cpus::get),
142 |         };
143 |         Ok(build_graph_from_files(&config, filepaths))
144 |     }
145 | 
146 |     fn get_entity_column_mask<'py>(
147 |         &self,
148 |         py: Python<'py>,
149 |         column_name: String,
150 |     ) -> PyResult<&'py PyArray<bool, Ix1>> {
151 |         let column_id_by_name = HashMap::from([
152 |             (&self.descriptor.col_a_name, self.descriptor.col_a_id),
153 |             (&self.descriptor.col_b_name, self.descriptor.col_b_id),
154 |         ]);
155 |         let column_id = column_id_by_name
156 |             .get(&column_name)
157 |             .ok_or(PyValueError::new_err("Column name invalid"))?;
158 | 
159 |         let mask: Vec<bool> = self
160 |             .column_ids
161 |             .par_iter()
162 |             .map(|id| *id == *column_id)
163 |             .collect();
164 |         let mask = Array1::from_vec(mask);
165 |         Ok(mask.to_pyarray(py))
166 |     }
167 | 
168 |     #[getter]
169 |     fn entity_degrees<'py>(&self, py: Python<'py>) -> &'py PyArray<f32, Ix1> {
170 |         let entity_degrees: Vec<f32> = self.entities.par_iter().map(|e| e.row_sum).collect();
171 |         Array1::from_vec(entity_degrees).to_pyarray(py)
172 |     }
173 | 
174 |     #[pyo3(signature = (feature_dim, seed = 0))]
175 |     fn initialize_deterministically<'py>(
176 |         &self,
177 |         py: Python<'py>,
178 |         feature_dim: usize,
179 |         seed: i64,
180 |     ) -> &'py PyArray<f32, Ix2> {
181 |         let mut vectors = Array2::zeros([self.entity_ids.len(), feature_dim]);
182 |         self.initialize_deterministically_rust(vectors.view_mut(), seed);
183 |         vectors.to_pyarray(py)
184 |     }
185 | 
186 |     // Stuff needed for pickle to work (new, getstate, setstate)
187 |     #[new]
188 |     #[pyo3(signature = (*args))]
189 |     fn new(args: &PyTuple) -> Self {
190 |         match args.len() {
191 |             0 => SparseMatrix {
192 |                 descriptor: SparseMatrixDescriptor {
193 |                     col_a_id: 0,
194 |                     col_a_name: "".to_string(),
195 |                     col_b_id: 0,
196 |                     col_b_name: "".to_string(),
197 |                 },
198 |                 entity_ids: vec![],
199 |                 entities: vec![],
200 |                 edges: vec![],
201 |                 slices: vec![],
202 |                 column_ids: vec![],
203 |             },
204 |             _ => panic!("SparseMatrix::new never meant to be called by user. Only 0-arg implementation provided to make pickle happy"),
205 |         }
206 |     }
207 | 
208 |     pub fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
209 |         Ok(PyBytes::new(py, &serialize(self).unwrap()).to_object(py))
210 |     }
211 | 
212 |     pub fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
213 |         match state.extract::<&PyBytes>(py) {
214 |             Ok(s) => {
215 |                 let sm: SparseMatrix = deserialize(s.as_bytes()).unwrap();
216 |                 *self = sm;
217 |                 Ok(())
218 |             }
219 |             Err(e) => Err(e),
220 |         }
221 |     }
222 | }
223 | 
224 | fn init_value(col: usize, hsh: u64, fixed_random_value: i64) -> f32 {
225 |     let hash = |num: i64| {
226 |         let mut hasher = DefaultHasher::new();
227 |         hasher.write_i64(num);
228 |         hasher.finish() as i64
229 |     };
230 | 
231 |     const MAX_HASH_I64: i64 = 8 * 1024 * 1024;
232 |     const MAX_HASH_F32: f32 = MAX_HASH_I64 as f32;
233 |     ((hash((hsh as i64) + (col as i64) + fixed_random_value) % MAX_HASH_I64) as f32) / MAX_HASH_F32
234 | }
235 | 
236 | #[pymodule]
237 | #[pyo3(name = "pycleora")]
238 | fn pycleora(_py: Python, m: &PyModule) -> PyResult<()> {
239 |     m.add_class::<SparseMatrix>()?;
240 |     Ok(())
241 | }
242 | 


--------------------------------------------------------------------------------
/legacy/src/persistence.rs:
--------------------------------------------------------------------------------
  1 | pub mod entity {
  2 |     use rustc_hash::FxHashMap;
  3 |     use std::sync::RwLock;
  4 | 
  5 |     pub trait EntityMappingPersistor {
  6 |         fn get_entity(&self, hash: u64) -> Option<String>;
  7 |         fn put_data(&self, hash: u64, entity: String);
  8 |         fn contains(&self, hash: u64) -> bool;
  9 |     }
 10 | 
 11 |     #[derive(Debug, Default)]
 12 |     pub struct InMemoryEntityMappingPersistor {
 13 |         entity_mappings: RwLock<FxHashMap<u64, String>>,
 14 |     }
 15 | 
 16 |     impl EntityMappingPersistor for InMemoryEntityMappingPersistor {
 17 |         fn get_entity(&self, hash: u64) -> Option<String> {
 18 |             let entity_mappings_read = self.entity_mappings.read().unwrap();
 19 |             entity_mappings_read.get(&hash).map(|s| s.to_string())
 20 |         }
 21 | 
 22 |         fn put_data(&self, hash: u64, entity: String) {
 23 |             let mut entity_mappings_write = self.entity_mappings.write().unwrap();
 24 |             entity_mappings_write.insert(hash, entity);
 25 |         }
 26 | 
 27 |         fn contains(&self, hash: u64) -> bool {
 28 |             let entity_mappings_read = self.entity_mappings.read().unwrap();
 29 |             entity_mappings_read.contains_key(&hash)
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | pub mod embedding {
 35 |     use crate::persistence::embedding::memmap::OwnedMmapArrayViewMut;
 36 |     use ndarray::{s, Array};
 37 |     use ndarray_npy::write_zeroed_npy;
 38 |     use std::fs::File;
 39 |     use std::io;
 40 |     use std::io::{BufWriter, Error, ErrorKind, Write};
 41 | 
 42 |     pub trait EmbeddingPersistor {
 43 |         fn put_metadata(&mut self, entity_count: u32, dimension: u16) -> Result<(), io::Error>;
 44 |         fn put_data(
 45 |             &mut self,
 46 |             entity: &str,
 47 |             occur_count: u32,
 48 |             vector: Vec<f32>,
 49 |         ) -> Result<(), io::Error>;
 50 |         fn finish(&mut self) -> Result<(), io::Error>;
 51 |     }
 52 | 
 53 |     pub struct TextFileVectorPersistor {
 54 |         buf_writer: BufWriter<File>,
 55 |         produce_entity_occurrence_count: bool,
 56 |     }
 57 | 
 58 |     impl TextFileVectorPersistor {
 59 |         pub fn new(filename: String, produce_entity_occurrence_count: bool) -> Self {
 60 |             let msg = format!("Unable to create file: {}", filename);
 61 |             let file = File::create(filename).expect(&msg);
 62 |             TextFileVectorPersistor {
 63 |                 buf_writer: BufWriter::new(file),
 64 |                 produce_entity_occurrence_count,
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 |     impl EmbeddingPersistor for TextFileVectorPersistor {
 70 |         fn put_metadata(&mut self, entity_count: u32, dimension: u16) -> Result<(), io::Error> {
 71 |             write!(&mut self.buf_writer, "{} {}", entity_count, dimension)?;
 72 |             Ok(())
 73 |         }
 74 | 
 75 |         fn put_data(
 76 |             &mut self,
 77 |             entity: &str,
 78 |             occur_count: u32,
 79 |             vector: Vec<f32>,
 80 |         ) -> Result<(), io::Error> {
 81 |             self.buf_writer.write_all(b"\n")?;
 82 |             self.buf_writer.write_all(entity.as_bytes())?;
 83 | 
 84 |             if self.produce_entity_occurrence_count {
 85 |                 write!(&mut self.buf_writer, " {}", occur_count)?;
 86 |             }
 87 | 
 88 |             for &v in &vector {
 89 |                 self.buf_writer.write_all(b" ")?;
 90 |                 let mut buf = ryu::Buffer::new(); // cheap op
 91 |                 self.buf_writer.write_all(buf.format_finite(v).as_bytes())?;
 92 |             }
 93 | 
 94 |             Ok(())
 95 |         }
 96 | 
 97 |         fn finish(&mut self) -> Result<(), io::Error> {
 98 |             self.buf_writer.write_all(b"\n")?;
 99 |             Ok(())
100 |         }
101 |     }
102 | 
103 |     mod memmap {
104 |         use memmap::MmapMut;
105 |         use ndarray::ArrayViewMut2;
106 |         use std::fs::OpenOptions;
107 |         use std::io;
108 |         use std::io::{Error, ErrorKind};
109 |         use std::ptr::drop_in_place;
110 | 
111 |         pub struct OwnedMmapArrayViewMut {
112 |             mmap_ptr: *mut MmapMut,
113 |             mmap_data: Option<ndarray::ArrayViewMut2<'static, f32>>,
114 |         }
115 | 
116 |         impl OwnedMmapArrayViewMut {
117 |             pub fn new(filename: &str) -> Result<Self, io::Error> {
118 |                 use ndarray_npy::ViewMutNpyExt;
119 | 
120 |                 let file = OpenOptions::new().read(true).write(true).open(filename)?;
121 |                 let mmap = unsafe { MmapMut::map_mut(&file)? };
122 |                 let mmap = Box::new(mmap);
123 |                 let mmap = Box::leak(mmap);
124 |                 let mmap_ptr: *mut MmapMut = mmap as *mut _;
125 | 
126 |                 let mmap_data = ArrayViewMut2::<'static, f32>::view_mut_npy(mmap)
127 |                     .map_err(|_| Error::new(ErrorKind::Other, "Mmap view error"))?;
128 | 
129 |                 Ok(Self {
130 |                     mmap_ptr,
131 |                     mmap_data: Some(mmap_data),
132 |                 })
133 |             }
134 | 
135 |             pub fn data_view<'a>(&'a mut self) -> &'a mut ArrayViewMut2<'a, f32> {
136 |                 let view = self
137 |                     .mmap_data
138 |                     .as_mut()
139 |                     .expect("Should be always defined. None only used in Drop");
140 | 
141 |                 // SAFETY: shortening lifetime from 'static to 'a is safe because underlying buffer won't be dropped until view is borrowed
142 |                 unsafe {
143 |                     core::mem::transmute::<
144 |                         &mut ArrayViewMut2<'static, f32>,
145 |                         &mut ArrayViewMut2<'a, f32>,
146 |                     >(view)
147 |                 }
148 |             }
149 |         }
150 | 
151 |         impl Drop for OwnedMmapArrayViewMut {
152 |             fn drop(&mut self) {
153 |                 // Unwind references with reverse order.
154 |                 // First remove view that points to mmap_ptr
155 |                 self.mmap_data = None;
156 |                 // And now drop mmap_ptr
157 |                 // SAFETY: safe because pointer leaked in constructor.
158 |                 unsafe { drop_in_place(self.mmap_ptr) }
159 |             }
160 |         }
161 |     }
162 | 
163 |     pub struct NpyPersistor {
164 |         entities: Vec<String>,
165 |         occurences: Vec<u32>,
166 |         array_file_name: String,
167 |         array_file: File,
168 |         array_write_context: Option<OwnedMmapArrayViewMut>,
169 |         occurences_buf: Option<BufWriter<File>>,
170 |         entities_buf: BufWriter<File>,
171 |     }
172 | 
173 |     impl NpyPersistor {
174 |         pub fn new(filename: String, produce_entity_occurrence_count: bool) -> Self {
175 |             let entities_filename = format!("{}.entities", &filename);
176 |             let entities_buf = BufWriter::new(
177 |                 File::create(&entities_filename)
178 |                     .unwrap_or_else(|_| panic!("Unable to create file: {}", &entities_filename)),
179 |             );
180 | 
181 |             let occurences_filename = format!("{}.occurences", &filename);
182 |             let occurences_buf = if produce_entity_occurrence_count {
183 |                 Some(BufWriter::new(
184 |                     File::create(&occurences_filename).unwrap_or_else(|_| {
185 |                         panic!("Unable to create file: {}", &occurences_filename)
186 |                     }),
187 |                 ))
188 |             } else {
189 |                 None
190 |             };
191 | 
192 |             let array_file_name = format!("{}.npy", &filename);
193 |             let array_file = File::create(&array_file_name)
194 |                 .unwrap_or_else(|_| panic!("Unable to create file: {}", &array_file_name));
195 | 
196 |             Self {
197 |                 entities: vec![],
198 |                 occurences: vec![],
199 |                 array_file_name,
200 |                 array_file,
201 |                 array_write_context: None,
202 |                 occurences_buf,
203 |                 entities_buf,
204 |             }
205 |         }
206 |     }
207 | 
208 |     impl EmbeddingPersistor for NpyPersistor {
209 |         fn put_metadata(&mut self, entity_count: u32, dimension: u16) -> Result<(), io::Error> {
210 |             write_zeroed_npy::<f32, _>(
211 |                 &self.array_file,
212 |                 [entity_count as usize, dimension as usize],
213 |             )
214 |             .map_err(|_| Error::new(ErrorKind::Other, "Write zeroed npy error"))?;
215 |             self.array_write_context = Some(OwnedMmapArrayViewMut::new(&self.array_file_name)?);
216 |             Ok(())
217 |         }
218 | 
219 |         fn put_data(
220 |             &mut self,
221 |             entity: &str,
222 |             occur_count: u32,
223 |             vector: Vec<f32>,
224 |         ) -> Result<(), io::Error> {
225 |             let array = &mut self
226 |                 .array_write_context
227 |                 .as_mut()
228 |                 .expect("Should be defined. Was put_metadata not called?")
229 |                 .data_view();
230 | 
231 |             array
232 |                 .slice_mut(s![self.entities.len(), ..])
233 |                 .assign(&Array::from(vector));
234 |             self.entities.push(entity.to_owned());
235 |             self.occurences.push(occur_count);
236 |             Ok(())
237 |         }
238 | 
239 |         fn finish(&mut self) -> Result<(), io::Error> {
240 |             use ndarray_npy::WriteNpyExt;
241 | 
242 |             serde_json::to_writer_pretty(&mut self.entities_buf, &self.entities)?;
243 | 
244 |             if let Some(occurences_buf) = self.occurences_buf.as_mut() {
245 |                 let occur = ndarray::ArrayView1::from(&self.occurences);
246 |                 occur.write_npy(occurences_buf).map_err(|e| {
247 |                     Error::new(
248 |                         ErrorKind::Other,
249 |                         format!("Could not save occurences: {}", e),
250 |                     )
251 |                 })?;
252 |             }
253 | 
254 |             Ok(())
255 |         }
256 |     }
257 | }
258 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 | 
  3 | ![Cleora logo](files/images/cleora.png)
  4 | 
  5 | </p>
  6 | 
  7 | ## Achievements
  8 | 
  9 | :one:st place at [SIGIR eCom Challenge 2020](https://sigir-ecom.github.io/ecom20DCPapers/SIGIR_eCom20_DC_paper_1.pdf)
 10 |  
 11 | :two:nd place  and Best Paper Award at [WSDM Booking.com Challenge 2021](http://ceur-ws.org/Vol-2855/challenge_short_3.pdf)
 12 | 
 13 | :two:nd place at [Twitter Recsys Challenge 2021](https://recsys-twitter.com/competition_leaderboard/latest)
 14 | 
 15 | :three:rd place at [KDD Cup 2021](https://ogb.stanford.edu/paper/kddcup2021/mag240m_SyneriseAI.pdf)
 16 | 
 17 | 
 18 | # Cleora
 19 | 
 20 | _**Cleora** is a genus of moths in the family **Geometridae**. Their scientific name derives from the Ancient Greek geo γῆ or γαῖα "the earth", and metron μέτρον "measure" in reference to the way their larvae, or "inchworms", appear to "**measure the earth**" as they move along in a looping fashion._
 21 | 
 22 | Cleora is a general-purpose model for efficient, scalable learning of stable and inductive entity embeddings for heterogeneous relational data.
 23 | 
 24 | # Introducing Cleora 2.0.0 - Python native
 25 | 
 26 | **Installation**
 27 | ```
 28 | pip install pycleora
 29 | ```
 30 | 
 31 | **Build instructions**
 32 | ```
 33 | # prepare python env
 34 | pip install maturin
 35 | 
 36 | # Install pycleora in current env (meant for development)
 37 | maturin develop
 38 | 
 39 | # Usage example below. More examples in examples/ folder.
 40 | ```
 41 | ## Changelog
 42 | 
 43 | **Cleora** is now available as a Python package `pycleora`. Key improvements compared to the previous version:
 44 | * _performance optimizations_: ~10x faster embedding times
 45 | * _performance optimizations_: significantly reduced memory usage
 46 | * _latest research_: improved embedding quality
 47 | * _new feature_: can create graphs from a Python `iterators` in addition to `tsv` files
 48 | * _new feature_: seamless integration with `NumPy`
 49 | * _new feature_: item attributes support via custom embeddings initialization
 50 | * _new feature_: adjustable vector projection / normalization after each propagation step
 51 | 
 52 | **Breaking changes:**
 53 | * _transient_ modifier not supported any more - creating `complex::reflexive` columns for hypergraph embeddings, _grouped by_ the transient entity gives better results.
 54 | 
 55 | 
 56 | # Usage example:
 57 | 
 58 | ```
 59 | from pycleora import SparseMatrix
 60 | import numpy as np
 61 | import pandas as pd
 62 | import random
 63 | 
 64 | # Generate example data
 65 | customers = [f"Customer_{i}" for i in range(1, 20)]
 66 | products = [f"Product_{j}" for j in range(1, 20)]
 67 | 
 68 | data = {
 69 |     "customer": random.choices(customers, k=100),
 70 |     "product": random.choices(products, k=100),
 71 | }
 72 | 
 73 | # Create DataFrame
 74 | df = pd.DataFrame(data)
 75 | 
 76 | # Create hyperedges
 77 | customer_products = df.groupby('customer')['product'].apply(list).values
 78 | 
 79 | # Convert to Cleora input format
 80 | cleora_input = map(lambda x: ' '.join(x), customer_products)
 81 | 
 82 | # Create Markov transition matrix for the hypergraph
 83 | mat = SparseMatrix.from_iterator(cleora_input, columns='complex::reflexive::product')
 84 | 
 85 | # Look at entity ids in the matrix, corresponding to embedding vectors
 86 | print(mat.entity_ids)
 87 | # ['Product_5', 'Product_3', 'Product_2', 'Product_4', 'Product_1']
 88 | 
 89 | # Initialize embedding vectors externally, using text, image, random vectors
 90 | # embeddings = ...
 91 | 
 92 | # Or use built-in random deterministic initialization
 93 | embeddings = mat.initialize_deterministically(1024)
 94 | 
 95 | # Perform Markov random walk, then normalize however many times we want
 96 | 
 97 | NUM_WALKS = 3   # The optimal number depends on the graph, typically between 3 and 7 yields good results
 98 |                 # lower values tend to capture co-occurrence, higher iterations capture substitutability in a context
 99 | 
100 | for i in range(NUM_WALKS):
101 |     # Can propagate with a symmetric matrix as well, but left Markov is a great default
102 |     embeddings = mat.left_markov_propagate(embeddings)
103 |     # Normalize with L2 norm by default, for the embeddings to reside on a hypersphere. Can use standardization instead.
104 |     embeddings /= np.linalg.norm(embeddings, ord=2, axis=-1, keepdims=True)
105 | 
106 | # We're done, here are our embeddings
107 | 
108 | for entity, embedding in zip(mat.entity_ids, embeddings):
109 |     print(entity, embedding)
110 | 
111 | # We can now compare our embeddings with dot product (since they are L2 normalized)
112 | 
113 | print(np.dot(embeddings[0], embeddings[1]))
114 | print(np.dot(embeddings[0], embeddings[2]))
115 | print(np.dot(embeddings[0], embeddings[3]))
116 | ```
117 | # FAQ
118 | 
119 | **Q: What should I embed?**
120 | 
121 | A: Any entities that interact with each other, co-occur or can be said to be present together in a given context. Examples can include: products in a shopping basket, locations frequented by the same people at similar times, employees collaborating together, chemical molecules being present in specific circumstances, proteins produced by the same bacteria, drug interactions, co-authors of the same academic papers, companies occurring together in the same LinkedIn profiles.
122 | 
123 | **Q: How should I construct the input?**
124 | 
125 | A: What works best is grouping entities co-occurring in a similar context, and feeding them in whitespace-separated lines using `complex::reflexive` modifier is a good idea. E.g. if you have product data, you can group the products by shopping baskets or by users. If you have urls, you can group them by browser sessions, of by (user, time window) pairs. Check out the usage example above. Grouping products by customers is just one possibility.
126 | 
127 | **Q: Can I embed users and products simultaneously, to compare them with cosine similarity?**
128 | 
129 | A: No, this is a methodologically wrong approach, stemming from outdated matrix factorization approaches. What you should do is come up with good product embeddings first, then create user embeddings from them. Feeding two columns e.g. `user product` into cleora will result in a bipartite graph. Similar products will be close to each other, similar users will be close to each other, but users and products will not necessarily be similar to each other.
130 | 
131 | **Q: What embedding dimensionality to use?**
132 | 
133 | A: The more, the better, but we typically work from _1024_ to _4096_. Memory is cheap and machines are powerful, so don't skimp on embedding size.
134 | 
135 | **Q: How many iterations of Markov propagation should I use?**
136 | 
137 | A: Depends on what you want to achieve. Low iterations (3) tend to approximate the co-occurrence matrix, while high iterations (7+) tend to give contextual similarity (think skip-gram but much more accurate and faster).
138 | 
139 | **Q: How do I incorporate external information, e.g. entity metadata, images, texts into the embeddings?**
140 | 
141 | A: Just initialize the embedding matrix with your own vectors coming from a VIT, setence-transformers, of a random projection of your numeric features. In that scenario low numbers of Markov iterations (1 to 3) tend to work best.
142 | 
143 | **Q: My embeddings don't fit in memory, what do I do?**
144 | 
145 | A: Cleora operates on dimensions independently. Initialize your embeddings with a smaller number of dimensions, run Cleora, persist to disk, then repeat. You can concatenate your resulting embedding vectors afterwards, but remember to normalize them afterwards!
146 | 
147 | **Q: Is there a minimum number of entity occurrences?**
148 | 
149 | A: No, an entity `A` co-occuring just 1 time with some other entity `B` will get a proper embedding, i.e. `B` will be the most similar to `A`. The other way around, `A` will be highly ranked among nearest neighbors of `B`, which may or may not be desirable, depending on your use case. Feel free to prune your input to Cleora to eliminate low-frequency items.
150 | 
151 | **Q: Are there any edge cases where Cleora can fail?**
152 | 
153 | A: Cleora works best for relatively sparse hypergraphs. If all your hyperedges contain some very common entity `X`, e.g. a _shopping bag_, then it will degrade the quality of embeddings by degenerating shortest paths in the random walk. It is a good practice to remove such entities from the hypergraph. 
154 | 
155 | **Q: How can Cleora be so fast and accurate at the same time?**
156 | 
157 | A: Not using negative sampling is a great boon. By constructing the (sparse) Markov transition matrix, Cleora explicitly performs all possible random walks in a hypergraph in one big step (a single matrix multiplication). That's what we call a single _iteration_. We perform 3+ such iterations. Thanks to a highly efficient implementation in Rust, with special care for concurrency, memory layout and cache coherence, it is blazingly fast. Negative sampling or randomly selecting random walks tend to introduce a lot of noise - Cleora is free of those burdens.
158 | 
159 | # Science
160 | 
161 | **Read the whitepaper ["Cleora: A Simple, Strong and Scalable Graph Embedding Scheme"](https://arxiv.org/abs/2102.02302)**
162 | 
163 | Cleora embeds entities in *n-dimensional spherical spaces* utilizing extremely fast stable, iterative random projections, which allows for unparalleled performance and scalability. 
164 | 
165 | Types of data which can be embedded include for example:
166 | - heterogeneous undirected graphs
167 | - heterogeneous undirected hypergraphs
168 | - text and other categorical array data
169 | - any combination of the above
170 | 
171 | **!!! Disclaimer: the numbers below are for Cleora 1.x, new version is significantly faster, but yet have to re-run the benchmarks**
172 | 
173 | Key competitive advantages of Cleora:
174 | * more than **197x faster than DeepWalk**
175 | * **~4x-8x faster than [PyTorch-BigGraph](https://ai.facebook.com/blog/open-sourcing-pytorch-biggraph-for-faster-embeddings-of-extremely-large-graphs/)** (depends on use case)
176 | * star expansion, clique expansion, and no expansion support for hypergraphs
177 | * **quality of results outperforming or competitive** with other embedding frameworks like [PyTorch-BigGraph](https://ai.facebook.com/blog/open-sourcing-pytorch-biggraph-for-faster-embeddings-of-extremely-large-graphs/), GOSH, DeepWalk, LINE
178 | * can embed extremely large graphs & hypergraphs on a single machine
179 | 
180 | Embedding times - example:
181 | 
182 | <table>
183 | <tr>
184 | <td> <b>Algorithm</b>
185 | <td> <b>FB dataset</b>
186 | <td> <b>RoadNet dataset</b>
187 | <td> <b>LiveJournal dataset</b>
188 | </tr>
189 | 
190 | <tr>
191 | <td> Cleora
192 | <td> 00:00:43 h
193 | <td> 00:21:59 h
194 | <td> 01:31:42 h
195 | </tr>
196 | 
197 | <tr>
198 | <td> PyTorch-BigGraph
199 | <td> 00:04.33 h
200 | <td> 00:31:11 h
201 | <td> 07:10:00 h
202 | </tr>
203 | 
204 | </table>
205 | 
206 | Link Prediction results - example:
207 | <table>
208 |   <tr>
209 |     <td>
210 |     <!-- <td rowspan="2">&nbsp;</td> -->
211 |     <td colspan="2"><b>FB dataset</b></td>
212 |     <td colspan="2"><b>RoadNet dataset</b></td>
213 |     <td colspan="2"><b>LiveJournal dataset</b></td>
214 |   </tr>
215 |   <tr>
216 |     <td> <b>Algorithm</b>
217 |     <td> <b>MRR</b>
218 |     <td> <b>HitRate@10</b>
219 |     <td> <b>MRR</b>
220 |     <td> <b>HitRate@10</b>
221 |     <td> <b>MRR</b>
222 |     <td> <b>HitRate@10</b>
223 |   </tr>
224 |   <tr>
225 |     <td> Cleora
226 |     <td> 0.072
227 |     <td> 0.172
228 |     <td> 0.929
229 |     <td> 0.942
230 |     <td> 0.586
231 |     <td> 0.627
232 |   </tr>
233 |   <tr>
234 |   <td> PyTorch-BigGraph
235 |   <td> 0.035
236 |   <td> 0.072
237 |   <td> 0.850
238 |   <td> 0.866
239 |   <td> 0.565
240 |   <td> 0.672
241 |   </tr>
242 | 
243 |   <!-- <tr>
244 |   <td> LINE
245 |   <td> 0.075
246 |   <td> 0.192
247 |   <td> 0.962
248 |   <td> 0.983
249 |   <td> 0.553
250 |   <td> 0.648
251 |   </tr> -->
252 | </table>
253 | 
254 | ## Cleora design principles
255 | Cleora is built as a multi-purpose "just embed it" tool, suitable for many different data types and formats.
256 | 
257 | Cleora ingests a relational table of rows representing a typed and undirected heterogeneous hypergraph, which can contain multiple:
258 | - typed categorical columns
259 | - typed categorical array columns
260 | 
261 | For example a relational table representing shopping baskets may have the following columns:
262 | 
263 |     user <\t> product <\t> store
264 | 
265 | With the input file containing values:
266 | 
267 |     user_id <\t> product_id product_id product_id <\t> store_id
268 | 
269 | Every column has a type, which is used to determine whether spaces of identifiers between different columns are shared or distinct. It is possible for two columns to share a type, which is the case for homogeneous graphs:
270 | 
271 |     user <\t> user
272 | 
273 | Based on the column format specification, Cleora performs:
274 |  - Star decomposition of hyper-edges
275 |  - Creation of pairwise graphs for all pairs of entity types
276 |  - Embedding of each graph
277 | 
278 | The final output of Cleora consists of multiple files for each (undirected) pair of entity types in the table.
279 | 
280 | Those embeddings can then be utilized in a novel way thanks to their dim-wise independence property, which is described further below.
281 | 
282 | ## Key technical features of Cleora embeddings
283 | The embeddings produced by Cleora are different from those produced by Node2vec, Word2vec, DeepWalk or other systems in this class by a number of key properties:
284 |  - **efficiency** - Cleora is two orders of magnitude faster than Node2Vec or DeepWalk
285 |  - **inductivity** - as Cleora embeddings of an entity are defined only by interactions with other entities, vectors for new entities can be computed on-the-fly
286 |  - **updatability** - refreshing a Cleora embedding for an entity is a very fast operation allowing for real-time updates without retraining
287 |  - **stability** - all starting vectors for entities are deterministic, which means that Cleora embeddings on similar datasets will end up being similar. Methods like Word2vec, Node2vec or DeepWalk return different results with every run.
288 |  - **cross-dataset compositionality** - thanks to stability of Cleora embeddings, embeddings of the same entity on multiple datasets can be combined by averaging, yielding meaningful vectors
289 |  - **dim-wise independence** - thanks to the process producing Cleora embeddings, every dimension is independent of others. This property allows for efficient and low-parameter method for combining multi-view embeddings with Conv1d layers.
290 |  - **extreme parallelism and performance** - Cleora is written in Rust utilizing thread-level parallelism for all calculations except input file loading. In practice this means that the embedding process is often faster than loading the input data.
291 | 
292 | ## Key usability features of Cleora embeddings
293 | 
294 | The technical properties described above imply good production-readiness of Cleora, which from the end-user perspective can be summarized as follows:
295 | - heterogeneous relational tables can be embedded without any artificial data pre-processing
296 | - mixed interaction + text datasets can be embedded with ease
297 | - cold start problem for new entities is non-existent
298 | - real-time updates of the embeddings do not require any separate solutions
299 | - multi-view embeddings work out of the box
300 | - temporal, incremental embeddings are stable out of the box, with no need for re-alignment, rotations or other methods
301 | - extremely large datasets are supported and can be embedded within seconds / minutes
302 | 
303 | ## Documentation
304 | 
305 | **!!! Disclaimer the documentation below is for Cleora 1.x, to be updated for 2.x**
306 | 
307 | More information can be found in [the full documentation](https://cleora.readthedocs.io/).
308 | 
309 | For details contact us at cleora@synerise.com
310 | 
311 | ## Cite
312 | 
313 | Please cite [our paper](https://arxiv.org/abs/2102.02302) (and the respective papers of the methods used) if you use this code in your own work:
314 | 
315 | ```
316 | @article{DBLP:journals/corr/abs-2102-02302,
317 |   author    = {Barbara Rychalska, Piotr Babel, Konrad Goluchowski, Andrzej Michalowski, Jacek Dabrowski},
318 |   title     = {Cleora: {A} Simple, Strong and Scalable Graph Embedding Scheme},
319 |   journal   = {CoRR},
320 |   year      = {2021}
321 | }
322 | ```
323 | 
324 | ## License
325 | 
326 | Synerise Cleora is MIT licensed, as found in the [LICENSE](LICENSE) file.
327 | 
328 | 
329 | ## How to Contribute
330 | 
331 | Pull requests are welcome.
332 | 


--------------------------------------------------------------------------------
/src/sparse_matrix_builder.rs:
--------------------------------------------------------------------------------
  1 | use std::cell::RefCell;
  2 | use std::cmp::Reverse;
  3 | use std::collections::HashMap;
  4 | use std::hash::BuildHasherDefault;
  5 | use std::ptr;
  6 | use std::sync::atomic::{AtomicUsize, Ordering};
  7 | 
  8 | use dashmap::DashMap;
  9 | use itertools::Itertools;
 10 | use rayon::iter::IntoParallelIterator;
 11 | use rayon::iter::IntoParallelRefIterator;
 12 | use rayon::iter::ParallelDrainFull;
 13 | use rayon::iter::ParallelIterator;
 14 | use rayon::prelude::ParallelSliceMut;
 15 | use rayon::ThreadPoolBuilder;
 16 | use rustc_hash::FxHasher;
 17 | use smallvec::SmallVec;
 18 | 
 19 | use crate::entity::{Hyperedge, SMALL_VECTOR_SIZE};
 20 | use crate::sparse_matrix::{Edge, Entity, SparseMatrix, SparseMatrixDescriptor};
 21 | 
 22 | #[derive(Debug, Default)]
 23 | struct Row {
 24 |     occurrence: u32,
 25 |     row_sum: f32,
 26 | }
 27 | 
 28 | /// Data locality plays huge role in propagation phase performance
 29 | /// We want connected nodes to have similar indices, as they will get updated together.
 30 | /// NodeIndexer assigns successive indices to nodes connected via hyper-edges.
 31 | /// Such ordering yields significant performance boost in propagation phase.
 32 | #[derive(Debug, Default)]
 33 | pub struct NodeIndexer {
 34 |     pub key_2_index: HashMap<u64, usize, BuildHasherDefault<FxHasher>>,
 35 |     pub index_2_key: Vec<u64>,
 36 |     pub index_2_entity_id: Vec<String>,
 37 |     pub index_2_column_id: Vec<u8>,
 38 | }
 39 | 
 40 | pub trait NodeIndexerBuilder {
 41 |     fn process(&self, key: u64, entity_id: &str, column_id: u8);
 42 |     fn finish(self) -> NodeIndexer;
 43 | }
 44 | 
 45 | #[derive(Debug)]
 46 | pub struct SyncNodeIndexerBuilder {
 47 |     node_indexer: RefCell<NodeIndexer>,
 48 | }
 49 | 
 50 | impl Default for SyncNodeIndexerBuilder {
 51 |     fn default() -> Self {
 52 |         SyncNodeIndexerBuilder {
 53 |             node_indexer: RefCell::new(NodeIndexer {
 54 |                 key_2_index: Default::default(),
 55 |                 index_2_key: vec![],
 56 |                 index_2_column_id: vec![],
 57 |                 index_2_entity_id: vec![],
 58 |             }),
 59 |         }
 60 |     }
 61 | }
 62 | 
 63 | impl NodeIndexerBuilder for SyncNodeIndexerBuilder {
 64 |     fn process(&self, key: u64, entity_id: &str, column_id: u8) {
 65 |         let mut node_indexer = self.node_indexer.borrow_mut();
 66 | 
 67 |         if node_indexer.key_2_index.contains_key(&key) {
 68 |             return;
 69 |         }
 70 |         let index = node_indexer.key_2_index.len();
 71 |         node_indexer.key_2_index.insert(key, index);
 72 |         node_indexer.index_2_key.push(key);
 73 |         node_indexer.index_2_entity_id.push(entity_id.to_string());
 74 |         node_indexer.index_2_column_id.push(column_id);
 75 |     }
 76 | 
 77 |     fn finish(self) -> NodeIndexer {
 78 |         self.node_indexer.into_inner()
 79 |     }
 80 | }
 81 | 
 82 | #[derive(Debug)]
 83 | pub struct IndexedEntity {
 84 |     index: usize,
 85 |     id: String,
 86 |     column_id: u8,
 87 | }
 88 | 
 89 | #[derive(Debug, Default)]
 90 | pub struct AsyncNodeIndexerBuilder {
 91 |     key_2_entity: DashMap<u64, IndexedEntity, BuildHasherDefault<FxHasher>>,
 92 |     next_index: AtomicUsize,
 93 | }
 94 | 
 95 | impl NodeIndexerBuilder for AsyncNodeIndexerBuilder {
 96 |     fn process(&self, key: u64, entity_id: &str, column_id: u8) {
 97 |         self.key_2_entity.entry(key).or_insert_with(|| {
 98 |             let index = self.next_index.fetch_add(1, Ordering::Relaxed);
 99 |             let id = entity_id.to_string();
100 |             IndexedEntity {
101 |                 index,
102 |                 id,
103 |                 column_id,
104 |             }
105 |         });
106 |     }
107 | 
108 |     fn finish(self) -> NodeIndexer {
109 |         // Thin wrappers over pointer to make it Send/Sync
110 |         // https://stackoverflow.com/a/70848420
111 | 
112 |         #[derive(Copy, Clone)]
113 |         struct PointerU64(*mut u64);
114 |         unsafe impl Send for PointerU64 {}
115 |         unsafe impl Sync for PointerU64 {}
116 | 
117 |         #[derive(Copy, Clone)]
118 |         struct PointerString(*mut String);
119 |         unsafe impl Send for PointerString {}
120 |         unsafe impl Sync for PointerString {}
121 | 
122 |         #[derive(Copy, Clone)]
123 |         struct PointerU8(*mut u8);
124 |         unsafe impl Send for PointerU8 {}
125 |         unsafe impl Sync for PointerU8 {}
126 | 
127 |         let numel = self.next_index.into_inner();
128 |         let mut index_2_key: Vec<u64> = vec![0; numel];
129 |         let mut index_2_entity_id = vec![String::new(); numel];
130 |         let mut index_2_column_id = vec![0; numel];
131 | 
132 |         let index_2_key_ptr = PointerU64(index_2_key.as_mut_ptr());
133 |         let index_2_entity_id_ptr = PointerString(index_2_entity_id.as_mut_ptr());
134 |         let index_2_column_id_ptr = PointerU8(index_2_column_id.as_mut_ptr());
135 | 
136 |         let key_2_index = self
137 |             .key_2_entity
138 |             .into_par_iter()
139 |             .map(|(key, indexed_entity)| {
140 |                 let IndexedEntity {
141 |                     index,
142 |                     id: entity_id,
143 |                     column_id,
144 |                 } = indexed_entity;
145 |                 unsafe {
146 |                     ptr::write(index_2_key_ptr.0.add(index), key);
147 |                     ptr::write(index_2_entity_id_ptr.0.add(index), entity_id);
148 |                     ptr::write(index_2_column_id_ptr.0.add(index), column_id);
149 |                 }
150 |                 (key, index)
151 |             })
152 |             .collect();
153 | 
154 |         NodeIndexer {
155 |             key_2_index,
156 |             index_2_key,
157 |             index_2_entity_id,
158 |             index_2_column_id,
159 |         }
160 |     }
161 | }
162 | 
163 | impl SparseMatrixDescriptor {
164 |     pub fn new(col_a_id: u8, col_a_name: String, col_b_id: u8, col_b_name: String) -> Self {
165 |         Self {
166 |             col_a_id,
167 |             col_a_name,
168 |             col_b_id,
169 |             col_b_name,
170 |         }
171 |     }
172 | 
173 |     pub fn make_buffer(&self, hyperedge_trim_n: usize) -> SparseMatrixBuffer {
174 |         SparseMatrixBuffer {
175 |             descriptor: self.clone(),
176 |             edge_count: 0,
177 |             hash_2_row: Default::default(),
178 |             hashes_2_edge: Default::default(),
179 |             hyperedge_trim_n,
180 |         }
181 |     }
182 | }
183 | 
184 | #[derive(Debug)]
185 | pub struct SparseMatrixBuffer {
186 |     pub descriptor: SparseMatrixDescriptor,
187 |     pub edge_count: u32,
188 |     hash_2_row: HashMap<u64, Row, BuildHasherDefault<FxHasher>>,
189 |     hashes_2_edge: HashMap<(u64, u64), f32, BuildHasherDefault<FxHasher>>,
190 |     hyperedge_trim_n: usize,
191 | }
192 | 
193 | impl SparseMatrixBuffer {
194 |     pub fn handle_hyperedge(&mut self, hyperedge: &Hyperedge) {
195 |         let SparseMatrixDescriptor {
196 |             col_a_id, col_b_id, ..
197 |         } = self.descriptor;
198 |         let total_combinations = hyperedge.edges_num(col_a_id, col_b_id) as u32;
199 | 
200 |         let mut nodes_a = hyperedge.nodes(col_a_id as usize);
201 |         let mut nodes_b = hyperedge.nodes(col_b_id as usize);
202 | 
203 |         for hash in &nodes_a {
204 |             self.update_row(*hash, nodes_b.len() as u32);
205 |         }
206 |         for hash in &nodes_b {
207 |             self.update_row(*hash, nodes_a.len() as u32);
208 |         }
209 | 
210 |         let value = 1f32 / (total_combinations as f32);
211 | 
212 |         let (nodes_a_high, nodes_a_low) = self.get_high_low_nodes(&mut nodes_a);
213 |         let (nodes_b_high, nodes_b_low) = self.get_high_low_nodes(&mut nodes_b);
214 |         self.handle_combinations(nodes_a_high, nodes_b_high, value);
215 |         self.handle_combinations(nodes_a_high, nodes_b_low, value);
216 |         self.handle_combinations(nodes_a_low, nodes_b_high, value);
217 |         // Ignore 'low-to-low' combinations
218 |     }
219 | 
220 |     fn get_high_low_nodes<'a>(
221 |         &self,
222 |         nodes: &'a mut SmallVec<[u64; SMALL_VECTOR_SIZE]>,
223 |     ) -> (&'a [u64], &'a [u64]) {
224 |         if nodes.len() > self.hyperedge_trim_n {
225 |             nodes.select_nth_unstable_by_key(self.hyperedge_trim_n, |h| {
226 |                 Reverse(self.hash_2_row.get(h).map_or(0, |r| r.occurrence))
227 |             });
228 |             nodes.split_at(self.hyperedge_trim_n)
229 |         } else {
230 |             (nodes, &[])
231 |         }
232 |     }
233 | 
234 |     fn handle_combinations(&mut self, a_hashes: &[u64], b_hashes: &[u64], value: f32) {
235 |         for a_hash in a_hashes {
236 |             for b_hash in b_hashes {
237 |                 self.add_pair_symmetric(*a_hash, *b_hash, value);
238 |             }
239 |         }
240 |     }
241 | 
242 |     /// It creates sparse matrix for two columns in the incoming data.
243 |     /// Let's say that we have such columns:
244 |     /// customers | products                | brands
245 |     /// incoming data:
246 |     /// userId1   | productId1, productId2  | brandId1, brandId2
247 |     /// userId2   | productId1              | brandId3, brandId4, brandId5
248 |     /// etc.
249 |     /// One of the sparse matrices could represent customers and products relation (products and brands relation, customers and brands relation).
250 |     /// This sparse matrix (customers and products relation) handles every combination in these columns according to
251 |     /// total combinations in a row.
252 |     /// The first row in the incoming data produces two combinations according to 4 total combinations:
253 |     /// userId1, productId1 and userId1, productId2
254 |     /// The second row produces one combination userId2, productId1 according to 3 total combinations.
255 |     /// `a_hash` - hash of a entity for a column A
256 |     /// `b_hash` - hash of a entity for a column B
257 |     /// `count` - total number of combinations in a row
258 |     fn add_pair_symmetric(&mut self, a_hash: u64, b_hash: u64, value: f32) {
259 |         self.edge_count += 1;
260 |         self.update_edge(a_hash, b_hash, value);
261 |         self.update_edge(b_hash, a_hash, value);
262 |     }
263 | 
264 |     fn update_row(&mut self, hash: u64, count: u32) {
265 |         let val = 1f32 / (count as f32);
266 |         let e = self.hash_2_row.entry(hash).or_default();
267 |         e.occurrence += count;
268 |         e.row_sum += val
269 |     }
270 | 
271 |     fn update_edge(&mut self, a_hash: u64, b_hash: u64, val: f32) {
272 |         let e = self.hashes_2_edge.entry((a_hash, b_hash)).or_default();
273 |         *e += val;
274 |     }
275 | }
276 | 
277 | #[derive(Debug)]
278 | pub struct SparseMatrixBuffersReducer {
279 |     descriptor: SparseMatrixDescriptor,
280 |     buffers: Vec<SparseMatrixBuffer>,
281 |     node_indexer: NodeIndexer,
282 |     num_workers: usize,
283 | }
284 | 
285 | pub struct EdgeEntry {
286 |     pub row: u32,
287 |     pub col: u32,
288 |     pub value: f32,
289 | }
290 | 
291 | impl SparseMatrixBuffersReducer {
292 |     pub fn new(
293 |         node_indexer: NodeIndexer,
294 |         buffers: Vec<SparseMatrixBuffer>,
295 |         num_workers: usize,
296 |     ) -> Self {
297 |         if buffers.is_empty() {
298 |             panic!("Cannot reduce 0 buffers")
299 |         }
300 | 
301 |         let descriptor = buffers[0].descriptor.clone();
302 |         for buffer in &buffers {
303 |             if descriptor != buffer.descriptor {
304 |                 panic!("Can only reduce buffers with the same sparse matrix description")
305 |             }
306 |         }
307 | 
308 |         Self {
309 |             descriptor,
310 |             buffers,
311 |             node_indexer,
312 |             num_workers,
313 |         }
314 |     }
315 | 
316 |     pub fn reduce(self) -> SparseMatrix {
317 |         ThreadPoolBuilder::new()
318 |             .num_threads(self.num_workers)
319 |             .build()
320 |             .unwrap()
321 |             .install(|| {
322 |                 let node_indexer = self.node_indexer;
323 | 
324 |                 // Extract buffers so their fields can be moved to reducing functions
325 |                 let (hash_2_row_maps, hashes_2_edge_map): (Vec<_>, Vec<_>) = self
326 |                     .buffers
327 |                     .into_iter()
328 |                     .map(|b| (b.hash_2_row, b.hashes_2_edge))
329 |                     .unzip();
330 |                 let entities =
331 |                     SparseMatrixBuffersReducer::reduce_to_entities(&node_indexer, hash_2_row_maps);
332 |                 let mut edges: Vec<_> =
333 |                     SparseMatrixBuffersReducer::reduce_to_edges(&node_indexer, hashes_2_edge_map);
334 |                 edges.par_sort_by_key(|entry| (entry.row, entry.col));
335 | 
336 |                 let slices: Vec<_> = edges
337 |                     .iter()
338 |                     .enumerate()
339 |                     .group_by(|(_, entry)| entry.row)
340 |                     .into_iter()
341 |                     .map(|(_, mut group)| {
342 |                         let first = group.next().expect("Group have at least one element");
343 |                         let last = group.last().unwrap_or(first);
344 |                         (first.0, last.0 + 1)
345 |                     })
346 |                     .collect();
347 | 
348 |                 let mut edges: Vec<_> = edges
349 |                     .into_par_iter()
350 |                     .map(|entry| Edge {
351 |                         other_entity_ix: entry.col,
352 |                         // use this field for different purpose to avoid reallocation
353 |                         left_markov_value: entry.value,
354 |                         symmetric_markov_value: 0.0,
355 |                     })
356 |                     .collect();
357 | 
358 |                 slices
359 |                     .iter()
360 |                     .enumerate()
361 |                     .for_each(|(row_ix, (start_ix, end_ix))| {
362 |                         let row_sum = entities[row_ix].row_sum;
363 |                         let slice = &mut edges[(*start_ix)..(*end_ix)];
364 |                         slice.iter_mut().for_each(|edge| {
365 |                             let value = edge.left_markov_value;
366 | 
367 |                             let left_markov_normalization = row_sum;
368 |                             let symmetric_markov_normalization = {
369 |                                 let col_sum = entities[edge.other_entity_ix as usize].row_sum;
370 |                                 (row_sum * col_sum).sqrt()
371 |                             };
372 |                             edge.left_markov_value = value / left_markov_normalization;
373 |                             edge.symmetric_markov_value = value / symmetric_markov_normalization;
374 |                         })
375 |                     });
376 | 
377 |                 SparseMatrix {
378 |                     descriptor: self.descriptor,
379 |                     entity_ids: node_indexer.index_2_entity_id,
380 |                     column_ids: node_indexer.index_2_column_id,
381 |                     entities,
382 |                     edges,
383 |                     slices,
384 |                 }
385 |             })
386 |     }
387 | 
388 |     fn reduce_to_entities(
389 |         node_indexer: &NodeIndexer,
390 |         entity_maps: Vec<HashMap<u64, Row, BuildHasherDefault<FxHasher>>>,
391 |     ) -> Vec<Entity> {
392 |         node_indexer
393 |             .index_2_key
394 |             .par_iter()
395 |             .map(|hash| {
396 |                 let mut entity_agg = Entity { row_sum: 0.0 };
397 |                 for entity_map in entity_maps.iter() {
398 |                     if let Some(entity) = entity_map.get(hash) {
399 |                         entity_agg.row_sum += entity.row_sum;
400 |                     }
401 |                 }
402 |                 entity_agg
403 |             })
404 |             .collect()
405 |     }
406 | 
407 |     fn reduce_to_edges(
408 |         node_indexer: &NodeIndexer,
409 |         edge_maps: Vec<HashMap<(u64, u64), f32, BuildHasherDefault<FxHasher>>>,
410 |     ) -> Vec<EdgeEntry> {
411 |         // Dashmap to have concurrent write access with par_drain
412 |         // par_drain is recommended to not increase peak memory usage
413 |         let reduced_edge_map: DashMap<(u64, u64), f32, BuildHasherDefault<FxHasher>> =
414 |             Default::default();
415 |         for mut edge_map in edge_maps.into_iter() {
416 |             edge_map.par_drain().for_each(|(k, v)| {
417 |                 reduced_edge_map
418 |                     .entry(k)
419 |                     .and_modify(|rv| *rv += v)
420 |                     .or_insert(v);
421 |             })
422 |         }
423 |         reduced_edge_map
424 |             .into_par_iter()
425 |             .map(|((row_hash, col_hash), value)| {
426 |                 let row = *node_indexer
427 |                     .key_2_index
428 |                     .get(&row_hash)
429 |                     .expect("Hash value was indexed") as u32;
430 |                 let col = *node_indexer
431 |                     .key_2_index
432 |                     .get(&col_hash)
433 |                     .expect("Hash value was indexed") as u32;
434 |                 EdgeEntry { row, col, value }
435 |             })
436 |             .collect()
437 |     }
438 | }
439 | 


--------------------------------------------------------------------------------
/legacy/example_classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 145,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "from sklearn.model_selection import train_test_split\n",
 11 |     "from sklearn.linear_model import SGDClassifier\n",
 12 |     "from sklearn.utils import shuffle\n",
 13 |     "from tqdm import tqdm\n",
 14 |     "import pickle as pkl\n",
 15 |     "import pandas as pd\n",
 16 |     "import random\n",
 17 |     "import sys\n",
 18 |     "import os\n",
 19 |     "from sklearn.metrics import f1_score"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 146,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "random.seed(0)\n",
 29 |     "np.random.seed(0)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 147,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "config = {\n",
 39 |     "    #embedding computation\n",
 40 |     "    'cleora_n_iter': 5,\n",
 41 |     "    'cleora_dim': 1024,\n",
 42 |     "    \n",
 43 |     "    #dataset preparation\n",
 44 |     "    'train_test_split': 0.2,\n",
 45 |     "    \n",
 46 |     "    #training classification\n",
 47 |     "    'input_embeddings': [\n",
 48 |     "                    'output/emb__cluster_id__StarNode.out',\n",
 49 |     "                    'output/emb__CliqueNode__CliqueNode.out',\n",
 50 |     "                   ],\n",
 51 |     "    'batch_size': 256,\n",
 52 |     "    'test_batch_size': 1000,\n",
 53 |     "    'epochs': [20],\n",
 54 |     "}"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "# Dataset preparation"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "1. Download the Facebook dataset from SNAP: https://snap.stanford.edu/data/facebook-large-page-page-network.html\n",
 69 |     "2. Extract the dataset to ./facebook_large/\n",
 70 |     "3. Compute Cleora embeddings as shown in \"Cleora training\" section in `example_link_prediction.ipynb`"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 148,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "df_cleora = pd.read_csv(\"./facebook_large/musae_facebook_edges.csv\")"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 149,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/html": [
 90 |        "<div>\n",
 91 |        "<style scoped>\n",
 92 |        "    .dataframe tbody tr th:only-of-type {\n",
 93 |        "        vertical-align: middle;\n",
 94 |        "    }\n",
 95 |        "\n",
 96 |        "    .dataframe tbody tr th {\n",
 97 |        "        vertical-align: top;\n",
 98 |        "    }\n",
 99 |        "\n",
100 |        "    .dataframe thead th {\n",
101 |        "        text-align: right;\n",
102 |        "    }\n",
103 |        "</style>\n",
104 |        "<table border=\"1\" class=\"dataframe\">\n",
105 |        "  <thead>\n",
106 |        "    <tr style=\"text-align: right;\">\n",
107 |        "      <th></th>\n",
108 |        "      <th>id_1</th>\n",
109 |        "      <th>id_2</th>\n",
110 |        "    </tr>\n",
111 |        "  </thead>\n",
112 |        "  <tbody>\n",
113 |        "    <tr>\n",
114 |        "      <th>0</th>\n",
115 |        "      <td>0</td>\n",
116 |        "      <td>18427</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>1</th>\n",
120 |        "      <td>1</td>\n",
121 |        "      <td>21708</td>\n",
122 |        "    </tr>\n",
123 |        "    <tr>\n",
124 |        "      <th>2</th>\n",
125 |        "      <td>1</td>\n",
126 |        "      <td>22208</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>3</th>\n",
130 |        "      <td>1</td>\n",
131 |        "      <td>22171</td>\n",
132 |        "    </tr>\n",
133 |        "    <tr>\n",
134 |        "      <th>4</th>\n",
135 |        "      <td>1</td>\n",
136 |        "      <td>6829</td>\n",
137 |        "    </tr>\n",
138 |        "  </tbody>\n",
139 |        "</table>\n",
140 |        "</div>"
141 |       ],
142 |       "text/plain": [
143 |        "   id_1   id_2\n",
144 |        "0     0  18427\n",
145 |        "1     1  21708\n",
146 |        "2     1  22208\n",
147 |        "3     1  22171\n",
148 |        "4     1   6829"
149 |       ]
150 |      },
151 |      "execution_count": 149,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "df_cleora.head()"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 150,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "train_cleora, test_cleora = train_test_split(df_cleora, test_size=config['train_test_split'])"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 151,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "fb_cleora_input_clique_filename = \"fb_cleora_input_clique.txt\"\n",
176 |     "fb_cleora_input_star_filename = \"fb_cleora_input_star.txt\"\n",
177 |     "output_dir = 'output'"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 152,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "with open(fb_cleora_input_clique_filename, \"w\") as f_cleora_clique, open(fb_cleora_input_star_filename, \"w\") as f_cleora_star:\n",
187 |     "    grouped_train = train_cleora.groupby('id_1')\n",
188 |     "    for n, (name, group) in enumerate(grouped_train):\n",
189 |     "        group_list = group['id_2'].tolist()\n",
190 |     "        group_elems = list(map(str, group_list))\n",
191 |     "        f_cleora_clique.write(\"{} {}\\n\".format(name, ' '.join(group_elems)))\n",
192 |     "        f_cleora_star.write(\"{}\\t{}\\n\".format(n, name))\n",
193 |     "        for elem in group_elems:\n",
194 |     "            f_cleora_star.write(\"{}\\t{}\\n\".format(n, elem))"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": []
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 153,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "df = pd.read_csv(\"facebook_large/musae_facebook_target.csv\")"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 154,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "classes = df['page_type'].unique()\n",
220 |     "class_ids = list(range(0, len(classes)))\n",
221 |     "class_dict = {k:v for k,v in zip(classes, class_ids)}\n",
222 |     "df['page_type'] = [class_dict[item] for item in df['page_type']] "
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 155,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "train_filename = \"fb_classification_train.txt\"\n",
232 |     "test_filename = \"fb_classification_test.txt\""
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 156,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "train, test = train_test_split(df, test_size=config['train_test_split'])"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 157,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "with open(train_filename, \"w\") as f_train:\n",
251 |     "    for index, row in train.iterrows():\n",
252 |     "        f_train.write(\"{} {}\\n\".format(row['id'], row['page_type']))"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 158,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "with open(test_filename, \"w\") as f_test:\n",
262 |     "    for index, row in test.iterrows():\n",
263 |     "        f_test.write(\"{} {}\\n\".format(row['id'], row['page_type']))"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "# Cleora training"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "Download an appropriate binary Cleora release from: https://github.com/Synerise/cleora/releases . \n",
278 |     "\n",
279 |     "A Linux GNU version is assumed in this example, but any other will do."
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 159,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "import subprocess\n",
289 |     "\n",
290 |     "\n",
291 |     "def columns2output_filename(output_dir, columns):\n",
292 |     "    columns_split = columns.split()\n",
293 |     "    if len(columns_split) == 1 and 'reflexive' in columns:\n",
294 |     "        column_name = columns.split('::')[-1]\n",
295 |     "        return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')\n",
296 |     "\n",
297 |     "    column_names = [i.split('::')[-1] for i in columns_split]\n",
298 |     "    return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')\n",
299 |     "\n",
300 |     "\n",
301 |     "def train_cleora(dim, n_iter, columns, input_filename, output_dir):\n",
302 |     "    command = ['./cleora-v1.0.1-x86_64-unknown-linux-gnu',\n",
303 |     "                '--columns', columns,\n",
304 |     "                '--dimension', str(dim), \n",
305 |     "                '-n', str(n_iter), \n",
306 |     "                '--input', input_filename, \n",
307 |     "                '-o', output_dir]\n",
308 |     "    subprocess.run(command, check=True)\n",
309 |     "    return columns2output_filename(output_dir, columns)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "## Star expansion\n",
317 |     "\n",
318 |     "In the `fb_cleora_input_star.txt` file the first column is a virtual node. The parameter `-c \"transient::cluster_id node\"` means that embeddings will not be created for nodes from this column. This translates to star expansion scheme."
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 160,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "name": "stdout",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "CPU times: user 1.37 ms, sys: 8.1 ms, total: 9.47 ms\n",
331 |       "Wall time: 8.59 s\n"
332 |      ]
333 |     }
334 |    ],
335 |    "source": [
336 |     "%%time\n",
337 |     "cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"transient::cluster_id StarNode\", fb_cleora_input_star_filename, output_dir)"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {},
343 |    "source": [
344 |     "## Clique expansion\n",
345 |     "\n",
346 |     "The `fb_cleora_input_clique.txt` file has the structure of adjacency list. The parameter `-c \"complex::reflexive::node\"` means that edges will be created for all cominations of nodes from each line. This translates to clique expansion scheme."
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 161,
352 |    "metadata": {},
353 |    "outputs": [
354 |     {
355 |      "name": "stdout",
356 |      "output_type": "stream",
357 |      "text": [
358 |       "CPU times: user 4.42 ms, sys: 8.34 ms, total: 12.8 ms\n",
359 |       "Wall time: 13.7 s\n"
360 |      ]
361 |     }
362 |    ],
363 |    "source": [
364 |     "%%time\n",
365 |     "cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"complex::reflexive::CliqueNode\", fb_cleora_input_clique_filename, output_dir)"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "## No expansion\n",
373 |     "\n",
374 |     "You can also compute Cleora without any expansion scheme by providing an input file in the edgelist format (single pair of nodes per line). Run with a simple parameter: `-c \"node1 node2\"`."
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "# Classification\n",
382 |     "\n",
383 |     "We train a simple multiclass Logistic Regression classifier to predict the class of node based on its embedding. We assess the quality of the classifier with of 2 metrics: micro-F1 and macro-F1."
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 162,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "def read_embeddings(input_file):\n",
393 |     "    df_full = pd.read_csv(input_file, delimiter = \" \", skiprows=[0], header=None, \n",
394 |     "                     index_col=0)\n",
395 |     "    df_full = df_full.drop([1], axis=1)\n",
396 |     "\n",
397 |     "    return df_full"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 163,
403 |    "metadata": {},
404 |    "outputs": [],
405 |    "source": [
406 |     "def read_train_test(embeddings):\n",
407 |     "    valid_idx = embeddings.index.to_numpy()\n",
408 |     "    \n",
409 |     "    train = np.loadtxt(train_filename, delimiter=\" \", dtype=np.int) \n",
410 |     "    test = np.loadtxt(test_filename, delimiter=\" \", dtype=np.int)\n",
411 |     "    \n",
412 |     "    train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]\n",
413 |     "    test = [t for t in test if (t[0] in valid_idx) and (t[1] in valid_idx)] \n",
414 |     "     \n",
415 |     "    train = np.array(train)\n",
416 |     "    test = np.array(test)\n",
417 |     "    \n",
418 |     "    return train,test"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 164,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "batch_size = config['batch_size']\n",
428 |     "test_batch_size = config['test_batch_size']"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 165,
434 |    "metadata": {
435 |     "scrolled": true
436 |    },
437 |    "outputs": [
438 |     {
439 |      "name": "stderr",
440 |      "output_type": "stream",
441 |      "text": [
442 |       "100%|██████████| 20/20 [00:15<00:00,  1.29it/s]"
443 |      ]
444 |     },
445 |     {
446 |      "name": "stdout",
447 |      "output_type": "stream",
448 |      "text": [
449 |       "algo: output/emb__cluster_id__StarNode.out epochs: 20, micro f1: 0.9093110871905274, macro f1:0.9094875754311472\n"
450 |      ]
451 |     },
452 |     {
453 |      "name": "stderr",
454 |      "output_type": "stream",
455 |      "text": [
456 |       "\n",
457 |       "100%|██████████| 20/20 [00:15<00:00,  1.33it/s]"
458 |      ]
459 |     },
460 |     {
461 |      "name": "stdout",
462 |      "output_type": "stream",
463 |      "text": [
464 |       "algo: output/emb__CliqueNode__CliqueNode.out epochs: 20, micro f1: 0.9171151776103337, macro f1:0.9169262311726959\n"
465 |      ]
466 |     },
467 |     {
468 |      "name": "stderr",
469 |      "output_type": "stream",
470 |      "text": [
471 |       "\n"
472 |      ]
473 |     }
474 |    ],
475 |    "source": [
476 |     "for algo in config['input_embeddings']:\n",
477 |     "    embeddings = read_embeddings(algo)\n",
478 |     "    train,test = read_train_test(embeddings)\n",
479 |     "                                 \n",
480 |     "    y_train = train[:, 1]\n",
481 |     "    y_test = test[:, 1]\n",
482 |     "\n",
483 |     "    clf = SGDClassifier(random_state=0, loss='log', alpha=0.0001)\n",
484 |     "    for e in tqdm(range(0, max(config['epochs']))):\n",
485 |     "        for idx in range(0,train.shape[0],batch_size):\n",
486 |     "            ex=train[idx:min(idx+batch_size,train.shape[0]),:]\n",
487 |     "\n",
488 |     "            ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n",
489 |     "            ex_y = y_train[idx:min(idx+batch_size,train.shape[0])]\n",
490 |     "    \n",
491 |     "            clf.partial_fit(ex_emb_in, ex_y, classes=[0,1,2,3])\n",
492 |     "        \n",
493 |     "        if e+1 in config['epochs']:\n",
494 |     "            acc = 0.0\n",
495 |     "            y_pred = []\n",
496 |     "            for n, idx in enumerate(range(0,test.shape[0],test_batch_size)):\n",
497 |     "                ex=test[idx:min(idx+test_batch_size,train.shape[0]),:]\n",
498 |     "                ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n",
499 |     "                pred = clf.predict_proba(ex_emb_in)\n",
500 |     "    \n",
501 |     "                classes = np.argmax(pred, axis=1)\n",
502 |     "                y_pred.extend(classes)\n",
503 |     "\n",
504 |     "            f1_micro = f1_score(y_test, y_pred, average='micro')\n",
505 |     "            f1_macro = f1_score(y_test, y_pred, average='macro')\n",
506 |     "            print('algo: {} epochs: {}, micro f1: {}, macro f1:{}'.format(algo, e+1, f1_micro, f1_macro))\n"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": null,
512 |    "metadata": {},
513 |    "outputs": [],
514 |    "source": []
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": null,
519 |    "metadata": {},
520 |    "outputs": [],
521 |    "source": []
522 |   }
523 |  ],
524 |  "metadata": {
525 |   "kernelspec": {
526 |    "display_name": "Python 3",
527 |    "language": "python",
528 |    "name": "python3"
529 |   },
530 |   "language_info": {
531 |    "codemirror_mode": {
532 |     "name": "ipython",
533 |     "version": 3
534 |    },
535 |    "file_extension": ".py",
536 |    "mimetype": "text/x-python",
537 |    "name": "python",
538 |    "nbconvert_exporter": "python",
539 |    "pygments_lexer": "ipython3",
540 |    "version": "3.7.4"
541 |   }
542 |  },
543 |  "nbformat": 4,
544 |  "nbformat_minor": 4
545 | }
546 | 


--------------------------------------------------------------------------------
/legacy/src/embedding.rs:
--------------------------------------------------------------------------------
  1 | use crate::configuration::Configuration;
  2 | use crate::persistence::embedding::EmbeddingPersistor;
  3 | use crate::persistence::entity::EntityMappingPersistor;
  4 | use crate::sparse_matrix::SparseMatrixReader;
  5 | use log::{info, warn};
  6 | use memmap::MmapMut;
  7 | use rayon::prelude::*;
  8 | use std::collections::hash_map::DefaultHasher;
  9 | use std::collections::HashSet;
 10 | use std::fs;
 11 | use std::fs::OpenOptions;
 12 | use std::hash::Hasher;
 13 | use std::marker::PhantomData;
 14 | use std::sync::Arc;
 15 | use uuid::Uuid;
 16 | 
 17 | /// Number of broken entities (those with errors during writing to the file) which are logged.
 18 | /// There can be much more but log the first few.
 19 | const LOGGED_NUMBER_OF_BROKEN_ENTITIES: usize = 20;
 20 | 
 21 | /// Used during matrix initialization. No specific requirement (ca be lower as well).
 22 | const MAX_HASH_I64: i64 = 8 * 1024 * 1024;
 23 | const MAX_HASH_F32: f32 = MAX_HASH_I64 as f32;
 24 | 
 25 | /// Wrapper for different types of matrix structures such as 2-dim vectors or memory-mapped files
 26 | trait MatrixWrapper {
 27 |     /// Initializing a matrix with values from its dimensions and the hash values from the sparse matrix
 28 |     fn init_with_hashes<T: SparseMatrixReader + Sync + Send>(
 29 |         rows: usize,
 30 |         cols: usize,
 31 |         fixed_random_value: i64,
 32 |         sparse_matrix_reader: Arc<T>,
 33 |     ) -> Self;
 34 | 
 35 |     /// Returns value for specific coordinates
 36 |     fn get_value(&self, row: usize, col: usize) -> f32;
 37 | 
 38 |     /// Normalizing a matrix by rows sum
 39 |     fn normalize(&mut self);
 40 | 
 41 |     /// Multiplies sparse matrix by the matrix
 42 |     fn multiply<T: SparseMatrixReader + Sync + Send>(
 43 |         sparse_matrix_reader: Arc<T>,
 44 |         other: Self,
 45 |     ) -> Self;
 46 | }
 47 | 
 48 | /// Two dimensional vectors as matrix representation
 49 | struct TwoDimVectorMatrix {
 50 |     rows: usize,
 51 |     cols: usize,
 52 |     matrix: Vec<Vec<f32>>,
 53 | }
 54 | 
 55 | impl MatrixWrapper for TwoDimVectorMatrix {
 56 |     fn init_with_hashes<T: SparseMatrixReader + Sync + Send>(
 57 |         rows: usize,
 58 |         cols: usize,
 59 |         fixed_random_value: i64,
 60 |         sparse_matrix_reader: Arc<T>,
 61 |     ) -> Self {
 62 |         let result: Vec<Vec<f32>> = (0..cols)
 63 |             .into_par_iter()
 64 |             .map(|i| {
 65 |                 let mut col: Vec<f32> = Vec::with_capacity(rows);
 66 |                 for hsh in sparse_matrix_reader.iter_hashes() {
 67 |                     let col_value = init_value(i, hsh.value, fixed_random_value);
 68 |                     col.push(col_value);
 69 |                 }
 70 |                 col
 71 |             })
 72 |             .collect();
 73 |         Self {
 74 |             rows,
 75 |             cols,
 76 |             matrix: result,
 77 |         }
 78 |     }
 79 | 
 80 |     #[inline]
 81 |     fn get_value(&self, row: usize, col: usize) -> f32 {
 82 |         let column: &Vec<f32> = self.matrix.get(col).unwrap();
 83 |         column[row]
 84 |     }
 85 | 
 86 |     fn normalize(&mut self) {
 87 |         let mut row_sum = vec![0f32; self.rows];
 88 | 
 89 |         for col in self.matrix.iter() {
 90 |             for (j, sum) in row_sum.iter_mut().enumerate() {
 91 |                 let value = col[j];
 92 |                 *sum += value.powi(2)
 93 |             }
 94 |         }
 95 | 
 96 |         let row_sum = Arc::new(row_sum);
 97 |         self.matrix.par_iter_mut().for_each(|col| {
 98 |             for (j, value) in col.iter_mut().enumerate() {
 99 |                 let sum = row_sum[j];
100 |                 *value /= sum.sqrt();
101 |             }
102 |         });
103 |     }
104 | 
105 |     fn multiply<T: SparseMatrixReader + Sync + Send>(
106 |         sparse_matrix_reader: Arc<T>,
107 |         other: Self,
108 |     ) -> Self {
109 |         let rnew = zero_2d(other.rows, other.cols);
110 | 
111 |         let result: Vec<Vec<f32>> = other
112 |             .matrix
113 |             .into_par_iter()
114 |             .zip(rnew)
115 |             .update(|data| {
116 |                 let (res_col, rnew_col) = data;
117 |                 for entry in sparse_matrix_reader.iter_entries() {
118 |                     let elem = rnew_col.get_mut(entry.row as usize).unwrap();
119 |                     let value = res_col[entry.col as usize];
120 |                     *elem += value * entry.value;
121 |                 }
122 |             })
123 |             .map(|data| data.1)
124 |             .collect();
125 | 
126 |         Self {
127 |             rows: other.rows,
128 |             cols: other.cols,
129 |             matrix: result,
130 |         }
131 |     }
132 | }
133 | 
134 | fn init_value(col: usize, hsh: u64, fixed_random_value: i64) -> f32 {
135 |     ((hash((hsh as i64) + (col as i64) + fixed_random_value) % MAX_HASH_I64) as f32) / MAX_HASH_F32
136 | }
137 | 
138 | fn hash(num: i64) -> i64 {
139 |     let mut hasher = DefaultHasher::new();
140 |     hasher.write_i64(num);
141 |     hasher.finish() as i64
142 | }
143 | 
144 | fn zero_2d(row: usize, col: usize) -> Vec<Vec<f32>> {
145 |     let mut res: Vec<Vec<f32>> = Vec::with_capacity(col);
146 |     for _i in 0..col {
147 |         let col = vec![0f32; row];
148 |         res.push(col);
149 |     }
150 |     res
151 | }
152 | 
153 | /// Memory-mapped file as matrix representation. Every column of the matrix is placed side by side in the file.
154 | struct MMapMatrix {
155 |     rows: usize,
156 |     cols: usize,
157 |     file_name: String,
158 |     matrix: MmapMut,
159 | }
160 | 
161 | impl MatrixWrapper for MMapMatrix {
162 |     fn init_with_hashes<T: SparseMatrixReader + Sync + Send>(
163 |         rows: usize,
164 |         cols: usize,
165 |         fixed_random_value: i64,
166 |         sparse_matrix_reader: Arc<T>,
167 |     ) -> Self {
168 |         let uuid = Uuid::new_v4();
169 |         let file_name = format!("{}_matrix_{}", sparse_matrix_reader.get_id(), uuid);
170 |         let mut mmap = create_mmap(rows, cols, file_name.as_str());
171 | 
172 |         mmap.par_chunks_mut(rows * 4)
173 |             .enumerate()
174 |             .for_each(|(i, chunk)| {
175 |                 // i - number of dimension
176 |                 // chunk - column/vector of bytes
177 |                 for (j, hsh) in sparse_matrix_reader.iter_hashes().enumerate() {
178 |                     let col_value = init_value(i, hsh.value, fixed_random_value);
179 |                     MMapMatrix::update_column(j, chunk, |value| unsafe { *value = col_value });
180 |                 }
181 |             });
182 | 
183 |         mmap.flush()
184 |             .expect("Can't flush memory map modifications to disk");
185 | 
186 |         Self {
187 |             rows,
188 |             cols,
189 |             file_name,
190 |             matrix: mmap,
191 |         }
192 |     }
193 | 
194 |     #[inline]
195 |     fn get_value(&self, row: usize, col: usize) -> f32 {
196 |         let start_idx = ((col * self.rows) + row) * 4;
197 |         let end_idx = start_idx + 4;
198 |         let pointer: *const u8 = (&self.matrix[start_idx..end_idx]).as_ptr();
199 |         unsafe {
200 |             let value = pointer as *const f32;
201 |             *value
202 |         }
203 |     }
204 | 
205 |     fn normalize(&mut self) {
206 |         let entities_count = self.rows;
207 |         let mut row_sum = vec![0f32; entities_count];
208 | 
209 |         for i in 0..(self.cols as usize) {
210 |             for (j, sum) in row_sum.iter_mut().enumerate() {
211 |                 let value = self.get_value(j, i);
212 |                 *sum += value.powi(2)
213 |             }
214 |         }
215 | 
216 |         let row_sum = Arc::new(row_sum);
217 |         self.matrix
218 |             .par_chunks_mut(entities_count * 4)
219 |             .enumerate()
220 |             .for_each(|(_i, chunk)| {
221 |                 // i - number of dimension
222 |                 // chunk - column/vector of bytes
223 |                 for (j, &sum) in row_sum.iter().enumerate() {
224 |                     MMapMatrix::update_column(j, chunk, |value| unsafe { *value /= sum.sqrt() });
225 |                 }
226 |             });
227 | 
228 |         self.matrix
229 |             .flush()
230 |             .expect("Can't flush memory map modifications to disk");
231 |     }
232 | 
233 |     fn multiply<T: SparseMatrixReader + Sync + Send>(
234 |         sparse_matrix_reader: Arc<T>,
235 |         other: Self,
236 |     ) -> Self {
237 |         let rows = other.rows;
238 |         let cols = other.cols;
239 | 
240 |         let uuid = Uuid::new_v4();
241 |         let file_name = format!("{}_matrix_{}", sparse_matrix_reader.get_id(), uuid);
242 |         let mut mmap_output = create_mmap(rows, cols, file_name.as_str());
243 | 
244 |         let input = Arc::new(other);
245 |         mmap_output
246 |             .par_chunks_mut(rows * 4)
247 |             .enumerate()
248 |             .for_each_with(input, |input, (i, chunk)| {
249 |                 for entry in sparse_matrix_reader.iter_entries() {
250 |                     let input_value = input.get_value(entry.col as usize, i);
251 |                     MMapMatrix::update_column(entry.row as usize, chunk, |value| unsafe {
252 |                         *value += input_value * entry.value
253 |                     });
254 |                 }
255 |             });
256 | 
257 |         mmap_output
258 |             .flush()
259 |             .expect("Can't flush memory map modifications to disk");
260 | 
261 |         Self {
262 |             rows,
263 |             cols,
264 |             file_name,
265 |             matrix: mmap_output,
266 |         }
267 |     }
268 | }
269 | 
270 | /// Creates memory-mapped file with allocated number of bytes
271 | fn create_mmap(rows: usize, cols: usize, file_name: &str) -> MmapMut {
272 |     let number_of_bytes = (rows * cols * 4) as u64;
273 |     let file = OpenOptions::new()
274 |         .read(true)
275 |         .write(true)
276 |         .create(true)
277 |         .open(file_name)
278 |         .expect("Can't create new set of options for memory mapped file");
279 |     file.set_len(number_of_bytes).unwrap_or_else(|_| {
280 |         panic!(
281 |             "Can't update the size of {} file to {} bytes",
282 |             file_name, number_of_bytes
283 |         )
284 |     });
285 |     unsafe {
286 |         MmapMut::map_mut(&file).unwrap_or_else(|_| {
287 |             panic!(
288 |                 "Can't create memory mapped file for the underlying file {}",
289 |                 file_name
290 |             )
291 |         })
292 |     }
293 | }
294 | 
295 | /// Used to remove memory-mapped file after processing
296 | impl Drop for MMapMatrix {
297 |     fn drop(&mut self) {
298 |         fs::remove_file(self.file_name.as_str()).unwrap_or_else(|_| {
299 |             warn!(
300 |                 "File {} can't be removed after work. Remove the file in order to save disk space.",
301 |                 self.file_name.as_str()
302 |             )
303 |         });
304 |     }
305 | }
306 | 
307 | impl MMapMatrix {
308 |     #[inline]
309 |     fn update_column<F>(col: usize, chunk: &mut [u8], func: F)
310 |     where
311 |         F: Fn(*mut f32),
312 |     {
313 |         let start_idx = col * 4;
314 |         let end_idx = start_idx + 4;
315 |         let pointer: *mut u8 = (&mut chunk[start_idx..end_idx]).as_mut_ptr();
316 |         let value = pointer as *mut f32;
317 |         func(value);
318 |     }
319 | }
320 | 
321 | /// Calculate embeddings in memory.
322 | pub fn calculate_embeddings<T1, T2>(
323 |     config: Arc<Configuration>,
324 |     sparse_matrix_reader: Arc<T1>,
325 |     entity_mapping_persistor: Arc<T2>,
326 |     embedding_persistor: &mut dyn EmbeddingPersistor,
327 | ) where
328 |     T1: SparseMatrixReader + Sync + Send,
329 |     T2: EntityMappingPersistor,
330 | {
331 |     let mult = MatrixMultiplicator::new(config.clone(), sparse_matrix_reader);
332 |     let init: TwoDimVectorMatrix = mult.initialize();
333 |     let res = mult.propagate(config.max_number_of_iteration, init);
334 |     mult.persist(res, entity_mapping_persistor, embedding_persistor);
335 | 
336 |     info!("Finalizing embeddings calculations!")
337 | }
338 | 
339 | /// Provides matrix multiplication based on sparse matrix data.
340 | #[derive(Debug)]
341 | struct MatrixMultiplicator<T: SparseMatrixReader + Sync + Send, M: MatrixWrapper> {
342 |     dimension: usize,
343 |     number_of_entities: usize,
344 |     fixed_random_value: i64,
345 |     sparse_matrix_reader: Arc<T>,
346 |     _marker: PhantomData<M>,
347 | }
348 | 
349 | impl<T, M> MatrixMultiplicator<T, M>
350 | where
351 |     T: SparseMatrixReader + Sync + Send,
352 |     M: MatrixWrapper,
353 | {
354 |     fn new(config: Arc<Configuration>, sparse_matrix_reader: Arc<T>) -> Self {
355 |         let rand_value = config.seed.map(hash).unwrap_or(0);
356 |         Self {
357 |             dimension: config.embeddings_dimension as usize,
358 |             number_of_entities: sparse_matrix_reader.get_number_of_entities() as usize,
359 |             fixed_random_value: rand_value,
360 |             sparse_matrix_reader,
361 |             _marker: PhantomData,
362 |         }
363 |     }
364 | 
365 |     /// Initialize a matrix
366 |     fn initialize(&self) -> M {
367 |         info!(
368 |             "Start initialization. Dims: {}, entities: {}.",
369 |             self.dimension, self.number_of_entities
370 |         );
371 | 
372 |         let result = M::init_with_hashes(
373 |             self.number_of_entities,
374 |             self.dimension,
375 |             self.fixed_random_value,
376 |             self.sparse_matrix_reader.clone(),
377 |         );
378 | 
379 |         info!(
380 |             "Done initializing. Dims: {}, entities: {}.",
381 |             self.dimension, self.number_of_entities
382 |         );
383 |         result
384 |     }
385 | 
386 |     /// The sparse matrix is multiplied by a freshly initialized matrix M.
387 |     /// Multiplication is done against each column of matrix M in a separate thread.
388 |     /// The obtained columns of the new matrix are subsequently merged into the full matrix.
389 |     /// The matrix is L2-normalized, again in a multithreaded fashion across matrix columns.
390 |     /// Finally, depending on the target iteration number, the matrix is either returned
391 |     /// or fed for next iterations of multiplication against the sparse matrix.
392 |     fn propagate(&self, max_iter: u8, res: M) -> M {
393 |         info!("Start propagating. Number of iterations: {}.", max_iter);
394 | 
395 |         let mut new_res = res;
396 |         for i in 0..max_iter {
397 |             let mut next = M::multiply(self.sparse_matrix_reader.clone(), new_res);
398 |             next.normalize();
399 |             new_res = next;
400 | 
401 |             info!(
402 |                 "Done iter: {}. Dims: {}, entities: {}, num data points: {}.",
403 |                 i,
404 |                 self.dimension,
405 |                 self.number_of_entities,
406 |                 self.sparse_matrix_reader.get_number_of_entries()
407 |             );
408 |         }
409 | 
410 |         info!("Done propagating.");
411 |         new_res
412 |     }
413 | 
414 |     /// Saves results to output such as textfile, numpy etc
415 |     fn persist<T1>(
416 |         &self,
417 |         res: M,
418 |         entity_mapping_persistor: Arc<T1>,
419 |         embedding_persistor: &mut dyn EmbeddingPersistor,
420 |     ) where
421 |         T1: EntityMappingPersistor,
422 |     {
423 |         info!("Start saving embeddings.");
424 | 
425 |         embedding_persistor
426 |             .put_metadata(self.number_of_entities as u32, self.dimension as u16)
427 |             .unwrap_or_else(|_| {
428 |                 // if can't write first data to the file, probably further is the same
429 |                 panic!(
430 |                     "Can't write metadata. Entities: {}. Dimension: {}.",
431 |                     self.number_of_entities, self.dimension
432 |                 )
433 |             });
434 | 
435 |         // entities which can't be written to the file (error occurs)
436 |         let mut broken_entities = HashSet::new();
437 |         for (i, hash) in self.sparse_matrix_reader.iter_hashes().enumerate() {
438 |             let entity_name_opt = entity_mapping_persistor.get_entity(hash.value);
439 |             if let Some(entity_name) = entity_name_opt {
440 |                 let mut embedding: Vec<f32> = Vec::with_capacity(self.dimension);
441 |                 for j in 0..self.dimension {
442 |                     let value = res.get_value(i, j);
443 |                     embedding.push(value);
444 |                 }
445 |                 embedding_persistor
446 |                     .put_data(&entity_name, hash.occurrence, embedding)
447 |                     .unwrap_or_else(|_| {
448 |                         broken_entities.insert(entity_name);
449 |                     });
450 |             };
451 |         }
452 | 
453 |         if !broken_entities.is_empty() {
454 |             log_broken_entities(broken_entities);
455 |         }
456 | 
457 |         embedding_persistor
458 |             .finish()
459 |             .unwrap_or_else(|_| warn!("Can't finish writing to the file."));
460 | 
461 |         info!("Done saving embeddings.");
462 |     }
463 | }
464 | 
465 | fn log_broken_entities(broken_entities: HashSet<String>) {
466 |     let num_of_broken_entities = broken_entities.len();
467 |     let few_broken_entities: HashSet<_> = broken_entities
468 |         .into_iter()
469 |         .take(LOGGED_NUMBER_OF_BROKEN_ENTITIES)
470 |         .collect();
471 |     warn!(
472 |         "Number of entities which can't be written to the file: {}. First {} broken entities: {:?}.",
473 |         num_of_broken_entities, LOGGED_NUMBER_OF_BROKEN_ENTITIES, few_broken_entities
474 |     );
475 | }
476 | 
477 | /// Calculate embeddings with memory-mapped files.
478 | pub fn calculate_embeddings_mmap<T1, T2>(
479 |     config: Arc<Configuration>,
480 |     sparse_matrix_reader: Arc<T1>,
481 |     entity_mapping_persistor: Arc<T2>,
482 |     embedding_persistor: &mut dyn EmbeddingPersistor,
483 | ) where
484 |     T1: SparseMatrixReader + Sync + Send,
485 |     T2: EntityMappingPersistor,
486 | {
487 |     let mult = MatrixMultiplicator::new(config.clone(), sparse_matrix_reader);
488 |     let init: MMapMatrix = mult.initialize();
489 |     let res = mult.propagate(config.max_number_of_iteration, init);
490 |     mult.persist(res, entity_mapping_persistor, embedding_persistor);
491 | 
492 |     info!("Finalizing embeddings calculations!")
493 | }
494 | 


--------------------------------------------------------------------------------
/legacy/src/entity.rs:
--------------------------------------------------------------------------------
  1 | use crate::configuration::{Column, Configuration};
  2 | use crate::persistence::entity::EntityMappingPersistor;
  3 | use smallvec::{smallvec, SmallVec};
  4 | use std::hash::Hasher;
  5 | use std::sync::Arc;
  6 | use twox_hash::XxHash64;
  7 | 
  8 | /// Indicates how many elements in a vector can be placed on Stack (used by smallvec crate). The rest
  9 | /// of the vector is placed on Heap.
 10 | pub const SMALL_VECTOR_SIZE: usize = 8;
 11 | 
 12 | /// Marker for elements in a vector. Let's say that we have `vec![1, 2, 3, 4]`
 13 | /// and `LengthAndOffset { length: 2, offset : 1 }`. Offset points to the second element in the vector
 14 | /// and length tell us how many elements we should take (in that case 2 elements: 2 and 3).
 15 | #[derive(Clone, Copy)]
 16 | struct LengthAndOffset {
 17 |     length: u32,
 18 |     offset: u32,
 19 | }
 20 | 
 21 | struct CartesianProduct {
 22 |     has_next: bool,
 23 |     lengths_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]>,
 24 |     indices: SmallVec<[u32; SMALL_VECTOR_SIZE]>,
 25 | }
 26 | 
 27 | impl CartesianProduct {
 28 |     fn new(
 29 |         lengths_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]>,
 30 |     ) -> CartesianProduct {
 31 |         let indices: SmallVec<[u32; SMALL_VECTOR_SIZE]> = lengths_and_offsets
 32 |             .iter()
 33 |             .map(|length_and_offset| length_and_offset.offset)
 34 |             .collect();
 35 |         CartesianProduct {
 36 |             has_next: true,
 37 |             lengths_and_offsets,
 38 |             indices,
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | impl Iterator for CartesianProduct {
 44 |     /// The type of the elements being iterated over.
 45 |     type Item = SmallVec<[u32; SMALL_VECTOR_SIZE]>;
 46 | 
 47 |     /// Advances the iterator and returns the next value - cartesian product.
 48 |     #[inline(always)]
 49 |     fn next(&mut self) -> Option<Self::Item> {
 50 |         if !self.has_next {
 51 |             return None;
 52 |         }
 53 | 
 54 |         let len = self.indices.len();
 55 |         let result: SmallVec<[u32; SMALL_VECTOR_SIZE]> = SmallVec::from_slice(&self.indices);
 56 |         for i in (0..len).rev() {
 57 |             let LengthAndOffset { length, offset } = self.lengths_and_offsets[i];
 58 |             let last_index = length + offset;
 59 |             if self.indices[i] == (last_index - 1) {
 60 |                 self.indices[i] = offset;
 61 |                 if i == 0 {
 62 |                     self.has_next = false;
 63 |                 }
 64 |             } else {
 65 |                 self.indices[i] += 1;
 66 |                 break;
 67 |             }
 68 |         }
 69 |         Some(result)
 70 |     }
 71 | }
 72 | 
 73 | pub struct EntityProcessor<'a, T, F>
 74 | where
 75 |     T: EntityMappingPersistor,
 76 |     F: FnMut(SmallVec<[u64; SMALL_VECTOR_SIZE]>),
 77 | {
 78 |     config: &'a Configuration,
 79 |     field_hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]>,
 80 |     not_ignored_columns_count: u16,
 81 |     columns_count: u16,
 82 |     entity_mapping_persistor: Arc<T>,
 83 |     hashes_handler: F,
 84 | }
 85 | 
 86 | impl<'a, T, F> EntityProcessor<'a, T, F>
 87 | where
 88 |     T: EntityMappingPersistor,
 89 |     F: FnMut(SmallVec<[u64; SMALL_VECTOR_SIZE]>),
 90 | {
 91 |     pub fn new(
 92 |         config: &'a Configuration,
 93 |         persistor: Arc<T>,
 94 |         hashes_handler: F,
 95 |     ) -> EntityProcessor<'a, T, F> {
 96 |         let columns = &config.columns;
 97 |         // hashes for column names are used to differentiate entities with the same name
 98 |         // from different columns
 99 |         let field_hashes_vec: Vec<u64> = columns.iter().map(|c| hash(&c.name)).collect();
100 |         let field_hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = SmallVec::from_vec(field_hashes_vec);
101 |         let not_ignored_cols = config.not_ignored_columns();
102 |         let mut not_ignored_columns_count = 0;
103 |         let mut reflexive_columns_count = 0;
104 |         for &col in &not_ignored_cols {
105 |             not_ignored_columns_count += 1;
106 |             if col.reflexive {
107 |                 reflexive_columns_count += 1
108 |             };
109 |         }
110 | 
111 |         let columns_count = not_ignored_columns_count + reflexive_columns_count;
112 | 
113 |         EntityProcessor {
114 |             config,
115 |             field_hashes,
116 |             not_ignored_columns_count,
117 |             columns_count,
118 |             entity_mapping_persistor: persistor,
119 |             hashes_handler,
120 |         }
121 |     }
122 | 
123 |     /// Every row can create few combinations (cartesian products) which are hashed and provided for sparse matrix creation.
124 |     /// `row` - array of strings such as: ("userId1", "productId1 productId2", "brandId1").
125 |     pub fn process_row<S: AsRef<str>>(&mut self, row: &[SmallVec<[S; SMALL_VECTOR_SIZE]>]) {
126 |         let mut hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> =
127 |             SmallVec::with_capacity(self.not_ignored_columns_count as usize);
128 |         let mut lens_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]> =
129 |             smallvec![LengthAndOffset{ length: 0, offset: 0}; self.columns_count as usize];
130 |         let mut reflexive_count = 0;
131 |         let mut current_offset = 0u32;
132 | 
133 |         let mut idx = 0;
134 |         for (i, column_entities) in row.iter().enumerate() {
135 |             let column = &self.config.columns[i];
136 |             if !column.ignored {
137 |                 if column.complex {
138 |                     for entity in column_entities {
139 |                         let hash = self.field_hashes[i] ^ hash(entity.as_ref());
140 |                         hashes.push(hash);
141 |                         self.update_entity_mapping(entity.as_ref(), hash, column);
142 |                     }
143 |                     let length = column_entities.len() as u32;
144 |                     lens_and_offsets[idx] = LengthAndOffset {
145 |                         length,
146 |                         offset: current_offset,
147 |                     };
148 |                     if column.reflexive {
149 |                         // put reflexive column data to the end of the buffers
150 |                         let reflexive_id =
151 |                             (self.not_ignored_columns_count + reflexive_count) as usize;
152 |                         lens_and_offsets[reflexive_id] = LengthAndOffset {
153 |                             length,
154 |                             offset: current_offset,
155 |                         };
156 |                         reflexive_count += 1;
157 |                     }
158 |                     current_offset += length;
159 |                 } else {
160 |                     let entity = column_entities.get(0).unwrap().as_ref();
161 |                     let hash = self.field_hashes[i] ^ hash(entity);
162 |                     hashes.push(hash);
163 |                     self.update_entity_mapping(entity, hash, column);
164 |                     let length = 1u32;
165 |                     lens_and_offsets[idx] = LengthAndOffset {
166 |                         length,
167 |                         offset: current_offset,
168 |                     };
169 |                     current_offset += length;
170 |                 }
171 |                 idx += 1;
172 |             }
173 |         }
174 | 
175 |         let hash_rows = self.generate_combinations_with_length(hashes, lens_and_offsets);
176 |         for hash_row in hash_rows {
177 |             (self.hashes_handler)(hash_row);
178 |         }
179 |     }
180 | 
181 |     #[inline(always)]
182 |     fn update_entity_mapping(&self, entity: &str, hash: u64, column: &Column) {
183 |         if !column.transient && !self.entity_mapping_persistor.contains(hash) {
184 |             let entry = if self.config.prepend_field {
185 |                 let mut entry = column.name.clone();
186 |                 entry.push_str("__");
187 |                 entry.push_str(entity);
188 |                 entry
189 |             } else {
190 |                 entity.to_string()
191 |             };
192 |             self.entity_mapping_persistor.put_data(hash, entry);
193 |         }
194 |     }
195 | 
196 |     /// It creates Cartesian Product for incoming data.
197 |     /// Let's say that we have such columns:
198 |     /// customers | products                | brands
199 |     /// incoming data:
200 |     /// userId1   | productId1, productId2  | brandId1, brandId2
201 |     /// Total number of combinations is equal to 4 (1 * 2 * 2) based on:
202 |     /// number of entities in customers column * number of entities in products column * number of entities in brands column
203 |     /// Cartesian Products for our data:
204 |     /// (userId1, productId1, brandId1), (userId1, productId1, brandId2), (userId1, productId2, brandId1), (userId1, productId2, brandId2)
205 |     /// `hashes` - entity hashes
206 |     /// `lens_and_offsets` - number of entities per column
207 |     /// return entity hashes Cartesian Products. Size of the array (matrix) is equal to number of combinations x number of columns (including reflexive column)
208 |     #[inline(always)]
209 |     fn generate_combinations_with_length(
210 |         &self,
211 |         hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]>,
212 |         lens_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]>,
213 |     ) -> impl Iterator<Item = SmallVec<[u64; SMALL_VECTOR_SIZE]>> {
214 |         let row_length = lens_and_offsets.len();
215 |         let mut total_combinations = 1;
216 |         for len_and_offset in &lens_and_offsets {
217 |             total_combinations *= len_and_offset.length;
218 |         }
219 | 
220 |         let cartesian = CartesianProduct::new(lens_and_offsets);
221 | 
222 |         cartesian.map(move |indices| {
223 |             let mut arr: SmallVec<[u64; SMALL_VECTOR_SIZE]> =
224 |                 SmallVec::with_capacity(row_length + 1);
225 |             arr.push(total_combinations as u64);
226 |             for i in indices {
227 |                 let value = hashes[i as usize];
228 |                 arr.push(value);
229 |             }
230 |             arr
231 |         })
232 |     }
233 | }
234 | 
235 | #[inline(always)]
236 | fn hash(entity: &str) -> u64 {
237 |     let mut hasher = XxHash64::default();
238 |     hasher.write(entity.as_bytes());
239 |     hasher.finish()
240 | }
241 | 
242 | #[cfg(test)]
243 | mod tests {
244 |     use crate::configuration::{Column, Configuration};
245 |     use crate::entity::{
246 |         hash, CartesianProduct, EntityProcessor, LengthAndOffset, SMALL_VECTOR_SIZE,
247 |     };
248 |     use crate::persistence::entity::InMemoryEntityMappingPersistor;
249 |     use smallvec::{smallvec, SmallVec};
250 |     use std::sync::Arc;
251 | 
252 |     fn prepare_lengths_and_offsets(
253 |         entities_per_column: &[u32],
254 |     ) -> SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]> {
255 |         let mut lens_and_offsets: SmallVec<[LengthAndOffset; SMALL_VECTOR_SIZE]> =
256 |             SmallVec::with_capacity(entities_per_column.len());
257 |         let mut offset = 0;
258 |         for &num_of_entities in entities_per_column {
259 |             lens_and_offsets.push(LengthAndOffset {
260 |                 length: num_of_entities,
261 |                 offset,
262 |             });
263 |             offset += num_of_entities;
264 |         }
265 |         lens_and_offsets
266 |     }
267 | 
268 |     fn prepare_hashes(
269 |         total_combination: u64,
270 |         entities: &[&str],
271 |         field_hashes: &[u64],
272 |     ) -> SmallVec<[u64; SMALL_VECTOR_SIZE]> {
273 |         let mut hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = SmallVec::new();
274 |         hashes.push(total_combination);
275 |         for (i, &entity) in entities.iter().enumerate() {
276 |             let hash = field_hashes[i] ^ hash(entity);
277 |             hashes.push(hash);
278 |         }
279 |         hashes
280 |     }
281 | 
282 |     #[test]
283 |     fn generate_cartesian_product_indices() {
284 |         let lengths_and_offsets = prepare_lengths_and_offsets(&[2, 1, 3]);
285 | 
286 |         let cartesian_product = CartesianProduct::new(lengths_and_offsets);
287 |         let mut iter = cartesian_product.into_iter();
288 | 
289 |         assert_eq!(Some(smallvec![0, 2, 3]), iter.next());
290 |         assert_eq!(Some(smallvec![0, 2, 4]), iter.next());
291 |         assert_eq!(Some(smallvec![0, 2, 5]), iter.next());
292 |         assert_eq!(Some(smallvec![1, 2, 3]), iter.next());
293 |         assert_eq!(Some(smallvec![1, 2, 4]), iter.next());
294 |         assert_eq!(Some(smallvec![1, 2, 5]), iter.next());
295 | 
296 |         assert_eq!(None, iter.next());
297 |     }
298 | 
299 |     #[test]
300 |     fn generate_cartesian_product_hashes() {
301 |         let dummy_config = Configuration::default(String::from(""), vec![]);
302 | 
303 |         // hashes for entities in every column
304 |         // column_1: 1 entity
305 |         // column_2: 2 entities
306 |         // column_3: 3 entities
307 |         let lengths_and_offsets = prepare_lengths_and_offsets(&[1, 2, 3]);
308 |         let hashes: SmallVec<[u64; SMALL_VECTOR_SIZE]> = smallvec![10, 20, 30, 40, 50, 60];
309 |         let mut total_combinations = 1u64;
310 |         for len_and_offset in &lengths_and_offsets {
311 |             total_combinations *= len_and_offset.length as u64;
312 |         }
313 | 
314 |         let in_memory_entity_mapping_persistor = InMemoryEntityMappingPersistor::default();
315 |         let in_memory_entity_mapping_persistor = Arc::new(in_memory_entity_mapping_persistor);
316 |         let entity_processor = EntityProcessor::new(
317 |             &dummy_config,
318 |             in_memory_entity_mapping_persistor.clone(),
319 |             |_hashes| {},
320 |         );
321 | 
322 |         let combinations: Vec<_> = entity_processor
323 |             .generate_combinations_with_length(hashes, lengths_and_offsets)
324 |             .collect();
325 |         assert_eq!(
326 |             &SmallVec::from([total_combinations, 10, 20, 40]),
327 |             combinations.get(0).unwrap()
328 |         );
329 |         assert_eq!(
330 |             &SmallVec::from([total_combinations, 10, 20, 50]),
331 |             combinations.get(1).unwrap()
332 |         );
333 |         assert_eq!(
334 |             &SmallVec::from([total_combinations, 10, 20, 60]),
335 |             combinations.get(2).unwrap()
336 |         );
337 |         assert_eq!(
338 |             &SmallVec::from([total_combinations, 10, 30, 40]),
339 |             combinations.get(3).unwrap()
340 |         );
341 |         assert_eq!(
342 |             &SmallVec::from([total_combinations, 10, 30, 50]),
343 |             combinations.get(4).unwrap()
344 |         );
345 |         assert_eq!(
346 |             &SmallVec::from([total_combinations, 10, 30, 60]),
347 |             combinations.get(5).unwrap()
348 |         );
349 |         assert_eq!(None, combinations.get(6));
350 |     }
351 | 
352 |     #[test]
353 |     fn process_row_and_handle_hashes() {
354 |         let columns = vec![
355 |             Column {
356 |                 name: String::from("column_1"),
357 |                 transient: false,
358 |                 complex: false,
359 |                 reflexive: false,
360 |                 ignored: true,
361 |             },
362 |             Column {
363 |                 name: String::from("column_2"),
364 |                 transient: true,
365 |                 complex: false,
366 |                 reflexive: false,
367 |                 ignored: false,
368 |             },
369 |             Column {
370 |                 name: String::from("column_3"),
371 |                 transient: false,
372 |                 complex: true,
373 |                 reflexive: true,
374 |                 ignored: false,
375 |             },
376 |             Column {
377 |                 name: String::from("column_4"),
378 |                 transient: false,
379 |                 complex: false,
380 |                 reflexive: false,
381 |                 ignored: false,
382 |             },
383 |         ];
384 |         // columns configuration: ignored::column_1 transient::column_2 complex::reflexive::column3 column_4
385 |         // first column is ignored - we don't process entities from that column
386 |         // third column is reflexive so we put it at the end
387 |         let column_names = vec![
388 |             columns[1].name.clone(),
389 |             columns[2].name.clone(),
390 |             columns[3].name.clone(),
391 |             columns[2].name.clone(),
392 |         ];
393 |         // hashes for column names are used to differentiate entities with the same name
394 |         // from different columns
395 |         let field_hashes: Vec<u64> = column_names.iter().map(|name| hash(name)).collect();
396 | 
397 |         // columns are most important, the rest can be omitted
398 |         let dummy_config = Configuration::default(String::from(""), columns);
399 | 
400 |         let in_memory_entity_mapping_persistor = InMemoryEntityMappingPersistor::default();
401 |         let in_memory_entity_mapping_persistor = Arc::new(in_memory_entity_mapping_persistor);
402 |         let mut result: SmallVec<[SmallVec<[u64; SMALL_VECTOR_SIZE]>; SMALL_VECTOR_SIZE]> =
403 |             SmallVec::new();
404 |         let mut entity_processor = EntityProcessor::new(
405 |             &dummy_config,
406 |             in_memory_entity_mapping_persistor.clone(),
407 |             |hashes| {
408 |                 result.push(hashes);
409 |             },
410 |         );
411 | 
412 |         let row = vec![
413 |             smallvec!["a"],
414 |             smallvec!["bb"],
415 |             smallvec!["ccc", "ddd"],
416 |             smallvec!["eeee"],
417 |         ];
418 |         entity_processor.process_row(&row);
419 | 
420 |         // first column is ignored, third one is reflexive so the entities go at the end
421 |         // input: "bb", "ccc ddd", "eeee", "ccc ddd"
422 |         // number of cartesian products from the above entities
423 |         assert_eq!(4, result.len());
424 |         assert_eq!(
425 |             prepare_hashes(4, &["bb", "ccc", "eeee", "ccc"], &field_hashes),
426 |             result[0]
427 |         );
428 |         assert_eq!(
429 |             prepare_hashes(4, &["bb", "ccc", "eeee", "ddd"], &field_hashes),
430 |             result[1]
431 |         );
432 |         assert_eq!(
433 |             prepare_hashes(4, &["bb", "ddd", "eeee", "ccc"], &field_hashes),
434 |             result[2]
435 |         );
436 |         assert_eq!(
437 |             prepare_hashes(4, &["bb", "ddd", "eeee", "ddd"], &field_hashes),
438 |             result[3]
439 |         );
440 |     }
441 | }
442 | 


--------------------------------------------------------------------------------