├── .circleci
└── config.yml
├── .flake8
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── README.md
├── docs
├── api
│ ├── modules.rst
│ ├── rexify.cli.rst
│ ├── rexify.constants.rst
│ ├── rexify.exceptions.rst
│ ├── rexify.exceptions.schema.rst
│ ├── rexify.features.base.rst
│ ├── rexify.features.dataset.rst
│ ├── rexify.features.extractor.rst
│ ├── rexify.features.pipelines.rst
│ ├── rexify.features.rst
│ ├── rexify.models.candidate.rst
│ ├── rexify.models.query.rst
│ ├── rexify.models.recommender.rst
│ ├── rexify.models.rst
│ ├── rexify.models.tower.rst
│ ├── rexify.pipeline.rst
│ ├── rexify.rst
│ └── rexify.utils.rst
├── conf.py
├── genindex.rst
├── index.rst
├── overview
│ ├── architecture.md
│ ├── inputs.md
│ └── overview.md
├── requirements.txt
└── tutorials
│ ├── configure_pipeline.ipynb
│ ├── prebuilt_pipeline.ipynb
│ └── quickstart.ipynb
├── pyproject.toml
├── rexify
├── __init__.py
├── data
│ ├── __init__.py
│ ├── base.py
│ ├── input.py
│ └── output.py
├── features
│ ├── __init__.py
│ ├── base.py
│ ├── extractor.py
│ └── transform
│ │ ├── __init__.py
│ │ ├── category.py
│ │ ├── custom.py
│ │ ├── entity.py
│ │ ├── event.py
│ │ ├── id.py
│ │ ├── number.py
│ │ └── sequence.py
├── models
│ ├── __init__.py
│ ├── base.py
│ ├── callbacks
│ │ ├── __init__.py
│ │ ├── index.py
│ │ └── mlflow.py
│ ├── index.py
│ ├── lookup.py
│ ├── ranking
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── event.py
│ │ └── ranking.py
│ ├── recommender.py
│ ├── retrieval
│ │ ├── __init__.py
│ │ ├── candidate.py
│ │ ├── query.py
│ │ ├── retrieval.py
│ │ └── tower.py
│ └── sequential.py
├── pipeline
│ ├── __init__.py
│ ├── __main__.py
│ └── components
│ │ ├── __init__.py
│ │ ├── load.py
│ │ └── train.py
├── schema.py
└── utils.py
└── tests
├── test_extractor.py
├── test_schema.py
└── test_utils.py
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2.1
2 |
3 | orbs:
4 | python: circleci/python@2.1.1
5 |
6 | jobs:
7 |
8 | test:
9 | docker:
10 | - image: cimg/python:3.10
11 | steps:
12 | - checkout
13 | - python/install-packages:
14 | pre-install-steps: []
15 | pkg-manager: poetry
16 | - run:
17 | name: Run tests
18 | command: |
19 | poetry run pytest
20 |
21 | publish:
22 | docker:
23 | - image: cimg/python:3.10
24 | steps:
25 | - checkout
26 | - run:
27 | name: Build and publish
28 | command: |
29 | poetry build
30 | poetry version $(git describe --tags --abbrev=0)
31 | poetry publish --build --username $PYPI_USERNAME --password $PYPI_PASSWORD
32 |
33 | docker:
34 | docker:
35 | - image: cimg/base:2023.04
36 | environment:
37 | IMAGE_URI: joseprsm/rexify
38 | steps:
39 | - checkout
40 | - setup_remote_docker
41 | - run:
42 | name: Build Docker image
43 | command: docker build . -t $IMAGE_URI
44 | - run:
45 | name: Push Docker image
46 | command: |
47 | echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
48 | docker push $IMAGE_URI
49 |
50 | workflows:
51 | test_only:
52 | jobs:
53 | - test
54 |
55 | test_and_build:
56 | jobs:
57 | - test: &tags_only
58 | filters:
59 | branches:
60 | ignore: /.*/
61 | tags:
62 | only: /^\d+\.\d+\.\d+$/
63 | - publish:
64 | <<: *tags_only
65 | requires:
66 | - test
67 | - request_docker:
68 | <<: *tags_only
69 | type: approval
70 | requires:
71 | - test
72 | - docker:
73 | <<: *tags_only
74 | requires:
75 | - request_docker
76 | - publish
77 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | select = C,E,F,W,B,B9
4 | ignore = E203, E501, W503
5 | exclude =
6 | docs/conf.py
7 | __init__.py
8 | build
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /build
2 | /dist
3 |
4 | .idea
5 | .env
6 | .coverage
7 | .pytest_cache
8 |
9 | */__pycache__/*
10 | */.ipynb_checkpoints/
11 | *.egg-info/
12 |
13 | outputs
14 |
15 | /docs/_build/
16 | /docs/api
17 | /docs/reference
18 |
19 | .DS_Store
20 |
21 | *.pyc
22 | .vscode/settings.json
23 |
24 | mlruns
25 |
26 | /*.json
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: local
3 | hooks:
4 | - id: black
5 | name: black
6 | language: system
7 | entry: black
8 | types: [ python ]
9 | require_serial: true
10 | - id: flake8
11 | name: flake8
12 | entry: flake8
13 | language: system
14 | types: [ python ]
15 | require_serial: true
16 | - id: isort
17 | name: isort
18 | entry: isort
19 | require_serial: true
20 | language: system
21 | types_or: [cython, pyi, python]
22 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-20.04
5 | tools:
6 | python: "3.10"
7 |
8 | sphinx:
9 | builder: html
10 | configuration: docs/conf.py
11 |
12 | python:
13 | install:
14 | - requirements: docs/requirements.txt
15 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | joseprsm@gmail.com.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 |
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 |
124 | [homepage]: https://www.contributor-covenant.org
125 |
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG python="3.10"
2 | ARG filesystem="gcs"
3 |
4 | FROM python:${python} AS base
5 |
6 | RUN if [ $(uname -m) != *arm* ]; then pip install scann==1.2.3; fi
7 |
8 | RUN pip install pandas numpy scikit-learn fsspec rexify
9 |
10 | FROM base AS fs-s3
11 |
12 | RUN pip install s3fs
13 |
14 | FROM base AS fs-gcs
15 |
16 | RUN pip install gcsfs
17 |
18 | FROM fs-${filesystem} AS final
19 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 José Medeiros
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | Rexify is a library to streamline recommender systems model development.
23 |
24 | In essence, Rexify adapts dynamically to your data, and outputs high-performing TensorFlow
25 | models that may be used wherever you want, independently of your data. Rexify also includes
26 | modules to deal with feature engineering as Scikit-Learn Transformers and Pipelines.
27 |
28 | With Rexify, users may easily train Recommender Systems models, just by specifying what their
29 | data looks like. Rexify also comes equipped with pre-built machine learning pipelines which can
30 | be used serverlessly.
31 |
32 | ## What is Rexify?
33 |
34 | Rexify is a low-code personalization tool, that makes use of traditional machine learning
35 | frameworks, such as Scikit-Learn and TensorFlow, to create scalable Recommender Systems
36 | workflows that anyone can use.
37 |
38 | ### Who is it for?
39 |
40 | Rexify is a project that simplifies and standardizes the workflow of recommender systems. It is
41 | mostly geared towards people with little to no machine learning knowledge, that want to implement
42 | somewhat scalable Recommender Systems in their applications.
43 |
44 | ## Installation
45 |
46 | The easiest way to install Rexify is via `pip`:
47 |
48 | ```shell
49 | pip install rexify
50 | ```
51 |
52 | ## Quick Tour
53 |
54 | Rexify is meant to be usable right out of the box. All you need to set up your model is interaction
55 | data - something that kind of looks like this:
56 |
57 | | user_id | item_id | timestamp | event_type |
58 | |---------|---------|------------|-------------|
59 | | 22 | 67 | 2021/05/13 | Purchase |
60 | | 37 | 9 | 2021/04/11 | Page View |
61 | | 22 | 473 | 2021/04/11 | Add to Cart |
62 | | ... | ... | ... | ... |
63 | | 358 | 51 | 2021/04/11 | Purchase |
64 |
65 | Additionally, we'll have to have configured a schema for the data.
66 | This schema is what will allow Rexify to generate a dynamic model and preprocessing steps.
67 | The schema should be comprised of two dictionaries (`user`, `ìtem`) and two key-value
68 | pairs: `event_type` (which should point to the column of the event type) and `timestamp` (
69 | which should point to the timestamp column)
70 |
71 | Each of these dictionaries should consist of features and internal data types,
72 | such as: `id`, `category`, `number`. More data types will be available
73 | in the future.
74 |
75 | ```json
76 | {
77 | "user": {
78 | "user_id": "id",
79 | "age": "number"
80 | },
81 | "item": {
82 | "item_id": "id",
83 | "category": "category"
84 | },
85 | "timestamp": "timestamp"
86 | "event_type": "event_type"
87 | }
88 | ```
89 |
90 | Essentially, what Rexify will do is take the schema, and dynamically adapt to the data.
91 |
92 | There are two main components in Rexify workflows: `FeatureExtractor` and `Recommender`.
93 |
94 | The `FeatureExtractor` is a scikit-learn Transformer that basically takes the schema of
95 | the data, and transforms the event data accordingly. Another method `.make_dataset()`,
96 | converts the transformed data into a `tf.data.Dataset`, all correctly configured to be fed
97 | to the `Recommender` model.
98 |
99 | `Recommender` is a `tfrs.Model` that basically implements the Query and Candidate towers.
100 | During training, the Query tower will take the user ID, user features, and context, to
101 | learn an embedding; the Candidate tower will do the same for the item ID and its features.
102 |
103 | More information about how the `FeatureExtractor` and the `Recommender` works can be found
104 | [here](https://rexify.readthedocs.io/en/latest/overview/architecture.html).
105 |
106 | A sample Rexify workflow should sort of look like this:
107 |
108 | ````python
109 |
110 | import pandas as pd
111 |
112 | from rexify import Schema, FeatureExtractor, Recommender
113 |
114 | events = pd.read_csv('path/to/events/data')
115 | schema = Schema.load('path/to/schema')
116 |
117 | fe = FeatureExtractor(schema, users='path/to/users/data', items='path/to/events/data', return_dataset=True)
118 | x = fe.fit(events).transform(events)
119 |
120 | model = Recommender(**fe.model_params)
121 | model.compile()
122 | model.fit(events, batch_size=512)
123 | ````
124 |
125 | When training is complete, you'll have a trained `tf.keras.Model` ready to be used, as
126 | you normally would.
127 |
128 | Alternatively, you can also run:
129 |
130 | ```shell
131 | python -m rexify.pipeline -p events=$EVENTS_PATH -p users=$USER_PATH -p items=$ITEMS_PATH -p schema=$SCHEMA_PATH
132 | ```
133 |
134 | Which will generate a `pipeline.json` file, that you can use on Kubeflow Pipelines (or Vertex AI Pipelines).
135 |
136 | ## License
137 |
138 | [MIT](https://github.com/joseprsm/rexify/blob/main/LICENSE)
139 |
--------------------------------------------------------------------------------
/docs/api/modules.rst:
--------------------------------------------------------------------------------
1 | rexify
2 | ======
3 |
4 | .. toctree::
5 | :maxdepth: 6
6 |
7 | rexify
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.cli.rst:
--------------------------------------------------------------------------------
1 | rexify.cli module
2 | =================
3 |
4 | .. automodule:: rexify.cli
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.constants.rst:
--------------------------------------------------------------------------------
1 | rexify.constants module
2 | =======================
3 |
4 | .. automodule:: rexify.constants
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.exceptions.rst:
--------------------------------------------------------------------------------
1 | rexify.exceptions package
2 | =========================
3 |
4 | .. automodule:: rexify.exceptions
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
9 | Submodules
10 | ----------
11 |
12 | .. toctree::
13 | :maxdepth: 6
14 |
15 | rexify.exceptions.schema
16 |
--------------------------------------------------------------------------------
/docs/api/rexify.exceptions.schema.rst:
--------------------------------------------------------------------------------
1 | rexify.exceptions.schema module
2 | ===============================
3 |
4 | .. automodule:: rexify.exceptions.schema
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.features.base.rst:
--------------------------------------------------------------------------------
1 | rexify.features.base module
2 | ===========================
3 |
4 | .. automodule:: rexify.features.base
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.features.dataset.rst:
--------------------------------------------------------------------------------
1 | rexify.features.dataset module
2 | ==============================
3 |
4 | .. automodule:: rexify.features.dataset
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.features.extractor.rst:
--------------------------------------------------------------------------------
1 | rexify.features.extractor module
2 | ================================
3 |
4 | .. automodule:: rexify.features.extractor
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.features.pipelines.rst:
--------------------------------------------------------------------------------
1 | rexify.features.pipelines module
2 | ================================
3 |
4 | .. automodule:: rexify.features.pipelines
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.features.rst:
--------------------------------------------------------------------------------
1 | rexify.features package
2 | =======================
3 |
4 | .. automodule:: rexify.features
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
9 | Submodules
10 | ----------
11 |
12 | .. toctree::
13 | :maxdepth: 6
14 |
15 | rexify.features.base
16 | rexify.features.dataset
17 | rexify.features.extractor
18 | rexify.features.pipelines
19 |
--------------------------------------------------------------------------------
/docs/api/rexify.models.candidate.rst:
--------------------------------------------------------------------------------
1 | rexify.models.candidate module
2 | ==============================
3 |
4 | .. automodule:: rexify.models.candidate
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.models.query.rst:
--------------------------------------------------------------------------------
1 | rexify.models.query module
2 | ==========================
3 |
4 | .. automodule:: rexify.models.query
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.models.recommender.rst:
--------------------------------------------------------------------------------
1 | rexify.models.recommender module
2 | ================================
3 |
4 | .. automodule:: rexify.models.recommender
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.models.rst:
--------------------------------------------------------------------------------
1 | rexify.models package
2 | =====================
3 |
4 | .. automodule:: rexify.models
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
9 | Submodules
10 | ----------
11 |
12 | .. toctree::
13 | :maxdepth: 6
14 |
15 | rexify.models.candidate
16 | rexify.models.query
17 | rexify.models.ranking
18 | rexify.models.recommender
19 | rexify.models.retrieval
20 | rexify.models.tower
21 |
--------------------------------------------------------------------------------
/docs/api/rexify.models.tower.rst:
--------------------------------------------------------------------------------
1 | rexify.models.tower module
2 | ==========================
3 |
4 | .. automodule:: rexify.models.tower
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.pipeline.rst:
--------------------------------------------------------------------------------
1 | rexify.pipeline module
2 | ======================
3 |
4 | .. automodule:: rexify.pipeline
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/api/rexify.rst:
--------------------------------------------------------------------------------
1 | rexify package
2 | ==============
3 |
4 | .. automodule:: rexify
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
9 | Subpackages
10 | -----------
11 |
12 | .. toctree::
13 | :maxdepth: 6
14 |
15 | rexify.exceptions
16 | rexify.features
17 | rexify.models
18 |
19 | Submodules
20 | ----------
21 |
22 | .. toctree::
23 | :maxdepth: 6
24 |
25 | rexify.cli
26 | rexify.constants
27 | rexify.pipeline
28 | rexify.utils
29 |
--------------------------------------------------------------------------------
/docs/api/rexify.utils.rst:
--------------------------------------------------------------------------------
1 | rexify.utils module
2 | ===================
3 |
4 | .. automodule:: rexify.utils
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | import sphinx_material
2 |
3 |
4 | project = "Rexify"
5 | html_title = "Rexify"
6 |
7 | html_theme = "sphinx_material"
8 |
9 | extensions = [
10 | "sphinx.ext.autodoc",
11 | "sphinx.ext.githubpages",
12 | "m2r2",
13 | "sphinx.ext.napoleon",
14 | "sphinx_search.extension",
15 | "sphinxcontrib.apidoc",
16 | "nbsphinx",
17 | ]
18 | source_suffix = [".rst", ".md"]
19 |
20 | napoleon_google_docstring = True
21 | napoleon_numpy_docstring = True
22 | napoleon_include_init_with_doc = True
23 | napoleon_include_private_with_doc = False
24 | napoleon_include_special_with_doc = True
25 | napoleon_use_admonition_for_examples = False
26 | napoleon_use_admonition_for_notes = False
27 | napoleon_use_admonition_for_references = False
28 | napoleon_use_ivar = False
29 | napoleon_use_param = True
30 | napoleon_use_rtype = False
31 |
32 | apidoc_module_dir = "../rexify"
33 | apidoc_output_dir = "api"
34 | apidoc_excluded_paths = ["**/*test*"]
35 | apidoc_module_first = True
36 | apidoc_separate_modules = True
37 | apidoc_extra_args = ["-d 6"]
38 |
39 | html_theme_options = {
40 | "color_primary": "cyan",
41 | "color_accent": "light-blue",
42 | "repo_url": "https://github.com/joseprsm/rexify",
43 | "repo_name": "Rexify",
44 | "globaltoc_depth": 2,
45 | "globaltoc_collapse": False,
46 | "globaltoc_includehidden": False,
47 | "repo_type": "github",
48 | }
49 |
50 | extensions.append("sphinx_material")
51 | html_theme_path = sphinx_material.html_theme_path()
52 | html_context = sphinx_material.get_html_context()
53 |
54 | html_sidebars = {
55 | "**": ["logo-text.html", "globaltoc.html", "localtoc.html", "searchbox.html"]
56 | }
57 |
58 | nbsphinx_allow_errors = True
59 |
--------------------------------------------------------------------------------
/docs/genindex.rst:
--------------------------------------------------------------------------------
1 | Main Index
2 | ==========
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. toctree::
2 | :hidden:
3 |
4 | genindex
5 |
6 |
7 | .. toctree::
8 | :titlesonly:
9 |
10 | Rexify
11 | Architecture
12 | Inputs
13 |
14 | .. toctree::
15 | :titlesonly:
16 | :caption: Guides and Examples
17 |
18 | Quickstart
19 | Using a pre-built pipeline
20 | Configuring your own Kubeflow pipeline
21 |
22 | .. toctree::
23 | :maxdepth: 1
24 | :caption: API reference
25 |
26 | API reference
27 |
28 | .. mdinclude:: ../README.md
--------------------------------------------------------------------------------
/docs/overview/architecture.md:
--------------------------------------------------------------------------------
1 | # Architecture
2 |
3 | Rexify has two main components: the `FeatureExtractor` and the `Recommender`.
4 |
5 | The former basically takes the original data, and learns all the transformations
6 | that need to be applied to the dataset. The output is a `tf.data.Dataset` with the
7 | right structure to be passed on to the `Recommender` model.
8 |
9 | This `Recommender` is a TensorFlow model with a dynamic architecture, which adapts
10 | itself according to the schema fed to the `FeatureExtractor`.
11 |
12 | ## Feature Extractor
13 |
14 | The `FeatureExtractor` is a scikit-learn Transformer. It implements a `.fit()`
15 | and a `.transform()` method that apply a set of transformations on the data.
16 |
17 | Essentially, it has a `_ppl` attribute which is a `sklearn.pipeline.Pipeline`;
18 | the pipeline steps are set according to the `schema` passed during instantiation,
19 | which are scikit-learn Transformers themselves.
20 |
21 | For example, an attribute classified as `id` would create a pipeline step with a
22 | `sklearn.compose.ColumnTransformer`, composed of a single `sklearn.preprocessing.OrdinalEncoder`
23 | Transformer.
24 |
25 | Additionally, it subclasses `rexify.features.TfDatasetGenerator`, which converts
26 | the output of the transformations of the `FeatureExtractor` into a `tf.data.Dataset`,
27 | with a nested structure such as this:
28 |
29 | ```
30 | {
31 | "query": {
32 | "user_id": tf.Tensor([]),
33 | "user_features": tf.Tensor([]),
34 | "context": tf.Tensor([]),
35 | },
36 | "candidate": {
37 | "item_id": tf.Tensor([]),
38 | "item_features": tf.Tensor([])
39 | }
40 | }
41 | ```
42 |
43 | With this structure, the Recommender model can call a different set of layers for
44 | the user and item ID attributes, and the remaining transformed features.
45 |
46 | ## Recommender
47 |
48 | The `Recommender` is a `tfrs.models.Model`, which subclasses `tf.keras.Model`
49 | and overrides the `.train_step()` method. According to the [TensorFlow Recommenders documentation](https://www.tensorflow.org/recommenders/api_docs/python/tfrs/models/Model):
50 |
51 | > Many recommender models are relatively complex, and do not neatly
52 | > fit into supervised or unsupervised paradigms. This base class makes it easy to
53 | > define custom training and test losses for such complex models.
54 |
55 | In this case, we use the Recommender model, to create a two tower model architecture, as explained [here](https://research.google/pubs/pub48840/).
56 | In short, it's composed of two main models, a Query model and a Candidate model, both of which learn
57 | to represent queries and candidates in the same vector space.
58 |
59 |
60 |
61 |
62 |
63 | Basically, it takes the `tf.data.Dataset` output by the `FeatureExtractor` and passes it by the two
64 | Query and Candidate model towers. Due to the nested structure of the dataset, we're able to get and apply
65 | different transformations to different sets of features.
66 |
67 | ### Query Tower
68 |
69 | The Query Tower is responsible with learning a representation for the queries. That representation is a
70 | combination between the user embedding, and the features learned from the remaining
71 | user and context attributes.
72 |
73 | Essentially, it takes the user ID attribute and passes it to an Embedding layer. The user and context
74 | features are concatenated and passed to a model composed of Dense layers. The output of that model and
75 | the user embedding are then concatenated and subsequently fed to another set of Dense layers.
76 |
77 | The resulting vector should represent a single query, which can be used to compute the similarity
78 | to the candidate vectors.
79 |
80 | ### Candidate Tower
81 |
82 | In essence, the Candidate Tower shares the same behavior as the Query's. The key difference is that instead
83 | of using the user ID and features and context, it solely uses the item ID and remaining features.
84 |
85 | On a deeper level, it takes the item ID attribute and passes it to an Embedding layer. The item features are
86 | passed to a set of Dense layers. The output of these layers and the Embedding layer are then concatenated and
87 | then passed to another set of Dense layers.
88 |
89 | The resulting vector should represent a single candidate, or item, in this case, which can be used to compute
90 | the similarity to a query vector or between other candidate vectors.
--------------------------------------------------------------------------------
/docs/overview/inputs.md:
--------------------------------------------------------------------------------
1 | # Inputs
2 |
3 | ## Data
4 |
5 | There are three main types of data may be input to Rexify: [Events](#Events), [Users](#Users), and [Items](#Items)
6 |
7 | ### Events
8 |
9 | Events are the main type of
10 |
11 | ### Users
12 |
13 | ### Items
14 |
15 | ## Schema
--------------------------------------------------------------------------------
/docs/overview/overview.md:
--------------------------------------------------------------------------------
1 | # Rexify
2 |
3 | Rexify is a library to streamline recommender systems model development. It is built on
4 | top of [Tensorflow Recommenders](https://github.com/tensorflow/recommenders) models and
5 | [Kubeflow](https://github.com/kubeflow/pipelines) pipelines.
6 |
7 | In essence, Rexify adapts dynamically to your data, and outputs high-performing TensorFlow
8 | models that may be used wherever you want, independently of your data. Rexify also includes modules to deal with feature engineering as Scikit-Learn Transformers
9 | and Pipelines.
10 |
11 | ## Who is Rexify for?
12 |
13 | Rexify is a project that simplifies and standardizes the workflow of recommender systems. It is
14 | mostly geared towards people with little to no machine learning knowledge, that want to implement
15 | somewhat scalable Recommender Systems in their applications.
16 |
17 | ## Quick Tour
18 |
19 | Rexify is meant to be usable right out of the box. All you need to set up your model is interaction data - something that kind of looks like this:
20 |
21 | | user_id | item_id | timestamp | item_name | event_type |
22 | |---------|---------|------------|-------------|-------------|
23 | | 22 | 67 | 2021/05/13 | Blue Jeans | Purchase |
24 | | 37 | 9 | 2021/04/11 | White Shirt | Page View |
25 | | 22 | 473 | 2021/04/11 | Red Purse | Add to Cart |
26 | | ... | ... | ... | ... | ... |
27 | | 358 | 51 | 2021/04/11 | Bracelet | Purchase |
28 |
29 | Additionally, we'll have to have configured a schema for the data.
30 | This schema is what will allow Rexify to generate a dynamic model and preprocessing steps.
31 | The schema should be comprised of three dictionaries: `user`, `ìtem`, `context`.
32 |
33 | Each of these dictionaries should consist of features and internal data types,
34 | such as: `id`, `categorical`, `timestamp`, `text`. More data types will be available
35 | in the future.
36 |
37 | ```json
38 | {
39 | "user": {
40 | "user_id": "id"
41 | },
42 | "item": {
43 | "item_id": "id",
44 | "timestamp": "timestamp",
45 | "item_name": "text"
46 | },
47 | "context": {
48 | "event_type": "categorical"
49 | }
50 | }
51 | ```
52 |
53 | Essentially, what Rexify will do is take the schema, and dynamically adapt to the data.
54 |
55 | ### As a package
56 |
57 | There are two main components in Rexify workflows: `FeatureExtractor` and `Recommender`.
58 |
59 | The `FeatureExtractor` is a scikit-learn Transformer that basically takes the schema of the data, and transforms the event data accordingly. Another method `.make_dataset()`, converts the transformed data into a `tf.data.Dataset`, all correctly configured to be fed to the `Recommender` model. You can read more about how the `FeatureExtractor` works here.
60 |
61 | `Recommender` is a `tfrs.Model` that basically implements the Query and Candidate towers. During training, the Query tower will take the user ID, user features, and context, to learn an embedding; the Candidate tower will do the same for the item ID and its features. More information about the `Recommender` model can be found here.
62 |
63 | A sample Rexify workflow should sort of look like this:
64 |
65 | ````python
66 | import json
67 | import pandas as pd
68 |
69 | from rexify.features import FeatureExtractor
70 | from rexify.models import Recommender
71 |
72 | events = pd.read_csv('path/to/events/data')
73 | with open('path/to/schema') as f:
74 | schema = json.load(f)
75 |
76 | feat = FeatureExtractor(schema)
77 | prep_data = feat.fit_transform(events)
78 | ds = feat.make_dataset(prep_data)
79 |
80 | model = Recommender(**feat.model_params)
81 | model.compile()
82 | model.fit(ds)
83 | ````
84 |
85 | When training is complete, you'll have a trained `tf.keras.Model` ready to be used, as you normally would.
86 |
87 | ### As a prebuilt pipeline
88 |
89 | After cloning this project and setting up the necessary environment variables, you can run:
90 |
91 | ```shell
92 | python -m rexify.pipeline
93 | ```
94 |
95 | Which should output a `pipeline.json` file. You can then upload this file manually to
96 | either a Kubeflow Pipeline or Vertex AI Pipelines instance, and it should run seamlessly.
97 |
98 | You can also check the [Kubeflow Pipeline](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.client.html#kfp.Client.create_run_from_pipeline_package)
99 | and [Vertex AI](https://cloud.google.com/vertex-ai/docs/pipelines/run-pipeline#create_a_pipeline_run)
100 | documentation to learn how to submit these pipelines programmatically.
101 |
102 | The prebuilt pipeline consists of 5 components:
103 |
104 | 1. `download`, which downloads the event data from URLs set on the `$INPUT_DATA_URL` and `$SCHEMA_URL` environment variables
105 | 2. `load`, which prepares the data downloaded in the previous step
106 | 3. `train`, which trains a `Recommender` model on the preprocessed data
107 | 4. `index`, which trains a [ScaNN](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html) model to retrieve the nearest neighbors
108 | 5. `retrieval`, which basically retrieves the nearest _k_ neighbors for each of the known users
109 |
110 |
111 | ### Via the demo application
112 |
113 | After cloning the project, install the demo dependencies and run the Streamlit application:
114 |
115 | ```shell
116 | pip install -r demo/requirements.txt
117 | streamlit run demo/app.py
118 | ```
119 |
120 | Or, if you're using docker:
121 |
122 | ```shell
123 | docker run joseprsm/rexify-demo
124 | ```
125 |
126 | You can then follow the steps here to set up your pipeline.
127 |
128 | During setup, you'll be asked to either input a publicly available dataset URL or use a sample data set.
129 | After that, you'll have a form to help you set up the schema for the data.
130 |
131 | Finally, after hitting "Compile", you'll have your Pipeline Spec ready. The resulting JSON file can then
132 | be uploaded to Vertex AI Pipelines or Kubeflow, seamlessly.
133 |
134 | The key difference from this pipeline to the prebuilt one is that instead of using the `download` component
135 | to download the schema, it will pass it as an argument to the pipeline, and then use a `copy` component to
136 | pass it down as an artifact.
137 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | mock==1.0.1
3 | alabaster>=0.7,<0.8,!=0.7.5
4 | commonmark==0.8.1
5 | recommonmark==0.5.0
6 | sphinx-rtd-theme
7 | readthedocs-sphinx-ext<2.2
8 | sphinx_material==0.0.30
9 | m2r2
10 | breathe
11 | sphinxcontrib-apidoc>=0.3.0
12 | readthedocs-sphinx-search==0.1.0
13 | jinja2==3.1.2
14 | nbsphinx==0.8.2
15 | nbsphinx-link==1.1.0
16 | ipykernel
--------------------------------------------------------------------------------
/docs/tutorials/configure_pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "8e30dc1e-0237-4a9c-94a7-f3495a608ab7",
6 | "metadata": {},
7 | "source": [
8 | "# Configuring your own pipeline"
9 | ]
10 | }
11 | ],
12 | "metadata": {
13 | "kernelspec": {
14 | "display_name": "Python 3 (ipykernel)",
15 | "language": "python",
16 | "name": "python3"
17 | },
18 | "language_info": {
19 | "codemirror_mode": {
20 | "name": "ipython",
21 | "version": 3
22 | },
23 | "file_extension": ".py",
24 | "mimetype": "text/x-python",
25 | "name": "python",
26 | "nbconvert_exporter": "python",
27 | "pygments_lexer": "ipython3",
28 | "version": "3.9.10"
29 | }
30 | },
31 | "nbformat": 4,
32 | "nbformat_minor": 5
33 | }
34 |
--------------------------------------------------------------------------------
/docs/tutorials/prebuilt_pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4d7be883-9e12-4f8c-b3b0-6f0505065da9",
6 | "metadata": {},
7 | "source": [
8 | "# Using the pre-built pipeline"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "id": "f5ba48c3-1176-4a47-a146-c45e20fb6645",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "!pip install rexify"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "id": "d0e1f889-7ba1-458c-bbee-240cf0ad3b19",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "!rexify pipeline create --args"
29 | ]
30 | }
31 | ],
32 | "metadata": {
33 | "kernelspec": {
34 | "display_name": "Python 3 (ipykernel)",
35 | "language": "python",
36 | "name": "python3"
37 | },
38 | "language_info": {
39 | "codemirror_mode": {
40 | "name": "ipython",
41 | "version": 3
42 | },
43 | "file_extension": ".py",
44 | "mimetype": "text/x-python",
45 | "name": "python",
46 | "nbconvert_exporter": "python",
47 | "pygments_lexer": "ipython3",
48 | "version": "3.9.10"
49 | }
50 | },
51 | "nbformat": 4,
52 | "nbformat_minor": 5
53 | }
54 |
--------------------------------------------------------------------------------
/docs/tutorials/quickstart.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "07e2eea0-dc4a-436c-8605-04df80a20d45",
6 | "metadata": {},
7 | "source": [
8 | "# Quickstart\n",
9 | "\n",
10 | "Let's start by installing Rexify"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "id": "fee1baf9-f430-44d3-a2f0-82f9cb17f107",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "!pip install rexify"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "id": "f6ed5c5a-f691-4871-94f3-97895132bf91",
26 | "metadata": {},
27 | "source": [
28 | "Get some data:"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "id": "7e7c8d3a-400c-4a6b-bf1f-171c73793c16",
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "!mkdir data\n",
39 | "!curl --get https://storage.googleapis.com/roostr-ratings-matrices/rexify/completions.csv > data/events.csv"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "id": "e9fbc3cd-e598-4270-a15e-d9a5cfb9ba5f",
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "import pandas as pd"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "id": "89e2d0b3-f0fd-4094-b64e-ccca7ae24705",
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "events = pd.read_csv('data/events.csv')\n",
60 | "events"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "id": "47ab6ec6-0d08-40c4-83c6-bd797ae40aca",
66 | "metadata": {},
67 | "source": [
68 | "Next, we need to specify our schema:"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "id": "09a944b4-045a-49c0-9e6a-efa2f2be14ae",
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "schema = {\n",
79 | " \"user\": {\n",
80 | " \"account_id\": \"id\",\n",
81 | " },\n",
82 | " \"item\": {\n",
83 | " \"program_id\": \"id\",\n",
84 | " },\n",
85 | " \"context\": {}\n",
86 | "}"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "id": "ea75dc34-0aa3-4d2f-a938-12734d57bff9",
92 | "metadata": {},
93 | "source": [
94 | "To preprocess our data, we can use the `FeatureExtractor`"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "id": "cbb99040-4e6c-42f9-87dc-1cbe033989b6",
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "from rexify.features import FeatureExtractor"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "id": "616e0441-d2ef-4d2d-8524-35635ed310a1",
110 | "metadata": {},
111 | "source": [
112 | "We just need to pass it the schema, and it's ready to roll out."
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "id": "0198ea5f-bd27-4304-a4ae-9218fcccc7eb",
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "feat = FeatureExtractor(schema=schema)"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "id": "40911616-99d7-4510-8946-7219d507b87b",
128 | "metadata": {},
129 | "source": [
130 | "As a scikit-learn Transformer, it has two main methods: `.fit()` and `.transform()`. What `.fit_transform()` essentially does is: `.fit().transform()`.\n",
131 | "\n",
132 | "During `.fit()`, it will take the schema, and infer what the preprocessing should look like - what transformations it should apply to the data before it's ready to be passed to the model. During `.transform()` it will apply those transformations, resulting in a `numpy.array` with the same number of rows as the original data."
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "id": "8f12e2f1-a724-4139-9102-009b11cda8df",
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "features = feat.fit_transform(events)\n",
143 | "features"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "id": "011cd59c-d754-4a22-af0a-de65e81b68f3",
149 | "metadata": {},
150 | "source": [
151 | "The `.make_dataset()` method converts the numpy array to a `tf.data.Dataset` with the format it's expecting."
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "id": "213b3c47-d612-41d1-a2f1-015f6c0b9b92",
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "dataset = feat.make_dataset(features).batch(512)"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "id": "d356f43c-a722-4bfd-bb0c-12a081d39316",
167 | "metadata": {},
168 | "source": [
169 | "We can now take our `Recommender` model and instantiate it.\n",
170 | "\n",
171 | "During `.fit`, our `FeatureExtractor` also learns the right model parameters, so we don't need to worry about them. They're stored in the `model_params` property."
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "id": "b1826f76-56a2-44a9-bf49-0854ce1c678a",
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "from rexify.models import Recommender"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": null,
187 | "id": "73ff6889-8fc9-4cdf-bf5e-3be307e03235",
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "model = Recommender(**feat.model_params)"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "id": "59a0a545-6e0d-4b3d-927e-0282e7760820",
197 | "metadata": {},
198 | "source": [
199 | "Being a `tensorflow.keras.Model` itself, in order to fit it, we need to first compile it:"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "id": "62e89747-42fb-4fee-a49f-56328f208b5c",
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "model.compile()"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "id": "d507a703-afa6-44f9-b24c-7362971da047",
215 | "metadata": {},
216 | "source": [
217 | "To fit it, all we need to do is pass our `tf.data.Dataset`:"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "id": "0d1ef245-2b9c-4bd0-a256-60595a0b699f",
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "# model.fit(dataset)"
228 | ]
229 | }
230 | ],
231 | "metadata": {
232 | "kernelspec": {
233 | "display_name": "Python 3 (ipykernel)",
234 | "language": "python",
235 | "name": "python3"
236 | },
237 | "language_info": {
238 | "codemirror_mode": {
239 | "name": "ipython",
240 | "version": 3
241 | },
242 | "file_extension": ".py",
243 | "mimetype": "text/x-python",
244 | "name": "python",
245 | "nbconvert_exporter": "python",
246 | "pygments_lexer": "ipython3",
247 | "version": "3.9.10"
248 | }
249 | },
250 | "nbformat": 4,
251 | "nbformat_minor": 5
252 | }
253 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "rexify"
3 | version = "0.0.0"
4 | description = "Streamlined Recommender System workflows with TensorFlow and Kubeflow"
5 | authors = ["José Medeiros "]
6 | license = "MIT"
7 | readme = "README.md"
8 | documentation = "https://rexify.readthedocs.io"
9 | packages = [{ include = "rexify" }]
10 | classifiers = [
11 | "Development Status :: 3 - Alpha",
12 | "Intended Audience :: Developers",
13 | "Intended Audience :: Information Technology",
14 | "License :: OSI Approved :: MIT License",
15 | "Operating System :: OS Independent",
16 | "Programming Language :: Python :: 3 :: Only",
17 | "Programming Language :: Python :: 3.8",
18 | "Programming Language :: Python :: 3.9",
19 | "Programming Language :: Python :: 3.10",
20 | "Topic :: Software Development",
21 | "Topic :: Software Development :: Libraries",
22 | "Topic :: Software Development :: Libraries :: Python Modules",
23 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
24 | ]
25 |
26 | [tool.poetry.dependencies]
27 | python = ">=3.8,<3.11"
28 | tensorflow = { version = "2.9.0", markers = "sys_platform != 'darwin'" }
29 | tensorflow_metal = { version = "0.5.0", markers = "sys_platform == 'darwin'"}
30 | tensorflow_macos = { version = "2.9.0", markers = "sys_platform == 'darwin'"}
31 | tensorflow_recommenders = ">=0.7.2"
32 | scikit-learn = "1.*"
33 | pandas = "^1.4.0"
34 | numpy = ">=1.22.3"
35 | kfp = { version = "^1.8.0", optional = true }
36 | mlflow = { version = "^2.3.0", optional = true }
37 | scann = { version = "^1.2.3", markers = "sys_platform != 'darwin'", optional = true }
38 | fsspec = { version = "2023.4.0", optional = true }
39 |
40 | [tool.poetry.extras]
41 | mlflow = ["mlflow"]
42 | scann = ["scann"]
43 | kfp = ["kfp", "fsspec"]
44 |
45 | [tool.poetry.dev-dependencies]
46 | pytest = "^7.1.2"
47 | flake8 = "^5.0.4"
48 | black = "^22.6.0"
49 | isort = "^5.10.1"
50 | pre-commit = "^2.20.0"
51 | darglint = ">=1.8.1"
52 | coverage = {extras = ["toml"], version = ">=6.2"}
53 | interrogate = "^1.5.0"
54 |
55 | [tool.isort]
56 | profile = "black"
57 | lines_after_imports = 2
58 |
59 | [tool.darglint]
60 | strictness = "long"
61 |
62 | [tool.mypy]
63 | disallow_any_generics = true
64 | disallow_subclassing_any = true
65 | disallow_untyped_calls = true
66 | disallow_untyped_defs = true
67 | disallow_incomplete_defs = true
68 | check_untyped_defs = true
69 | disallow_untyped_decorators = true
70 | no_implicit_optional = true
71 | warn_redundant_casts = true
72 | warn_unused_ignores = true
73 | warn_return_any = true
74 | implicit_reexport = false
75 | strict_equality = true
76 |
77 | [tool.coverage.paths]
78 | source = ["rexify"]
79 |
80 | [tool.coverage.run]
81 | branch = true
82 | source = ["rexify"]
83 |
84 | [tool.coverage.report]
85 | show_missing = true
86 | exclude_lines = ["if __name__ == .__main__.:", "_cmd"]
87 | omit = ["*/__init__.py"]
88 |
89 | [tool.interrogate]
90 | ignore-init-method = true
91 | ignore-init-module = true
92 | ignore-magic = true
93 | ignore-semiprivate = true
94 | ignore-private = true
95 | ignore-module = true
96 | ignore-nested-functions = true
97 | ignore-property-decorators = true
98 | exclude = ["docs", "build", "rexify/pipeline.py", "*/exceptions/*"]
99 | ignore-regex = ["call", "get_config", "compute_loss"]
100 | verbose = 0
101 | omit-covered-files = false
102 | quiet = false
103 | color = false
104 |
105 |
106 | [build-system]
107 | requires = ["poetry-core>=1.0.0"]
108 | build-backend = "poetry.core.masonry.api"
109 |
--------------------------------------------------------------------------------
/rexify/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import Events, Items, Output, Users
2 | from .features.extractor import FeatureExtractor
3 | from .models import Recommender
4 | from .schema import Schema
5 |
6 |
7 | BASE_IMAGE = "joseprsm/rexify"
8 |
--------------------------------------------------------------------------------
/rexify/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .input import Events, Items, Users
2 | from .output import Output
3 |
--------------------------------------------------------------------------------
/rexify/data/base.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from pathlib import Path
3 |
4 | import pandas as pd
5 |
6 | from rexify.features.base import HasSchemaMixin
7 | from rexify.schema import Schema
8 |
9 |
10 | class BaseDataFrame(pd.DataFrame, HasSchemaMixin):
11 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
12 | pd.DataFrame.__init__(self, data)
13 | HasSchemaMixin.__init__(self, schema=schema)
14 |
15 | @abstractmethod
16 | def load(cls, path: str | Path, **kwargs):
17 | pass
18 |
--------------------------------------------------------------------------------
/rexify/data/input.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from pathlib import Path
3 |
4 | import pandas as pd
5 | from sklearn.model_selection import train_test_split
6 |
7 | from rexify.data.base import BaseDataFrame
8 | from rexify.schema import Schema
9 |
10 |
11 | class Input(BaseDataFrame):
12 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
13 | super().__init__(data, schema)
14 |
15 | @classmethod
16 | def load(cls, path: str | Path, load_fn: str = "read_csv", schema: Schema = None):
17 | return cls(data=getattr(pd, load_fn)(path), schema=schema)
18 |
19 | def split(self, **kwargs):
20 | train, val = train_test_split(self, **kwargs)
21 | return self.__class__(train, self.schema), self.__class__(val, self.schema)
22 |
23 | @abstractmethod
24 | def generate(cls, n: int = 100):
25 | raise NotImplementedError
26 |
27 |
28 | class Events(Input):
29 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
30 | super().__init__(data, schema)
31 |
32 |
33 | class Users(Input):
34 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
35 | super().__init__(data, schema)
36 |
37 |
38 | class Items(Input):
39 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
40 | super().__init__(data, schema)
41 |
--------------------------------------------------------------------------------
/rexify/data/output.py:
--------------------------------------------------------------------------------
1 | import json
2 | import warnings
3 | from pathlib import Path
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import tensorflow as tf
8 |
9 | from rexify.data.base import BaseDataFrame
10 | from rexify.schema import Schema
11 | from rexify.utils import get_target_id, make_dirs
12 |
13 |
14 | class Output(BaseDataFrame):
15 | def __init__(
16 | self,
17 | data: pd.DataFrame,
18 | schema: Schema,
19 | ranking_features: list[str] | None = None,
20 | ) -> None:
21 | super().__init__(data, schema)
22 | with warnings.catch_warnings():
23 | warnings.filterwarnings("ignore")
24 | self._ranking_features = ranking_features
25 |
26 | @classmethod
27 | def load(cls, path: str | Path):
28 | path = Path(path)
29 |
30 | history = pd.read_csv(path / "history.csv")
31 | features = pd.read_csv(path / "features.csv")
32 | features["history"] = history.values.tolist()
33 | del history
34 |
35 | schema = Schema.from_json(path / "schema.json")
36 | with open(path / "ranks.json", "r") as f:
37 | ranking_features = json.load(f)
38 |
39 | return cls(features, schema=schema, ranking_features=ranking_features)
40 |
41 | def save(self, path: str | Path, name: str = None):
42 | path = Path(path)
43 | path = path / name if name else path
44 |
45 | history = pd.DataFrame(np.stack(self.loc[:, "history"].values))
46 |
47 | make_dirs(path)
48 | history.to_csv(path / "history.csv", index=None)
49 | self.drop("history", axis=1).to_csv(path / "features.csv", index=None)
50 |
51 | with open(path / "ranks.json", "w") as f:
52 | json.dump(self._ranking_features, f)
53 |
54 | self.schema.save(path / "schema.json")
55 |
56 | def to_dataset(self) -> tf.data.Dataset:
57 | return self._make_dataset().map(self._get_header_fn())
58 |
59 | def _make_dataset(self) -> tf.data.Dataset:
60 | return tf.data.Dataset.zip(
61 | (
62 | self._get_target_vector_dataset(self, self._schema, "user"),
63 | self._get_target_vector_dataset(self, self._schema, "item"),
64 | tf.data.Dataset.from_tensor_slices(
65 | np.stack(self["history"].values).astype(np.int32)
66 | ),
67 | self._get_ranking_dataset(self),
68 | )
69 | )
70 |
71 | @staticmethod
72 | def _get_target_vector_dataset(
73 | data, schema: Schema, target: str
74 | ) -> tf.data.Dataset:
75 | return tf.data.Dataset.from_tensor_slices(
76 | data.loc[:, get_target_id(schema, target)]
77 | .values.reshape(-1)
78 | .astype(np.int32)
79 | )
80 |
81 | @staticmethod
82 | def _get_header_fn():
83 | @tf.autograph.experimental.do_not_convert
84 | def header_fn(user_id, item_id, history, ranks):
85 | return {
86 | "query": {"user_id": user_id, "history": history},
87 | "candidate": {"item_id": item_id},
88 | "rank": ranks,
89 | }
90 |
91 | return header_fn
92 |
93 | def _get_ranking_dataset(self, data) -> tf.data.Dataset:
94 | @tf.autograph.experimental.do_not_convert
95 | def add_header(x):
96 | return {
97 | self._ranking_features[i]: x[i]
98 | for i in range(len(self._ranking_features))
99 | }
100 |
101 | return tf.data.Dataset.from_tensor_slices(
102 | data.loc[:, self._ranking_features].values.astype(np.int32)
103 | ).map(add_header)
104 |
105 | @property
106 | def ranking_features(self):
107 | return self._ranking_features
108 |
--------------------------------------------------------------------------------
/rexify/features/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joseprsm/rexify/6efb0cbe8ce9e35b58b200fcb95cf8e65c03d2c2/rexify/features/__init__.py
--------------------------------------------------------------------------------
/rexify/features/base.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import re
3 | from pathlib import Path
4 |
5 | from sklearn.base import BaseEstimator, TransformerMixin
6 | from sklearn.compose import make_column_transformer
7 | from sklearn.pipeline import Pipeline
8 |
9 | from rexify.schema import Schema
10 | from rexify.utils import get_target_feature, make_dirs
11 |
12 |
13 | class HasSchemaMixin:
14 | def __init__(self, schema: Schema):
15 | self._schema = schema
16 |
17 | @property
18 | def schema(self):
19 | return self._schema
20 |
21 |
22 | class HasTargetMixin:
23 |
24 | _SUPPORTED_TARGETS = ["user", "item"]
25 |
26 | def __init__(self, target: str):
27 | self._target = target
28 |
29 | @property
30 | def target(self):
31 | return self._target
32 |
33 | @classmethod
34 | def _validate_target(cls, target: str):
35 | if target not in cls._SUPPORTED_TARGETS:
36 | raise ValueError(f"Target {target} not supported")
37 |
38 |
39 | class Serializable:
40 | def save(self, output_dir: str, filename: str = None):
41 | make_dirs(output_dir)
42 | filename = (
43 | filename or self._camel_to_snake_case(self.__class__.__name__) + ".pickle"
44 | )
45 | output_path = Path(output_dir) / filename
46 | with open(output_path, "wb") as f:
47 | pickle.dump(self, f)
48 |
49 | @classmethod
50 | def load(cls, path: Path | str):
51 | with open(path, "rb") as f:
52 | feat = pickle.load(f)
53 | return feat
54 |
55 | @staticmethod
56 | def _camel_to_snake_case(name: str):
57 | return re.sub(r"(? list[str]:
73 | return get_target_feature(schema, target, dtype)
74 |
75 | def __iter__(self):
76 | for x in [self._name, self.ppl, self._targets]:
77 | yield x
78 |
79 | def as_tuple(self):
80 | return tuple(self)
81 |
82 |
83 | class BaseTransformer(BaseEstimator, TransformerMixin):
84 | def __init__(self, transformer: TransformerMixin, target_features: list[str]):
85 | super().__init__()
86 | self.transformer = transformer
87 | self.target_features = target_features
88 |
89 | self._column_transformer = make_column_transformer(
90 | (self.transformer, self.target_features),
91 | )
92 |
93 | def fit(self, X, y=None, **fit_params):
94 | return self
95 |
96 | def transform(self, X):
97 | pass
98 |
--------------------------------------------------------------------------------
/rexify/features/extractor.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from sklearn.base import BaseEstimator, TransformerMixin
6 | from sklearn.pipeline import make_pipeline
7 |
8 | from rexify.data import Events, Items, Output, Users
9 | from rexify.features.base import HasSchemaMixin, Serializable
10 | from rexify.features.transform import CustomTransformer, EventEncoder, Sequencer
11 | from rexify.features.transform.entity import EntityTransformer
12 | from rexify.schema import Schema
13 |
14 |
15 | class FeatureExtractor(BaseEstimator, TransformerMixin, HasSchemaMixin, Serializable):
16 |
17 | _model_params: dict[str, Any]
18 | _item_ids: np.ndarray
19 | _user_ids: np.ndarray
20 |
21 | def __init__(
22 | self,
23 | schema: Schema,
24 | users: str = None,
25 | items: str = None,
26 | return_dataset: bool = False,
27 | window_size: int = 3,
28 | custom_transformers: list[CustomTransformer] = None,
29 | ):
30 | HasSchemaMixin.__init__(self, schema)
31 |
32 | self._users = users
33 | self._items = items
34 | self._return_dataset = return_dataset
35 | self._window_size = window_size
36 | self._window_size = window_size
37 | self._timestamp = schema.timestamp
38 | self._custom_transformers = custom_transformers or []
39 |
40 | self._user_transformer = EntityTransformer(
41 | schema, "user", self._custom_transformers
42 | )
43 | self._item_transformer = EntityTransformer(
44 | schema, "item", self._custom_transformers
45 | )
46 |
47 | self._ppl = make_pipeline(
48 | EventEncoder(self._schema),
49 | Sequencer(
50 | self._schema,
51 | timestamp_feature=self._timestamp,
52 | window_size=self._window_size,
53 | ),
54 | )
55 |
56 | def fit(self, X: Events):
57 | self._fit_transformer(Users)
58 | self._fit_transformer(Items)
59 |
60 | x_ = X.copy()
61 | events = self._encode(self._user_transformer, x_)
62 | events = self._encode(self._item_transformer, events)
63 | _ = self._ppl.fit(events)
64 |
65 | self._model_params = self._get_model_params()
66 | return self
67 |
68 | def transform(self, X: Events) -> Output:
69 | x_ = X.copy()
70 | events = self._encode(self._user_transformer, x_)
71 | events = self._encode(self._item_transformer, events)
72 | events = self._ppl.transform(events)
73 | events = self._drop(events, self._user_transformer)
74 | events = self._drop(events, self._item_transformer)
75 | self._model_params["session_history"] = self.history
76 |
77 | transformed = Output(
78 | data=events, schema=self._schema, ranking_features=self.ranking_features
79 | )
80 |
81 | self._user_ids = self._get_ids(transformed, self._user_transformer)
82 | self._item_ids = self._get_ids(transformed, self._item_transformer)
83 |
84 | return transformed.to_dataset() if self._return_dataset else transformed
85 |
86 | def _fit_transformer(self, inputs: Users | Items):
87 | input_name = inputs.__name__.lower()
88 | input_path: str = getattr(self, f"_{input_name}")
89 | transformer = getattr(self, f"_{input_name[:-1]}_transformer")
90 | x = inputs.load(input_path, schema=self._schema)
91 | transformer.fit(x).transform(x)
92 |
93 | @staticmethod
94 | def _encode(transformer: EntityTransformer, data: pd.DataFrame) -> pd.DataFrame:
95 | encoder, feature_names = transformer.encoder
96 | data[feature_names] = encoder.transform(data[feature_names])
97 | return data
98 |
99 | @staticmethod
100 | def _drop(df: pd.DataFrame, transformer: EntityTransformer):
101 | encoder, id_ = transformer.encoder
102 | return df.loc[df[id_].values.reshape(-1) != encoder.unknown_value, :]
103 |
104 | def _get_model_params(self):
105 | model_params = {}
106 | model_params.update(self._user_transformer.model_params)
107 | model_params.update(self._item_transformer.model_params)
108 | model_params.update({"ranking_features": self.ranking_features})
109 | model_params["window_size"] = self._window_size
110 | return model_params
111 |
112 | @staticmethod
113 | def _get_ids(df: pd.DataFrame, transformer: EntityTransformer):
114 | return df.loc[:, transformer.encoder[1][0]].values.astype(np.int32)
115 |
116 | @property
117 | def users(self):
118 | return self._users
119 |
120 | @property
121 | def items(self):
122 | return self._items
123 |
124 | @property
125 | def model_params(self):
126 | return self._model_params
127 |
128 | @property
129 | def ranking_features(self):
130 | return self._ppl.steps[0][1].ranking_features
131 |
132 | @property
133 | def history(self):
134 | return self._ppl.steps[1][1].history
135 |
136 | @property
137 | def return_dataset(self):
138 | return self._return_dataset
139 |
140 | @property
141 | def window_size(self):
142 | return self._window_size
143 |
144 | @property
145 | def custom_transformers(self):
146 | return self._custom_transformers
147 |
148 | @property
149 | def item_encoder(self):
150 | return self._item_transformer.encoder[0]
151 |
152 | @property
153 | def item_ids(self):
154 | return self._item_ids
155 |
156 | @property
157 | def user_encoder(self):
158 | return self._user_transformer.encoder[0]
159 |
160 | @property
161 | def user_ids(self):
162 | return self._user_ids
163 |
--------------------------------------------------------------------------------
/rexify/features/transform/__init__.py:
--------------------------------------------------------------------------------
1 | from .category import CategoricalEncoder
2 | from .custom import CustomTransformer
3 | from .event import EventEncoder
4 | from .id import IDEncoder
5 | from .number import NumericalEncoder
6 | from .sequence import Sequencer
7 |
--------------------------------------------------------------------------------
/rexify/features/transform/category.py:
--------------------------------------------------------------------------------
1 | from sklearn.pipeline import make_pipeline
2 | from sklearn.preprocessing import OneHotEncoder
3 |
4 | from rexify.features.base import BaseEncoder
5 | from rexify.schema import Schema
6 |
7 |
8 | class CategoricalEncoder(BaseEncoder):
9 | def __init__(self, schema: Schema, target: str):
10 | super().__init__(dtype="category", target=target, schema=schema)
11 | self.ppl = make_pipeline(OneHotEncoder(sparse_output=False))
12 |
--------------------------------------------------------------------------------
/rexify/features/transform/custom.py:
--------------------------------------------------------------------------------
1 | from sklearn.base import TransformerMixin
2 |
3 |
4 | class CustomTransformer(tuple):
5 | def __new__(
6 | cls, target: str, transformer: TransformerMixin, features: list[str]
7 | ) -> tuple:
8 | name = f"{target}_{''.join([f[0] for f in features])}_customTransformer"
9 | return tuple.__new__(CustomTransformer, (name, transformer, features))
10 |
--------------------------------------------------------------------------------
/rexify/features/transform/entity.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from sklearn.compose import ColumnTransformer
6 | from sklearn.pipeline import Pipeline, make_pipeline
7 |
8 | from rexify.features.base import HasSchemaMixin, HasTargetMixin
9 | from rexify.features.transform import (
10 | CategoricalEncoder,
11 | CustomTransformer,
12 | IDEncoder,
13 | NumericalEncoder,
14 | )
15 | from rexify.schema import Schema
16 | from rexify.utils import get_target_id
17 |
18 |
19 | class _FeatureTransformer(ColumnTransformer, HasSchemaMixin, HasTargetMixin):
20 | def __init__(self, schema: Schema, target: str):
21 | HasSchemaMixin.__init__(self, schema=schema)
22 | HasTargetMixin.__init__(self, target=target)
23 | transformers = self._get_transformers()
24 | ColumnTransformer.__init__(
25 | self, transformers=transformers, remainder="passthrough"
26 | )
27 |
28 | def _get_transformers(self) -> list[tuple[str, Pipeline, list[str]]]:
29 | transformer_list = []
30 |
31 | cat_encoder = CategoricalEncoder(self._schema, self._target).as_tuple()
32 | transformer_list += [cat_encoder] if cat_encoder[-1] != tuple() else []
33 |
34 | num_encoder = NumericalEncoder(self._schema, self._target).as_tuple()
35 | transformer_list += [num_encoder] if num_encoder[-1] != tuple() else []
36 |
37 | return transformer_list
38 |
39 |
40 | class EntityTransformer(ColumnTransformer, HasSchemaMixin, HasTargetMixin):
41 | _features: pd.DataFrame
42 | _model_params: dict[str, Any]
43 |
44 | def __init__(
45 | self,
46 | schema: Schema,
47 | target: str,
48 | custom_transformers: list[CustomTransformer] = None,
49 | ):
50 | HasSchemaMixin.__init__(self, schema)
51 | HasTargetMixin.__init__(self, target)
52 | self._custom_transformers = (
53 | self._filter_custom_transformers(custom_transformers, self._target) or []
54 | )
55 | transformers = [
56 | self._get_feature_pipeline(self._schema, self._target)
57 | ] + self._custom_transformers
58 | ColumnTransformer.__init__(self, transformers)
59 |
60 | def fit(self, X, y=None):
61 | super().fit(X, y)
62 | n_dims = self._get_n_dims(X)
63 | self._model_params = n_dims
64 | return self
65 |
66 | def transform(self, X) -> pd.DataFrame:
67 | self._features = super().transform(X)
68 | self._features = pd.DataFrame(
69 | self._features[:, :-1], index=self._features[:, -1]
70 | )
71 | self._features = pd.concat(
72 | [
73 | self._features,
74 | pd.DataFrame(np.zeros(self._features.shape[1])).transpose(),
75 | ],
76 | ignore_index=True,
77 | )
78 |
79 | self._model_params.update({f"{self._target}_embeddings": self._features})
80 | return self._features
81 |
82 | def _get_n_dims(self, X):
83 | id_col = get_target_id(self._schema, self._target)[0]
84 | input_dims = int(X[id_col].nunique() + 1)
85 | return {f"{self._target}_dims": input_dims}
86 |
87 | @staticmethod
88 | def _filter_custom_transformers(
89 | custom_transformers: list[CustomTransformer], target: str
90 | ):
91 | def target_from_name(x):
92 | return x[0].split("_")[0] == target
93 |
94 | return list(filter(target_from_name, custom_transformers))
95 |
96 | @staticmethod
97 | def _get_feature_pipeline(schema, target) -> tuple[str, Pipeline, list[str]]:
98 | name = f"{target}_featureExtractor"
99 | ppl = make_pipeline(
100 | IDEncoder(schema, target),
101 | _FeatureTransformer(schema, target),
102 | )
103 | target_keys = getattr(schema, target).to_dict()
104 | keys = [target_keys.pop("id")] + list(target_keys.keys())
105 | return name, ppl, keys
106 |
107 | @property
108 | def model_params(self):
109 | return self._model_params
110 |
111 | @property
112 | def identifiers(self):
113 | return self._features.index.values.astype(int)
114 |
115 | @property
116 | def encoder(self):
117 | encoder = self.transformers_[0][1].steps[0][1].transformer.transformers_[0]
118 | return encoder[1], encoder[-1]
119 |
120 | @property
121 | def custom_transformers(self):
122 | return self._custom_transformers
123 |
--------------------------------------------------------------------------------
/rexify/features/transform/event.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.base import BaseEstimator, TransformerMixin
3 | from sklearn.compose import make_column_transformer
4 | from sklearn.preprocessing import OneHotEncoder
5 |
6 | from rexify.features.base import HasSchemaMixin
7 | from rexify.schema import Schema
8 |
9 |
10 | class EventEncoder(BaseEstimator, TransformerMixin, HasSchemaMixin):
11 | def __init__(self, schema: Schema):
12 | HasSchemaMixin.__init__(self, schema)
13 | self._event_type = schema.event_type
14 | self._transformer = make_column_transformer(
15 | (OneHotEncoder(), [self._event_type])
16 | )
17 |
18 | def fit(self, X, y=None, **fit_params):
19 | self._transformer.fit(X, y)
20 | return self
21 |
22 | def transform(self, X):
23 | oneh = self._transformer.transform(X)
24 | oneh = pd.DataFrame(oneh, columns=self.transformer.get_feature_names_out())
25 | x = X.drop(self._event_type, axis=1)
26 | return pd.concat([x, oneh], axis=1)
27 |
28 | @property
29 | def transformer(self):
30 | return self._transformer.transformers_[0][1]
31 |
32 | @property
33 | def ranking_features(self):
34 | return self.transformer.get_feature_names_out().tolist()
35 |
--------------------------------------------------------------------------------
/rexify/features/transform/id.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.base import BaseEstimator, TransformerMixin
4 | from sklearn.compose import ColumnTransformer, make_column_transformer
5 | from sklearn.preprocessing import OrdinalEncoder
6 |
7 | from rexify.features.base import HasSchemaMixin, HasTargetMixin
8 | from rexify.utils import get_target_id
9 |
10 |
11 | class IDEncoder(BaseEstimator, TransformerMixin, HasSchemaMixin, HasTargetMixin):
12 |
13 | _transformer: ColumnTransformer
14 |
15 | def __init__(self, schema, target):
16 | HasSchemaMixin.__init__(self, schema)
17 | HasTargetMixin.__init__(self, target)
18 |
19 | def fit(self, X: pd.DataFrame, y=None):
20 | target_features = get_target_id(self._schema, self._target)
21 | encoder_args = self._get_encoder_args(X, target_features)
22 | self._transformer = make_column_transformer(
23 | (OrdinalEncoder(**encoder_args), target_features),
24 | remainder="passthrough",
25 | )
26 | self._transformer.fit(X, y)
27 | return self
28 |
29 | def transform(self, X: pd.DataFrame) -> pd.DataFrame:
30 | x = self._transformer.transform(X)
31 | columns = self._get_features_names_out()
32 | return pd.DataFrame(x, columns=columns)
33 |
34 | def _get_features_names_out(self) -> list[str]:
35 | features = self._transformer.get_feature_names_out()
36 | return [name.split("__")[-1] for name in features]
37 |
38 | @staticmethod
39 | def _get_encoder_args(df: pd.DataFrame, target_features: list[str]):
40 | value = df[target_features].nunique().sum()
41 | return {
42 | "dtype": np.int64,
43 | "handle_unknown": "use_encoded_value",
44 | "unknown_value": value,
45 | }
46 |
47 | @property
48 | def transformer(self):
49 | return self._transformer
50 |
51 | @property
52 | def target_feature(self):
53 | return self._transformer.transformers[0][-1][0]
54 |
--------------------------------------------------------------------------------
/rexify/features/transform/number.py:
--------------------------------------------------------------------------------
1 | from sklearn.pipeline import make_pipeline
2 | from sklearn.preprocessing import MinMaxScaler
3 |
4 | from rexify.features.base import BaseEncoder
5 | from rexify.schema import Schema
6 |
7 |
8 | class NumericalEncoder(BaseEncoder):
9 | def __init__(self, schema: Schema, target: str):
10 | super().__init__(dtype="number", target=target, schema=schema)
11 | self.ppl = make_pipeline(MinMaxScaler(feature_range=(-1, 1)))
12 |
--------------------------------------------------------------------------------
/rexify/features/transform/sequence.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.base import BaseEstimator, TransformerMixin
4 |
5 | from rexify.features.base import HasSchemaMixin
6 | from rexify.schema import Schema
7 | from rexify.utils import get_target_id
8 |
9 |
10 | class Sequencer(BaseEstimator, TransformerMixin, HasSchemaMixin):
11 |
12 | """Transformer responsible for creating sequential data.
13 |
14 | It creates a new column `history` that holds the previous `window_size` event item IDs.
15 |
16 | Args:
17 | schema (rexify.types.Schema): the data schema
18 | timestamp_feature (str): the dataframe's feature name with a timestamp
19 | window_size (int): the size of the sliding window
20 |
21 | Examples:
22 | >>> from rexify.features.transform import Sequencer
23 | >>> sequencer = Sequencer(schema)
24 | >>> sequencer.fit(events)
25 | Sequencer(schema={'context': {'timestamp': 'timestamp'},
26 | 'item': {'item_id': 'id', 'price': 'numerical',
27 | 'type': 'categorical'},
28 | 'rank': [{'name': 'Purchase'}, {'name': 'Add to Cart'},
29 | {'name': 'Page View'}],
30 | 'user': {'age': 'numerical', 'gender': 'categorical',
31 | 'user_id': 'id'}},
32 | timestamp_feature='timestamp', window_size=4)
33 | >>> transformed = sequencer.transform(events)
34 |
35 | """
36 |
37 | _user_id: str
38 | _item_id: str
39 | _columns: list[str]
40 | _padding: list[int]
41 | _history: pd.DataFrame
42 |
43 | def __init__(self, schema: Schema, window_size: int = 3, **kwargs):
44 | super().__init__(schema=schema)
45 | self._timestamp_feature = self._schema.timestamp
46 | self._window_size = window_size + 1
47 |
48 | def fit(self, X: pd.DataFrame, *_):
49 | self._user_id = get_target_id(self.schema, "user")[0]
50 | self._item_id = get_target_id(self.schema, "item")[0]
51 | self._columns = [col for col in X.columns if col != self._user_id]
52 | self._padding = [X[self._item_id].max() + 1] * (self._window_size - 2)
53 | return self
54 |
55 | def transform(self, X: pd.DataFrame):
56 | sequences = self._get_sequences(X)
57 |
58 | res = sequences.drop(self._item_id, axis=1).applymap(self._get_last)
59 | res[self._item_id] = sequences.pop(self._item_id)
60 | res["history"] = sequences.pop("history")
61 | res.reset_index(inplace=True)
62 | res = res.loc[res["history"].map(len) == self._window_size - 1, :]
63 | res = res.loc[~res.loc[:, self._timestamp_feature].isna()]
64 |
65 | self._history = self._get_history(res)
66 |
67 | res.drop(self._timestamp_feature, axis=1, inplace=True)
68 | return res
69 |
70 | def _get_sequences(self, df: pd.DataFrame):
71 | sequences: pd.DataFrame = (
72 | df.sort_values(self._timestamp_feature)
73 | .set_index(self._user_id)
74 | .groupby(level=-1)
75 | .apply(self._mask)
76 | .apply(pd.Series)
77 | .rename(columns=pd.Series(self._columns))
78 | .applymap(self._pad)
79 | .applymap(self._window)
80 | .apply(lambda x: x.explode())
81 | )
82 |
83 | sequences["history"] = sequences[self._item_id].map(lambda x: x[:-1])
84 | sequences[self._item_id] = sequences[self._item_id].map(self._get_last)
85 | return sequences
86 |
87 | def _get_history(self, df: pd.DataFrame):
88 | return (
89 | df.groupby([self._user_id])
90 | .agg({self._timestamp_feature: max, "history": list})
91 | .drop(self._timestamp_feature, axis=1)
92 | .history.map(self._get_last)
93 | )
94 |
95 | def _mask(self, df: pd.DataFrame):
96 | return [list(df[col]) for col in self._columns]
97 |
98 | @staticmethod
99 | def _get_last(lst: list):
100 | return lst[-1]
101 |
102 | def _window(self, sequence):
103 | if len(sequence) >= self._window_size:
104 | sequence = np.array(sequence, dtype=object)
105 |
106 | stack = [
107 | sequence[range(i, i + self._window_size)]
108 | for i in range(len(sequence) - self._window_size + 1)
109 | ]
110 |
111 | if len(stack) > 1:
112 | stack = np.stack(stack)
113 |
114 | return stack
115 | return [sequence]
116 |
117 | def _pad(self, x: list):
118 | return self._padding + x
119 |
120 | @property
121 | def timestamp_feature(self):
122 | return self._timestamp_feature
123 |
124 | @property
125 | def window_size(self):
126 | return self._window_size
127 |
128 | @property
129 | def history(self):
130 | return self._history
131 |
--------------------------------------------------------------------------------
/rexify/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .recommender import Recommender
2 |
--------------------------------------------------------------------------------
/rexify/models/base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 |
3 | import tensorflow as tf
4 |
5 |
6 | class DenseSetterMixin(ABC):
7 | @staticmethod
8 | def _set_sequential_model(
9 | layer: str | tf.keras.layers.Layer, layer_sizes: list[int], **kwargs
10 | ) -> list[tf.keras.layers.Layer]:
11 | if type(layer) == str:
12 | layer = getattr(tf.keras.layers, layer)
13 | return [layer(num_neurons, **kwargs) for num_neurons in layer_sizes]
14 |
15 | def _set_dense_layers(
16 | self, layer_sizes: list[int], activation: str | None = "relu"
17 | ) -> list[tf.keras.layers.Layer]:
18 | return self._set_sequential_model("Dense", layer_sizes, activation=activation)
19 |
20 | @staticmethod
21 | def _call_layers(layer_list: list[tf.keras.layers.Layer], inputs):
22 | x = inputs
23 | for layer in layer_list:
24 | x = layer(x)
25 | return x
26 |
--------------------------------------------------------------------------------
/rexify/models/callbacks/__init__.py:
--------------------------------------------------------------------------------
1 | from .index import BruteForceCallback, ScaNNCallback
2 |
3 |
4 | try:
5 | from .mlflow import MlflowCallback
6 | except:
7 | pass
8 |
--------------------------------------------------------------------------------
/rexify/models/callbacks/index.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from rexify.models.index import BruteForce, ScaNN
4 |
5 |
6 | class _IndexCallback(tf.keras.callbacks.Callback):
7 |
8 | INDEX: BruteForce | ScaNN
9 |
10 | def __init__(
11 | self,
12 | sample_query: dict[str, tf.Tensor],
13 | query_model: str = "query_model",
14 | batch_size: int = 128,
15 | **index_args,
16 | ):
17 | super().__init__()
18 | self._query_model = query_model
19 | self._batch_size = batch_size
20 | self._sample_query = sample_query
21 | self._index_args = index_args
22 | self._target = "user" if self._query_model == "query_model" else "item"
23 |
24 | def set(self) -> tf.keras.Model:
25 | query_model = getattr(self.model, self._query_model)
26 | return self.INDEX(query_model, self.model.window_size, **self._index_args)
27 |
28 | def on_train_end(self, logs=None):
29 | index = self.set()
30 | index.index_from_dataset(candidates=self._get_candidates_dataset())
31 | _ = index(self._sample_query[f"{self._target}_id"])
32 | setattr(self.model, f"{self._target}_index", index)
33 |
34 | def _get_candidates_dataset(self):
35 | def zip_item_dataset(item):
36 | return (item["item_id"], self.model.candidate_model(item))
37 |
38 | candidates = self._get_candidates().batch(self._batch_size)
39 | return candidates.map(zip_item_dataset)
40 |
41 | def _get_candidates(self):
42 | def header_fn(item_id):
43 | return {"item_id": tf.cast(item_id, tf.int32)}
44 |
45 | return tf.data.Dataset.from_tensor_slices(
46 | self.model.candidate_model.identifiers
47 | ).map(header_fn)
48 |
49 |
50 | class BruteForceCallback(_IndexCallback):
51 |
52 | INDEX = BruteForce
53 |
54 |
55 | class ScaNNCallback(_IndexCallback):
56 |
57 | INDEX = ScaNN
58 |
--------------------------------------------------------------------------------
/rexify/models/callbacks/mlflow.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import mlflow
4 | import tensorflow as tf
5 |
6 |
7 | class MlflowCallback(tf.keras.callbacks.Callback):
8 | def __init__(
9 | self,
10 | tracking_uri: str = os.environ.get("MLFLOW_TRACKING_URI"),
11 | experiment_name: str = os.environ.get("MLFLOW_EXPERIMENT_NAME"),
12 | ):
13 | super().__init__()
14 | if tracking_uri:
15 | mlflow.set_tracking_uri(tracking_uri)
16 | if experiment_name:
17 | mlflow.set_experiment(experiment_name)
18 |
19 | def on_train_begin(self, logs=None):
20 | config = self.model.get_config()
21 |
22 | def parse(value):
23 | if type(value).__name__ == "ListWrapper":
24 | return list(value)
25 | return value
26 |
27 | params = {k: parse(v) for k, v in config.items()}
28 | mlflow.log_params(params)
29 |
30 | def on_epoch_end(self, epoch, logs=None):
31 | mlflow.log_metrics(logs)
32 |
--------------------------------------------------------------------------------
/rexify/models/index.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow_recommenders as tfrs
3 |
4 |
5 | class _BaseIndex:
6 | def __init__(self, query_model: tf.keras.Model, window_size: int):
7 | self.query_model = query_model
8 | self._window_size = window_size
9 |
10 | def call(self, queries: tf.Tensor, k: int = None):
11 | queries_shape = queries.shape[0] or 1
12 | inputs = (
13 | {
14 | "user_id": queries,
15 | "history": tf.zeros(
16 | shape=(queries_shape, self._window_size), dtype=tf.int32
17 | ),
18 | }
19 | if self.query_model.name.startswith("query")
20 | else {"item_id": queries}
21 | )
22 | return self.__class__.__bases__[1].call(self, inputs, k)
23 |
24 |
25 | class BruteForce(_BaseIndex, tfrs.layers.factorized_top_k.BruteForce):
26 | def __init__(
27 | self,
28 | query_model: tf.keras.Model,
29 | window_size: int,
30 | k: int = 2,
31 | name: str = None,
32 | ):
33 | tfrs.layers.factorized_top_k.BruteForce.__init__(self, query_model, k, name)
34 | _BaseIndex.__init__(self, query_model, window_size)
35 |
36 |
37 | class ScaNN(_BaseIndex, tfrs.layers.factorized_top_k.ScaNN):
38 | def __init__(
39 | self,
40 | query_model: tf.keras.Model,
41 | window_size: int,
42 | k: int = 10,
43 | distance_measure: str = "dot_product",
44 | num_leaves: int = 100,
45 | num_leaves_to_search: int = 10,
46 | training_iterations: int = 12,
47 | dimensions_per_block: int = 2,
48 | num_reordering_candidates: int = None,
49 | parallelize_batch_searches: bool = True,
50 | name: str = None,
51 | ):
52 | tfrs.layers.factorized_top_k.ScaNN.__init__(
53 | self,
54 | query_model,
55 | k,
56 | distance_measure,
57 | num_leaves,
58 | num_leaves_to_search,
59 | training_iterations,
60 | dimensions_per_block,
61 | num_reordering_candidates,
62 | parallelize_batch_searches,
63 | name,
64 | )
65 | _BaseIndex.__init__(self, query_model, window_size)
66 |
--------------------------------------------------------------------------------
/rexify/models/lookup.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 |
6 |
7 | class _BaseLookupModel(tf.keras.Model):
8 | def __init__(self, ids: np.ndarray, values: np.ndarray):
9 | super().__init__()
10 | self._ids = ids
11 | self._values = values
12 |
13 | identifiers_idx = np.arange(0, self._ids.shape[0])
14 | init = tf.lookup.KeyValueTensorInitializer(
15 | keys=self._ids,
16 | values=identifiers_idx,
17 | key_dtype=tf.int32,
18 | value_dtype=tf.int32,
19 | )
20 |
21 | self.token_to_id = tf.lookup.StaticHashTable(init, default_value=len(ids))
22 |
23 | @tf.function(input_signature=[tf.TensorSpec([None], tf.int32)])
24 | def call(self, inputs):
25 | ids = self.token_to_id.lookup(inputs)
26 | return tf.nn.embedding_lookup(params=self._values, ids=ids)
27 |
28 | @abstractmethod
29 | def get_config(self):
30 | pass
31 |
32 |
33 | class EmbeddingLookup(_BaseLookupModel):
34 | def __init__(self, ids: np.ndarray, embeddings: np.ndarray):
35 | super().__init__(ids=ids, values=embeddings)
36 |
37 | def get_config(self):
38 | return {"ids": self._ids, "embeddings": self._values}
39 |
40 |
41 | class SessionLookup(_BaseLookupModel):
42 | def __init__(self, ids: np.ndarray, sessions: np.ndarray):
43 | super().__init__(ids=ids, values=sessions)
44 |
45 | def get_config(self):
46 | return {"ids": self._ids, "sessions": self._values}
47 |
--------------------------------------------------------------------------------
/rexify/models/ranking/__init__.py:
--------------------------------------------------------------------------------
1 | from .ranking import RankingMixin
2 |
--------------------------------------------------------------------------------
/rexify/models/ranking/base.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow_recommenders as tfrs
3 |
4 | from rexify.models.base import DenseSetterMixin
5 |
6 |
7 | class BaseRankingModel(tf.keras.Model, DenseSetterMixin):
8 |
9 | output_layer: tf.keras.layers.Dense
10 | task: tfrs.tasks.Ranking
11 |
12 | def __init__(self, layer_sizes: list[int]):
13 | super().__init__()
14 | self._layer_sizes = layer_sizes or [64, 32]
15 | self.hidden_layers = self._set_dense_layers(self._layer_sizes)
16 |
17 | def call(self, inputs, labels):
18 | x = self._call_layers(self.hidden_layers, inputs)
19 | x = self.output_layer(x)
20 | return self.task(labels=labels, predictions=x)
21 |
--------------------------------------------------------------------------------
/rexify/models/ranking/event.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow_recommenders as tfrs
3 |
4 | from rexify.models.ranking.base import BaseRankingModel
5 |
6 |
7 | class EventModel(BaseRankingModel):
8 | def __init__(self, layer_sizes: list[int] = None, n_dims: int = 1):
9 | super().__init__(layer_sizes=layer_sizes)
10 | self._n_dims = n_dims
11 | self.output_layer = tf.keras.layers.Dense(self._n_dims, activation="softmax")
12 | self.task = tfrs.tasks.Ranking(loss=tf.keras.losses.CategoricalCrossentropy())
13 |
14 | def get_config(self):
15 | return {
16 | "layer_sizes": self._layer_sizes,
17 | "n_dims": self._n_dims,
18 | }
19 |
--------------------------------------------------------------------------------
/rexify/models/ranking/ranking.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 |
3 | import tensorflow as tf
4 | import tensorflow_recommenders as tfrs
5 |
6 | from rexify.models.base import DenseSetterMixin
7 |
8 |
9 | class RankingMixin(tfrs.Model, DenseSetterMixin, ABC):
10 | def __init__(
11 | self,
12 | ranking_features: list[str] = None,
13 | layer_sizes: list[int] = None,
14 | weights: dict[str, float] = None,
15 | ):
16 | super().__init__()
17 | self._ranking_features = ranking_features or []
18 | self._ranking_layers = layer_sizes or [64, 32]
19 |
20 | # todo: validate ranking weights
21 | self._ranking_weights = weights or {
22 | feature: 1.0 for feature in self._ranking_features
23 | }
24 | self._ranking_models = {
25 | feature: self._get_ranking_model() for feature in self._ranking_features
26 | }
27 | self._ranking_tasks = {
28 | feature: tfrs.tasks.Ranking(loss=tf.keras.losses.BinaryCrossentropy())
29 | for feature in self._ranking_features
30 | }
31 |
32 | def get_loss(
33 | self,
34 | query_embeddings: tf.Tensor,
35 | candidate_embeddings: tf.Tensor,
36 | ranks: dict[str, tf.Tensor],
37 | ):
38 | loss = 0
39 | inputs = tf.concat([query_embeddings, candidate_embeddings], axis=1)
40 | for feature, model in self._ranking_models.items():
41 | rating_preds = self._call_layers(model, inputs)
42 | loss += (
43 | self._ranking_tasks[feature](
44 | labels=ranks[feature], predictions=rating_preds
45 | )
46 | * self._ranking_weights[feature]
47 | )
48 | return loss
49 |
50 | def _get_ranking_model(self) -> list[tf.keras.layers.Layer]:
51 | model = self._set_dense_layers(self._ranking_layers)
52 | model.append(tf.keras.layers.Dense(1, activation="sigmoid"))
53 | return model
54 |
--------------------------------------------------------------------------------
/rexify/models/recommender.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import tensorflow as tf
3 |
4 | from rexify.models.callbacks import BruteForceCallback
5 | from rexify.models.ranking import RankingMixin
6 | from rexify.models.retrieval import RetrievalMixin
7 | from rexify.utils import get_sample_query
8 |
9 |
10 | class Recommender(RetrievalMixin, RankingMixin):
11 | """The main Recommender model.
12 |
13 | It expects a `tf.data.Dataset`, composed of two keys: "query" and "candidate";
14 | the query part of the dataset has three keys:
15 |
16 | * the user ID feature name, a scalar;
17 | * `user_features`, an array representing the user features
18 | * `context_features`, an array representing the context features
19 |
20 | The candidate part of the data set has two keys:
21 |
22 | * the item ID feature name, a scalar;
23 | * `item_features`, an array representing the item features
24 |
25 | The query tower model takes the user ID feature and passes it by an embedding layer. The
26 | user and context features are concatenated and passed by a number of dense layers. The
27 | item ID feature is similarly passed to an Embedding layer. Its outputs are then concatenated
28 | to the outputs of the features model whose inputs are the item features, and are then
29 | passed by a number of Dense layers.
30 |
31 | An optional Ranking model is also included, granted there are `ranking_features`.
32 |
33 | Args:
34 | user_dims (int): number possible values for the user ID feature
35 | item_dims (int): number possible values for the item ID feature
36 | embedding_dim (int): output dimension of the embedding layer
37 | feature_layers (list): number of neurons in each layer for the feature models
38 | output_layers (list): number of neurons in each layer for the output models
39 |
40 | Examples:
41 | >>> from rexify.models import Recommender
42 | >>> model = Recommender()
43 | >>> model.compile()
44 |
45 | >>> import numpy as np
46 | >>> inputs = tf.data.Dataset.from_tensor_slices(np.concatenate([np.random.randint(0, 15, size=100).reshape(-1, 1), np.random.randint(0, 1, size=100).reshape(-1, 1), np.random.randint(0, 1_000, size=100).reshape(-1, 1), np.random.randint(0, 1_000, size=100).reshape(-1, 1), np.random.randint(0, 15, size=100).reshape(-1, 1), np.random.randint(0, 5, size=100).reshape(-1, 1),], axis=1)).map(lambda x: {'query': {'user_id': x[0], 'user_features': x[1:3], 'context_features': x[3:4]}, 'candidate': {'item_id': x[4], 'item_features': x[5:]}}).batch(128)
47 |
48 | >>> _ = model.fit(inputs, verbose=0)
49 |
50 | """
51 |
52 | def __init__(
53 | self,
54 | user_dims: int,
55 | item_dims: int,
56 | user_embeddings: pd.DataFrame,
57 | item_embeddings: pd.DataFrame,
58 | session_history: pd.DataFrame,
59 | window_size: int = 3,
60 | embedding_dim: int = 32,
61 | feature_layers: list[int] = None,
62 | output_layers: list[int] = None,
63 | ranking_features: list[str] = None,
64 | ranking_layers: list[int] = None,
65 | ranking_weights: dict[str, float] = None,
66 | ):
67 | RetrievalMixin.__init__(
68 | self,
69 | user_dims=user_dims + 1,
70 | item_dims=item_dims + 1,
71 | user_embeddings=user_embeddings,
72 | item_embeddings=item_embeddings,
73 | session_history=session_history,
74 | window_size=window_size,
75 | embedding_dim=embedding_dim,
76 | feature_layers=feature_layers,
77 | output_layers=output_layers,
78 | )
79 |
80 | RankingMixin.__init__(
81 | self,
82 | ranking_features=ranking_features,
83 | layer_sizes=ranking_layers,
84 | weights=ranking_weights,
85 | )
86 |
87 | def compute_loss(self, inputs, training: bool = False) -> tf.Tensor:
88 | embeddings = self(
89 | inputs, training=training
90 | ) # Recommender inherits RetrievalMixin's call method
91 | loss = RetrievalMixin.get_loss(self, *embeddings)
92 | loss += RankingMixin.get_loss(self, *embeddings, inputs["rank"])
93 | return loss
94 |
95 | def fit(
96 | self,
97 | x: tf.data.Dataset,
98 | batch_size: int = None,
99 | epochs: int = 1,
100 | callbacks: list[tf.keras.callbacks.Callback] = None,
101 | validation_data=None,
102 | ):
103 | callbacks = callbacks if callbacks else self._get_callbacks(x, batch_size)
104 | # todo: validate number of index callbacks
105 | # - can't be more than a single index for each model (query, candidate)
106 |
107 | if batch_size:
108 | x = x.batch(batch_size)
109 | if validation_data:
110 | validation_data = validation_data.batch(batch_size)
111 |
112 | return super().fit(
113 | x, epochs=epochs, validation_data=validation_data, callbacks=callbacks
114 | )
115 |
116 | def get_config(self):
117 | return {
118 | "item_dims": self._item_dims,
119 | "user_dims": self._user_dims,
120 | "output_layers": self._output_layers,
121 | "feature_layers": self._feature_layers,
122 | "ranking_layers": self._ranking_layers,
123 | "ranking_features": self._ranking_features,
124 | "ranking_weights": self._ranking_weights,
125 | }
126 |
127 | @classmethod
128 | def load(cls, export_dir: str) -> tf.keras.Model:
129 | return tf.saved_model.load(export_dir)
130 |
131 | @staticmethod
132 | def _get_callbacks(x, batch_size: int = None) -> list[tf.keras.callbacks.Callback]:
133 | # required to set index shapes
134 | sample_query = get_sample_query(x)["query"]
135 |
136 | def get_index_callback():
137 | try:
138 | import scann # noqa: F401
139 |
140 | from rexify.models.callbacks import ScaNNCallback
141 |
142 | return ScaNNCallback(sample_query, batch_size=batch_size)
143 |
144 | except ImportError:
145 | return BruteForceCallback(sample_query, batch_size=batch_size)
146 |
147 | def get_mlflow_callback():
148 | try:
149 | from rexify.models.callbacks import MlflowCallback
150 |
151 | return MlflowCallback()
152 |
153 | except ImportError:
154 | return
155 |
156 | callbacks = [get_index_callback(), get_mlflow_callback()]
157 | callbacks = callbacks[:-1] if callbacks[-1] is None else callbacks
158 |
159 | return callbacks
160 |
--------------------------------------------------------------------------------
/rexify/models/retrieval/__init__.py:
--------------------------------------------------------------------------------
1 | from .retrieval import RetrievalMixin
2 |
--------------------------------------------------------------------------------
/rexify/models/retrieval/candidate.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from rexify.models.retrieval.tower import TowerModel
5 |
6 |
7 | class CandidateModel(TowerModel):
8 | """Tower model responsible for computing the candidate representations
9 |
10 | Args:
11 | n_items (str): number possible values for the ID feature
12 | embedding_dim (int): output dimension of the embedding layer
13 | output_layers (list): number of neurons in each layer for the output model
14 | feature_layers (list): number of neurons in each layer for the feature model
15 |
16 | Examples:
17 |
18 | >>> from rexify.models.retrieval.candidate import CandidateModel
19 | >>> model = CandidateModel('item_id', 15)
20 | >>> model({'item_id': tf.constant([1]), 'item_features': tf.constant([[1, 1, 1]])})
21 |
23 | """
24 |
25 | def __init__(
26 | self,
27 | n_items: int,
28 | identifiers: np.array,
29 | feature_embeddings: np.array,
30 | embedding_dim: int = 32,
31 | output_layers: list[int] = None,
32 | feature_layers: list[int] = None,
33 | ):
34 | super().__init__(
35 | "item_id",
36 | n_items,
37 | identifiers,
38 | feature_embeddings,
39 | embedding_dim,
40 | output_layers,
41 | feature_layers,
42 | )
43 |
44 | def call(self, inputs: dict[str, tf.Tensor], training: bool = None) -> tf.Tensor:
45 | x = self.embedding_layer(inputs[self._id_feature])
46 | features = self.lookup_model(inputs[self._id_feature])
47 | feature_embedding = self._call_layers(self.feature_model, features)
48 | x = tf.concat([x, feature_embedding], axis=1)
49 | x = self._call_layers(self.output_model, x)
50 | return x
51 |
52 | def get_config(self):
53 | config = super().get_config()
54 | config["n_items"] = self._n_dims
55 | return config
56 |
--------------------------------------------------------------------------------
/rexify/models/retrieval/query.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import tensorflow as tf
4 |
5 | from rexify.models.lookup import SessionLookup
6 | from rexify.models.retrieval.tower import TowerModel
7 | from rexify.models.sequential import SequentialModel
8 |
9 |
10 | class QueryModel(TowerModel):
11 | """Tower model responsible for computing the query representations
12 |
13 | Args:
14 | n_users (str): number possible values for the ID feature
15 | embedding_dim (int): output dimension of the embedding layer
16 | output_layers (list): number of neurons in each layer for the output model
17 | feature_layers (list): number of neurons in each layer for the feature model
18 |
19 | Examples:
20 |
21 | >>> from rexify.models.retrieval.query import QueryModel
22 | >>> model = QueryModel('user_id', 15)
23 | >>> model({"user_id": tf.constant([1]), "user_features": tf.constant([[1, 1]]), "context_features": tf.constant([[1]])})
24 |
26 | """
27 |
28 | def __init__(
29 | self,
30 | n_users: int,
31 | n_items: int,
32 | identifiers: np.array,
33 | feature_embeddings: np.array,
34 | session_history: pd.DataFrame,
35 | embedding_dim: int = 32,
36 | output_layers: list[int] = None,
37 | feature_layers: list[int] = None,
38 | recurrent_layers: list[int] = None,
39 | sequential_dense_layers: list[int] = None,
40 | ):
41 | super().__init__(
42 | "user_id",
43 | n_users,
44 | identifiers,
45 | feature_embeddings,
46 | embedding_dim,
47 | output_layers,
48 | feature_layers,
49 | )
50 | self._n_items = n_items
51 | self.sequential_model = SequentialModel(
52 | n_dims=n_items,
53 | embedding_dim=self._embedding_dim,
54 | recurrent_layer_sizes=recurrent_layers,
55 | dense_layer_sizes=sequential_dense_layers,
56 | )
57 | self.session_lookup = SessionLookup(
58 | ids=session_history.index.values.astype(int),
59 | sessions=np.stack(session_history.values).astype(int),
60 | )
61 |
62 | def call(self, inputs: dict[str, tf.Tensor], training: bool = None) -> tf.Tensor:
63 | x = self.embedding_layer(inputs[self._id_feature])
64 | features = [self.lookup_model(inputs[self._id_feature])]
65 |
66 | history = (
67 | self.session_lookup(inputs[self._id_feature])
68 | if not training
69 | else inputs["history"]
70 | )
71 |
72 | sequential_embedding = self.sequential_model(history)
73 | x = tf.concat([x, sequential_embedding], axis=1)
74 |
75 | features = tf.concat(features, axis=1) if len(features) > 1 else features[0]
76 | feature_embedding = self._call_layers(self.feature_model, features)
77 | x = tf.concat([x, feature_embedding], axis=1)
78 |
79 | x = self._call_layers(self.output_model, x)
80 | return x
81 |
82 | def get_config(self):
83 | config = super().get_config()
84 | config["user_id"] = self._id_feature
85 | config["n_users"] = self._n_dims
86 | config["n_items"] = self._n_items
87 | return config
88 |
--------------------------------------------------------------------------------
/rexify/models/retrieval/retrieval.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 |
3 | import pandas as pd
4 | import tensorflow as tf
5 | import tensorflow_recommenders as tfrs
6 |
7 | from rexify.models.retrieval.candidate import CandidateModel
8 | from rexify.models.retrieval.query import QueryModel
9 |
10 |
11 | class RetrievalMixin(tfrs.Model, ABC):
12 | def __init__(
13 | self,
14 | user_dims: int,
15 | item_dims: int,
16 | user_embeddings: pd.DataFrame,
17 | item_embeddings: pd.DataFrame,
18 | session_history: pd.DataFrame,
19 | window_size: int = 3,
20 | embedding_dim: int = 32,
21 | feature_layers: list[int] = None,
22 | output_layers: list[int] = None,
23 | **kwargs
24 | ):
25 | super().__init__()
26 | self._user_dims = user_dims
27 | self._item_dims = item_dims
28 | self._window_size = window_size
29 | self._embedding_dim = embedding_dim
30 | self._output_layers = output_layers or [64, 32]
31 | self._feature_layers = feature_layers or [64, 32, 16]
32 | joint_args = {
33 | "embedding_dim": self._embedding_dim,
34 | "output_layers": self._output_layers,
35 | "feature_layers": self._feature_layers,
36 | }
37 |
38 | self.query_model = QueryModel(
39 | self._user_dims,
40 | self._item_dims,
41 | identifiers=user_embeddings.index.values.astype(int),
42 | feature_embeddings=user_embeddings.values.astype(float),
43 | session_history=session_history,
44 | **joint_args
45 | )
46 |
47 | self.candidate_model = CandidateModel(
48 | self._item_dims,
49 | identifiers=item_embeddings.index.values.astype(int),
50 | feature_embeddings=item_embeddings.values.astype(float),
51 | **joint_args
52 | )
53 |
54 | self.retrieval_task = tfrs.tasks.Retrieval()
55 |
56 | def call(self, inputs, training: bool = False):
57 | query_embeddings: tf.Tensor = self.query_model(
58 | inputs["query"], training=training
59 | )
60 | candidate_embeddings: tf.Tensor = self.candidate_model(
61 | inputs["candidate"], training=training
62 | )
63 | return query_embeddings, candidate_embeddings
64 |
65 | def get_loss(self, *embeddings):
66 | return self.retrieval_task(*embeddings)
67 |
68 | @property
69 | def window_size(self):
70 | return self._window_size
71 |
--------------------------------------------------------------------------------
/rexify/models/retrieval/tower.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 |
6 | from rexify.models.base import DenseSetterMixin
7 | from rexify.models.lookup import EmbeddingLookup
8 |
9 |
10 | class TowerModel(tf.keras.Model, DenseSetterMixin):
11 | """
12 |
13 | Args:
14 | id_feature (str): the ID feature
15 | n_dims (str): number possible values for the ID feature
16 | embedding_dim (int): output dimension of the embedding layer
17 | layer_sizes (list): number of neurons in each layer for the output model
18 | feature_layers (list): number of neurons in each layer for the feature model
19 |
20 | Attributes:
21 | embedding_layer (tf.keras.layers.Embedding):
22 | feature_model (list):
23 | output_model (list):
24 | """
25 |
26 | def __init__(
27 | self,
28 | id_feature: str,
29 | n_dims: int,
30 | identifiers: np.array,
31 | feature_embeddings: np.array,
32 | embedding_dim: int = 32,
33 | layer_sizes: list[int] = None,
34 | feature_layers: list[int] = None,
35 | ):
36 | super().__init__()
37 | self._id_feature = id_feature
38 | self._n_dims = n_dims
39 | self._embedding_dim = embedding_dim
40 | self._layer_sizes = layer_sizes or [64, 32]
41 | self._feature_layers = feature_layers or [64, 32, 16]
42 | self._identifiers = identifiers
43 | self._target_features = feature_embeddings
44 |
45 | self.embedding_layer = tf.keras.layers.Embedding(n_dims, embedding_dim)
46 | self.feature_model = self._set_dense_layers(self._feature_layers)
47 | self.lookup_model = EmbeddingLookup(
48 | ids=self._identifiers, embeddings=self._target_features
49 | )
50 | self.output_model = self._set_dense_layers(self._layer_sizes, activation=None)
51 |
52 | @abstractmethod
53 | def call(self, inputs: dict[str, tf.Tensor], training: bool = None):
54 | raise NotImplementedError
55 |
56 | def get_config(self):
57 | return {
58 | "id_features": self._id_feature,
59 | "n_dims": self._n_dims,
60 | "embedding_dim": self._embedding_dim,
61 | "layer_sizes": self._layer_sizes,
62 | "feature_layers": self._feature_layers,
63 | "identifiers": self._identifiers,
64 | "feature_embeddings": self._target_features,
65 | }
66 |
67 | @property
68 | def identifiers(self):
69 | return self._identifiers
70 |
--------------------------------------------------------------------------------
/rexify/models/sequential.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from rexify.models.base import DenseSetterMixin
4 |
5 |
6 | class SequentialModel(tf.keras.Model, DenseSetterMixin):
7 | def __init__(
8 | self,
9 | n_dims: int,
10 | embedding_dim: int,
11 | layer: str = "LSTM",
12 | activation: str = "relu",
13 | recurrent_layer_sizes: list[int] = None,
14 | dense_layer_sizes: list[int] = None,
15 | ):
16 | super().__init__()
17 | self._layer = layer
18 | self._n_dims = n_dims
19 | self._embedding_dim = embedding_dim
20 | self._activation = activation
21 | self._recurrent_layer_sizes = recurrent_layer_sizes or [32] * 2
22 | self._dense_layer_sizes = dense_layer_sizes or [32, 16]
23 |
24 | self.embedding_layer = tf.keras.layers.Embedding(
25 | self._n_dims, self._embedding_dim
26 | )
27 |
28 | self.recurrent_model = self._set_recurrent_model()
29 |
30 | self.output_model = self._set_dense_layers(
31 | layer_sizes=self._dense_layer_sizes[:-1], activation=activation
32 | )
33 | self.output_model.append(tf.keras.layers.Dense(self._dense_layer_sizes[-1]))
34 |
35 | def call(self, inputs: tf.Tensor):
36 | x = tf.cast(inputs, tf.int32)
37 | x = self.embedding_layer(x)
38 | x = self._call_layers(self.recurrent_model, x)
39 | return self._call_layers(self.output_model, x)
40 |
41 | def _set_recurrent_model(self) -> tf.keras.Model:
42 | layer = getattr(tf.keras.layers, self._layer)
43 | layers = self._set_sequential_model(
44 | layer=layer,
45 | layer_sizes=self._recurrent_layer_sizes[:-1],
46 | return_sequences=True,
47 | )
48 | layers.append(layer(self._recurrent_layer_sizes[-1]))
49 | return layers
50 |
51 | def get_config(self):
52 | return {
53 | "n_dims": self._n_dims,
54 | "embedding_dim": self._embedding_dim,
55 | "layer": self._layer,
56 | "activation": self._activation,
57 | "recurrent_layer_sizes": self._recurrent_layer_sizes,
58 | "dense_layer_sizes": self._dense_layer_sizes,
59 | }
60 |
--------------------------------------------------------------------------------
/rexify/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | PIPELINE_ROOT = os.environ.get("PIPELINE_ROOT", "outputs")
5 |
--------------------------------------------------------------------------------
/rexify/pipeline/__main__.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | import typer
4 | from kfp.v2.compiler import Compiler
5 | from kfp.v2.dsl import pipeline
6 |
7 | from rexify.pipeline import PIPELINE_ROOT
8 | from rexify.pipeline.components import load, train
9 |
10 |
11 | @pipeline(name="pipeline", pipeline_root=PIPELINE_ROOT)
12 | def pipeline(
13 | events: str,
14 | users: str,
15 | items: str,
16 | schema: str,
17 | epochs: int = 100,
18 | batch_size: int = 512,
19 | ):
20 |
21 | load_task = load(
22 | events=events,
23 | users=users,
24 | items=items,
25 | schema=schema,
26 | )
27 |
28 | train_task = train( # noqa:F841
29 | feature_extractor=load_task.outputs["feature_extractor"],
30 | train_data=load_task.outputs["train_data"],
31 | validation_data=load_task.outputs["validation_data"],
32 | batch_size=batch_size,
33 | epochs=epochs,
34 | )
35 |
36 |
37 | def compile(
38 | output_path: str = typer.Option(
39 | None, help="Output path for the pipeline definition JSON file"
40 | ),
41 | parameter: list[str] = typer.Option(
42 | None, "--parameter", "-p", help="Pipeline parameter, KEY=VALUE"
43 | ),
44 | ):
45 | output_path = output_path if output_path else "pipeline.json"
46 |
47 | pipeline_parameters = (
48 | {k: v for k, v in [param.split("=") for param in parameter]}
49 | if parameter
50 | else None
51 | )
52 |
53 | with warnings.catch_warnings():
54 | warnings.filterwarnings("ignore")
55 | Compiler().compile(
56 | pipeline_func=pipeline,
57 | package_path=output_path,
58 | pipeline_parameters=pipeline_parameters,
59 | )
60 |
61 |
62 | if __name__ == "__main__":
63 | typer.run(compile)
64 |
--------------------------------------------------------------------------------
/rexify/pipeline/components/__init__.py:
--------------------------------------------------------------------------------
1 | from .load import load
2 | from .train import train
3 |
--------------------------------------------------------------------------------
/rexify/pipeline/components/load.py:
--------------------------------------------------------------------------------
1 | from kfp.v2.dsl import Artifact, Dataset, Output, component
2 |
3 | from rexify import BASE_IMAGE
4 |
5 |
6 | @component(base_image=BASE_IMAGE)
7 | def load(
8 | events: str,
9 | users: str,
10 | items: str,
11 | schema: str,
12 | feature_extractor: Output[Artifact],
13 | train_data: Output[Dataset],
14 | validation_data: Output[Dataset],
15 | test_size: float = 0.3,
16 | ):
17 | import json
18 |
19 | from rexify import Events, FeatureExtractor, Output, Schema
20 |
21 | schema = Schema.from_dict(json.loads(schema))
22 | train, val = Events.load(events, schema=schema).split(test_size=test_size)
23 |
24 | fe = FeatureExtractor(schema, users, items, return_dataset=False)
25 | train: Output = fe.fit(train).transform(train)
26 | val: Output = fe.transform(val)
27 |
28 | fe.save(feature_extractor.path)
29 | train.save(train_data.path, "train.csv")
30 | val.save(validation_data.path, "val.csv")
31 |
--------------------------------------------------------------------------------
/rexify/pipeline/components/train.py:
--------------------------------------------------------------------------------
1 | from kfp.v2.dsl import Artifact, Dataset, Input, Model, Output, component
2 |
3 | from rexify import BASE_IMAGE
4 |
5 |
6 | @component(base_image=BASE_IMAGE)
7 | def train(
8 | feature_extractor: Input[Artifact],
9 | train_data: Input[Dataset],
10 | validation_data: Input[Dataset],
11 | model: Output[Model],
12 | batch_size: int = 512,
13 | epochs: int = 10,
14 | ):
15 | from rexify import DataFrame, FeatureExtractor, Recommender
16 |
17 | fe = FeatureExtractor.load(feature_extractor.path)
18 | train_data = DataFrame.load(train_data.path)
19 | validation_data = DataFrame.load(validation_data.path)
20 |
21 | fit_params = {"batch_size": batch_size, "epochs": epochs}
22 | recommender = Recommender(**fe.model_params)
23 | recommender.compile()
24 | recommender.fit(train_data, validation_data=validation_data, **fit_params)
25 | recommender.save(model.path)
26 |
--------------------------------------------------------------------------------
/rexify/schema.py:
--------------------------------------------------------------------------------
1 | import json
2 | from copy import deepcopy
3 |
4 | from rexify.utils import get_target_id
5 |
6 |
7 | class _JSONSerializable:
8 | def to_dict(self):
9 | return self.__dict__.copy()
10 |
11 |
12 | class _TargetSchema(_JSONSerializable):
13 |
14 | _SUPPORTED_DATA_TYPES = ["category", "number", "id"]
15 |
16 | def __init__(self, id_: str, **features):
17 | self.id = id_
18 | for feature_name, dtype in features.items():
19 | self._validate_features(feature_name, dtype)
20 | setattr(self, feature_name, dtype)
21 |
22 | @classmethod
23 | def _validate_features(cls, feature_name: str, dtype: str):
24 | if dtype not in cls._SUPPORTED_DATA_TYPES:
25 | raise ValueError(
26 | f"""
27 | Data type not supported for feature `{feature_name}`.
28 | Supported data types are: {cls._SUPPORTED_DATA_TYPES}
29 | """
30 | )
31 |
32 |
33 | class Schema(_JSONSerializable):
34 | def __init__(
35 | self,
36 | user_id: str,
37 | item_id: str,
38 | timestamp: str,
39 | event_type: str,
40 | user_features: dict[str, str] = None,
41 | item_features: dict[str, str] = None,
42 | ):
43 | user_features = user_features or {}
44 | item_features = item_features or {}
45 | self.user = _TargetSchema(user_id, **user_features)
46 | self.item = _TargetSchema(item_id, **item_features)
47 | self.timestamp = timestamp
48 | self.event_type = event_type
49 |
50 | @classmethod
51 | def from_json(cls, schema_path: str):
52 | with open(schema_path, "r") as f:
53 | schema = json.load(f)
54 | return Schema.from_dict(schema)
55 |
56 | @classmethod
57 | def from_dict(cls, schema: dict[str, str | dict[str, str]]):
58 | schema_ = deepcopy(schema)
59 | user_id = get_target_id(schema_, "user")[0]
60 | _ = schema_["user"].pop(user_id)
61 |
62 | item_id = get_target_id(schema_, "item")[0]
63 | _ = schema_["item"].pop(item_id)
64 |
65 | return Schema(
66 | user_id=user_id,
67 | item_id=item_id,
68 | timestamp=schema_["timestamp"],
69 | event_type=schema_["event_type"],
70 | user_features=schema_["user"],
71 | item_features=schema_["item"],
72 | )
73 |
74 | def to_dict(self):
75 | schema = dict()
76 | schema["user"] = self.user.to_dict()
77 | schema["user"][schema["user"]["id"]] = "id"
78 | _ = schema["user"].pop("id")
79 |
80 | schema["item"] = self.item.to_dict()
81 | schema["item"][schema["item"]["id"]] = "id"
82 | _ = schema["item"].pop("id")
83 |
84 | schema["event_type"] = self.event_type
85 | schema["timestamp"] = self.timestamp
86 | return schema
87 |
88 | def save(self, path: str):
89 | with open(path, "w") as f:
90 | json.dump(self.to_dict(), f, indent=4)
91 |
--------------------------------------------------------------------------------
/rexify/utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import tensorflow as tf
4 |
5 |
6 | def _get_target(schema, target: str):
7 | return getattr(schema, target).to_dict() if type(schema) != dict else schema[target]
8 |
9 |
10 | def get_target_id(schema, target: str) -> list[str]:
11 | if type(schema) != dict:
12 | return [getattr(schema, target).id]
13 | return [k for k, v in schema[target].items() if v == "id"]
14 |
15 |
16 | def get_target_feature(schema, target: str, type_: str):
17 | def mask(x: tuple):
18 | return x[1] == type_
19 |
20 | schema_dict = _get_target(schema, target)
21 | return list(map(lambda x: x[0], filter(mask, schema_dict.items())))
22 |
23 |
24 | def make_dirs(*args):
25 | for dir_ in args:
26 | Path(dir_).mkdir(parents=True, exist_ok=True)
27 |
28 |
29 | def get_sample_query(x: tf.data.Dataset):
30 | return list(x.batch(1).take(1))[0]
31 |
--------------------------------------------------------------------------------
/tests/test_extractor.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | from pathlib import Path
4 |
5 | import numpy as np
6 | import pandas as pd
7 | import pytest
8 | from sklearn.preprocessing import StandardScaler
9 |
10 | from rexify import FeatureExtractor, Output, Schema
11 | from rexify.features.transform import CustomTransformer
12 |
13 |
14 | class TestFeatureExtractor:
15 | @pytest.fixture(scope="class")
16 | def schema(self):
17 | user_id = "user_id"
18 | item_id = "item_id"
19 | timestamp = "timestamp"
20 | event_type = "event_type"
21 | user_features = {"age": "number", "gender": "category"}
22 | item_features = {"price": "number", "category": "category"}
23 | return Schema(
24 | user_id, item_id, timestamp, event_type, user_features, item_features
25 | )
26 |
27 | @pytest.fixture(scope="class")
28 | def data(self):
29 | return pd.DataFrame(
30 | {
31 | "user_id": [1, 1, 2, 2, 3, 3],
32 | "item_id": [10, 20, 10, 20, 30, 40],
33 | "timestamp": [1, 2, 3, 4, 5, 6],
34 | "event_type": ["p", "p", "p", "p", "p", "p"],
35 | }
36 | )
37 |
38 | @pytest.fixture(scope="class")
39 | def users(self):
40 | return pd.DataFrame(
41 | {"user_id": [1, 2, 3], "age": [25, 30, 35], "gender": ["M", "F", "M"]}
42 | )
43 |
44 | @pytest.fixture(scope="class")
45 | def items(self):
46 | return pd.DataFrame(
47 | {"item_id": [10, 20, 30], "price": [1, 2, 3], "category": ["1", "2", "3"]}
48 | )
49 |
50 | @pytest.fixture(scope="class")
51 | def feat(self, schema, users, items):
52 | users, items = self._save_users_items(users, items)
53 | return FeatureExtractor(schema, users, items)
54 |
55 | def test_fit(self, data, feat):
56 | _ = feat.fit(data)
57 |
58 | def test_transform(self, data, feat):
59 | transformed = feat.fit(data).transform(data)
60 | assert isinstance(transformed, Output)
61 |
62 | @pytest.fixture(scope="class")
63 | def custom_feat(self, schema, users, items):
64 | users["custom_feature"] = np.random.randint(100, 200, size=users.shape[0])
65 | users, items = self._save_users_items(users, items)
66 | return FeatureExtractor(
67 | schema,
68 | users,
69 | items,
70 | custom_transformers=[
71 | CustomTransformer("user", StandardScaler(), ["custom_feature"])
72 | ],
73 | )
74 |
75 | def test_fit_custom(self, data, feat, custom_feat):
76 | _ = feat.fit(data)
77 | _ = custom_feat.fit(data)
78 | assert feat.model_params["user_embeddings"].shape[1] == 3
79 | assert custom_feat.model_params["user_embeddings"].shape[1] == 4
80 |
81 | def test_save_load(self, data, feat):
82 | _ = feat.fit(data).transform(data)
83 | tmp_dir = tempfile.mkdtemp()
84 | feat.save(tmp_dir)
85 | feat_path = Path(tmp_dir) / "feature_extractor.pickle"
86 | assert feat_path.exists()
87 |
88 | fe = FeatureExtractor.load(feat_path)
89 | assert fe
90 |
91 | @pytest.fixture(scope="class")
92 | def fe_no_data(self, schema, users, items):
93 | users, items = self._save_users_items(users, items)
94 | return FeatureExtractor(schema, users, items, return_dataset=False)
95 |
96 | def test_make_dataset(self, data, fe_no_data):
97 | transformed = fe_no_data.fit(data).transform(data)
98 |
99 | tmp_dir = tempfile.mkdtemp()
100 | transformed_path = Path(tmp_dir)
101 | transformed.save(transformed_path)
102 |
103 | df = Output.load(transformed_path)
104 | df.to_dataset()
105 |
106 | def _save_users_items(self, users, items) -> tuple[str, str]:
107 | tmp_dir = tempfile.mkdtemp()
108 |
109 | users_path = os.path.join(tmp_dir, "users.csv")
110 | users.to_csv(users_path)
111 |
112 | items_path = os.path.join(tmp_dir, "items.csv")
113 | items.to_csv(items_path)
114 |
115 | return users_path, items_path
116 |
--------------------------------------------------------------------------------
/tests/test_schema.py:
--------------------------------------------------------------------------------
1 | import json
2 | import tempfile
3 |
4 | import pytest
5 |
6 | from rexify.schema import Schema, _TargetSchema
7 |
8 |
9 | def test_init():
10 | user_id = "user_id"
11 | item_id = "item_id"
12 | timestamp = "timestamp"
13 | event_type = "event_type"
14 | user_features = {"age": "number", "gender": "category"}
15 | item_features = {"price": "number", "category": "category"}
16 | schema = Schema(
17 | user_id=user_id,
18 | item_id=item_id,
19 | timestamp=timestamp,
20 | event_type=event_type,
21 | user_features=user_features,
22 | item_features=item_features,
23 | )
24 |
25 | assert schema.user.id == "user_id"
26 | assert schema.user.age == "number"
27 | assert schema.user.gender == "category"
28 | assert schema.item.id == "item_id"
29 | assert schema.item.price == "number"
30 | assert schema.item.category == "category"
31 | assert schema.timestamp == timestamp
32 | assert schema.event_type == event_type
33 |
34 |
35 | def test_from_dict():
36 | schema_dict = {
37 | "user": {"user_id": "id", "age": "number", "gender": "category"},
38 | "item": {"item_id": "id", "price": "number", "category": "category"},
39 | "timestamp": "timestamp",
40 | "event_type": "event_type",
41 | }
42 |
43 | schema = Schema.from_dict(schema_dict)
44 |
45 | assert schema.user.id == "user_id"
46 | assert schema.user.age == "number"
47 | assert schema.user.gender == "category"
48 | assert schema.item.id == "item_id"
49 | assert schema.item.price == "number"
50 | assert schema.item.category == "category"
51 | assert schema.timestamp == "timestamp"
52 | assert schema.event_type == "event_type"
53 |
54 |
55 | def test_load():
56 | schema_dict = {
57 | "user": {"user_id": "id", "age": "number", "gender": "category"},
58 | "item": {"item_id": "id", "price": "number", "category": "category"},
59 | "timestamp": "timestamp",
60 | "event_type": "event_type",
61 | }
62 |
63 | with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
64 | json.dump(schema_dict, f)
65 | f.seek(0)
66 | schema = Schema.from_json(f.name)
67 |
68 | assert schema.user.id == "user_id"
69 | assert schema.user.age == "number"
70 | assert schema.user.gender == "category"
71 | assert schema.item.id == "item_id"
72 | assert schema.item.price == "number"
73 | assert schema.item.category == "category"
74 | assert schema.timestamp == "timestamp"
75 | assert schema.event_type == "event_type"
76 |
77 |
78 | def test_target_schema():
79 | # Test data types are valid
80 | target = _TargetSchema("id", feature1="category", feature2="number")
81 | assert hasattr(target, "id")
82 | assert hasattr(target, "feature1")
83 | assert target.feature1 == "category"
84 | assert hasattr(target, "feature2")
85 | assert target.feature2 == "number"
86 |
87 | # Test unsupported data type throws error
88 | with pytest.raises(ValueError, match=r"Data type not supported"):
89 | _ = _TargetSchema("id", feature1="string")
90 |
91 |
92 | def test_schema_io():
93 | # Test Schema to_dict method
94 | user_id = "user_id"
95 | item_id = "item_id"
96 | timestamp = "timestamp"
97 | event_type = "event_type"
98 | user_features = {"age": "number", "gender": "category"}
99 | item_features = {"price": "number", "category": "category"}
100 | schema = Schema(
101 | user_id, item_id, timestamp, event_type, user_features, item_features
102 | )
103 | assert schema.to_dict() == {
104 | "user": {"user_id": "id", "age": "number", "gender": "category"},
105 | "item": {"item_id": "id", "price": "number", "category": "category"},
106 | "timestamp": "timestamp",
107 | "event_type": "event_type",
108 | }
109 |
110 | # Test Schema from_dict method
111 | schema_dict = schema.to_dict()
112 | schema_loaded = Schema.from_dict(schema_dict)
113 | assert schema_loaded.to_dict() == schema.to_dict()
114 |
115 | # Test Schema load method
116 | with open("test_schema.json", "w") as f:
117 | json.dump(schema_dict, f, indent=4)
118 | schema_loaded = Schema.from_json("test_schema.json")
119 | assert schema_loaded.to_dict() == schema.to_dict()
120 |
121 | # Test Schema save method
122 | schema.save("test_schema.json")
123 | with open("test_schema.json", "r") as f:
124 | schema_loaded = json.load(f)
125 | assert schema_loaded == schema_dict
126 |
--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 |
5 | from rexify.utils import get_target_feature, get_target_id, make_dirs
6 |
7 |
8 | @pytest.fixture
9 | def schema():
10 | return {
11 | "target1": {"key1": "id", "key2": "value1"},
12 | "target2": {"key3": "value2", "key4": "id"},
13 | "target3": {"key5": "value3", "key6": "value4"},
14 | }
15 |
16 |
17 | def test_get_target_id(schema):
18 | assert get_target_id(schema, "target1") == ["key1"]
19 | assert get_target_id(schema, "target2") == ["key4"]
20 | assert get_target_id(schema, "target3") == []
21 |
22 |
23 | def test_get_target_feature(schema):
24 | assert get_target_feature(schema, "target1", "id") == ["key1"]
25 | assert get_target_feature(schema, "target1", "value1") == ["key2"]
26 | assert get_target_feature(schema, "target2", "id") == ["key4"]
27 | assert get_target_feature(schema, "target2", "value2") == ["key3"]
28 | assert get_target_feature(schema, "target3", "value3") == ["key5"]
29 | assert get_target_feature(schema, "target3", "value4") == ["key6"]
30 | assert get_target_feature(schema, "target3", "value5") == []
31 |
32 |
33 | def test_make_dirs(tmpdir):
34 | dir1 = tmpdir.mkdir("dir1")
35 | dir2 = tmpdir.mkdir("dir2")
36 | make_dirs(dir1, dir2)
37 |
38 | assert Path(dir1).exists()
39 | assert Path(dir2).exists()
40 |
--------------------------------------------------------------------------------