├── .circleci
    └── config.yml
├── .flake8
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── README.md
├── docs
    ├── api
    │   ├── modules.rst
    │   ├── rexify.cli.rst
    │   ├── rexify.constants.rst
    │   ├── rexify.exceptions.rst
    │   ├── rexify.exceptions.schema.rst
    │   ├── rexify.features.base.rst
    │   ├── rexify.features.dataset.rst
    │   ├── rexify.features.extractor.rst
    │   ├── rexify.features.pipelines.rst
    │   ├── rexify.features.rst
    │   ├── rexify.models.candidate.rst
    │   ├── rexify.models.query.rst
    │   ├── rexify.models.recommender.rst
    │   ├── rexify.models.rst
    │   ├── rexify.models.tower.rst
    │   ├── rexify.pipeline.rst
    │   ├── rexify.rst
    │   └── rexify.utils.rst
    ├── conf.py
    ├── genindex.rst
    ├── index.rst
    ├── overview
    │   ├── architecture.md
    │   ├── inputs.md
    │   └── overview.md
    ├── requirements.txt
    └── tutorials
    │   ├── configure_pipeline.ipynb
    │   ├── prebuilt_pipeline.ipynb
    │   └── quickstart.ipynb
├── pyproject.toml
├── rexify
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── base.py
    │   ├── input.py
    │   └── output.py
    ├── features
    │   ├── __init__.py
    │   ├── base.py
    │   ├── extractor.py
    │   └── transform
    │   │   ├── __init__.py
    │   │   ├── category.py
    │   │   ├── custom.py
    │   │   ├── entity.py
    │   │   ├── event.py
    │   │   ├── id.py
    │   │   ├── number.py
    │   │   └── sequence.py
    ├── models
    │   ├── __init__.py
    │   ├── base.py
    │   ├── callbacks
    │   │   ├── __init__.py
    │   │   ├── index.py
    │   │   └── mlflow.py
    │   ├── index.py
    │   ├── lookup.py
    │   ├── ranking
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── event.py
    │   │   └── ranking.py
    │   ├── recommender.py
    │   ├── retrieval
    │   │   ├── __init__.py
    │   │   ├── candidate.py
    │   │   ├── query.py
    │   │   ├── retrieval.py
    │   │   └── tower.py
    │   └── sequential.py
    ├── pipeline
    │   ├── __init__.py
    │   ├── __main__.py
    │   └── components
    │   │   ├── __init__.py
    │   │   ├── load.py
    │   │   └── train.py
    ├── schema.py
    └── utils.py
└── tests
    ├── test_extractor.py
    ├── test_schema.py
    └── test_utils.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | 
 3 | orbs:
 4 |   python: circleci/python@2.1.1
 5 | 
 6 | jobs:
 7 | 
 8 |   test:
 9 |     docker:
10 |       - image: cimg/python:3.10
11 |     steps:
12 |       - checkout
13 |       - python/install-packages:
14 |           pre-install-steps: []
15 |           pkg-manager: poetry
16 |       - run:
17 |           name: Run tests
18 |           command: |
19 |             poetry run pytest
20 | 
21 |   publish:
22 |     docker:
23 |       - image: cimg/python:3.10
24 |     steps:
25 |       - checkout
26 |       - run:
27 |           name: Build and publish
28 |           command: |
29 |             poetry build
30 |             poetry version $(git describe --tags --abbrev=0)
31 |             poetry publish --build --username $PYPI_USERNAME --password $PYPI_PASSWORD
32 | 
33 |   docker:
34 |     docker:
35 |       - image: cimg/base:2023.04
36 |     environment:
37 |       IMAGE_URI: joseprsm/rexify
38 |     steps:
39 |       - checkout
40 |       - setup_remote_docker
41 |       - run:
42 |           name: Build Docker image
43 |           command: docker build . -t $IMAGE_URI
44 |       - run:
45 |           name: Push Docker image
46 |           command: | 
47 |             echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
48 |             docker push $IMAGE_URI
49 | 
50 | workflows:
51 |   test_only:
52 |     jobs:
53 |       - test
54 |           
55 |   test_and_build:
56 |     jobs:
57 |       - test:  &tags_only
58 |           filters:
59 |             branches:
60 |               ignore: /.*/
61 |             tags:
62 |               only: /^\d+\.\d+\.\d+$/
63 |       - publish:
64 |           <<: *tags_only
65 |           requires:
66 |             - test
67 |       - request_docker:
68 |           <<: *tags_only
69 |           type: approval
70 |           requires:
71 |             - test
72 |       - docker:
73 |           <<: *tags_only
74 |           requires:
75 |             - request_docker
76 |             - publish
77 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | select = C,E,F,W,B,B9
4 | ignore = E203, E501, W503
5 | exclude =
6 |     docs/conf.py
7 |     __init__.py
8 |     build


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /build
 2 | /dist
 3 | 
 4 | .idea
 5 | .env
 6 | .coverage
 7 | .pytest_cache
 8 | 
 9 | */__pycache__/*
10 | */.ipynb_checkpoints/
11 | *.egg-info/
12 | 
13 | outputs
14 | 
15 | /docs/_build/
16 | /docs/api
17 | /docs/reference
18 | 
19 | .DS_Store
20 | 
21 | *.pyc
22 | .vscode/settings.json
23 | 
24 | mlruns
25 | 
26 | /*.json


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: black
 5 |         name: black
 6 |         language: system
 7 |         entry: black
 8 |         types: [ python ]
 9 |         require_serial: true
10 |       - id: flake8
11 |         name: flake8
12 |         entry: flake8
13 |         language: system
14 |         types: [ python ]
15 |         require_serial: true
16 |       - id: isort
17 |         name: isort
18 |         entry: isort
19 |         require_serial: true
20 |         language: system
21 |         types_or: [cython, pyi, python]
22 |   


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-20.04
 5 |   tools:
 6 |     python: "3.10"
 7 | 
 8 | sphinx:
 9 |   builder: html
10 |   configuration: docs/conf.py
11 | 
12 | python:
13 |    install:
14 |    - requirements: docs/requirements.txt
15 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | joseprsm@gmail.com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG python="3.10"
 2 | ARG filesystem="gcs"
 3 | 
 4 | FROM python:${python} AS base
 5 | 
 6 | RUN if [ $(uname -m) != *arm* ]; then pip install scann==1.2.3; fi
 7 | 
 8 | RUN pip install pandas numpy scikit-learn fsspec rexify
 9 | 
10 | FROM base AS fs-s3
11 | 
12 | RUN pip install s3fs
13 | 
14 | FROM base AS fs-gcs
15 | 
16 | RUN pip install gcsfs
17 | 
18 | FROM fs-${filesystem} AS final
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 José Medeiros
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <br>
  3 |     <img src="https://storage.googleapis.com/rexify/1659986918545.png" height="200"/>
  4 |     <br>
  5 | <p>
  6 | 
  7 | <p align="center">
  8 |     <a href="https://circleci.com/gh/joseprsm/rexify">
  9 |         <img alt="Build" src="https://img.shields.io/circleci/build/github/joseprsm/rexify?style=flat-square">
 10 |     </a>
 11 |     <a href="https://github.com/joseprsm/rexify/blob/main/LICENSE">
 12 |         <img alt="License" src="https://img.shields.io/github/license/joseprsm/rexify?style=flat-square">
 13 |     </a>
 14 |     <a href="https://rexify.readthedocs.io">
 15 |         <img alt="Documentation" src="https://img.shields.io/badge/documentation-online-success?style=flat-square">
 16 |     </a>
 17 |     <a href="https://pypi.org/project/rexify/">
 18 |         <img alt="GitHub release" src="https://img.shields.io/github/v/release/joseprsm/rexify?style=flat-square">
 19 |     </a>
 20 | </p>
 21 | 
 22 | Rexify is a library to streamline recommender systems model development.
 23 | 
 24 | In essence, Rexify adapts dynamically to your data, and outputs high-performing TensorFlow
 25 | models that may be used wherever you want, independently of your data. Rexify also includes
 26 | modules to deal with feature engineering as Scikit-Learn Transformers and Pipelines.
 27 | 
 28 | With Rexify, users may easily train Recommender Systems models, just by specifying what their
 29 | data looks like. Rexify also comes equipped with pre-built machine learning pipelines which can
 30 | be used serverlessly. 
 31 | 
 32 | ## What is Rexify?
 33 | 
 34 | Rexify is a low-code personalization tool, that makes use of traditional machine learning 
 35 | frameworks, such as Scikit-Learn and TensorFlow, to create scalable Recommender Systems
 36 | workflows that anyone can use.
 37 | 
 38 | ### Who is it for?
 39 | 
 40 | Rexify is a project that simplifies and standardizes the workflow of recommender systems. It is 
 41 | mostly geared towards people with little to no machine learning knowledge, that want to implement
 42 | somewhat scalable Recommender Systems in their applications.
 43 | 
 44 | ## Installation
 45 | 
 46 | The easiest way to install Rexify is via `pip`:
 47 | 
 48 | ```shell
 49 | pip install rexify
 50 | ```
 51 | 
 52 | ## Quick Tour
 53 | 
 54 | Rexify is meant to be usable right out of the box. All you need to set up your model is interaction
 55 | data - something that kind of looks like this:
 56 | 
 57 | | user_id | item_id | timestamp  | event_type  |
 58 | |---------|---------|------------|-------------|
 59 | | 22      | 67      | 2021/05/13 | Purchase    |
 60 | | 37      | 9       | 2021/04/11 | Page View   |
 61 | | 22      | 473     | 2021/04/11 | Add to Cart |
 62 | | ...     | ...     | ...        | ...         |
 63 | | 358     | 51      | 2021/04/11 | Purchase    |
 64 | 
 65 | Additionally, we'll have to have configured a schema for the data.
 66 | This schema is what will allow Rexify to generate a dynamic model and preprocessing steps.
 67 | The schema should be comprised of two dictionaries (`user`, `ìtem`) and two key-value 
 68 | pairs: `event_type` (which should point to the column of the event type) and `timestamp` (
 69 | which should point to the timestamp column)
 70 | 
 71 | Each of these dictionaries should consist of features and internal data types, 
 72 | such as: `id`, `category`, `number`. More data types will be available 
 73 | in the future.
 74 | 
 75 | ```json
 76 | {
 77 |   "user": {
 78 |     "user_id": "id",
 79 |     "age": "number"
 80 |   },
 81 |   "item": {
 82 |     "item_id": "id",
 83 |     "category": "category"
 84 |   },
 85 |   "timestamp": "timestamp"
 86 |   "event_type": "event_type"
 87 | }
 88 | ```
 89 | 
 90 | Essentially, what Rexify will do is take the schema, and dynamically adapt to the data.
 91 | 
 92 | There are two main components in Rexify workflows: `FeatureExtractor` and `Recommender`.
 93 | 
 94 | The `FeatureExtractor` is a scikit-learn Transformer that basically takes the schema of 
 95 | the data, and transforms the event data accordingly. Another method `.make_dataset()`, 
 96 | converts the transformed data into a `tf.data.Dataset`, all correctly configured to be fed
 97 | to the `Recommender` model.
 98 | 
 99 | `Recommender` is a `tfrs.Model` that basically implements the Query and Candidate towers. 
100 | During training, the Query tower will take the user ID, user features, and context, to 
101 | learn an embedding; the Candidate tower will do the same for the item ID and its features. 
102 | 
103 | More information about how the `FeatureExtractor` and the `Recommender` works can be found 
104 | [here](https://rexify.readthedocs.io/en/latest/overview/architecture.html). 
105 | 
106 | A sample Rexify workflow should sort of look like this:
107 | 
108 | ````python
109 | 
110 | import pandas as pd
111 | 
112 | from rexify import Schema, FeatureExtractor, Recommender
113 | 
114 | events = pd.read_csv('path/to/events/data')
115 | schema = Schema.load('path/to/schema')
116 | 
117 | fe = FeatureExtractor(schema, users='path/to/users/data', items='path/to/events/data', return_dataset=True)
118 | x = fe.fit(events).transform(events)
119 | 
120 | model = Recommender(**fe.model_params)
121 | model.compile()
122 | model.fit(events, batch_size=512)
123 | ````
124 | 
125 | When training is complete, you'll have a trained `tf.keras.Model` ready to be used, as
126 | you normally would. 
127 | 
128 | Alternatively, you can also run:
129 | 
130 | ```shell
131 | python -m rexify.pipeline -p events=$EVENTS_PATH -p users=$USER_PATH -p items=$ITEMS_PATH -p schema=$SCHEMA_PATH
132 | ```
133 | 
134 | Which will generate a `pipeline.json` file, that you can use on Kubeflow Pipelines (or Vertex AI Pipelines).
135 | 
136 | ## License
137 | 
138 | [MIT](https://github.com/joseprsm/rexify/blob/main/LICENSE)
139 | 


--------------------------------------------------------------------------------
/docs/api/modules.rst:
--------------------------------------------------------------------------------
1 | rexify
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 6
6 | 
7 |    rexify
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.cli.rst:
--------------------------------------------------------------------------------
1 | rexify.cli module
2 | =================
3 | 
4 | .. automodule:: rexify.cli
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.constants.rst:
--------------------------------------------------------------------------------
1 | rexify.constants module
2 | =======================
3 | 
4 | .. automodule:: rexify.constants
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.exceptions.rst:
--------------------------------------------------------------------------------
 1 | rexify.exceptions package
 2 | =========================
 3 | 
 4 | .. automodule:: rexify.exceptions
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 6
14 | 
15 |    rexify.exceptions.schema
16 | 


--------------------------------------------------------------------------------
/docs/api/rexify.exceptions.schema.rst:
--------------------------------------------------------------------------------
1 | rexify.exceptions.schema module
2 | ===============================
3 | 
4 | .. automodule:: rexify.exceptions.schema
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.features.base.rst:
--------------------------------------------------------------------------------
1 | rexify.features.base module
2 | ===========================
3 | 
4 | .. automodule:: rexify.features.base
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.features.dataset.rst:
--------------------------------------------------------------------------------
1 | rexify.features.dataset module
2 | ==============================
3 | 
4 | .. automodule:: rexify.features.dataset
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.features.extractor.rst:
--------------------------------------------------------------------------------
1 | rexify.features.extractor module
2 | ================================
3 | 
4 | .. automodule:: rexify.features.extractor
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.features.pipelines.rst:
--------------------------------------------------------------------------------
1 | rexify.features.pipelines module
2 | ================================
3 | 
4 | .. automodule:: rexify.features.pipelines
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.features.rst:
--------------------------------------------------------------------------------
 1 | rexify.features package
 2 | =======================
 3 | 
 4 | .. automodule:: rexify.features
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 6
14 | 
15 |    rexify.features.base
16 |    rexify.features.dataset
17 |    rexify.features.extractor
18 |    rexify.features.pipelines
19 | 


--------------------------------------------------------------------------------
/docs/api/rexify.models.candidate.rst:
--------------------------------------------------------------------------------
1 | rexify.models.candidate module
2 | ==============================
3 | 
4 | .. automodule:: rexify.models.candidate
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.models.query.rst:
--------------------------------------------------------------------------------
1 | rexify.models.query module
2 | ==========================
3 | 
4 | .. automodule:: rexify.models.query
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.models.recommender.rst:
--------------------------------------------------------------------------------
1 | rexify.models.recommender module
2 | ================================
3 | 
4 | .. automodule:: rexify.models.recommender
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.models.rst:
--------------------------------------------------------------------------------
 1 | rexify.models package
 2 | =====================
 3 | 
 4 | .. automodule:: rexify.models
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 6
14 | 
15 |    rexify.models.candidate
16 |    rexify.models.query
17 |    rexify.models.ranking
18 |    rexify.models.recommender
19 |    rexify.models.retrieval
20 |    rexify.models.tower
21 | 


--------------------------------------------------------------------------------
/docs/api/rexify.models.tower.rst:
--------------------------------------------------------------------------------
1 | rexify.models.tower module
2 | ==========================
3 | 
4 | .. automodule:: rexify.models.tower
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.pipeline.rst:
--------------------------------------------------------------------------------
1 | rexify.pipeline module
2 | ======================
3 | 
4 | .. automodule:: rexify.pipeline
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/api/rexify.rst:
--------------------------------------------------------------------------------
 1 | rexify package
 2 | ==============
 3 | 
 4 | .. automodule:: rexify
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 6
14 | 
15 |    rexify.exceptions
16 |    rexify.features
17 |    rexify.models
18 | 
19 | Submodules
20 | ----------
21 | 
22 | .. toctree::
23 |    :maxdepth: 6
24 | 
25 |    rexify.cli
26 |    rexify.constants
27 |    rexify.pipeline
28 |    rexify.utils
29 | 


--------------------------------------------------------------------------------
/docs/api/rexify.utils.rst:
--------------------------------------------------------------------------------
1 | rexify.utils module
2 | ===================
3 | 
4 | .. automodule:: rexify.utils
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import sphinx_material
 2 | 
 3 | 
 4 | project = "Rexify"
 5 | html_title = "Rexify"
 6 | 
 7 | html_theme = "sphinx_material"
 8 | 
 9 | extensions = [
10 |     "sphinx.ext.autodoc",
11 |     "sphinx.ext.githubpages",
12 |     "m2r2",
13 |     "sphinx.ext.napoleon",
14 |     "sphinx_search.extension",
15 |     "sphinxcontrib.apidoc",
16 |     "nbsphinx",
17 | ]
18 | source_suffix = [".rst", ".md"]
19 | 
20 | napoleon_google_docstring = True
21 | napoleon_numpy_docstring = True
22 | napoleon_include_init_with_doc = True
23 | napoleon_include_private_with_doc = False
24 | napoleon_include_special_with_doc = True
25 | napoleon_use_admonition_for_examples = False
26 | napoleon_use_admonition_for_notes = False
27 | napoleon_use_admonition_for_references = False
28 | napoleon_use_ivar = False
29 | napoleon_use_param = True
30 | napoleon_use_rtype = False
31 | 
32 | apidoc_module_dir = "../rexify"
33 | apidoc_output_dir = "api"
34 | apidoc_excluded_paths = ["**/*test*"]
35 | apidoc_module_first = True
36 | apidoc_separate_modules = True
37 | apidoc_extra_args = ["-d 6"]
38 | 
39 | html_theme_options = {
40 |     "color_primary": "cyan",
41 |     "color_accent": "light-blue",
42 |     "repo_url": "https://github.com/joseprsm/rexify",
43 |     "repo_name": "Rexify",
44 |     "globaltoc_depth": 2,
45 |     "globaltoc_collapse": False,
46 |     "globaltoc_includehidden": False,
47 |     "repo_type": "github",
48 | }
49 | 
50 | extensions.append("sphinx_material")
51 | html_theme_path = sphinx_material.html_theme_path()
52 | html_context = sphinx_material.get_html_context()
53 | 
54 | html_sidebars = {
55 |     "**": ["logo-text.html", "globaltoc.html", "localtoc.html", "searchbox.html"]
56 | }
57 | 
58 | nbsphinx_allow_errors = True
59 | 


--------------------------------------------------------------------------------
/docs/genindex.rst:
--------------------------------------------------------------------------------
1 | Main Index
2 | ==========


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. toctree::
 2 |    :hidden:
 3 | 
 4 |    genindex
 5 | 
 6 | 
 7 | .. toctree::
 8 |     :titlesonly:
 9 | 
10 |     Rexify <overview/overview.md>
11 |     Architecture <overview/architecture.md>
12 |     Inputs <overview/inputs.md>
13 | 
14 | .. toctree::
15 |     :titlesonly:
16 |     :caption: Guides and Examples
17 | 
18 |     Quickstart <tutorials/quickstart>
19 |     Using a pre-built pipeline <tutorials/prebuilt_pipeline>
20 |     Configuring your own Kubeflow pipeline <tutorials/configure_pipeline>
21 | 
22 | .. toctree::
23 |    :maxdepth: 1
24 |    :caption: API reference
25 | 
26 |    API reference <api/modules>
27 | 
28 | .. mdinclude:: ../README.md


--------------------------------------------------------------------------------
/docs/overview/architecture.md:
--------------------------------------------------------------------------------
 1 | # Architecture
 2 | 
 3 | Rexify has two main components: the `FeatureExtractor` and the `Recommender`. 
 4 | 
 5 | The former basically takes the original data, and learns all the transformations
 6 | that need to be applied to the dataset. The output is a `tf.data.Dataset` with the 
 7 | right structure to be passed on to the `Recommender` model. 
 8 | 
 9 | This `Recommender` is a TensorFlow model with a dynamic architecture, which adapts
10 | itself according to the schema fed to the `FeatureExtractor`.
11 | 
12 | ## Feature Extractor
13 | 
14 | The `FeatureExtractor` is a scikit-learn Transformer. It implements a `.fit()` 
15 | and a `.transform()` method that apply a set of transformations on the data.
16 | 
17 | Essentially, it has a `_ppl` attribute which is a `sklearn.pipeline.Pipeline`; 
18 | the pipeline steps are set according to the `schema` passed during instantiation, 
19 | which are scikit-learn Transformers themselves.
20 | 
21 | For example, an attribute classified as `id` would create a pipeline step with a 
22 | `sklearn.compose.ColumnTransformer`, composed of  a single `sklearn.preprocessing.OrdinalEncoder` 
23 | Transformer. 
24 | 
25 | Additionally, it subclasses `rexify.features.TfDatasetGenerator`, which converts 
26 | the output of the transformations of the `FeatureExtractor` into a `tf.data.Dataset`, 
27 | with a nested structure such as this:
28 | 
29 | ```
30 | {
31 |   "query": {
32 |     "user_id": tf.Tensor([]),
33 |     "user_features": tf.Tensor([]),
34 |     "context": tf.Tensor([]),
35 |   },
36 |   "candidate": {
37 |     "item_id": tf.Tensor([]),
38 |     "item_features": tf.Tensor([])
39 |   }
40 | }
41 | ```
42 | 
43 | With this structure, the Recommender model can call a different set of layers for 
44 | the user and item ID attributes, and the remaining transformed features.  
45 | 
46 | ## Recommender
47 | 
48 | The `Recommender` is a `tfrs.models.Model`, which subclasses `tf.keras.Model` 
49 | and overrides the `.train_step()` method. According to the [TensorFlow Recommenders documentation](https://www.tensorflow.org/recommenders/api_docs/python/tfrs/models/Model):
50 | 
51 | > Many recommender models are relatively complex, and do not neatly 
52 | > fit into supervised or unsupervised paradigms. This base class makes it easy to 
53 | > define custom training and test losses for such complex models.
54 | 
55 | In this case, we use the Recommender model, to create a two tower model architecture, as explained [here](https://research.google/pubs/pub48840/). 
56 | In short, it's composed of two main models, a Query model and a Candidate model, both of which learn
57 | to represent queries and candidates in the same vector space.
58 | 
59 | <p align="center">
60 |     <img src="https://1.bp.blogspot.com/-ww8cKT3nIb8/X2pdWAWWNmI/AAAAAAAADl8/pkeFRxizkXYbDGbOcaAnZkorjEuqtrabgCLcBGAsYHQ/s0/TF%2BRecommenders%2B06.gif" style="width:500px">
61 | </p>
62 | 
63 | Basically, it takes the `tf.data.Dataset` output by the `FeatureExtractor` and passes it by the two 
64 | Query and Candidate model towers. Due to the nested structure of the dataset, we're able to get and apply
65 | different transformations to different sets of features.
66 | 
67 | ### Query Tower
68 | 
69 | The Query Tower is responsible with learning a representation for the queries. That representation is a
70 | combination between the user embedding, and the features learned from the remaining
71 | user and context attributes.
72 | 
73 | Essentially, it takes the user ID attribute and passes it to an Embedding layer. The user and context 
74 | features are concatenated and passed to a model composed of Dense layers. The output of that model and 
75 | the user embedding are then concatenated and subsequently fed to another set of Dense layers.
76 | 
77 | The resulting vector should represent a single query, which can be used to compute the similarity
78 | to the candidate vectors.
79 | 
80 | ### Candidate Tower
81 | 
82 | In essence, the Candidate Tower shares the same behavior as the Query's. The key difference is that instead 
83 | of using the user ID and features and context, it solely uses the item ID and remaining features.
84 | 
85 | On a deeper level, it takes the item ID attribute and passes it to an Embedding layer. The item features are
86 | passed to a set of Dense layers. The output of these layers and the Embedding layer are then concatenated and
87 | then passed to another set of Dense layers.
88 | 
89 | The resulting vector should represent a single candidate, or item, in this case, which can be used to compute 
90 | the similarity to a query vector or between other candidate vectors.


--------------------------------------------------------------------------------
/docs/overview/inputs.md:
--------------------------------------------------------------------------------
 1 | # Inputs
 2 | 
 3 | ## Data
 4 | 
 5 | There are three main types of data may be input to Rexify: [Events](#Events), [Users](#Users), and [Items](#Items)  
 6 | 
 7 | ### Events
 8 | 
 9 | Events are the main type of 
10 | 
11 | ### Users
12 | 
13 | ### Items
14 | 
15 | ## Schema


--------------------------------------------------------------------------------
/docs/overview/overview.md:
--------------------------------------------------------------------------------
  1 | # Rexify
  2 | 
  3 | Rexify is a library to streamline recommender systems model development. It is built on
  4 | top of [Tensorflow Recommenders](https://github.com/tensorflow/recommenders) models and 
  5 | [Kubeflow](https://github.com/kubeflow/pipelines) pipelines.
  6 | 
  7 | In essence, Rexify adapts dynamically to your data, and outputs high-performing TensorFlow
  8 | models that may be used wherever you want, independently of your data. Rexify also includes modules to deal with feature engineering as Scikit-Learn Transformers 
  9 | and Pipelines.
 10 | 
 11 | ## Who is Rexify for?
 12 | 
 13 | Rexify is a project that simplifies and standardizes the workflow of recommender systems. It is 
 14 | mostly geared towards people with little to no machine learning knowledge, that want to implement
 15 | somewhat scalable Recommender Systems in their applications.
 16 | 
 17 | ## Quick Tour
 18 | 
 19 | Rexify is meant to be usable right out of the box. All you need to set up your model is interaction data - something that kind of looks like this:
 20 | 
 21 | | user_id | item_id | timestamp  | item_name   | event_type  |
 22 | |---------|---------|------------|-------------|-------------|
 23 | | 22      | 67      | 2021/05/13 | Blue Jeans  | Purchase    |
 24 | | 37      | 9       | 2021/04/11 | White Shirt | Page View   |
 25 | | 22      | 473     | 2021/04/11 | Red Purse   | Add to Cart |
 26 | | ...     | ...     | ...        | ...         | ...         |
 27 | | 358     | 51      | 2021/04/11 | Bracelet    | Purchase    |
 28 | 
 29 | Additionally, we'll have to have configured a schema for the data.
 30 | This schema is what will allow Rexify to generate a dynamic model and preprocessing steps.
 31 | The schema should be comprised of three dictionaries: `user`, `ìtem`, `context`.
 32 | 
 33 | Each of these dictionaries should consist of features and internal data types, 
 34 | such as: `id`, `categorical`, `timestamp`, `text`. More data types will be available 
 35 | in the future.
 36 | 
 37 | ```json
 38 | {
 39 |   "user": {
 40 |     "user_id": "id"
 41 |   },
 42 |   "item": {
 43 |     "item_id": "id",
 44 |     "timestamp": "timestamp",
 45 |     "item_name": "text"
 46 |   },
 47 |   "context": {
 48 |     "event_type": "categorical"
 49 |   }
 50 | }
 51 | ```
 52 | 
 53 | Essentially, what Rexify will do is take the schema, and dynamically adapt to the data.
 54 | 
 55 | ### As a package
 56 | 
 57 | There are two main components in Rexify workflows: `FeatureExtractor` and `Recommender`.
 58 | 
 59 | The `FeatureExtractor` is a scikit-learn Transformer that basically takes the schema of the data, and transforms the event data accordingly. Another method `.make_dataset()`, converts the transformed data into a `tf.data.Dataset`, all correctly configured to be fed to the `Recommender` model. You can read more about how the `FeatureExtractor` works here.
 60 | 
 61 | `Recommender` is a `tfrs.Model` that basically implements the Query and Candidate towers. During training, the Query tower will take the user ID, user features, and context, to learn an embedding; the Candidate tower will do the same for the item ID and its features. More information about the `Recommender` model can be found here. 
 62 | 
 63 | A sample Rexify workflow should sort of look like this:
 64 | 
 65 | ````python
 66 | import json
 67 | import pandas as pd
 68 | 
 69 | from rexify.features import FeatureExtractor
 70 | from rexify.models import Recommender
 71 | 
 72 | events = pd.read_csv('path/to/events/data')
 73 | with open('path/to/schema') as f:
 74 |     schema = json.load(f)
 75 | 
 76 | feat = FeatureExtractor(schema)
 77 | prep_data = feat.fit_transform(events)
 78 | ds = feat.make_dataset(prep_data)
 79 | 
 80 | model = Recommender(**feat.model_params)
 81 | model.compile()
 82 | model.fit(ds)
 83 | ````
 84 | 
 85 | When training is complete, you'll have a trained `tf.keras.Model` ready to be used, as you normally would. 
 86 | 
 87 | ### As a prebuilt pipeline
 88 | 
 89 | After cloning this project and setting up the necessary environment variables, you can run:
 90 | 
 91 | ```shell
 92 | python -m rexify.pipeline
 93 | ```
 94 | 
 95 | Which should output a `pipeline.json` file. You can then upload this file manually to 
 96 | either a Kubeflow Pipeline or Vertex AI Pipelines instance, and it should run seamlessly. 
 97 | 
 98 | You can also check the [Kubeflow Pipeline](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.client.html#kfp.Client.create_run_from_pipeline_package)
 99 | and [Vertex AI](https://cloud.google.com/vertex-ai/docs/pipelines/run-pipeline#create_a_pipeline_run) 
100 | documentation to learn how to submit these pipelines programmatically.
101 | 
102 | The prebuilt pipeline consists of 5 components:
103 | 
104 | 1. `download`, which downloads the event data from URLs set on the `$INPUT_DATA_URL` and `$SCHEMA_URL` environment variables
105 | 2. `load`, which prepares the data downloaded in the previous step
106 | 3. `train`, which trains a `Recommender` model on the preprocessed data
107 | 4. `index`, which trains a [ScaNN](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html) model to retrieve the nearest neighbors
108 | 5. `retrieval`, which basically retrieves the nearest _k_ neighbors for each of the known users
109 | 
110 | 
111 | ### Via the demo application
112 | 
113 | After cloning the project, install the demo dependencies and run the Streamlit application:
114 | 
115 | ```shell
116 | pip install -r demo/requirements.txt
117 | streamlit run demo/app.py
118 | ```
119 | 
120 | Or, if you're using docker:
121 | 
122 | ```shell
123 | docker run joseprsm/rexify-demo
124 | ```
125 | 
126 | You can then follow the steps here to set up your pipeline. 
127 | 
128 | During setup, you'll be asked to either input a publicly available dataset URL or use a sample data set.
129 | After that, you'll have a form to help you set up the schema for the data.
130 | 
131 | Finally, after hitting "Compile", you'll have your Pipeline Spec ready. The resulting JSON file can then 
132 | be uploaded to Vertex AI Pipelines or Kubeflow, seamlessly.
133 | 
134 | The key difference from this pipeline to the prebuilt one is that instead of using the `download` component 
135 | to download the schema, it will pass it as an argument to the pipeline, and then use a `copy` component to 
136 | pass it down as an artifact.
137 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | sphinx
 2 | mock==1.0.1
 3 | alabaster>=0.7,<0.8,!=0.7.5
 4 | commonmark==0.8.1
 5 | recommonmark==0.5.0
 6 | sphinx-rtd-theme
 7 | readthedocs-sphinx-ext<2.2
 8 | sphinx_material==0.0.30
 9 | m2r2
10 | breathe
11 | sphinxcontrib-apidoc>=0.3.0
12 | readthedocs-sphinx-search==0.1.0
13 | jinja2==3.1.2
14 | nbsphinx==0.8.2
15 | nbsphinx-link==1.1.0
16 | ipykernel


--------------------------------------------------------------------------------
/docs/tutorials/configure_pipeline.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "8e30dc1e-0237-4a9c-94a7-f3495a608ab7",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Configuring your own pipeline"
 9 |    ]
10 |   }
11 |  ],
12 |  "metadata": {
13 |   "kernelspec": {
14 |    "display_name": "Python 3 (ipykernel)",
15 |    "language": "python",
16 |    "name": "python3"
17 |   },
18 |   "language_info": {
19 |    "codemirror_mode": {
20 |     "name": "ipython",
21 |     "version": 3
22 |    },
23 |    "file_extension": ".py",
24 |    "mimetype": "text/x-python",
25 |    "name": "python",
26 |    "nbconvert_exporter": "python",
27 |    "pygments_lexer": "ipython3",
28 |    "version": "3.9.10"
29 |   }
30 |  },
31 |  "nbformat": 4,
32 |  "nbformat_minor": 5
33 | }
34 | 


--------------------------------------------------------------------------------
/docs/tutorials/prebuilt_pipeline.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "4d7be883-9e12-4f8c-b3b0-6f0505065da9",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Using the pre-built pipeline"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "code",
13 |    "execution_count": null,
14 |    "id": "f5ba48c3-1176-4a47-a146-c45e20fb6645",
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "!pip install rexify"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "id": "d0e1f889-7ba1-458c-bbee-240cf0ad3b19",
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": [
28 |     "!rexify pipeline create --args"
29 |    ]
30 |   }
31 |  ],
32 |  "metadata": {
33 |   "kernelspec": {
34 |    "display_name": "Python 3 (ipykernel)",
35 |    "language": "python",
36 |    "name": "python3"
37 |   },
38 |   "language_info": {
39 |    "codemirror_mode": {
40 |     "name": "ipython",
41 |     "version": 3
42 |    },
43 |    "file_extension": ".py",
44 |    "mimetype": "text/x-python",
45 |    "name": "python",
46 |    "nbconvert_exporter": "python",
47 |    "pygments_lexer": "ipython3",
48 |    "version": "3.9.10"
49 |   }
50 |  },
51 |  "nbformat": 4,
52 |  "nbformat_minor": 5
53 | }
54 | 


--------------------------------------------------------------------------------
/docs/tutorials/quickstart.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "07e2eea0-dc4a-436c-8605-04df80a20d45",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Quickstart\n",
  9 |     "\n",
 10 |     "Let's start by installing Rexify"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "fee1baf9-f430-44d3-a2f0-82f9cb17f107",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "!pip install rexify"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "f6ed5c5a-f691-4871-94f3-97895132bf91",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "Get some data:"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "id": "7e7c8d3a-400c-4a6b-bf1f-171c73793c16",
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "!mkdir data\n",
 39 |     "!curl --get https://storage.googleapis.com/roostr-ratings-matrices/rexify/completions.csv > data/events.csv"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "id": "e9fbc3cd-e598-4270-a15e-d9a5cfb9ba5f",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import pandas as pd"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "89e2d0b3-f0fd-4094-b64e-ccca7ae24705",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "events = pd.read_csv('data/events.csv')\n",
 60 |     "events"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "id": "47ab6ec6-0d08-40c4-83c6-bd797ae40aca",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "Next, we need to specify our schema:"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "09a944b4-045a-49c0-9e6a-efa2f2be14ae",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "schema = {\n",
 79 |     "    \"user\": {\n",
 80 |     "        \"account_id\": \"id\",\n",
 81 |     "    },\n",
 82 |     "    \"item\": {\n",
 83 |     "        \"program_id\": \"id\",\n",
 84 |     "    },\n",
 85 |     "    \"context\": {}\n",
 86 |     "}"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "id": "ea75dc34-0aa3-4d2f-a938-12734d57bff9",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "To preprocess our data, we can use the `FeatureExtractor`"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "cbb99040-4e6c-42f9-87dc-1cbe033989b6",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "from rexify.features import FeatureExtractor"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "id": "616e0441-d2ef-4d2d-8524-35635ed310a1",
110 |    "metadata": {},
111 |    "source": [
112 |     "We just need to pass it the schema, and it's ready to roll out."
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "id": "0198ea5f-bd27-4304-a4ae-9218fcccc7eb",
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "feat = FeatureExtractor(schema=schema)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "40911616-99d7-4510-8946-7219d507b87b",
128 |    "metadata": {},
129 |    "source": [
130 |     "As a scikit-learn Transformer, it has two main methods: `.fit()` and `.transform()`. What `.fit_transform()` essentially does is: `.fit().transform()`.\n",
131 |     "\n",
132 |     "During `.fit()`, it will take the schema, and infer what the preprocessing should look like - what transformations it should apply to the data before it's ready to be passed to the model. During `.transform()` it will apply those transformations, resulting in a `numpy.array` with the same number of rows as the original data."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "8f12e2f1-a724-4139-9102-009b11cda8df",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "features = feat.fit_transform(events)\n",
143 |     "features"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "id": "011cd59c-d754-4a22-af0a-de65e81b68f3",
149 |    "metadata": {},
150 |    "source": [
151 |     "The `.make_dataset()` method converts the numpy array to a `tf.data.Dataset` with the format it's expecting."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "id": "213b3c47-d612-41d1-a2f1-015f6c0b9b92",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "dataset = feat.make_dataset(features).batch(512)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "id": "d356f43c-a722-4bfd-bb0c-12a081d39316",
167 |    "metadata": {},
168 |    "source": [
169 |     "We can now take our `Recommender` model and instantiate it.\n",
170 |     "\n",
171 |     "During `.fit`, our `FeatureExtractor` also learns the right model parameters, so we don't need to worry about them. They're stored in the `model_params` property."
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "id": "b1826f76-56a2-44a9-bf49-0854ce1c678a",
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "from rexify.models import Recommender"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "id": "73ff6889-8fc9-4cdf-bf5e-3be307e03235",
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "model = Recommender(**feat.model_params)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "id": "59a0a545-6e0d-4b3d-927e-0282e7760820",
197 |    "metadata": {},
198 |    "source": [
199 |     "Being a `tensorflow.keras.Model` itself, in order to fit it, we need to first compile it:"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "id": "62e89747-42fb-4fee-a49f-56328f208b5c",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "model.compile()"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "id": "d507a703-afa6-44f9-b24c-7362971da047",
215 |    "metadata": {},
216 |    "source": [
217 |     "To fit it, all we need to do is pass our `tf.data.Dataset`:"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "id": "0d1ef245-2b9c-4bd0-a256-60595a0b699f",
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "# model.fit(dataset)"
228 |    ]
229 |   }
230 |  ],
231 |  "metadata": {
232 |   "kernelspec": {
233 |    "display_name": "Python 3 (ipykernel)",
234 |    "language": "python",
235 |    "name": "python3"
236 |   },
237 |   "language_info": {
238 |    "codemirror_mode": {
239 |     "name": "ipython",
240 |     "version": 3
241 |    },
242 |    "file_extension": ".py",
243 |    "mimetype": "text/x-python",
244 |    "name": "python",
245 |    "nbconvert_exporter": "python",
246 |    "pygments_lexer": "ipython3",
247 |    "version": "3.9.10"
248 |   }
249 |  },
250 |  "nbformat": 4,
251 |  "nbformat_minor": 5
252 | }
253 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "rexify"
  3 | version = "0.0.0"
  4 | description = "Streamlined Recommender System workflows with TensorFlow and Kubeflow"
  5 | authors = ["José Medeiros <joseprsm@gmail.com>"]
  6 | license = "MIT"
  7 | readme = "README.md"
  8 | documentation = "https://rexify.readthedocs.io"
  9 | packages = [{ include = "rexify" }]
 10 | classifiers = [
 11 |     "Development Status :: 3 - Alpha",
 12 |     "Intended Audience :: Developers",
 13 |     "Intended Audience :: Information Technology",
 14 |     "License :: OSI Approved :: MIT License",
 15 |     "Operating System :: OS Independent",
 16 |     "Programming Language :: Python :: 3 :: Only",
 17 |     "Programming Language :: Python :: 3.8",
 18 |     "Programming Language :: Python :: 3.9",
 19 |     "Programming Language :: Python :: 3.10",
 20 |     "Topic :: Software Development",
 21 |     "Topic :: Software Development :: Libraries",
 22 |     "Topic :: Software Development :: Libraries :: Python Modules",
 23 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 24 | ]
 25 | 
 26 | [tool.poetry.dependencies]
 27 | python = ">=3.8,<3.11"
 28 | tensorflow = { version = "2.9.0", markers = "sys_platform != 'darwin'" }
 29 | tensorflow_metal = { version = "0.5.0", markers = "sys_platform == 'darwin'"}
 30 | tensorflow_macos = { version = "2.9.0", markers = "sys_platform == 'darwin'"}
 31 | tensorflow_recommenders = ">=0.7.2"
 32 | scikit-learn = "1.*"
 33 | pandas = "^1.4.0"
 34 | numpy = ">=1.22.3"
 35 | kfp = { version = "^1.8.0", optional = true }
 36 | mlflow = { version = "^2.3.0", optional = true }
 37 | scann = { version = "^1.2.3", markers = "sys_platform != 'darwin'", optional = true }
 38 | fsspec = { version = "2023.4.0", optional = true }
 39 | 
 40 | [tool.poetry.extras]
 41 | mlflow = ["mlflow"]
 42 | scann = ["scann"]
 43 | kfp = ["kfp", "fsspec"]
 44 | 
 45 | [tool.poetry.dev-dependencies]
 46 | pytest = "^7.1.2"
 47 | flake8 = "^5.0.4"
 48 | black = "^22.6.0"
 49 | isort = "^5.10.1"
 50 | pre-commit = "^2.20.0"
 51 | darglint = ">=1.8.1"
 52 | coverage = {extras = ["toml"], version = ">=6.2"}
 53 | interrogate = "^1.5.0"
 54 | 
 55 | [tool.isort]
 56 | profile = "black"
 57 | lines_after_imports = 2
 58 | 
 59 | [tool.darglint]
 60 | strictness = "long"
 61 | 
 62 | [tool.mypy]
 63 | disallow_any_generics = true
 64 | disallow_subclassing_any = true
 65 | disallow_untyped_calls = true
 66 | disallow_untyped_defs = true
 67 | disallow_incomplete_defs = true
 68 | check_untyped_defs = true
 69 | disallow_untyped_decorators = true
 70 | no_implicit_optional = true
 71 | warn_redundant_casts = true
 72 | warn_unused_ignores = true
 73 | warn_return_any = true
 74 | implicit_reexport = false
 75 | strict_equality = true
 76 | 
 77 | [tool.coverage.paths]
 78 | source = ["rexify"]
 79 | 
 80 | [tool.coverage.run]
 81 | branch = true
 82 | source = ["rexify"]
 83 | 
 84 | [tool.coverage.report]
 85 | show_missing = true
 86 | exclude_lines = ["if __name__ == .__main__.:", "_cmd"]
 87 | omit = ["*/__init__.py"]
 88 | 
 89 | [tool.interrogate]
 90 | ignore-init-method = true
 91 | ignore-init-module = true
 92 | ignore-magic = true
 93 | ignore-semiprivate = true
 94 | ignore-private = true
 95 | ignore-module = true
 96 | ignore-nested-functions = true
 97 | ignore-property-decorators = true
 98 | exclude = ["docs", "build", "rexify/pipeline.py", "*/exceptions/*"]
 99 | ignore-regex = ["call", "get_config", "compute_loss"]
100 | verbose = 0
101 | omit-covered-files = false
102 | quiet = false
103 | color = false
104 | 
105 | 
106 | [build-system]
107 | requires = ["poetry-core>=1.0.0"]
108 | build-backend = "poetry.core.masonry.api"
109 | 


--------------------------------------------------------------------------------
/rexify/__init__.py:
--------------------------------------------------------------------------------
1 | from .data import Events, Items, Output, Users
2 | from .features.extractor import FeatureExtractor
3 | from .models import Recommender
4 | from .schema import Schema
5 | 
6 | 
7 | BASE_IMAGE = "joseprsm/rexify"
8 | 


--------------------------------------------------------------------------------
/rexify/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .input import Events, Items, Users
2 | from .output import Output
3 | 


--------------------------------------------------------------------------------
/rexify/data/base.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from pathlib import Path
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from rexify.features.base import HasSchemaMixin
 7 | from rexify.schema import Schema
 8 | 
 9 | 
10 | class BaseDataFrame(pd.DataFrame, HasSchemaMixin):
11 |     def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
12 |         pd.DataFrame.__init__(self, data)
13 |         HasSchemaMixin.__init__(self, schema=schema)
14 | 
15 |     @abstractmethod
16 |     def load(cls, path: str | Path, **kwargs):
17 |         pass
18 | 


--------------------------------------------------------------------------------
/rexify/data/input.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from pathlib import Path
 3 | 
 4 | import pandas as pd
 5 | from sklearn.model_selection import train_test_split
 6 | 
 7 | from rexify.data.base import BaseDataFrame
 8 | from rexify.schema import Schema
 9 | 
10 | 
11 | class Input(BaseDataFrame):
12 |     def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
13 |         super().__init__(data, schema)
14 | 
15 |     @classmethod
16 |     def load(cls, path: str | Path, load_fn: str = "read_csv", schema: Schema = None):
17 |         return cls(data=getattr(pd, load_fn)(path), schema=schema)
18 | 
19 |     def split(self, **kwargs):
20 |         train, val = train_test_split(self, **kwargs)
21 |         return self.__class__(train, self.schema), self.__class__(val, self.schema)
22 | 
23 |     @abstractmethod
24 |     def generate(cls, n: int = 100):
25 |         raise NotImplementedError
26 | 
27 | 
28 | class Events(Input):
29 |     def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
30 |         super().__init__(data, schema)
31 | 
32 | 
33 | class Users(Input):
34 |     def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
35 |         super().__init__(data, schema)
36 | 
37 | 
38 | class Items(Input):
39 |     def __init__(self, data: pd.DataFrame, schema: Schema) -> None:
40 |         super().__init__(data, schema)
41 | 


--------------------------------------------------------------------------------
/rexify/data/output.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import warnings
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import tensorflow as tf
  8 | 
  9 | from rexify.data.base import BaseDataFrame
 10 | from rexify.schema import Schema
 11 | from rexify.utils import get_target_id, make_dirs
 12 | 
 13 | 
 14 | class Output(BaseDataFrame):
 15 |     def __init__(
 16 |         self,
 17 |         data: pd.DataFrame,
 18 |         schema: Schema,
 19 |         ranking_features: list[str] | None = None,
 20 |     ) -> None:
 21 |         super().__init__(data, schema)
 22 |         with warnings.catch_warnings():
 23 |             warnings.filterwarnings("ignore")
 24 |             self._ranking_features = ranking_features
 25 | 
 26 |     @classmethod
 27 |     def load(cls, path: str | Path):
 28 |         path = Path(path)
 29 | 
 30 |         history = pd.read_csv(path / "history.csv")
 31 |         features = pd.read_csv(path / "features.csv")
 32 |         features["history"] = history.values.tolist()
 33 |         del history
 34 | 
 35 |         schema = Schema.from_json(path / "schema.json")
 36 |         with open(path / "ranks.json", "r") as f:
 37 |             ranking_features = json.load(f)
 38 | 
 39 |         return cls(features, schema=schema, ranking_features=ranking_features)
 40 | 
 41 |     def save(self, path: str | Path, name: str = None):
 42 |         path = Path(path)
 43 |         path = path / name if name else path
 44 | 
 45 |         history = pd.DataFrame(np.stack(self.loc[:, "history"].values))
 46 | 
 47 |         make_dirs(path)
 48 |         history.to_csv(path / "history.csv", index=None)
 49 |         self.drop("history", axis=1).to_csv(path / "features.csv", index=None)
 50 | 
 51 |         with open(path / "ranks.json", "w") as f:
 52 |             json.dump(self._ranking_features, f)
 53 | 
 54 |         self.schema.save(path / "schema.json")
 55 | 
 56 |     def to_dataset(self) -> tf.data.Dataset:
 57 |         return self._make_dataset().map(self._get_header_fn())
 58 | 
 59 |     def _make_dataset(self) -> tf.data.Dataset:
 60 |         return tf.data.Dataset.zip(
 61 |             (
 62 |                 self._get_target_vector_dataset(self, self._schema, "user"),
 63 |                 self._get_target_vector_dataset(self, self._schema, "item"),
 64 |                 tf.data.Dataset.from_tensor_slices(
 65 |                     np.stack(self["history"].values).astype(np.int32)
 66 |                 ),
 67 |                 self._get_ranking_dataset(self),
 68 |             )
 69 |         )
 70 | 
 71 |     @staticmethod
 72 |     def _get_target_vector_dataset(
 73 |         data, schema: Schema, target: str
 74 |     ) -> tf.data.Dataset:
 75 |         return tf.data.Dataset.from_tensor_slices(
 76 |             data.loc[:, get_target_id(schema, target)]
 77 |             .values.reshape(-1)
 78 |             .astype(np.int32)
 79 |         )
 80 | 
 81 |     @staticmethod
 82 |     def _get_header_fn():
 83 |         @tf.autograph.experimental.do_not_convert
 84 |         def header_fn(user_id, item_id, history, ranks):
 85 |             return {
 86 |                 "query": {"user_id": user_id, "history": history},
 87 |                 "candidate": {"item_id": item_id},
 88 |                 "rank": ranks,
 89 |             }
 90 | 
 91 |         return header_fn
 92 | 
 93 |     def _get_ranking_dataset(self, data) -> tf.data.Dataset:
 94 |         @tf.autograph.experimental.do_not_convert
 95 |         def add_header(x):
 96 |             return {
 97 |                 self._ranking_features[i]: x[i]
 98 |                 for i in range(len(self._ranking_features))
 99 |             }
100 | 
101 |         return tf.data.Dataset.from_tensor_slices(
102 |             data.loc[:, self._ranking_features].values.astype(np.int32)
103 |         ).map(add_header)
104 | 
105 |     @property
106 |     def ranking_features(self):
107 |         return self._ranking_features
108 | 


--------------------------------------------------------------------------------
/rexify/features/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joseprsm/rexify/6efb0cbe8ce9e35b58b200fcb95cf8e65c03d2c2/rexify/features/__init__.py


--------------------------------------------------------------------------------
/rexify/features/base.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import re
 3 | from pathlib import Path
 4 | 
 5 | from sklearn.base import BaseEstimator, TransformerMixin
 6 | from sklearn.compose import make_column_transformer
 7 | from sklearn.pipeline import Pipeline
 8 | 
 9 | from rexify.schema import Schema
10 | from rexify.utils import get_target_feature, make_dirs
11 | 
12 | 
13 | class HasSchemaMixin:
14 |     def __init__(self, schema: Schema):
15 |         self._schema = schema
16 | 
17 |     @property
18 |     def schema(self):
19 |         return self._schema
20 | 
21 | 
22 | class HasTargetMixin:
23 | 
24 |     _SUPPORTED_TARGETS = ["user", "item"]
25 | 
26 |     def __init__(self, target: str):
27 |         self._target = target
28 | 
29 |     @property
30 |     def target(self):
31 |         return self._target
32 | 
33 |     @classmethod
34 |     def _validate_target(cls, target: str):
35 |         if target not in cls._SUPPORTED_TARGETS:
36 |             raise ValueError(f"Target {target} not supported")
37 | 
38 | 
39 | class Serializable:
40 |     def save(self, output_dir: str, filename: str = None):
41 |         make_dirs(output_dir)
42 |         filename = (
43 |             filename or self._camel_to_snake_case(self.__class__.__name__) + ".pickle"
44 |         )
45 |         output_path = Path(output_dir) / filename
46 |         with open(output_path, "wb") as f:
47 |             pickle.dump(self, f)
48 | 
49 |     @classmethod
50 |     def load(cls, path: Path | str):
51 |         with open(path, "rb") as f:
52 |             feat = pickle.load(f)
53 |         return feat
54 | 
55 |     @staticmethod
56 |     def _camel_to_snake_case(name: str):
57 |         return re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower()
58 | 
59 | 
60 | class BaseEncoder(HasSchemaMixin):
61 | 
62 |     ppl: Pipeline
63 |     _targets: list[str]
64 | 
65 |     def __init__(self, dtype: str, target: str, schema: Schema):
66 |         super().__init__(schema)
67 |         self._type = dtype
68 |         self._name = target.lower() + "_" + self._type + "Pipeline"
69 |         self._targets = self._get_features(self._schema, target, self._type)
70 | 
71 |     @staticmethod
72 |     def _get_features(schema, target, dtype) -> list[str]:
73 |         return get_target_feature(schema, target, dtype)
74 | 
75 |     def __iter__(self):
76 |         for x in [self._name, self.ppl, self._targets]:
77 |             yield x
78 | 
79 |     def as_tuple(self):
80 |         return tuple(self)
81 | 
82 | 
83 | class BaseTransformer(BaseEstimator, TransformerMixin):
84 |     def __init__(self, transformer: TransformerMixin, target_features: list[str]):
85 |         super().__init__()
86 |         self.transformer = transformer
87 |         self.target_features = target_features
88 | 
89 |         self._column_transformer = make_column_transformer(
90 |             (self.transformer, self.target_features),
91 |         )
92 | 
93 |     def fit(self, X, y=None, **fit_params):
94 |         return self
95 | 
96 |     def transform(self, X):
97 |         pass
98 | 


--------------------------------------------------------------------------------
/rexify/features/extractor.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn.base import BaseEstimator, TransformerMixin
  6 | from sklearn.pipeline import make_pipeline
  7 | 
  8 | from rexify.data import Events, Items, Output, Users
  9 | from rexify.features.base import HasSchemaMixin, Serializable
 10 | from rexify.features.transform import CustomTransformer, EventEncoder, Sequencer
 11 | from rexify.features.transform.entity import EntityTransformer
 12 | from rexify.schema import Schema
 13 | 
 14 | 
 15 | class FeatureExtractor(BaseEstimator, TransformerMixin, HasSchemaMixin, Serializable):
 16 | 
 17 |     _model_params: dict[str, Any]
 18 |     _item_ids: np.ndarray
 19 |     _user_ids: np.ndarray
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         schema: Schema,
 24 |         users: str = None,
 25 |         items: str = None,
 26 |         return_dataset: bool = False,
 27 |         window_size: int = 3,
 28 |         custom_transformers: list[CustomTransformer] = None,
 29 |     ):
 30 |         HasSchemaMixin.__init__(self, schema)
 31 | 
 32 |         self._users = users
 33 |         self._items = items
 34 |         self._return_dataset = return_dataset
 35 |         self._window_size = window_size
 36 |         self._window_size = window_size
 37 |         self._timestamp = schema.timestamp
 38 |         self._custom_transformers = custom_transformers or []
 39 | 
 40 |         self._user_transformer = EntityTransformer(
 41 |             schema, "user", self._custom_transformers
 42 |         )
 43 |         self._item_transformer = EntityTransformer(
 44 |             schema, "item", self._custom_transformers
 45 |         )
 46 | 
 47 |         self._ppl = make_pipeline(
 48 |             EventEncoder(self._schema),
 49 |             Sequencer(
 50 |                 self._schema,
 51 |                 timestamp_feature=self._timestamp,
 52 |                 window_size=self._window_size,
 53 |             ),
 54 |         )
 55 | 
 56 |     def fit(self, X: Events):
 57 |         self._fit_transformer(Users)
 58 |         self._fit_transformer(Items)
 59 | 
 60 |         x_ = X.copy()
 61 |         events = self._encode(self._user_transformer, x_)
 62 |         events = self._encode(self._item_transformer, events)
 63 |         _ = self._ppl.fit(events)
 64 | 
 65 |         self._model_params = self._get_model_params()
 66 |         return self
 67 | 
 68 |     def transform(self, X: Events) -> Output:
 69 |         x_ = X.copy()
 70 |         events = self._encode(self._user_transformer, x_)
 71 |         events = self._encode(self._item_transformer, events)
 72 |         events = self._ppl.transform(events)
 73 |         events = self._drop(events, self._user_transformer)
 74 |         events = self._drop(events, self._item_transformer)
 75 |         self._model_params["session_history"] = self.history
 76 | 
 77 |         transformed = Output(
 78 |             data=events, schema=self._schema, ranking_features=self.ranking_features
 79 |         )
 80 | 
 81 |         self._user_ids = self._get_ids(transformed, self._user_transformer)
 82 |         self._item_ids = self._get_ids(transformed, self._item_transformer)
 83 | 
 84 |         return transformed.to_dataset() if self._return_dataset else transformed
 85 | 
 86 |     def _fit_transformer(self, inputs: Users | Items):
 87 |         input_name = inputs.__name__.lower()
 88 |         input_path: str = getattr(self, f"_{input_name}")
 89 |         transformer = getattr(self, f"_{input_name[:-1]}_transformer")
 90 |         x = inputs.load(input_path, schema=self._schema)
 91 |         transformer.fit(x).transform(x)
 92 | 
 93 |     @staticmethod
 94 |     def _encode(transformer: EntityTransformer, data: pd.DataFrame) -> pd.DataFrame:
 95 |         encoder, feature_names = transformer.encoder
 96 |         data[feature_names] = encoder.transform(data[feature_names])
 97 |         return data
 98 | 
 99 |     @staticmethod
100 |     def _drop(df: pd.DataFrame, transformer: EntityTransformer):
101 |         encoder, id_ = transformer.encoder
102 |         return df.loc[df[id_].values.reshape(-1) != encoder.unknown_value, :]
103 | 
104 |     def _get_model_params(self):
105 |         model_params = {}
106 |         model_params.update(self._user_transformer.model_params)
107 |         model_params.update(self._item_transformer.model_params)
108 |         model_params.update({"ranking_features": self.ranking_features})
109 |         model_params["window_size"] = self._window_size
110 |         return model_params
111 | 
112 |     @staticmethod
113 |     def _get_ids(df: pd.DataFrame, transformer: EntityTransformer):
114 |         return df.loc[:, transformer.encoder[1][0]].values.astype(np.int32)
115 | 
116 |     @property
117 |     def users(self):
118 |         return self._users
119 | 
120 |     @property
121 |     def items(self):
122 |         return self._items
123 | 
124 |     @property
125 |     def model_params(self):
126 |         return self._model_params
127 | 
128 |     @property
129 |     def ranking_features(self):
130 |         return self._ppl.steps[0][1].ranking_features
131 | 
132 |     @property
133 |     def history(self):
134 |         return self._ppl.steps[1][1].history
135 | 
136 |     @property
137 |     def return_dataset(self):
138 |         return self._return_dataset
139 | 
140 |     @property
141 |     def window_size(self):
142 |         return self._window_size
143 | 
144 |     @property
145 |     def custom_transformers(self):
146 |         return self._custom_transformers
147 | 
148 |     @property
149 |     def item_encoder(self):
150 |         return self._item_transformer.encoder[0]
151 | 
152 |     @property
153 |     def item_ids(self):
154 |         return self._item_ids
155 | 
156 |     @property
157 |     def user_encoder(self):
158 |         return self._user_transformer.encoder[0]
159 | 
160 |     @property
161 |     def user_ids(self):
162 |         return self._user_ids
163 | 


--------------------------------------------------------------------------------
/rexify/features/transform/__init__.py:
--------------------------------------------------------------------------------
1 | from .category import CategoricalEncoder
2 | from .custom import CustomTransformer
3 | from .event import EventEncoder
4 | from .id import IDEncoder
5 | from .number import NumericalEncoder
6 | from .sequence import Sequencer
7 | 


--------------------------------------------------------------------------------
/rexify/features/transform/category.py:
--------------------------------------------------------------------------------
 1 | from sklearn.pipeline import make_pipeline
 2 | from sklearn.preprocessing import OneHotEncoder
 3 | 
 4 | from rexify.features.base import BaseEncoder
 5 | from rexify.schema import Schema
 6 | 
 7 | 
 8 | class CategoricalEncoder(BaseEncoder):
 9 |     def __init__(self, schema: Schema, target: str):
10 |         super().__init__(dtype="category", target=target, schema=schema)
11 |         self.ppl = make_pipeline(OneHotEncoder(sparse_output=False))
12 | 


--------------------------------------------------------------------------------
/rexify/features/transform/custom.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import TransformerMixin
 2 | 
 3 | 
 4 | class CustomTransformer(tuple):
 5 |     def __new__(
 6 |         cls, target: str, transformer: TransformerMixin, features: list[str]
 7 |     ) -> tuple:
 8 |         name = f"{target}_{''.join([f[0] for f in features])}_customTransformer"
 9 |         return tuple.__new__(CustomTransformer, (name, transformer, features))
10 | 


--------------------------------------------------------------------------------
/rexify/features/transform/entity.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn.compose import ColumnTransformer
  6 | from sklearn.pipeline import Pipeline, make_pipeline
  7 | 
  8 | from rexify.features.base import HasSchemaMixin, HasTargetMixin
  9 | from rexify.features.transform import (
 10 |     CategoricalEncoder,
 11 |     CustomTransformer,
 12 |     IDEncoder,
 13 |     NumericalEncoder,
 14 | )
 15 | from rexify.schema import Schema
 16 | from rexify.utils import get_target_id
 17 | 
 18 | 
 19 | class _FeatureTransformer(ColumnTransformer, HasSchemaMixin, HasTargetMixin):
 20 |     def __init__(self, schema: Schema, target: str):
 21 |         HasSchemaMixin.__init__(self, schema=schema)
 22 |         HasTargetMixin.__init__(self, target=target)
 23 |         transformers = self._get_transformers()
 24 |         ColumnTransformer.__init__(
 25 |             self, transformers=transformers, remainder="passthrough"
 26 |         )
 27 | 
 28 |     def _get_transformers(self) -> list[tuple[str, Pipeline, list[str]]]:
 29 |         transformer_list = []
 30 | 
 31 |         cat_encoder = CategoricalEncoder(self._schema, self._target).as_tuple()
 32 |         transformer_list += [cat_encoder] if cat_encoder[-1] != tuple() else []
 33 | 
 34 |         num_encoder = NumericalEncoder(self._schema, self._target).as_tuple()
 35 |         transformer_list += [num_encoder] if num_encoder[-1] != tuple() else []
 36 | 
 37 |         return transformer_list
 38 | 
 39 | 
 40 | class EntityTransformer(ColumnTransformer, HasSchemaMixin, HasTargetMixin):
 41 |     _features: pd.DataFrame
 42 |     _model_params: dict[str, Any]
 43 | 
 44 |     def __init__(
 45 |         self,
 46 |         schema: Schema,
 47 |         target: str,
 48 |         custom_transformers: list[CustomTransformer] = None,
 49 |     ):
 50 |         HasSchemaMixin.__init__(self, schema)
 51 |         HasTargetMixin.__init__(self, target)
 52 |         self._custom_transformers = (
 53 |             self._filter_custom_transformers(custom_transformers, self._target) or []
 54 |         )
 55 |         transformers = [
 56 |             self._get_feature_pipeline(self._schema, self._target)
 57 |         ] + self._custom_transformers
 58 |         ColumnTransformer.__init__(self, transformers)
 59 | 
 60 |     def fit(self, X, y=None):
 61 |         super().fit(X, y)
 62 |         n_dims = self._get_n_dims(X)
 63 |         self._model_params = n_dims
 64 |         return self
 65 | 
 66 |     def transform(self, X) -> pd.DataFrame:
 67 |         self._features = super().transform(X)
 68 |         self._features = pd.DataFrame(
 69 |             self._features[:, :-1], index=self._features[:, -1]
 70 |         )
 71 |         self._features = pd.concat(
 72 |             [
 73 |                 self._features,
 74 |                 pd.DataFrame(np.zeros(self._features.shape[1])).transpose(),
 75 |             ],
 76 |             ignore_index=True,
 77 |         )
 78 | 
 79 |         self._model_params.update({f"{self._target}_embeddings": self._features})
 80 |         return self._features
 81 | 
 82 |     def _get_n_dims(self, X):
 83 |         id_col = get_target_id(self._schema, self._target)[0]
 84 |         input_dims = int(X[id_col].nunique() + 1)
 85 |         return {f"{self._target}_dims": input_dims}
 86 | 
 87 |     @staticmethod
 88 |     def _filter_custom_transformers(
 89 |         custom_transformers: list[CustomTransformer], target: str
 90 |     ):
 91 |         def target_from_name(x):
 92 |             return x[0].split("_")[0] == target
 93 | 
 94 |         return list(filter(target_from_name, custom_transformers))
 95 | 
 96 |     @staticmethod
 97 |     def _get_feature_pipeline(schema, target) -> tuple[str, Pipeline, list[str]]:
 98 |         name = f"{target}_featureExtractor"
 99 |         ppl = make_pipeline(
100 |             IDEncoder(schema, target),
101 |             _FeatureTransformer(schema, target),
102 |         )
103 |         target_keys = getattr(schema, target).to_dict()
104 |         keys = [target_keys.pop("id")] + list(target_keys.keys())
105 |         return name, ppl, keys
106 | 
107 |     @property
108 |     def model_params(self):
109 |         return self._model_params
110 | 
111 |     @property
112 |     def identifiers(self):
113 |         return self._features.index.values.astype(int)
114 | 
115 |     @property
116 |     def encoder(self):
117 |         encoder = self.transformers_[0][1].steps[0][1].transformer.transformers_[0]
118 |         return encoder[1], encoder[-1]
119 | 
120 |     @property
121 |     def custom_transformers(self):
122 |         return self._custom_transformers
123 | 


--------------------------------------------------------------------------------
/rexify/features/transform/event.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.base import BaseEstimator, TransformerMixin
 3 | from sklearn.compose import make_column_transformer
 4 | from sklearn.preprocessing import OneHotEncoder
 5 | 
 6 | from rexify.features.base import HasSchemaMixin
 7 | from rexify.schema import Schema
 8 | 
 9 | 
10 | class EventEncoder(BaseEstimator, TransformerMixin, HasSchemaMixin):
11 |     def __init__(self, schema: Schema):
12 |         HasSchemaMixin.__init__(self, schema)
13 |         self._event_type = schema.event_type
14 |         self._transformer = make_column_transformer(
15 |             (OneHotEncoder(), [self._event_type])
16 |         )
17 | 
18 |     def fit(self, X, y=None, **fit_params):
19 |         self._transformer.fit(X, y)
20 |         return self
21 | 
22 |     def transform(self, X):
23 |         oneh = self._transformer.transform(X)
24 |         oneh = pd.DataFrame(oneh, columns=self.transformer.get_feature_names_out())
25 |         x = X.drop(self._event_type, axis=1)
26 |         return pd.concat([x, oneh], axis=1)
27 | 
28 |     @property
29 |     def transformer(self):
30 |         return self._transformer.transformers_[0][1]
31 | 
32 |     @property
33 |     def ranking_features(self):
34 |         return self.transformer.get_feature_names_out().tolist()
35 | 


--------------------------------------------------------------------------------
/rexify/features/transform/id.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.base import BaseEstimator, TransformerMixin
 4 | from sklearn.compose import ColumnTransformer, make_column_transformer
 5 | from sklearn.preprocessing import OrdinalEncoder
 6 | 
 7 | from rexify.features.base import HasSchemaMixin, HasTargetMixin
 8 | from rexify.utils import get_target_id
 9 | 
10 | 
11 | class IDEncoder(BaseEstimator, TransformerMixin, HasSchemaMixin, HasTargetMixin):
12 | 
13 |     _transformer: ColumnTransformer
14 | 
15 |     def __init__(self, schema, target):
16 |         HasSchemaMixin.__init__(self, schema)
17 |         HasTargetMixin.__init__(self, target)
18 | 
19 |     def fit(self, X: pd.DataFrame, y=None):
20 |         target_features = get_target_id(self._schema, self._target)
21 |         encoder_args = self._get_encoder_args(X, target_features)
22 |         self._transformer = make_column_transformer(
23 |             (OrdinalEncoder(**encoder_args), target_features),
24 |             remainder="passthrough",
25 |         )
26 |         self._transformer.fit(X, y)
27 |         return self
28 | 
29 |     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
30 |         x = self._transformer.transform(X)
31 |         columns = self._get_features_names_out()
32 |         return pd.DataFrame(x, columns=columns)
33 | 
34 |     def _get_features_names_out(self) -> list[str]:
35 |         features = self._transformer.get_feature_names_out()
36 |         return [name.split("__")[-1] for name in features]
37 | 
38 |     @staticmethod
39 |     def _get_encoder_args(df: pd.DataFrame, target_features: list[str]):
40 |         value = df[target_features].nunique().sum()
41 |         return {
42 |             "dtype": np.int64,
43 |             "handle_unknown": "use_encoded_value",
44 |             "unknown_value": value,
45 |         }
46 | 
47 |     @property
48 |     def transformer(self):
49 |         return self._transformer
50 | 
51 |     @property
52 |     def target_feature(self):
53 |         return self._transformer.transformers[0][-1][0]
54 | 


--------------------------------------------------------------------------------
/rexify/features/transform/number.py:
--------------------------------------------------------------------------------
 1 | from sklearn.pipeline import make_pipeline
 2 | from sklearn.preprocessing import MinMaxScaler
 3 | 
 4 | from rexify.features.base import BaseEncoder
 5 | from rexify.schema import Schema
 6 | 
 7 | 
 8 | class NumericalEncoder(BaseEncoder):
 9 |     def __init__(self, schema: Schema, target: str):
10 |         super().__init__(dtype="number", target=target, schema=schema)
11 |         self.ppl = make_pipeline(MinMaxScaler(feature_range=(-1, 1)))
12 | 


--------------------------------------------------------------------------------
/rexify/features/transform/sequence.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.base import BaseEstimator, TransformerMixin
  4 | 
  5 | from rexify.features.base import HasSchemaMixin
  6 | from rexify.schema import Schema
  7 | from rexify.utils import get_target_id
  8 | 
  9 | 
 10 | class Sequencer(BaseEstimator, TransformerMixin, HasSchemaMixin):
 11 | 
 12 |     """Transformer responsible for creating sequential data.
 13 | 
 14 |     It creates a new column `history` that holds the previous `window_size` event item IDs.
 15 | 
 16 |     Args:
 17 |         schema (rexify.types.Schema): the data schema
 18 |         timestamp_feature (str): the dataframe's feature name with a timestamp
 19 |         window_size (int): the size of the sliding window
 20 | 
 21 |     Examples:
 22 |         >>> from rexify.features.transform import Sequencer
 23 |         >>> sequencer = Sequencer(schema)
 24 |         >>> sequencer.fit(events)
 25 |         Sequencer(schema={'context': {'timestamp': 'timestamp'},
 26 |                       'item': {'item_id': 'id', 'price': 'numerical',
 27 |                                'type': 'categorical'},
 28 |                       'rank': [{'name': 'Purchase'}, {'name': 'Add to Cart'},
 29 |                                {'name': 'Page View'}],
 30 |                       'user': {'age': 'numerical', 'gender': 'categorical',
 31 |                                'user_id': 'id'}},
 32 |               timestamp_feature='timestamp', window_size=4)
 33 |         >>> transformed = sequencer.transform(events)
 34 | 
 35 |     """
 36 | 
 37 |     _user_id: str
 38 |     _item_id: str
 39 |     _columns: list[str]
 40 |     _padding: list[int]
 41 |     _history: pd.DataFrame
 42 | 
 43 |     def __init__(self, schema: Schema, window_size: int = 3, **kwargs):
 44 |         super().__init__(schema=schema)
 45 |         self._timestamp_feature = self._schema.timestamp
 46 |         self._window_size = window_size + 1
 47 | 
 48 |     def fit(self, X: pd.DataFrame, *_):
 49 |         self._user_id = get_target_id(self.schema, "user")[0]
 50 |         self._item_id = get_target_id(self.schema, "item")[0]
 51 |         self._columns = [col for col in X.columns if col != self._user_id]
 52 |         self._padding = [X[self._item_id].max() + 1] * (self._window_size - 2)
 53 |         return self
 54 | 
 55 |     def transform(self, X: pd.DataFrame):
 56 |         sequences = self._get_sequences(X)
 57 | 
 58 |         res = sequences.drop(self._item_id, axis=1).applymap(self._get_last)
 59 |         res[self._item_id] = sequences.pop(self._item_id)
 60 |         res["history"] = sequences.pop("history")
 61 |         res.reset_index(inplace=True)
 62 |         res = res.loc[res["history"].map(len) == self._window_size - 1, :]
 63 |         res = res.loc[~res.loc[:, self._timestamp_feature].isna()]
 64 | 
 65 |         self._history = self._get_history(res)
 66 | 
 67 |         res.drop(self._timestamp_feature, axis=1, inplace=True)
 68 |         return res
 69 | 
 70 |     def _get_sequences(self, df: pd.DataFrame):
 71 |         sequences: pd.DataFrame = (
 72 |             df.sort_values(self._timestamp_feature)
 73 |             .set_index(self._user_id)
 74 |             .groupby(level=-1)
 75 |             .apply(self._mask)
 76 |             .apply(pd.Series)
 77 |             .rename(columns=pd.Series(self._columns))
 78 |             .applymap(self._pad)
 79 |             .applymap(self._window)
 80 |             .apply(lambda x: x.explode())
 81 |         )
 82 | 
 83 |         sequences["history"] = sequences[self._item_id].map(lambda x: x[:-1])
 84 |         sequences[self._item_id] = sequences[self._item_id].map(self._get_last)
 85 |         return sequences
 86 | 
 87 |     def _get_history(self, df: pd.DataFrame):
 88 |         return (
 89 |             df.groupby([self._user_id])
 90 |             .agg({self._timestamp_feature: max, "history": list})
 91 |             .drop(self._timestamp_feature, axis=1)
 92 |             .history.map(self._get_last)
 93 |         )
 94 | 
 95 |     def _mask(self, df: pd.DataFrame):
 96 |         return [list(df[col]) for col in self._columns]
 97 | 
 98 |     @staticmethod
 99 |     def _get_last(lst: list):
100 |         return lst[-1]
101 | 
102 |     def _window(self, sequence):
103 |         if len(sequence) >= self._window_size:
104 |             sequence = np.array(sequence, dtype=object)
105 | 
106 |             stack = [
107 |                 sequence[range(i, i + self._window_size)]
108 |                 for i in range(len(sequence) - self._window_size + 1)
109 |             ]
110 | 
111 |             if len(stack) > 1:
112 |                 stack = np.stack(stack)
113 | 
114 |             return stack
115 |         return [sequence]
116 | 
117 |     def _pad(self, x: list):
118 |         return self._padding + x
119 | 
120 |     @property
121 |     def timestamp_feature(self):
122 |         return self._timestamp_feature
123 | 
124 |     @property
125 |     def window_size(self):
126 |         return self._window_size
127 | 
128 |     @property
129 |     def history(self):
130 |         return self._history
131 | 


--------------------------------------------------------------------------------
/rexify/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .recommender import Recommender
2 | 


--------------------------------------------------------------------------------
/rexify/models/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | class DenseSetterMixin(ABC):
 7 |     @staticmethod
 8 |     def _set_sequential_model(
 9 |         layer: str | tf.keras.layers.Layer, layer_sizes: list[int], **kwargs
10 |     ) -> list[tf.keras.layers.Layer]:
11 |         if type(layer) == str:
12 |             layer = getattr(tf.keras.layers, layer)
13 |         return [layer(num_neurons, **kwargs) for num_neurons in layer_sizes]
14 | 
15 |     def _set_dense_layers(
16 |         self, layer_sizes: list[int], activation: str | None = "relu"
17 |     ) -> list[tf.keras.layers.Layer]:
18 |         return self._set_sequential_model("Dense", layer_sizes, activation=activation)
19 | 
20 |     @staticmethod
21 |     def _call_layers(layer_list: list[tf.keras.layers.Layer], inputs):
22 |         x = inputs
23 |         for layer in layer_list:
24 |             x = layer(x)
25 |         return x
26 | 


--------------------------------------------------------------------------------
/rexify/models/callbacks/__init__.py:
--------------------------------------------------------------------------------
1 | from .index import BruteForceCallback, ScaNNCallback
2 | 
3 | 
4 | try:
5 |     from .mlflow import MlflowCallback
6 | except:
7 |     pass
8 | 


--------------------------------------------------------------------------------
/rexify/models/callbacks/index.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rexify.models.index import BruteForce, ScaNN
 4 | 
 5 | 
 6 | class _IndexCallback(tf.keras.callbacks.Callback):
 7 | 
 8 |     INDEX: BruteForce | ScaNN
 9 | 
10 |     def __init__(
11 |         self,
12 |         sample_query: dict[str, tf.Tensor],
13 |         query_model: str = "query_model",
14 |         batch_size: int = 128,
15 |         **index_args,
16 |     ):
17 |         super().__init__()
18 |         self._query_model = query_model
19 |         self._batch_size = batch_size
20 |         self._sample_query = sample_query
21 |         self._index_args = index_args
22 |         self._target = "user" if self._query_model == "query_model" else "item"
23 | 
24 |     def set(self) -> tf.keras.Model:
25 |         query_model = getattr(self.model, self._query_model)
26 |         return self.INDEX(query_model, self.model.window_size, **self._index_args)
27 | 
28 |     def on_train_end(self, logs=None):
29 |         index = self.set()
30 |         index.index_from_dataset(candidates=self._get_candidates_dataset())
31 |         _ = index(self._sample_query[f"{self._target}_id"])
32 |         setattr(self.model, f"{self._target}_index", index)
33 | 
34 |     def _get_candidates_dataset(self):
35 |         def zip_item_dataset(item):
36 |             return (item["item_id"], self.model.candidate_model(item))
37 | 
38 |         candidates = self._get_candidates().batch(self._batch_size)
39 |         return candidates.map(zip_item_dataset)
40 | 
41 |     def _get_candidates(self):
42 |         def header_fn(item_id):
43 |             return {"item_id": tf.cast(item_id, tf.int32)}
44 | 
45 |         return tf.data.Dataset.from_tensor_slices(
46 |             self.model.candidate_model.identifiers
47 |         ).map(header_fn)
48 | 
49 | 
50 | class BruteForceCallback(_IndexCallback):
51 | 
52 |     INDEX = BruteForce
53 | 
54 | 
55 | class ScaNNCallback(_IndexCallback):
56 | 
57 |     INDEX = ScaNN
58 | 


--------------------------------------------------------------------------------
/rexify/models/callbacks/mlflow.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import mlflow
 4 | import tensorflow as tf
 5 | 
 6 | 
 7 | class MlflowCallback(tf.keras.callbacks.Callback):
 8 |     def __init__(
 9 |         self,
10 |         tracking_uri: str = os.environ.get("MLFLOW_TRACKING_URI"),
11 |         experiment_name: str = os.environ.get("MLFLOW_EXPERIMENT_NAME"),
12 |     ):
13 |         super().__init__()
14 |         if tracking_uri:
15 |             mlflow.set_tracking_uri(tracking_uri)
16 |         if experiment_name:
17 |             mlflow.set_experiment(experiment_name)
18 | 
19 |     def on_train_begin(self, logs=None):
20 |         config = self.model.get_config()
21 | 
22 |         def parse(value):
23 |             if type(value).__name__ == "ListWrapper":
24 |                 return list(value)
25 |             return value
26 | 
27 |         params = {k: parse(v) for k, v in config.items()}
28 |         mlflow.log_params(params)
29 | 
30 |     def on_epoch_end(self, epoch, logs=None):
31 |         mlflow.log_metrics(logs)
32 | 


--------------------------------------------------------------------------------
/rexify/models/index.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow_recommenders as tfrs
 3 | 
 4 | 
 5 | class _BaseIndex:
 6 |     def __init__(self, query_model: tf.keras.Model, window_size: int):
 7 |         self.query_model = query_model
 8 |         self._window_size = window_size
 9 | 
10 |     def call(self, queries: tf.Tensor, k: int = None):
11 |         queries_shape = queries.shape[0] or 1
12 |         inputs = (
13 |             {
14 |                 "user_id": queries,
15 |                 "history": tf.zeros(
16 |                     shape=(queries_shape, self._window_size), dtype=tf.int32
17 |                 ),
18 |             }
19 |             if self.query_model.name.startswith("query")
20 |             else {"item_id": queries}
21 |         )
22 |         return self.__class__.__bases__[1].call(self, inputs, k)
23 | 
24 | 
25 | class BruteForce(_BaseIndex, tfrs.layers.factorized_top_k.BruteForce):
26 |     def __init__(
27 |         self,
28 |         query_model: tf.keras.Model,
29 |         window_size: int,
30 |         k: int = 2,
31 |         name: str = None,
32 |     ):
33 |         tfrs.layers.factorized_top_k.BruteForce.__init__(self, query_model, k, name)
34 |         _BaseIndex.__init__(self, query_model, window_size)
35 | 
36 | 
37 | class ScaNN(_BaseIndex, tfrs.layers.factorized_top_k.ScaNN):
38 |     def __init__(
39 |         self,
40 |         query_model: tf.keras.Model,
41 |         window_size: int,
42 |         k: int = 10,
43 |         distance_measure: str = "dot_product",
44 |         num_leaves: int = 100,
45 |         num_leaves_to_search: int = 10,
46 |         training_iterations: int = 12,
47 |         dimensions_per_block: int = 2,
48 |         num_reordering_candidates: int = None,
49 |         parallelize_batch_searches: bool = True,
50 |         name: str = None,
51 |     ):
52 |         tfrs.layers.factorized_top_k.ScaNN.__init__(
53 |             self,
54 |             query_model,
55 |             k,
56 |             distance_measure,
57 |             num_leaves,
58 |             num_leaves_to_search,
59 |             training_iterations,
60 |             dimensions_per_block,
61 |             num_reordering_candidates,
62 |             parallelize_batch_searches,
63 |             name,
64 |         )
65 |         _BaseIndex.__init__(self, query_model, window_size)
66 | 


--------------------------------------------------------------------------------
/rexify/models/lookup.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | 
 6 | 
 7 | class _BaseLookupModel(tf.keras.Model):
 8 |     def __init__(self, ids: np.ndarray, values: np.ndarray):
 9 |         super().__init__()
10 |         self._ids = ids
11 |         self._values = values
12 | 
13 |         identifiers_idx = np.arange(0, self._ids.shape[0])
14 |         init = tf.lookup.KeyValueTensorInitializer(
15 |             keys=self._ids,
16 |             values=identifiers_idx,
17 |             key_dtype=tf.int32,
18 |             value_dtype=tf.int32,
19 |         )
20 | 
21 |         self.token_to_id = tf.lookup.StaticHashTable(init, default_value=len(ids))
22 | 
23 |     @tf.function(input_signature=[tf.TensorSpec([None], tf.int32)])
24 |     def call(self, inputs):
25 |         ids = self.token_to_id.lookup(inputs)
26 |         return tf.nn.embedding_lookup(params=self._values, ids=ids)
27 | 
28 |     @abstractmethod
29 |     def get_config(self):
30 |         pass
31 | 
32 | 
33 | class EmbeddingLookup(_BaseLookupModel):
34 |     def __init__(self, ids: np.ndarray, embeddings: np.ndarray):
35 |         super().__init__(ids=ids, values=embeddings)
36 | 
37 |     def get_config(self):
38 |         return {"ids": self._ids, "embeddings": self._values}
39 | 
40 | 
41 | class SessionLookup(_BaseLookupModel):
42 |     def __init__(self, ids: np.ndarray, sessions: np.ndarray):
43 |         super().__init__(ids=ids, values=sessions)
44 | 
45 |     def get_config(self):
46 |         return {"ids": self._ids, "sessions": self._values}
47 | 


--------------------------------------------------------------------------------
/rexify/models/ranking/__init__.py:
--------------------------------------------------------------------------------
1 | from .ranking import RankingMixin
2 | 


--------------------------------------------------------------------------------
/rexify/models/ranking/base.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow_recommenders as tfrs
 3 | 
 4 | from rexify.models.base import DenseSetterMixin
 5 | 
 6 | 
 7 | class BaseRankingModel(tf.keras.Model, DenseSetterMixin):
 8 | 
 9 |     output_layer: tf.keras.layers.Dense
10 |     task: tfrs.tasks.Ranking
11 | 
12 |     def __init__(self, layer_sizes: list[int]):
13 |         super().__init__()
14 |         self._layer_sizes = layer_sizes or [64, 32]
15 |         self.hidden_layers = self._set_dense_layers(self._layer_sizes)
16 | 
17 |     def call(self, inputs, labels):
18 |         x = self._call_layers(self.hidden_layers, inputs)
19 |         x = self.output_layer(x)
20 |         return self.task(labels=labels, predictions=x)
21 | 


--------------------------------------------------------------------------------
/rexify/models/ranking/event.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow_recommenders as tfrs
 3 | 
 4 | from rexify.models.ranking.base import BaseRankingModel
 5 | 
 6 | 
 7 | class EventModel(BaseRankingModel):
 8 |     def __init__(self, layer_sizes: list[int] = None, n_dims: int = 1):
 9 |         super().__init__(layer_sizes=layer_sizes)
10 |         self._n_dims = n_dims
11 |         self.output_layer = tf.keras.layers.Dense(self._n_dims, activation="softmax")
12 |         self.task = tfrs.tasks.Ranking(loss=tf.keras.losses.CategoricalCrossentropy())
13 | 
14 |     def get_config(self):
15 |         return {
16 |             "layer_sizes": self._layer_sizes,
17 |             "n_dims": self._n_dims,
18 |         }
19 | 


--------------------------------------------------------------------------------
/rexify/models/ranking/ranking.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | import tensorflow as tf
 4 | import tensorflow_recommenders as tfrs
 5 | 
 6 | from rexify.models.base import DenseSetterMixin
 7 | 
 8 | 
 9 | class RankingMixin(tfrs.Model, DenseSetterMixin, ABC):
10 |     def __init__(
11 |         self,
12 |         ranking_features: list[str] = None,
13 |         layer_sizes: list[int] = None,
14 |         weights: dict[str, float] = None,
15 |     ):
16 |         super().__init__()
17 |         self._ranking_features = ranking_features or []
18 |         self._ranking_layers = layer_sizes or [64, 32]
19 | 
20 |         # todo: validate ranking weights
21 |         self._ranking_weights = weights or {
22 |             feature: 1.0 for feature in self._ranking_features
23 |         }
24 |         self._ranking_models = {
25 |             feature: self._get_ranking_model() for feature in self._ranking_features
26 |         }
27 |         self._ranking_tasks = {
28 |             feature: tfrs.tasks.Ranking(loss=tf.keras.losses.BinaryCrossentropy())
29 |             for feature in self._ranking_features
30 |         }
31 | 
32 |     def get_loss(
33 |         self,
34 |         query_embeddings: tf.Tensor,
35 |         candidate_embeddings: tf.Tensor,
36 |         ranks: dict[str, tf.Tensor],
37 |     ):
38 |         loss = 0
39 |         inputs = tf.concat([query_embeddings, candidate_embeddings], axis=1)
40 |         for feature, model in self._ranking_models.items():
41 |             rating_preds = self._call_layers(model, inputs)
42 |             loss += (
43 |                 self._ranking_tasks[feature](
44 |                     labels=ranks[feature], predictions=rating_preds
45 |                 )
46 |                 * self._ranking_weights[feature]
47 |             )
48 |         return loss
49 | 
50 |     def _get_ranking_model(self) -> list[tf.keras.layers.Layer]:
51 |         model = self._set_dense_layers(self._ranking_layers)
52 |         model.append(tf.keras.layers.Dense(1, activation="sigmoid"))
53 |         return model
54 | 


--------------------------------------------------------------------------------
/rexify/models/recommender.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import tensorflow as tf
  3 | 
  4 | from rexify.models.callbacks import BruteForceCallback
  5 | from rexify.models.ranking import RankingMixin
  6 | from rexify.models.retrieval import RetrievalMixin
  7 | from rexify.utils import get_sample_query
  8 | 
  9 | 
 10 | class Recommender(RetrievalMixin, RankingMixin):
 11 |     """The main Recommender model.
 12 | 
 13 |     It expects a `tf.data.Dataset`, composed of two keys: "query" and "candidate";
 14 |     the query part of the dataset has three keys:
 15 | 
 16 |     * the user ID feature name, a scalar;
 17 |     * `user_features`, an array representing the user features
 18 |     * `context_features`, an array representing the context features
 19 | 
 20 |     The candidate part of the data set has two keys:
 21 | 
 22 |     * the item ID feature name, a scalar;
 23 |     * `item_features`, an array representing the item features
 24 | 
 25 |     The query tower model takes the user ID feature and passes it by an embedding layer. The
 26 |     user and context features are concatenated and passed by a number of dense layers. The
 27 |     item ID feature is similarly passed to an Embedding layer. Its outputs are then concatenated
 28 |     to the outputs of the features model whose inputs are the item features, and are then
 29 |     passed by a number of Dense layers.
 30 | 
 31 |     An optional Ranking model is also included, granted there are `ranking_features`.
 32 | 
 33 |     Args:
 34 |         user_dims (int): number possible values for the user ID feature
 35 |         item_dims (int): number possible values for the item ID feature
 36 |         embedding_dim (int): output dimension of the embedding layer
 37 |         feature_layers (list): number of neurons in each layer for the feature models
 38 |         output_layers (list): number of neurons in each layer for the output models
 39 | 
 40 |     Examples:
 41 |         >>> from rexify.models import Recommender
 42 |         >>> model = Recommender()
 43 |         >>> model.compile()
 44 | 
 45 |         >>> import numpy as np
 46 |         >>> inputs = tf.data.Dataset.from_tensor_slices(np.concatenate([np.random.randint(0, 15, size=100).reshape(-1, 1), np.random.randint(0, 1, size=100).reshape(-1, 1), np.random.randint(0, 1_000, size=100).reshape(-1, 1), np.random.randint(0, 1_000, size=100).reshape(-1, 1), np.random.randint(0, 15, size=100).reshape(-1, 1), np.random.randint(0, 5, size=100).reshape(-1, 1),], axis=1)).map(lambda x: {'query': {'user_id': x[0], 'user_features': x[1:3], 'context_features': x[3:4]}, 'candidate': {'item_id': x[4], 'item_features': x[5:]}}).batch(128)
 47 | 
 48 |         >>> _ = model.fit(inputs, verbose=0)
 49 | 
 50 |     """
 51 | 
 52 |     def __init__(
 53 |         self,
 54 |         user_dims: int,
 55 |         item_dims: int,
 56 |         user_embeddings: pd.DataFrame,
 57 |         item_embeddings: pd.DataFrame,
 58 |         session_history: pd.DataFrame,
 59 |         window_size: int = 3,
 60 |         embedding_dim: int = 32,
 61 |         feature_layers: list[int] = None,
 62 |         output_layers: list[int] = None,
 63 |         ranking_features: list[str] = None,
 64 |         ranking_layers: list[int] = None,
 65 |         ranking_weights: dict[str, float] = None,
 66 |     ):
 67 |         RetrievalMixin.__init__(
 68 |             self,
 69 |             user_dims=user_dims + 1,
 70 |             item_dims=item_dims + 1,
 71 |             user_embeddings=user_embeddings,
 72 |             item_embeddings=item_embeddings,
 73 |             session_history=session_history,
 74 |             window_size=window_size,
 75 |             embedding_dim=embedding_dim,
 76 |             feature_layers=feature_layers,
 77 |             output_layers=output_layers,
 78 |         )
 79 | 
 80 |         RankingMixin.__init__(
 81 |             self,
 82 |             ranking_features=ranking_features,
 83 |             layer_sizes=ranking_layers,
 84 |             weights=ranking_weights,
 85 |         )
 86 | 
 87 |     def compute_loss(self, inputs, training: bool = False) -> tf.Tensor:
 88 |         embeddings = self(
 89 |             inputs, training=training
 90 |         )  # Recommender inherits RetrievalMixin's call method
 91 |         loss = RetrievalMixin.get_loss(self, *embeddings)
 92 |         loss += RankingMixin.get_loss(self, *embeddings, inputs["rank"])
 93 |         return loss
 94 | 
 95 |     def fit(
 96 |         self,
 97 |         x: tf.data.Dataset,
 98 |         batch_size: int = None,
 99 |         epochs: int = 1,
100 |         callbacks: list[tf.keras.callbacks.Callback] = None,
101 |         validation_data=None,
102 |     ):
103 |         callbacks = callbacks if callbacks else self._get_callbacks(x, batch_size)
104 |         # todo: validate number of index callbacks
105 |         #   - can't be more than a single index for each model (query, candidate)
106 | 
107 |         if batch_size:
108 |             x = x.batch(batch_size)
109 |             if validation_data:
110 |                 validation_data = validation_data.batch(batch_size)
111 | 
112 |         return super().fit(
113 |             x, epochs=epochs, validation_data=validation_data, callbacks=callbacks
114 |         )
115 | 
116 |     def get_config(self):
117 |         return {
118 |             "item_dims": self._item_dims,
119 |             "user_dims": self._user_dims,
120 |             "output_layers": self._output_layers,
121 |             "feature_layers": self._feature_layers,
122 |             "ranking_layers": self._ranking_layers,
123 |             "ranking_features": self._ranking_features,
124 |             "ranking_weights": self._ranking_weights,
125 |         }
126 | 
127 |     @classmethod
128 |     def load(cls, export_dir: str) -> tf.keras.Model:
129 |         return tf.saved_model.load(export_dir)
130 | 
131 |     @staticmethod
132 |     def _get_callbacks(x, batch_size: int = None) -> list[tf.keras.callbacks.Callback]:
133 |         # required to set index shapes
134 |         sample_query = get_sample_query(x)["query"]
135 | 
136 |         def get_index_callback():
137 |             try:
138 |                 import scann  # noqa: F401
139 | 
140 |                 from rexify.models.callbacks import ScaNNCallback
141 | 
142 |                 return ScaNNCallback(sample_query, batch_size=batch_size)
143 | 
144 |             except ImportError:
145 |                 return BruteForceCallback(sample_query, batch_size=batch_size)
146 | 
147 |         def get_mlflow_callback():
148 |             try:
149 |                 from rexify.models.callbacks import MlflowCallback
150 | 
151 |                 return MlflowCallback()
152 | 
153 |             except ImportError:
154 |                 return
155 | 
156 |         callbacks = [get_index_callback(), get_mlflow_callback()]
157 |         callbacks = callbacks[:-1] if callbacks[-1] is None else callbacks
158 | 
159 |         return callbacks
160 | 


--------------------------------------------------------------------------------
/rexify/models/retrieval/__init__.py:
--------------------------------------------------------------------------------
1 | from .retrieval import RetrievalMixin
2 | 


--------------------------------------------------------------------------------
/rexify/models/retrieval/candidate.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | from rexify.models.retrieval.tower import TowerModel
 5 | 
 6 | 
 7 | class CandidateModel(TowerModel):
 8 |     """Tower model responsible for computing the candidate representations
 9 | 
10 |     Args:
11 |         n_items (str): number possible values for the ID feature
12 |         embedding_dim (int): output dimension of the embedding layer
13 |         output_layers (list): number of neurons in each layer for the output model
14 |         feature_layers (list): number of neurons in each layer for the feature model
15 | 
16 |     Examples:
17 | 
18 |     >>> from rexify.models.retrieval.candidate import CandidateModel
19 |     >>> model = CandidateModel('item_id', 15)
20 |     >>> model({'item_id': tf.constant([1]), 'item_features': tf.constant([[1, 1, 1]])})
21 |     <tf.Tensor: shape=(1, 32), dtype=float32, numpy=
22 |     array([[...]], dtype=float32)>
23 |     """
24 | 
25 |     def __init__(
26 |         self,
27 |         n_items: int,
28 |         identifiers: np.array,
29 |         feature_embeddings: np.array,
30 |         embedding_dim: int = 32,
31 |         output_layers: list[int] = None,
32 |         feature_layers: list[int] = None,
33 |     ):
34 |         super().__init__(
35 |             "item_id",
36 |             n_items,
37 |             identifiers,
38 |             feature_embeddings,
39 |             embedding_dim,
40 |             output_layers,
41 |             feature_layers,
42 |         )
43 | 
44 |     def call(self, inputs: dict[str, tf.Tensor], training: bool = None) -> tf.Tensor:
45 |         x = self.embedding_layer(inputs[self._id_feature])
46 |         features = self.lookup_model(inputs[self._id_feature])
47 |         feature_embedding = self._call_layers(self.feature_model, features)
48 |         x = tf.concat([x, feature_embedding], axis=1)
49 |         x = self._call_layers(self.output_model, x)
50 |         return x
51 | 
52 |     def get_config(self):
53 |         config = super().get_config()
54 |         config["n_items"] = self._n_dims
55 |         return config
56 | 


--------------------------------------------------------------------------------
/rexify/models/retrieval/query.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import tensorflow as tf
 4 | 
 5 | from rexify.models.lookup import SessionLookup
 6 | from rexify.models.retrieval.tower import TowerModel
 7 | from rexify.models.sequential import SequentialModel
 8 | 
 9 | 
10 | class QueryModel(TowerModel):
11 |     """Tower model responsible for computing the query representations
12 | 
13 |     Args:
14 |         n_users (str): number possible values for the ID feature
15 |         embedding_dim (int): output dimension of the embedding layer
16 |         output_layers (list): number of neurons in each layer for the output model
17 |         feature_layers (list): number of neurons in each layer for the feature model
18 | 
19 |     Examples:
20 | 
21 |     >>> from rexify.models.retrieval.query import QueryModel
22 |     >>> model = QueryModel('user_id', 15)
23 |     >>> model({"user_id": tf.constant([1]), "user_features": tf.constant([[1, 1]]), "context_features": tf.constant([[1]])})
24 |     <tf.Tensor: shape=(1, 32), dtype=float32, numpy=
25 |     array([[...]], dtype=float32)>
26 |     """
27 | 
28 |     def __init__(
29 |         self,
30 |         n_users: int,
31 |         n_items: int,
32 |         identifiers: np.array,
33 |         feature_embeddings: np.array,
34 |         session_history: pd.DataFrame,
35 |         embedding_dim: int = 32,
36 |         output_layers: list[int] = None,
37 |         feature_layers: list[int] = None,
38 |         recurrent_layers: list[int] = None,
39 |         sequential_dense_layers: list[int] = None,
40 |     ):
41 |         super().__init__(
42 |             "user_id",
43 |             n_users,
44 |             identifiers,
45 |             feature_embeddings,
46 |             embedding_dim,
47 |             output_layers,
48 |             feature_layers,
49 |         )
50 |         self._n_items = n_items
51 |         self.sequential_model = SequentialModel(
52 |             n_dims=n_items,
53 |             embedding_dim=self._embedding_dim,
54 |             recurrent_layer_sizes=recurrent_layers,
55 |             dense_layer_sizes=sequential_dense_layers,
56 |         )
57 |         self.session_lookup = SessionLookup(
58 |             ids=session_history.index.values.astype(int),
59 |             sessions=np.stack(session_history.values).astype(int),
60 |         )
61 | 
62 |     def call(self, inputs: dict[str, tf.Tensor], training: bool = None) -> tf.Tensor:
63 |         x = self.embedding_layer(inputs[self._id_feature])
64 |         features = [self.lookup_model(inputs[self._id_feature])]
65 | 
66 |         history = (
67 |             self.session_lookup(inputs[self._id_feature])
68 |             if not training
69 |             else inputs["history"]
70 |         )
71 | 
72 |         sequential_embedding = self.sequential_model(history)
73 |         x = tf.concat([x, sequential_embedding], axis=1)
74 | 
75 |         features = tf.concat(features, axis=1) if len(features) > 1 else features[0]
76 |         feature_embedding = self._call_layers(self.feature_model, features)
77 |         x = tf.concat([x, feature_embedding], axis=1)
78 | 
79 |         x = self._call_layers(self.output_model, x)
80 |         return x
81 | 
82 |     def get_config(self):
83 |         config = super().get_config()
84 |         config["user_id"] = self._id_feature
85 |         config["n_users"] = self._n_dims
86 |         config["n_items"] = self._n_items
87 |         return config
88 | 


--------------------------------------------------------------------------------
/rexify/models/retrieval/retrieval.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | import pandas as pd
 4 | import tensorflow as tf
 5 | import tensorflow_recommenders as tfrs
 6 | 
 7 | from rexify.models.retrieval.candidate import CandidateModel
 8 | from rexify.models.retrieval.query import QueryModel
 9 | 
10 | 
11 | class RetrievalMixin(tfrs.Model, ABC):
12 |     def __init__(
13 |         self,
14 |         user_dims: int,
15 |         item_dims: int,
16 |         user_embeddings: pd.DataFrame,
17 |         item_embeddings: pd.DataFrame,
18 |         session_history: pd.DataFrame,
19 |         window_size: int = 3,
20 |         embedding_dim: int = 32,
21 |         feature_layers: list[int] = None,
22 |         output_layers: list[int] = None,
23 |         **kwargs
24 |     ):
25 |         super().__init__()
26 |         self._user_dims = user_dims
27 |         self._item_dims = item_dims
28 |         self._window_size = window_size
29 |         self._embedding_dim = embedding_dim
30 |         self._output_layers = output_layers or [64, 32]
31 |         self._feature_layers = feature_layers or [64, 32, 16]
32 |         joint_args = {
33 |             "embedding_dim": self._embedding_dim,
34 |             "output_layers": self._output_layers,
35 |             "feature_layers": self._feature_layers,
36 |         }
37 | 
38 |         self.query_model = QueryModel(
39 |             self._user_dims,
40 |             self._item_dims,
41 |             identifiers=user_embeddings.index.values.astype(int),
42 |             feature_embeddings=user_embeddings.values.astype(float),
43 |             session_history=session_history,
44 |             **joint_args
45 |         )
46 | 
47 |         self.candidate_model = CandidateModel(
48 |             self._item_dims,
49 |             identifiers=item_embeddings.index.values.astype(int),
50 |             feature_embeddings=item_embeddings.values.astype(float),
51 |             **joint_args
52 |         )
53 | 
54 |         self.retrieval_task = tfrs.tasks.Retrieval()
55 | 
56 |     def call(self, inputs, training: bool = False):
57 |         query_embeddings: tf.Tensor = self.query_model(
58 |             inputs["query"], training=training
59 |         )
60 |         candidate_embeddings: tf.Tensor = self.candidate_model(
61 |             inputs["candidate"], training=training
62 |         )
63 |         return query_embeddings, candidate_embeddings
64 | 
65 |     def get_loss(self, *embeddings):
66 |         return self.retrieval_task(*embeddings)
67 | 
68 |     @property
69 |     def window_size(self):
70 |         return self._window_size
71 | 


--------------------------------------------------------------------------------
/rexify/models/retrieval/tower.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | 
 6 | from rexify.models.base import DenseSetterMixin
 7 | from rexify.models.lookup import EmbeddingLookup
 8 | 
 9 | 
10 | class TowerModel(tf.keras.Model, DenseSetterMixin):
11 |     """
12 | 
13 |     Args:
14 |         id_feature (str): the ID feature
15 |         n_dims (str): number possible values for the ID feature
16 |         embedding_dim (int): output dimension of the embedding layer
17 |         layer_sizes (list): number of neurons in each layer for the output model
18 |         feature_layers (list): number of neurons in each layer for the feature model
19 | 
20 |     Attributes:
21 |          embedding_layer (tf.keras.layers.Embedding):
22 |          feature_model (list):
23 |          output_model (list):
24 |     """
25 | 
26 |     def __init__(
27 |         self,
28 |         id_feature: str,
29 |         n_dims: int,
30 |         identifiers: np.array,
31 |         feature_embeddings: np.array,
32 |         embedding_dim: int = 32,
33 |         layer_sizes: list[int] = None,
34 |         feature_layers: list[int] = None,
35 |     ):
36 |         super().__init__()
37 |         self._id_feature = id_feature
38 |         self._n_dims = n_dims
39 |         self._embedding_dim = embedding_dim
40 |         self._layer_sizes = layer_sizes or [64, 32]
41 |         self._feature_layers = feature_layers or [64, 32, 16]
42 |         self._identifiers = identifiers
43 |         self._target_features = feature_embeddings
44 | 
45 |         self.embedding_layer = tf.keras.layers.Embedding(n_dims, embedding_dim)
46 |         self.feature_model = self._set_dense_layers(self._feature_layers)
47 |         self.lookup_model = EmbeddingLookup(
48 |             ids=self._identifiers, embeddings=self._target_features
49 |         )
50 |         self.output_model = self._set_dense_layers(self._layer_sizes, activation=None)
51 | 
52 |     @abstractmethod
53 |     def call(self, inputs: dict[str, tf.Tensor], training: bool = None):
54 |         raise NotImplementedError
55 | 
56 |     def get_config(self):
57 |         return {
58 |             "id_features": self._id_feature,
59 |             "n_dims": self._n_dims,
60 |             "embedding_dim": self._embedding_dim,
61 |             "layer_sizes": self._layer_sizes,
62 |             "feature_layers": self._feature_layers,
63 |             "identifiers": self._identifiers,
64 |             "feature_embeddings": self._target_features,
65 |         }
66 | 
67 |     @property
68 |     def identifiers(self):
69 |         return self._identifiers
70 | 


--------------------------------------------------------------------------------
/rexify/models/sequential.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rexify.models.base import DenseSetterMixin
 4 | 
 5 | 
 6 | class SequentialModel(tf.keras.Model, DenseSetterMixin):
 7 |     def __init__(
 8 |         self,
 9 |         n_dims: int,
10 |         embedding_dim: int,
11 |         layer: str = "LSTM",
12 |         activation: str = "relu",
13 |         recurrent_layer_sizes: list[int] = None,
14 |         dense_layer_sizes: list[int] = None,
15 |     ):
16 |         super().__init__()
17 |         self._layer = layer
18 |         self._n_dims = n_dims
19 |         self._embedding_dim = embedding_dim
20 |         self._activation = activation
21 |         self._recurrent_layer_sizes = recurrent_layer_sizes or [32] * 2
22 |         self._dense_layer_sizes = dense_layer_sizes or [32, 16]
23 | 
24 |         self.embedding_layer = tf.keras.layers.Embedding(
25 |             self._n_dims, self._embedding_dim
26 |         )
27 | 
28 |         self.recurrent_model = self._set_recurrent_model()
29 | 
30 |         self.output_model = self._set_dense_layers(
31 |             layer_sizes=self._dense_layer_sizes[:-1], activation=activation
32 |         )
33 |         self.output_model.append(tf.keras.layers.Dense(self._dense_layer_sizes[-1]))
34 | 
35 |     def call(self, inputs: tf.Tensor):
36 |         x = tf.cast(inputs, tf.int32)
37 |         x = self.embedding_layer(x)
38 |         x = self._call_layers(self.recurrent_model, x)
39 |         return self._call_layers(self.output_model, x)
40 | 
41 |     def _set_recurrent_model(self) -> tf.keras.Model:
42 |         layer = getattr(tf.keras.layers, self._layer)
43 |         layers = self._set_sequential_model(
44 |             layer=layer,
45 |             layer_sizes=self._recurrent_layer_sizes[:-1],
46 |             return_sequences=True,
47 |         )
48 |         layers.append(layer(self._recurrent_layer_sizes[-1]))
49 |         return layers
50 | 
51 |     def get_config(self):
52 |         return {
53 |             "n_dims": self._n_dims,
54 |             "embedding_dim": self._embedding_dim,
55 |             "layer": self._layer,
56 |             "activation": self._activation,
57 |             "recurrent_layer_sizes": self._recurrent_layer_sizes,
58 |             "dense_layer_sizes": self._dense_layer_sizes,
59 |         }
60 | 


--------------------------------------------------------------------------------
/rexify/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | PIPELINE_ROOT = os.environ.get("PIPELINE_ROOT", "outputs")
5 | 


--------------------------------------------------------------------------------
/rexify/pipeline/__main__.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import typer
 4 | from kfp.v2.compiler import Compiler
 5 | from kfp.v2.dsl import pipeline
 6 | 
 7 | from rexify.pipeline import PIPELINE_ROOT
 8 | from rexify.pipeline.components import load, train
 9 | 
10 | 
11 | @pipeline(name="pipeline", pipeline_root=PIPELINE_ROOT)
12 | def pipeline(
13 |     events: str,
14 |     users: str,
15 |     items: str,
16 |     schema: str,
17 |     epochs: int = 100,
18 |     batch_size: int = 512,
19 | ):
20 | 
21 |     load_task = load(
22 |         events=events,
23 |         users=users,
24 |         items=items,
25 |         schema=schema,
26 |     )
27 | 
28 |     train_task = train(  # noqa:F841
29 |         feature_extractor=load_task.outputs["feature_extractor"],
30 |         train_data=load_task.outputs["train_data"],
31 |         validation_data=load_task.outputs["validation_data"],
32 |         batch_size=batch_size,
33 |         epochs=epochs,
34 |     )
35 | 
36 | 
37 | def compile(
38 |     output_path: str = typer.Option(
39 |         None, help="Output path for the pipeline definition JSON file"
40 |     ),
41 |     parameter: list[str] = typer.Option(
42 |         None, "--parameter", "-p", help="Pipeline parameter, KEY=VALUE"
43 |     ),
44 | ):
45 |     output_path = output_path if output_path else "pipeline.json"
46 | 
47 |     pipeline_parameters = (
48 |         {k: v for k, v in [param.split("=") for param in parameter]}
49 |         if parameter
50 |         else None
51 |     )
52 | 
53 |     with warnings.catch_warnings():
54 |         warnings.filterwarnings("ignore")
55 |         Compiler().compile(
56 |             pipeline_func=pipeline,
57 |             package_path=output_path,
58 |             pipeline_parameters=pipeline_parameters,
59 |         )
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     typer.run(compile)
64 | 


--------------------------------------------------------------------------------
/rexify/pipeline/components/__init__.py:
--------------------------------------------------------------------------------
1 | from .load import load
2 | from .train import train
3 | 


--------------------------------------------------------------------------------
/rexify/pipeline/components/load.py:
--------------------------------------------------------------------------------
 1 | from kfp.v2.dsl import Artifact, Dataset, Output, component
 2 | 
 3 | from rexify import BASE_IMAGE
 4 | 
 5 | 
 6 | @component(base_image=BASE_IMAGE)
 7 | def load(
 8 |     events: str,
 9 |     users: str,
10 |     items: str,
11 |     schema: str,
12 |     feature_extractor: Output[Artifact],
13 |     train_data: Output[Dataset],
14 |     validation_data: Output[Dataset],
15 |     test_size: float = 0.3,
16 | ):
17 |     import json
18 | 
19 |     from rexify import Events, FeatureExtractor, Output, Schema
20 | 
21 |     schema = Schema.from_dict(json.loads(schema))
22 |     train, val = Events.load(events, schema=schema).split(test_size=test_size)
23 | 
24 |     fe = FeatureExtractor(schema, users, items, return_dataset=False)
25 |     train: Output = fe.fit(train).transform(train)
26 |     val: Output = fe.transform(val)
27 | 
28 |     fe.save(feature_extractor.path)
29 |     train.save(train_data.path, "train.csv")
30 |     val.save(validation_data.path, "val.csv")
31 | 


--------------------------------------------------------------------------------
/rexify/pipeline/components/train.py:
--------------------------------------------------------------------------------
 1 | from kfp.v2.dsl import Artifact, Dataset, Input, Model, Output, component
 2 | 
 3 | from rexify import BASE_IMAGE
 4 | 
 5 | 
 6 | @component(base_image=BASE_IMAGE)
 7 | def train(
 8 |     feature_extractor: Input[Artifact],
 9 |     train_data: Input[Dataset],
10 |     validation_data: Input[Dataset],
11 |     model: Output[Model],
12 |     batch_size: int = 512,
13 |     epochs: int = 10,
14 | ):
15 |     from rexify import DataFrame, FeatureExtractor, Recommender
16 | 
17 |     fe = FeatureExtractor.load(feature_extractor.path)
18 |     train_data = DataFrame.load(train_data.path)
19 |     validation_data = DataFrame.load(validation_data.path)
20 | 
21 |     fit_params = {"batch_size": batch_size, "epochs": epochs}
22 |     recommender = Recommender(**fe.model_params)
23 |     recommender.compile()
24 |     recommender.fit(train_data, validation_data=validation_data, **fit_params)
25 |     recommender.save(model.path)
26 | 


--------------------------------------------------------------------------------
/rexify/schema.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from copy import deepcopy
 3 | 
 4 | from rexify.utils import get_target_id
 5 | 
 6 | 
 7 | class _JSONSerializable:
 8 |     def to_dict(self):
 9 |         return self.__dict__.copy()
10 | 
11 | 
12 | class _TargetSchema(_JSONSerializable):
13 | 
14 |     _SUPPORTED_DATA_TYPES = ["category", "number", "id"]
15 | 
16 |     def __init__(self, id_: str, **features):
17 |         self.id = id_
18 |         for feature_name, dtype in features.items():
19 |             self._validate_features(feature_name, dtype)
20 |             setattr(self, feature_name, dtype)
21 | 
22 |     @classmethod
23 |     def _validate_features(cls, feature_name: str, dtype: str):
24 |         if dtype not in cls._SUPPORTED_DATA_TYPES:
25 |             raise ValueError(
26 |                 f"""
27 |                 Data type not supported for feature `{feature_name}`.
28 |                 Supported data types are: {cls._SUPPORTED_DATA_TYPES}
29 |                 """
30 |             )
31 | 
32 | 
33 | class Schema(_JSONSerializable):
34 |     def __init__(
35 |         self,
36 |         user_id: str,
37 |         item_id: str,
38 |         timestamp: str,
39 |         event_type: str,
40 |         user_features: dict[str, str] = None,
41 |         item_features: dict[str, str] = None,
42 |     ):
43 |         user_features = user_features or {}
44 |         item_features = item_features or {}
45 |         self.user = _TargetSchema(user_id, **user_features)
46 |         self.item = _TargetSchema(item_id, **item_features)
47 |         self.timestamp = timestamp
48 |         self.event_type = event_type
49 | 
50 |     @classmethod
51 |     def from_json(cls, schema_path: str):
52 |         with open(schema_path, "r") as f:
53 |             schema = json.load(f)
54 |         return Schema.from_dict(schema)
55 | 
56 |     @classmethod
57 |     def from_dict(cls, schema: dict[str, str | dict[str, str]]):
58 |         schema_ = deepcopy(schema)
59 |         user_id = get_target_id(schema_, "user")[0]
60 |         _ = schema_["user"].pop(user_id)
61 | 
62 |         item_id = get_target_id(schema_, "item")[0]
63 |         _ = schema_["item"].pop(item_id)
64 | 
65 |         return Schema(
66 |             user_id=user_id,
67 |             item_id=item_id,
68 |             timestamp=schema_["timestamp"],
69 |             event_type=schema_["event_type"],
70 |             user_features=schema_["user"],
71 |             item_features=schema_["item"],
72 |         )
73 | 
74 |     def to_dict(self):
75 |         schema = dict()
76 |         schema["user"] = self.user.to_dict()
77 |         schema["user"][schema["user"]["id"]] = "id"
78 |         _ = schema["user"].pop("id")
79 | 
80 |         schema["item"] = self.item.to_dict()
81 |         schema["item"][schema["item"]["id"]] = "id"
82 |         _ = schema["item"].pop("id")
83 | 
84 |         schema["event_type"] = self.event_type
85 |         schema["timestamp"] = self.timestamp
86 |         return schema
87 | 
88 |     def save(self, path: str):
89 |         with open(path, "w") as f:
90 |             json.dump(self.to_dict(), f, indent=4)
91 | 


--------------------------------------------------------------------------------
/rexify/utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def _get_target(schema, target: str):
 7 |     return getattr(schema, target).to_dict() if type(schema) != dict else schema[target]
 8 | 
 9 | 
10 | def get_target_id(schema, target: str) -> list[str]:
11 |     if type(schema) != dict:
12 |         return [getattr(schema, target).id]
13 |     return [k for k, v in schema[target].items() if v == "id"]
14 | 
15 | 
16 | def get_target_feature(schema, target: str, type_: str):
17 |     def mask(x: tuple):
18 |         return x[1] == type_
19 | 
20 |     schema_dict = _get_target(schema, target)
21 |     return list(map(lambda x: x[0], filter(mask, schema_dict.items())))
22 | 
23 | 
24 | def make_dirs(*args):
25 |     for dir_ in args:
26 |         Path(dir_).mkdir(parents=True, exist_ok=True)
27 | 
28 | 
29 | def get_sample_query(x: tf.data.Dataset):
30 |     return list(x.batch(1).take(1))[0]
31 | 


--------------------------------------------------------------------------------
/tests/test_extractor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import pytest
  8 | from sklearn.preprocessing import StandardScaler
  9 | 
 10 | from rexify import FeatureExtractor, Output, Schema
 11 | from rexify.features.transform import CustomTransformer
 12 | 
 13 | 
 14 | class TestFeatureExtractor:
 15 |     @pytest.fixture(scope="class")
 16 |     def schema(self):
 17 |         user_id = "user_id"
 18 |         item_id = "item_id"
 19 |         timestamp = "timestamp"
 20 |         event_type = "event_type"
 21 |         user_features = {"age": "number", "gender": "category"}
 22 |         item_features = {"price": "number", "category": "category"}
 23 |         return Schema(
 24 |             user_id, item_id, timestamp, event_type, user_features, item_features
 25 |         )
 26 | 
 27 |     @pytest.fixture(scope="class")
 28 |     def data(self):
 29 |         return pd.DataFrame(
 30 |             {
 31 |                 "user_id": [1, 1, 2, 2, 3, 3],
 32 |                 "item_id": [10, 20, 10, 20, 30, 40],
 33 |                 "timestamp": [1, 2, 3, 4, 5, 6],
 34 |                 "event_type": ["p", "p", "p", "p", "p", "p"],
 35 |             }
 36 |         )
 37 | 
 38 |     @pytest.fixture(scope="class")
 39 |     def users(self):
 40 |         return pd.DataFrame(
 41 |             {"user_id": [1, 2, 3], "age": [25, 30, 35], "gender": ["M", "F", "M"]}
 42 |         )
 43 | 
 44 |     @pytest.fixture(scope="class")
 45 |     def items(self):
 46 |         return pd.DataFrame(
 47 |             {"item_id": [10, 20, 30], "price": [1, 2, 3], "category": ["1", "2", "3"]}
 48 |         )
 49 | 
 50 |     @pytest.fixture(scope="class")
 51 |     def feat(self, schema, users, items):
 52 |         users, items = self._save_users_items(users, items)
 53 |         return FeatureExtractor(schema, users, items)
 54 | 
 55 |     def test_fit(self, data, feat):
 56 |         _ = feat.fit(data)
 57 | 
 58 |     def test_transform(self, data, feat):
 59 |         transformed = feat.fit(data).transform(data)
 60 |         assert isinstance(transformed, Output)
 61 | 
 62 |     @pytest.fixture(scope="class")
 63 |     def custom_feat(self, schema, users, items):
 64 |         users["custom_feature"] = np.random.randint(100, 200, size=users.shape[0])
 65 |         users, items = self._save_users_items(users, items)
 66 |         return FeatureExtractor(
 67 |             schema,
 68 |             users,
 69 |             items,
 70 |             custom_transformers=[
 71 |                 CustomTransformer("user", StandardScaler(), ["custom_feature"])
 72 |             ],
 73 |         )
 74 | 
 75 |     def test_fit_custom(self, data, feat, custom_feat):
 76 |         _ = feat.fit(data)
 77 |         _ = custom_feat.fit(data)
 78 |         assert feat.model_params["user_embeddings"].shape[1] == 3
 79 |         assert custom_feat.model_params["user_embeddings"].shape[1] == 4
 80 | 
 81 |     def test_save_load(self, data, feat):
 82 |         _ = feat.fit(data).transform(data)
 83 |         tmp_dir = tempfile.mkdtemp()
 84 |         feat.save(tmp_dir)
 85 |         feat_path = Path(tmp_dir) / "feature_extractor.pickle"
 86 |         assert feat_path.exists()
 87 | 
 88 |         fe = FeatureExtractor.load(feat_path)
 89 |         assert fe
 90 | 
 91 |     @pytest.fixture(scope="class")
 92 |     def fe_no_data(self, schema, users, items):
 93 |         users, items = self._save_users_items(users, items)
 94 |         return FeatureExtractor(schema, users, items, return_dataset=False)
 95 | 
 96 |     def test_make_dataset(self, data, fe_no_data):
 97 |         transformed = fe_no_data.fit(data).transform(data)
 98 | 
 99 |         tmp_dir = tempfile.mkdtemp()
100 |         transformed_path = Path(tmp_dir)
101 |         transformed.save(transformed_path)
102 | 
103 |         df = Output.load(transformed_path)
104 |         df.to_dataset()
105 | 
106 |     def _save_users_items(self, users, items) -> tuple[str, str]:
107 |         tmp_dir = tempfile.mkdtemp()
108 | 
109 |         users_path = os.path.join(tmp_dir, "users.csv")
110 |         users.to_csv(users_path)
111 | 
112 |         items_path = os.path.join(tmp_dir, "items.csv")
113 |         items.to_csv(items_path)
114 | 
115 |         return users_path, items_path
116 | 


--------------------------------------------------------------------------------
/tests/test_schema.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tempfile
  3 | 
  4 | import pytest
  5 | 
  6 | from rexify.schema import Schema, _TargetSchema
  7 | 
  8 | 
  9 | def test_init():
 10 |     user_id = "user_id"
 11 |     item_id = "item_id"
 12 |     timestamp = "timestamp"
 13 |     event_type = "event_type"
 14 |     user_features = {"age": "number", "gender": "category"}
 15 |     item_features = {"price": "number", "category": "category"}
 16 |     schema = Schema(
 17 |         user_id=user_id,
 18 |         item_id=item_id,
 19 |         timestamp=timestamp,
 20 |         event_type=event_type,
 21 |         user_features=user_features,
 22 |         item_features=item_features,
 23 |     )
 24 | 
 25 |     assert schema.user.id == "user_id"
 26 |     assert schema.user.age == "number"
 27 |     assert schema.user.gender == "category"
 28 |     assert schema.item.id == "item_id"
 29 |     assert schema.item.price == "number"
 30 |     assert schema.item.category == "category"
 31 |     assert schema.timestamp == timestamp
 32 |     assert schema.event_type == event_type
 33 | 
 34 | 
 35 | def test_from_dict():
 36 |     schema_dict = {
 37 |         "user": {"user_id": "id", "age": "number", "gender": "category"},
 38 |         "item": {"item_id": "id", "price": "number", "category": "category"},
 39 |         "timestamp": "timestamp",
 40 |         "event_type": "event_type",
 41 |     }
 42 | 
 43 |     schema = Schema.from_dict(schema_dict)
 44 | 
 45 |     assert schema.user.id == "user_id"
 46 |     assert schema.user.age == "number"
 47 |     assert schema.user.gender == "category"
 48 |     assert schema.item.id == "item_id"
 49 |     assert schema.item.price == "number"
 50 |     assert schema.item.category == "category"
 51 |     assert schema.timestamp == "timestamp"
 52 |     assert schema.event_type == "event_type"
 53 | 
 54 | 
 55 | def test_load():
 56 |     schema_dict = {
 57 |         "user": {"user_id": "id", "age": "number", "gender": "category"},
 58 |         "item": {"item_id": "id", "price": "number", "category": "category"},
 59 |         "timestamp": "timestamp",
 60 |         "event_type": "event_type",
 61 |     }
 62 | 
 63 |     with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
 64 |         json.dump(schema_dict, f)
 65 |         f.seek(0)
 66 |         schema = Schema.from_json(f.name)
 67 | 
 68 |     assert schema.user.id == "user_id"
 69 |     assert schema.user.age == "number"
 70 |     assert schema.user.gender == "category"
 71 |     assert schema.item.id == "item_id"
 72 |     assert schema.item.price == "number"
 73 |     assert schema.item.category == "category"
 74 |     assert schema.timestamp == "timestamp"
 75 |     assert schema.event_type == "event_type"
 76 | 
 77 | 
 78 | def test_target_schema():
 79 |     # Test data types are valid
 80 |     target = _TargetSchema("id", feature1="category", feature2="number")
 81 |     assert hasattr(target, "id")
 82 |     assert hasattr(target, "feature1")
 83 |     assert target.feature1 == "category"
 84 |     assert hasattr(target, "feature2")
 85 |     assert target.feature2 == "number"
 86 | 
 87 |     # Test unsupported data type throws error
 88 |     with pytest.raises(ValueError, match=r"Data type not supported"):
 89 |         _ = _TargetSchema("id", feature1="string")
 90 | 
 91 | 
 92 | def test_schema_io():
 93 |     # Test Schema to_dict method
 94 |     user_id = "user_id"
 95 |     item_id = "item_id"
 96 |     timestamp = "timestamp"
 97 |     event_type = "event_type"
 98 |     user_features = {"age": "number", "gender": "category"}
 99 |     item_features = {"price": "number", "category": "category"}
100 |     schema = Schema(
101 |         user_id, item_id, timestamp, event_type, user_features, item_features
102 |     )
103 |     assert schema.to_dict() == {
104 |         "user": {"user_id": "id", "age": "number", "gender": "category"},
105 |         "item": {"item_id": "id", "price": "number", "category": "category"},
106 |         "timestamp": "timestamp",
107 |         "event_type": "event_type",
108 |     }
109 | 
110 |     # Test Schema from_dict method
111 |     schema_dict = schema.to_dict()
112 |     schema_loaded = Schema.from_dict(schema_dict)
113 |     assert schema_loaded.to_dict() == schema.to_dict()
114 | 
115 |     # Test Schema load method
116 |     with open("test_schema.json", "w") as f:
117 |         json.dump(schema_dict, f, indent=4)
118 |     schema_loaded = Schema.from_json("test_schema.json")
119 |     assert schema_loaded.to_dict() == schema.to_dict()
120 | 
121 |     # Test Schema save method
122 |     schema.save("test_schema.json")
123 |     with open("test_schema.json", "r") as f:
124 |         schema_loaded = json.load(f)
125 |     assert schema_loaded == schema_dict
126 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from rexify.utils import get_target_feature, get_target_id, make_dirs
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def schema():
10 |     return {
11 |         "target1": {"key1": "id", "key2": "value1"},
12 |         "target2": {"key3": "value2", "key4": "id"},
13 |         "target3": {"key5": "value3", "key6": "value4"},
14 |     }
15 | 
16 | 
17 | def test_get_target_id(schema):
18 |     assert get_target_id(schema, "target1") == ["key1"]
19 |     assert get_target_id(schema, "target2") == ["key4"]
20 |     assert get_target_id(schema, "target3") == []
21 | 
22 | 
23 | def test_get_target_feature(schema):
24 |     assert get_target_feature(schema, "target1", "id") == ["key1"]
25 |     assert get_target_feature(schema, "target1", "value1") == ["key2"]
26 |     assert get_target_feature(schema, "target2", "id") == ["key4"]
27 |     assert get_target_feature(schema, "target2", "value2") == ["key3"]
28 |     assert get_target_feature(schema, "target3", "value3") == ["key5"]
29 |     assert get_target_feature(schema, "target3", "value4") == ["key6"]
30 |     assert get_target_feature(schema, "target3", "value5") == []
31 | 
32 | 
33 | def test_make_dirs(tmpdir):
34 |     dir1 = tmpdir.mkdir("dir1")
35 |     dir2 = tmpdir.mkdir("dir2")
36 |     make_dirs(dir1, dir2)
37 | 
38 |     assert Path(dir1).exists()
39 |     assert Path(dir2).exists()
40 | 


--------------------------------------------------------------------------------