├── .circleci └── config.yml ├── .flake8 ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── README.md ├── docs ├── api │ ├── modules.rst │ ├── rexify.cli.rst │ ├── rexify.constants.rst │ ├── rexify.exceptions.rst │ ├── rexify.exceptions.schema.rst │ ├── rexify.features.base.rst │ ├── rexify.features.dataset.rst │ ├── rexify.features.extractor.rst │ ├── rexify.features.pipelines.rst │ ├── rexify.features.rst │ ├── rexify.models.candidate.rst │ ├── rexify.models.query.rst │ ├── rexify.models.recommender.rst │ ├── rexify.models.rst │ ├── rexify.models.tower.rst │ ├── rexify.pipeline.rst │ ├── rexify.rst │ └── rexify.utils.rst ├── conf.py ├── genindex.rst ├── index.rst ├── overview │ ├── architecture.md │ ├── inputs.md │ └── overview.md ├── requirements.txt └── tutorials │ ├── configure_pipeline.ipynb │ ├── prebuilt_pipeline.ipynb │ └── quickstart.ipynb ├── pyproject.toml ├── rexify ├── __init__.py ├── data │ ├── __init__.py │ ├── base.py │ ├── input.py │ └── output.py ├── features │ ├── __init__.py │ ├── base.py │ ├── extractor.py │ └── transform │ │ ├── __init__.py │ │ ├── category.py │ │ ├── custom.py │ │ ├── entity.py │ │ ├── event.py │ │ ├── id.py │ │ ├── number.py │ │ └── sequence.py ├── models │ ├── __init__.py │ ├── base.py │ ├── callbacks │ │ ├── __init__.py │ │ ├── index.py │ │ └── mlflow.py │ ├── index.py │ ├── lookup.py │ ├── ranking │ │ ├── __init__.py │ │ ├── base.py │ │ ├── event.py │ │ └── ranking.py │ ├── recommender.py │ ├── retrieval │ │ ├── __init__.py │ │ ├── candidate.py │ │ ├── query.py │ │ ├── retrieval.py │ │ └── tower.py │ └── sequential.py ├── pipeline │ ├── __init__.py │ ├── __main__.py │ └── components │ │ ├── __init__.py │ │ ├── load.py │ │ └── train.py ├── schema.py └── utils.py └── tests ├── test_extractor.py ├── test_schema.py └── test_utils.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | python: circleci/python@2.1.1 5 | 6 | jobs: 7 | 8 | test: 9 | docker: 10 | - image: cimg/python:3.10 11 | steps: 12 | - checkout 13 | - python/install-packages: 14 | pre-install-steps: [] 15 | pkg-manager: poetry 16 | - run: 17 | name: Run tests 18 | command: | 19 | poetry run pytest 20 | 21 | publish: 22 | docker: 23 | - image: cimg/python:3.10 24 | steps: 25 | - checkout 26 | - run: 27 | name: Build and publish 28 | command: | 29 | poetry build 30 | poetry version $(git describe --tags --abbrev=0) 31 | poetry publish --build --username $PYPI_USERNAME --password $PYPI_PASSWORD 32 | 33 | docker: 34 | docker: 35 | - image: cimg/base:2023.04 36 | environment: 37 | IMAGE_URI: joseprsm/rexify 38 | steps: 39 | - checkout 40 | - setup_remote_docker 41 | - run: 42 | name: Build Docker image 43 | command: docker build . -t $IMAGE_URI 44 | - run: 45 | name: Push Docker image 46 | command: | 47 | echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin 48 | docker push $IMAGE_URI 49 | 50 | workflows: 51 | test_only: 52 | jobs: 53 | - test 54 | 55 | test_and_build: 56 | jobs: 57 | - test: &tags_only 58 | filters: 59 | branches: 60 | ignore: /.*/ 61 | tags: 62 | only: /^\d+\.\d+\.\d+$/ 63 | - publish: 64 | <<: *tags_only 65 | requires: 66 | - test 67 | - request_docker: 68 | <<: *tags_only 69 | type: approval 70 | requires: 71 | - test 72 | - docker: 73 | <<: *tags_only 74 | requires: 75 | - request_docker 76 | - publish 77 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | select = C,E,F,W,B,B9 4 | ignore = E203, E501, W503 5 | exclude = 6 | docs/conf.py 7 | __init__.py 8 | build -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /dist 3 | 4 | .idea 5 | .env 6 | .coverage 7 | .pytest_cache 8 | 9 | */__pycache__/* 10 | */.ipynb_checkpoints/ 11 | *.egg-info/ 12 | 13 | outputs 14 | 15 | /docs/_build/ 16 | /docs/api 17 | /docs/reference 18 | 19 | .DS_Store 20 | 21 | *.pyc 22 | .vscode/settings.json 23 | 24 | mlruns 25 | 26 | /*.json -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: black 5 | name: black 6 | language: system 7 | entry: black 8 | types: [ python ] 9 | require_serial: true 10 | - id: flake8 11 | name: flake8 12 | entry: flake8 13 | language: system 14 | types: [ python ] 15 | require_serial: true 16 | - id: isort 17 | name: isort 18 | entry: isort 19 | require_serial: true 20 | language: system 21 | types_or: [cython, pyi, python] 22 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-20.04 5 | tools: 6 | python: "3.10" 7 | 8 | sphinx: 9 | builder: html 10 | configuration: docs/conf.py 11 | 12 | python: 13 | install: 14 | - requirements: docs/requirements.txt 15 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | joseprsm@gmail.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG python="3.10" 2 | ARG filesystem="gcs" 3 | 4 | FROM python:${python} AS base 5 | 6 | RUN if [ $(uname -m) != *arm* ]; then pip install scann==1.2.3; fi 7 | 8 | RUN pip install pandas numpy scikit-learn fsspec rexify 9 | 10 | FROM base AS fs-s3 11 | 12 | RUN pip install s3fs 13 | 14 | FROM base AS fs-gcs 15 | 16 | RUN pip install gcsfs 17 | 18 | FROM fs-${filesystem} AS final 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 José Medeiros 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 | 4 |
5 |

6 | 7 |

8 | 9 | Build 10 | 11 | 12 | License 13 | 14 | 15 | Documentation 16 | 17 | 18 | GitHub release 19 | 20 |

21 | 22 | Rexify is a library to streamline recommender systems model development. 23 | 24 | In essence, Rexify adapts dynamically to your data, and outputs high-performing TensorFlow 25 | models that may be used wherever you want, independently of your data. Rexify also includes 26 | modules to deal with feature engineering as Scikit-Learn Transformers and Pipelines. 27 | 28 | With Rexify, users may easily train Recommender Systems models, just by specifying what their 29 | data looks like. Rexify also comes equipped with pre-built machine learning pipelines which can 30 | be used serverlessly. 31 | 32 | ## What is Rexify? 33 | 34 | Rexify is a low-code personalization tool, that makes use of traditional machine learning 35 | frameworks, such as Scikit-Learn and TensorFlow, to create scalable Recommender Systems 36 | workflows that anyone can use. 37 | 38 | ### Who is it for? 39 | 40 | Rexify is a project that simplifies and standardizes the workflow of recommender systems. It is 41 | mostly geared towards people with little to no machine learning knowledge, that want to implement 42 | somewhat scalable Recommender Systems in their applications. 43 | 44 | ## Installation 45 | 46 | The easiest way to install Rexify is via `pip`: 47 | 48 | ```shell 49 | pip install rexify 50 | ``` 51 | 52 | ## Quick Tour 53 | 54 | Rexify is meant to be usable right out of the box. All you need to set up your model is interaction 55 | data - something that kind of looks like this: 56 | 57 | | user_id | item_id | timestamp | event_type | 58 | |---------|---------|------------|-------------| 59 | | 22 | 67 | 2021/05/13 | Purchase | 60 | | 37 | 9 | 2021/04/11 | Page View | 61 | | 22 | 473 | 2021/04/11 | Add to Cart | 62 | | ... | ... | ... | ... | 63 | | 358 | 51 | 2021/04/11 | Purchase | 64 | 65 | Additionally, we'll have to have configured a schema for the data. 66 | This schema is what will allow Rexify to generate a dynamic model and preprocessing steps. 67 | The schema should be comprised of two dictionaries (`user`, `ìtem`) and two key-value 68 | pairs: `event_type` (which should point to the column of the event type) and `timestamp` ( 69 | which should point to the timestamp column) 70 | 71 | Each of these dictionaries should consist of features and internal data types, 72 | such as: `id`, `category`, `number`. More data types will be available 73 | in the future. 74 | 75 | ```json 76 | { 77 | "user": { 78 | "user_id": "id", 79 | "age": "number" 80 | }, 81 | "item": { 82 | "item_id": "id", 83 | "category": "category" 84 | }, 85 | "timestamp": "timestamp" 86 | "event_type": "event_type" 87 | } 88 | ``` 89 | 90 | Essentially, what Rexify will do is take the schema, and dynamically adapt to the data. 91 | 92 | There are two main components in Rexify workflows: `FeatureExtractor` and `Recommender`. 93 | 94 | The `FeatureExtractor` is a scikit-learn Transformer that basically takes the schema of 95 | the data, and transforms the event data accordingly. Another method `.make_dataset()`, 96 | converts the transformed data into a `tf.data.Dataset`, all correctly configured to be fed 97 | to the `Recommender` model. 98 | 99 | `Recommender` is a `tfrs.Model` that basically implements the Query and Candidate towers. 100 | During training, the Query tower will take the user ID, user features, and context, to 101 | learn an embedding; the Candidate tower will do the same for the item ID and its features. 102 | 103 | More information about how the `FeatureExtractor` and the `Recommender` works can be found 104 | [here](https://rexify.readthedocs.io/en/latest/overview/architecture.html). 105 | 106 | A sample Rexify workflow should sort of look like this: 107 | 108 | ````python 109 | 110 | import pandas as pd 111 | 112 | from rexify import Schema, FeatureExtractor, Recommender 113 | 114 | events = pd.read_csv('path/to/events/data') 115 | schema = Schema.load('path/to/schema') 116 | 117 | fe = FeatureExtractor(schema, users='path/to/users/data', items='path/to/events/data', return_dataset=True) 118 | x = fe.fit(events).transform(events) 119 | 120 | model = Recommender(**fe.model_params) 121 | model.compile() 122 | model.fit(events, batch_size=512) 123 | ```` 124 | 125 | When training is complete, you'll have a trained `tf.keras.Model` ready to be used, as 126 | you normally would. 127 | 128 | Alternatively, you can also run: 129 | 130 | ```shell 131 | python -m rexify.pipeline -p events=$EVENTS_PATH -p users=$USER_PATH -p items=$ITEMS_PATH -p schema=$SCHEMA_PATH 132 | ``` 133 | 134 | Which will generate a `pipeline.json` file, that you can use on Kubeflow Pipelines (or Vertex AI Pipelines). 135 | 136 | ## License 137 | 138 | [MIT](https://github.com/joseprsm/rexify/blob/main/LICENSE) 139 | -------------------------------------------------------------------------------- /docs/api/modules.rst: -------------------------------------------------------------------------------- 1 | rexify 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | rexify 8 | -------------------------------------------------------------------------------- /docs/api/rexify.cli.rst: -------------------------------------------------------------------------------- 1 | rexify.cli module 2 | ================= 3 | 4 | .. automodule:: rexify.cli 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.constants.rst: -------------------------------------------------------------------------------- 1 | rexify.constants module 2 | ======================= 3 | 4 | .. automodule:: rexify.constants 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.exceptions.rst: -------------------------------------------------------------------------------- 1 | rexify.exceptions package 2 | ========================= 3 | 4 | .. automodule:: rexify.exceptions 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | :maxdepth: 6 14 | 15 | rexify.exceptions.schema 16 | -------------------------------------------------------------------------------- /docs/api/rexify.exceptions.schema.rst: -------------------------------------------------------------------------------- 1 | rexify.exceptions.schema module 2 | =============================== 3 | 4 | .. automodule:: rexify.exceptions.schema 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.features.base.rst: -------------------------------------------------------------------------------- 1 | rexify.features.base module 2 | =========================== 3 | 4 | .. automodule:: rexify.features.base 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.features.dataset.rst: -------------------------------------------------------------------------------- 1 | rexify.features.dataset module 2 | ============================== 3 | 4 | .. automodule:: rexify.features.dataset 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.features.extractor.rst: -------------------------------------------------------------------------------- 1 | rexify.features.extractor module 2 | ================================ 3 | 4 | .. automodule:: rexify.features.extractor 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.features.pipelines.rst: -------------------------------------------------------------------------------- 1 | rexify.features.pipelines module 2 | ================================ 3 | 4 | .. automodule:: rexify.features.pipelines 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.features.rst: -------------------------------------------------------------------------------- 1 | rexify.features package 2 | ======================= 3 | 4 | .. automodule:: rexify.features 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | :maxdepth: 6 14 | 15 | rexify.features.base 16 | rexify.features.dataset 17 | rexify.features.extractor 18 | rexify.features.pipelines 19 | -------------------------------------------------------------------------------- /docs/api/rexify.models.candidate.rst: -------------------------------------------------------------------------------- 1 | rexify.models.candidate module 2 | ============================== 3 | 4 | .. automodule:: rexify.models.candidate 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.models.query.rst: -------------------------------------------------------------------------------- 1 | rexify.models.query module 2 | ========================== 3 | 4 | .. automodule:: rexify.models.query 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.models.recommender.rst: -------------------------------------------------------------------------------- 1 | rexify.models.recommender module 2 | ================================ 3 | 4 | .. automodule:: rexify.models.recommender 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.models.rst: -------------------------------------------------------------------------------- 1 | rexify.models package 2 | ===================== 3 | 4 | .. automodule:: rexify.models 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | .. toctree:: 13 | :maxdepth: 6 14 | 15 | rexify.models.candidate 16 | rexify.models.query 17 | rexify.models.ranking 18 | rexify.models.recommender 19 | rexify.models.retrieval 20 | rexify.models.tower 21 | -------------------------------------------------------------------------------- /docs/api/rexify.models.tower.rst: -------------------------------------------------------------------------------- 1 | rexify.models.tower module 2 | ========================== 3 | 4 | .. automodule:: rexify.models.tower 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.pipeline.rst: -------------------------------------------------------------------------------- 1 | rexify.pipeline module 2 | ====================== 3 | 4 | .. automodule:: rexify.pipeline 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/api/rexify.rst: -------------------------------------------------------------------------------- 1 | rexify package 2 | ============== 3 | 4 | .. automodule:: rexify 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 6 14 | 15 | rexify.exceptions 16 | rexify.features 17 | rexify.models 18 | 19 | Submodules 20 | ---------- 21 | 22 | .. toctree:: 23 | :maxdepth: 6 24 | 25 | rexify.cli 26 | rexify.constants 27 | rexify.pipeline 28 | rexify.utils 29 | -------------------------------------------------------------------------------- /docs/api/rexify.utils.rst: -------------------------------------------------------------------------------- 1 | rexify.utils module 2 | =================== 3 | 4 | .. automodule:: rexify.utils 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import sphinx_material 2 | 3 | 4 | project = "Rexify" 5 | html_title = "Rexify" 6 | 7 | html_theme = "sphinx_material" 8 | 9 | extensions = [ 10 | "sphinx.ext.autodoc", 11 | "sphinx.ext.githubpages", 12 | "m2r2", 13 | "sphinx.ext.napoleon", 14 | "sphinx_search.extension", 15 | "sphinxcontrib.apidoc", 16 | "nbsphinx", 17 | ] 18 | source_suffix = [".rst", ".md"] 19 | 20 | napoleon_google_docstring = True 21 | napoleon_numpy_docstring = True 22 | napoleon_include_init_with_doc = True 23 | napoleon_include_private_with_doc = False 24 | napoleon_include_special_with_doc = True 25 | napoleon_use_admonition_for_examples = False 26 | napoleon_use_admonition_for_notes = False 27 | napoleon_use_admonition_for_references = False 28 | napoleon_use_ivar = False 29 | napoleon_use_param = True 30 | napoleon_use_rtype = False 31 | 32 | apidoc_module_dir = "../rexify" 33 | apidoc_output_dir = "api" 34 | apidoc_excluded_paths = ["**/*test*"] 35 | apidoc_module_first = True 36 | apidoc_separate_modules = True 37 | apidoc_extra_args = ["-d 6"] 38 | 39 | html_theme_options = { 40 | "color_primary": "cyan", 41 | "color_accent": "light-blue", 42 | "repo_url": "https://github.com/joseprsm/rexify", 43 | "repo_name": "Rexify", 44 | "globaltoc_depth": 2, 45 | "globaltoc_collapse": False, 46 | "globaltoc_includehidden": False, 47 | "repo_type": "github", 48 | } 49 | 50 | extensions.append("sphinx_material") 51 | html_theme_path = sphinx_material.html_theme_path() 52 | html_context = sphinx_material.get_html_context() 53 | 54 | html_sidebars = { 55 | "**": ["logo-text.html", "globaltoc.html", "localtoc.html", "searchbox.html"] 56 | } 57 | 58 | nbsphinx_allow_errors = True 59 | -------------------------------------------------------------------------------- /docs/genindex.rst: -------------------------------------------------------------------------------- 1 | Main Index 2 | ========== -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. toctree:: 2 | :hidden: 3 | 4 | genindex 5 | 6 | 7 | .. toctree:: 8 | :titlesonly: 9 | 10 | Rexify 11 | Architecture 12 | Inputs 13 | 14 | .. toctree:: 15 | :titlesonly: 16 | :caption: Guides and Examples 17 | 18 | Quickstart 19 | Using a pre-built pipeline 20 | Configuring your own Kubeflow pipeline 21 | 22 | .. toctree:: 23 | :maxdepth: 1 24 | :caption: API reference 25 | 26 | API reference 27 | 28 | .. mdinclude:: ../README.md -------------------------------------------------------------------------------- /docs/overview/architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | 3 | Rexify has two main components: the `FeatureExtractor` and the `Recommender`. 4 | 5 | The former basically takes the original data, and learns all the transformations 6 | that need to be applied to the dataset. The output is a `tf.data.Dataset` with the 7 | right structure to be passed on to the `Recommender` model. 8 | 9 | This `Recommender` is a TensorFlow model with a dynamic architecture, which adapts 10 | itself according to the schema fed to the `FeatureExtractor`. 11 | 12 | ## Feature Extractor 13 | 14 | The `FeatureExtractor` is a scikit-learn Transformer. It implements a `.fit()` 15 | and a `.transform()` method that apply a set of transformations on the data. 16 | 17 | Essentially, it has a `_ppl` attribute which is a `sklearn.pipeline.Pipeline`; 18 | the pipeline steps are set according to the `schema` passed during instantiation, 19 | which are scikit-learn Transformers themselves. 20 | 21 | For example, an attribute classified as `id` would create a pipeline step with a 22 | `sklearn.compose.ColumnTransformer`, composed of a single `sklearn.preprocessing.OrdinalEncoder` 23 | Transformer. 24 | 25 | Additionally, it subclasses `rexify.features.TfDatasetGenerator`, which converts 26 | the output of the transformations of the `FeatureExtractor` into a `tf.data.Dataset`, 27 | with a nested structure such as this: 28 | 29 | ``` 30 | { 31 | "query": { 32 | "user_id": tf.Tensor([]), 33 | "user_features": tf.Tensor([]), 34 | "context": tf.Tensor([]), 35 | }, 36 | "candidate": { 37 | "item_id": tf.Tensor([]), 38 | "item_features": tf.Tensor([]) 39 | } 40 | } 41 | ``` 42 | 43 | With this structure, the Recommender model can call a different set of layers for 44 | the user and item ID attributes, and the remaining transformed features. 45 | 46 | ## Recommender 47 | 48 | The `Recommender` is a `tfrs.models.Model`, which subclasses `tf.keras.Model` 49 | and overrides the `.train_step()` method. According to the [TensorFlow Recommenders documentation](https://www.tensorflow.org/recommenders/api_docs/python/tfrs/models/Model): 50 | 51 | > Many recommender models are relatively complex, and do not neatly 52 | > fit into supervised or unsupervised paradigms. This base class makes it easy to 53 | > define custom training and test losses for such complex models. 54 | 55 | In this case, we use the Recommender model, to create a two tower model architecture, as explained [here](https://research.google/pubs/pub48840/). 56 | In short, it's composed of two main models, a Query model and a Candidate model, both of which learn 57 | to represent queries and candidates in the same vector space. 58 | 59 |

60 | 61 |

62 | 63 | Basically, it takes the `tf.data.Dataset` output by the `FeatureExtractor` and passes it by the two 64 | Query and Candidate model towers. Due to the nested structure of the dataset, we're able to get and apply 65 | different transformations to different sets of features. 66 | 67 | ### Query Tower 68 | 69 | The Query Tower is responsible with learning a representation for the queries. That representation is a 70 | combination between the user embedding, and the features learned from the remaining 71 | user and context attributes. 72 | 73 | Essentially, it takes the user ID attribute and passes it to an Embedding layer. The user and context 74 | features are concatenated and passed to a model composed of Dense layers. The output of that model and 75 | the user embedding are then concatenated and subsequently fed to another set of Dense layers. 76 | 77 | The resulting vector should represent a single query, which can be used to compute the similarity 78 | to the candidate vectors. 79 | 80 | ### Candidate Tower 81 | 82 | In essence, the Candidate Tower shares the same behavior as the Query's. The key difference is that instead 83 | of using the user ID and features and context, it solely uses the item ID and remaining features. 84 | 85 | On a deeper level, it takes the item ID attribute and passes it to an Embedding layer. The item features are 86 | passed to a set of Dense layers. The output of these layers and the Embedding layer are then concatenated and 87 | then passed to another set of Dense layers. 88 | 89 | The resulting vector should represent a single candidate, or item, in this case, which can be used to compute 90 | the similarity to a query vector or between other candidate vectors. -------------------------------------------------------------------------------- /docs/overview/inputs.md: -------------------------------------------------------------------------------- 1 | # Inputs 2 | 3 | ## Data 4 | 5 | There are three main types of data may be input to Rexify: [Events](#Events), [Users](#Users), and [Items](#Items) 6 | 7 | ### Events 8 | 9 | Events are the main type of 10 | 11 | ### Users 12 | 13 | ### Items 14 | 15 | ## Schema -------------------------------------------------------------------------------- /docs/overview/overview.md: -------------------------------------------------------------------------------- 1 | # Rexify 2 | 3 | Rexify is a library to streamline recommender systems model development. It is built on 4 | top of [Tensorflow Recommenders](https://github.com/tensorflow/recommenders) models and 5 | [Kubeflow](https://github.com/kubeflow/pipelines) pipelines. 6 | 7 | In essence, Rexify adapts dynamically to your data, and outputs high-performing TensorFlow 8 | models that may be used wherever you want, independently of your data. Rexify also includes modules to deal with feature engineering as Scikit-Learn Transformers 9 | and Pipelines. 10 | 11 | ## Who is Rexify for? 12 | 13 | Rexify is a project that simplifies and standardizes the workflow of recommender systems. It is 14 | mostly geared towards people with little to no machine learning knowledge, that want to implement 15 | somewhat scalable Recommender Systems in their applications. 16 | 17 | ## Quick Tour 18 | 19 | Rexify is meant to be usable right out of the box. All you need to set up your model is interaction data - something that kind of looks like this: 20 | 21 | | user_id | item_id | timestamp | item_name | event_type | 22 | |---------|---------|------------|-------------|-------------| 23 | | 22 | 67 | 2021/05/13 | Blue Jeans | Purchase | 24 | | 37 | 9 | 2021/04/11 | White Shirt | Page View | 25 | | 22 | 473 | 2021/04/11 | Red Purse | Add to Cart | 26 | | ... | ... | ... | ... | ... | 27 | | 358 | 51 | 2021/04/11 | Bracelet | Purchase | 28 | 29 | Additionally, we'll have to have configured a schema for the data. 30 | This schema is what will allow Rexify to generate a dynamic model and preprocessing steps. 31 | The schema should be comprised of three dictionaries: `user`, `ìtem`, `context`. 32 | 33 | Each of these dictionaries should consist of features and internal data types, 34 | such as: `id`, `categorical`, `timestamp`, `text`. More data types will be available 35 | in the future. 36 | 37 | ```json 38 | { 39 | "user": { 40 | "user_id": "id" 41 | }, 42 | "item": { 43 | "item_id": "id", 44 | "timestamp": "timestamp", 45 | "item_name": "text" 46 | }, 47 | "context": { 48 | "event_type": "categorical" 49 | } 50 | } 51 | ``` 52 | 53 | Essentially, what Rexify will do is take the schema, and dynamically adapt to the data. 54 | 55 | ### As a package 56 | 57 | There are two main components in Rexify workflows: `FeatureExtractor` and `Recommender`. 58 | 59 | The `FeatureExtractor` is a scikit-learn Transformer that basically takes the schema of the data, and transforms the event data accordingly. Another method `.make_dataset()`, converts the transformed data into a `tf.data.Dataset`, all correctly configured to be fed to the `Recommender` model. You can read more about how the `FeatureExtractor` works here. 60 | 61 | `Recommender` is a `tfrs.Model` that basically implements the Query and Candidate towers. During training, the Query tower will take the user ID, user features, and context, to learn an embedding; the Candidate tower will do the same for the item ID and its features. More information about the `Recommender` model can be found here. 62 | 63 | A sample Rexify workflow should sort of look like this: 64 | 65 | ````python 66 | import json 67 | import pandas as pd 68 | 69 | from rexify.features import FeatureExtractor 70 | from rexify.models import Recommender 71 | 72 | events = pd.read_csv('path/to/events/data') 73 | with open('path/to/schema') as f: 74 | schema = json.load(f) 75 | 76 | feat = FeatureExtractor(schema) 77 | prep_data = feat.fit_transform(events) 78 | ds = feat.make_dataset(prep_data) 79 | 80 | model = Recommender(**feat.model_params) 81 | model.compile() 82 | model.fit(ds) 83 | ```` 84 | 85 | When training is complete, you'll have a trained `tf.keras.Model` ready to be used, as you normally would. 86 | 87 | ### As a prebuilt pipeline 88 | 89 | After cloning this project and setting up the necessary environment variables, you can run: 90 | 91 | ```shell 92 | python -m rexify.pipeline 93 | ``` 94 | 95 | Which should output a `pipeline.json` file. You can then upload this file manually to 96 | either a Kubeflow Pipeline or Vertex AI Pipelines instance, and it should run seamlessly. 97 | 98 | You can also check the [Kubeflow Pipeline](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.client.html#kfp.Client.create_run_from_pipeline_package) 99 | and [Vertex AI](https://cloud.google.com/vertex-ai/docs/pipelines/run-pipeline#create_a_pipeline_run) 100 | documentation to learn how to submit these pipelines programmatically. 101 | 102 | The prebuilt pipeline consists of 5 components: 103 | 104 | 1. `download`, which downloads the event data from URLs set on the `$INPUT_DATA_URL` and `$SCHEMA_URL` environment variables 105 | 2. `load`, which prepares the data downloaded in the previous step 106 | 3. `train`, which trains a `Recommender` model on the preprocessed data 107 | 4. `index`, which trains a [ScaNN](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html) model to retrieve the nearest neighbors 108 | 5. `retrieval`, which basically retrieves the nearest _k_ neighbors for each of the known users 109 | 110 | 111 | ### Via the demo application 112 | 113 | After cloning the project, install the demo dependencies and run the Streamlit application: 114 | 115 | ```shell 116 | pip install -r demo/requirements.txt 117 | streamlit run demo/app.py 118 | ``` 119 | 120 | Or, if you're using docker: 121 | 122 | ```shell 123 | docker run joseprsm/rexify-demo 124 | ``` 125 | 126 | You can then follow the steps here to set up your pipeline. 127 | 128 | During setup, you'll be asked to either input a publicly available dataset URL or use a sample data set. 129 | After that, you'll have a form to help you set up the schema for the data. 130 | 131 | Finally, after hitting "Compile", you'll have your Pipeline Spec ready. The resulting JSON file can then 132 | be uploaded to Vertex AI Pipelines or Kubeflow, seamlessly. 133 | 134 | The key difference from this pipeline to the prebuilt one is that instead of using the `download` component 135 | to download the schema, it will pass it as an argument to the pipeline, and then use a `copy` component to 136 | pass it down as an artifact. 137 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | mock==1.0.1 3 | alabaster>=0.7,<0.8,!=0.7.5 4 | commonmark==0.8.1 5 | recommonmark==0.5.0 6 | sphinx-rtd-theme 7 | readthedocs-sphinx-ext<2.2 8 | sphinx_material==0.0.30 9 | m2r2 10 | breathe 11 | sphinxcontrib-apidoc>=0.3.0 12 | readthedocs-sphinx-search==0.1.0 13 | jinja2==3.1.2 14 | nbsphinx==0.8.2 15 | nbsphinx-link==1.1.0 16 | ipykernel -------------------------------------------------------------------------------- /docs/tutorials/configure_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8e30dc1e-0237-4a9c-94a7-f3495a608ab7", 6 | "metadata": {}, 7 | "source": [ 8 | "# Configuring your own pipeline" 9 | ] 10 | } 11 | ], 12 | "metadata": { 13 | "kernelspec": { 14 | "display_name": "Python 3 (ipykernel)", 15 | "language": "python", 16 | "name": "python3" 17 | }, 18 | "language_info": { 19 | "codemirror_mode": { 20 | "name": "ipython", 21 | "version": 3 22 | }, 23 | "file_extension": ".py", 24 | "mimetype": "text/x-python", 25 | "name": "python", 26 | "nbconvert_exporter": "python", 27 | "pygments_lexer": "ipython3", 28 | "version": "3.9.10" 29 | } 30 | }, 31 | "nbformat": 4, 32 | "nbformat_minor": 5 33 | } 34 | -------------------------------------------------------------------------------- /docs/tutorials/prebuilt_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4d7be883-9e12-4f8c-b3b0-6f0505065da9", 6 | "metadata": {}, 7 | "source": [ 8 | "# Using the pre-built pipeline" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "f5ba48c3-1176-4a47-a146-c45e20fb6645", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!pip install rexify" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "d0e1f889-7ba1-458c-bbee-240cf0ad3b19", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "!rexify pipeline create --args" 29 | ] 30 | } 31 | ], 32 | "metadata": { 33 | "kernelspec": { 34 | "display_name": "Python 3 (ipykernel)", 35 | "language": "python", 36 | "name": "python3" 37 | }, 38 | "language_info": { 39 | "codemirror_mode": { 40 | "name": "ipython", 41 | "version": 3 42 | }, 43 | "file_extension": ".py", 44 | "mimetype": "text/x-python", 45 | "name": "python", 46 | "nbconvert_exporter": "python", 47 | "pygments_lexer": "ipython3", 48 | "version": "3.9.10" 49 | } 50 | }, 51 | "nbformat": 4, 52 | "nbformat_minor": 5 53 | } 54 | -------------------------------------------------------------------------------- /docs/tutorials/quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "07e2eea0-dc4a-436c-8605-04df80a20d45", 6 | "metadata": {}, 7 | "source": [ 8 | "# Quickstart\n", 9 | "\n", 10 | "Let's start by installing Rexify" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "fee1baf9-f430-44d3-a2f0-82f9cb17f107", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "!pip install rexify" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "f6ed5c5a-f691-4871-94f3-97895132bf91", 26 | "metadata": {}, 27 | "source": [ 28 | "Get some data:" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "7e7c8d3a-400c-4a6b-bf1f-171c73793c16", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "!mkdir data\n", 39 | "!curl --get https://storage.googleapis.com/roostr-ratings-matrices/rexify/completions.csv > data/events.csv" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "e9fbc3cd-e598-4270-a15e-d9a5cfb9ba5f", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import pandas as pd" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "89e2d0b3-f0fd-4094-b64e-ccca7ae24705", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "events = pd.read_csv('data/events.csv')\n", 60 | "events" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "47ab6ec6-0d08-40c4-83c6-bd797ae40aca", 66 | "metadata": {}, 67 | "source": [ 68 | "Next, we need to specify our schema:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "09a944b4-045a-49c0-9e6a-efa2f2be14ae", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "schema = {\n", 79 | " \"user\": {\n", 80 | " \"account_id\": \"id\",\n", 81 | " },\n", 82 | " \"item\": {\n", 83 | " \"program_id\": \"id\",\n", 84 | " },\n", 85 | " \"context\": {}\n", 86 | "}" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "ea75dc34-0aa3-4d2f-a938-12734d57bff9", 92 | "metadata": {}, 93 | "source": [ 94 | "To preprocess our data, we can use the `FeatureExtractor`" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "cbb99040-4e6c-42f9-87dc-1cbe033989b6", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "from rexify.features import FeatureExtractor" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "616e0441-d2ef-4d2d-8524-35635ed310a1", 110 | "metadata": {}, 111 | "source": [ 112 | "We just need to pass it the schema, and it's ready to roll out." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "0198ea5f-bd27-4304-a4ae-9218fcccc7eb", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "feat = FeatureExtractor(schema=schema)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "40911616-99d7-4510-8946-7219d507b87b", 128 | "metadata": {}, 129 | "source": [ 130 | "As a scikit-learn Transformer, it has two main methods: `.fit()` and `.transform()`. What `.fit_transform()` essentially does is: `.fit().transform()`.\n", 131 | "\n", 132 | "During `.fit()`, it will take the schema, and infer what the preprocessing should look like - what transformations it should apply to the data before it's ready to be passed to the model. During `.transform()` it will apply those transformations, resulting in a `numpy.array` with the same number of rows as the original data." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "8f12e2f1-a724-4139-9102-009b11cda8df", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "features = feat.fit_transform(events)\n", 143 | "features" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "id": "011cd59c-d754-4a22-af0a-de65e81b68f3", 149 | "metadata": {}, 150 | "source": [ 151 | "The `.make_dataset()` method converts the numpy array to a `tf.data.Dataset` with the format it's expecting." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "id": "213b3c47-d612-41d1-a2f1-015f6c0b9b92", 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "dataset = feat.make_dataset(features).batch(512)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "id": "d356f43c-a722-4bfd-bb0c-12a081d39316", 167 | "metadata": {}, 168 | "source": [ 169 | "We can now take our `Recommender` model and instantiate it.\n", 170 | "\n", 171 | "During `.fit`, our `FeatureExtractor` also learns the right model parameters, so we don't need to worry about them. They're stored in the `model_params` property." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "id": "b1826f76-56a2-44a9-bf49-0854ce1c678a", 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "from rexify.models import Recommender" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "id": "73ff6889-8fc9-4cdf-bf5e-3be307e03235", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "model = Recommender(**feat.model_params)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "id": "59a0a545-6e0d-4b3d-927e-0282e7760820", 197 | "metadata": {}, 198 | "source": [ 199 | "Being a `tensorflow.keras.Model` itself, in order to fit it, we need to first compile it:" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "id": "62e89747-42fb-4fee-a49f-56328f208b5c", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "model.compile()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "id": "d507a703-afa6-44f9-b24c-7362971da047", 215 | "metadata": {}, 216 | "source": [ 217 | "To fit it, all we need to do is pass our `tf.data.Dataset`:" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "0d1ef245-2b9c-4bd0-a256-60595a0b699f", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "# model.fit(dataset)" 228 | ] 229 | } 230 | ], 231 | "metadata": { 232 | "kernelspec": { 233 | "display_name": "Python 3 (ipykernel)", 234 | "language": "python", 235 | "name": "python3" 236 | }, 237 | "language_info": { 238 | "codemirror_mode": { 239 | "name": "ipython", 240 | "version": 3 241 | }, 242 | "file_extension": ".py", 243 | "mimetype": "text/x-python", 244 | "name": "python", 245 | "nbconvert_exporter": "python", 246 | "pygments_lexer": "ipython3", 247 | "version": "3.9.10" 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 5 252 | } 253 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "rexify" 3 | version = "0.0.0" 4 | description = "Streamlined Recommender System workflows with TensorFlow and Kubeflow" 5 | authors = ["José Medeiros "] 6 | license = "MIT" 7 | readme = "README.md" 8 | documentation = "https://rexify.readthedocs.io" 9 | packages = [{ include = "rexify" }] 10 | classifiers = [ 11 | "Development Status :: 3 - Alpha", 12 | "Intended Audience :: Developers", 13 | "Intended Audience :: Information Technology", 14 | "License :: OSI Approved :: MIT License", 15 | "Operating System :: OS Independent", 16 | "Programming Language :: Python :: 3 :: Only", 17 | "Programming Language :: Python :: 3.8", 18 | "Programming Language :: Python :: 3.9", 19 | "Programming Language :: Python :: 3.10", 20 | "Topic :: Software Development", 21 | "Topic :: Software Development :: Libraries", 22 | "Topic :: Software Development :: Libraries :: Python Modules", 23 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 24 | ] 25 | 26 | [tool.poetry.dependencies] 27 | python = ">=3.8,<3.11" 28 | tensorflow = { version = "2.9.0", markers = "sys_platform != 'darwin'" } 29 | tensorflow_metal = { version = "0.5.0", markers = "sys_platform == 'darwin'"} 30 | tensorflow_macos = { version = "2.9.0", markers = "sys_platform == 'darwin'"} 31 | tensorflow_recommenders = ">=0.7.2" 32 | scikit-learn = "1.*" 33 | pandas = "^1.4.0" 34 | numpy = ">=1.22.3" 35 | kfp = { version = "^1.8.0", optional = true } 36 | mlflow = { version = "^2.3.0", optional = true } 37 | scann = { version = "^1.2.3", markers = "sys_platform != 'darwin'", optional = true } 38 | fsspec = { version = "2023.4.0", optional = true } 39 | 40 | [tool.poetry.extras] 41 | mlflow = ["mlflow"] 42 | scann = ["scann"] 43 | kfp = ["kfp", "fsspec"] 44 | 45 | [tool.poetry.dev-dependencies] 46 | pytest = "^7.1.2" 47 | flake8 = "^5.0.4" 48 | black = "^22.6.0" 49 | isort = "^5.10.1" 50 | pre-commit = "^2.20.0" 51 | darglint = ">=1.8.1" 52 | coverage = {extras = ["toml"], version = ">=6.2"} 53 | interrogate = "^1.5.0" 54 | 55 | [tool.isort] 56 | profile = "black" 57 | lines_after_imports = 2 58 | 59 | [tool.darglint] 60 | strictness = "long" 61 | 62 | [tool.mypy] 63 | disallow_any_generics = true 64 | disallow_subclassing_any = true 65 | disallow_untyped_calls = true 66 | disallow_untyped_defs = true 67 | disallow_incomplete_defs = true 68 | check_untyped_defs = true 69 | disallow_untyped_decorators = true 70 | no_implicit_optional = true 71 | warn_redundant_casts = true 72 | warn_unused_ignores = true 73 | warn_return_any = true 74 | implicit_reexport = false 75 | strict_equality = true 76 | 77 | [tool.coverage.paths] 78 | source = ["rexify"] 79 | 80 | [tool.coverage.run] 81 | branch = true 82 | source = ["rexify"] 83 | 84 | [tool.coverage.report] 85 | show_missing = true 86 | exclude_lines = ["if __name__ == .__main__.:", "_cmd"] 87 | omit = ["*/__init__.py"] 88 | 89 | [tool.interrogate] 90 | ignore-init-method = true 91 | ignore-init-module = true 92 | ignore-magic = true 93 | ignore-semiprivate = true 94 | ignore-private = true 95 | ignore-module = true 96 | ignore-nested-functions = true 97 | ignore-property-decorators = true 98 | exclude = ["docs", "build", "rexify/pipeline.py", "*/exceptions/*"] 99 | ignore-regex = ["call", "get_config", "compute_loss"] 100 | verbose = 0 101 | omit-covered-files = false 102 | quiet = false 103 | color = false 104 | 105 | 106 | [build-system] 107 | requires = ["poetry-core>=1.0.0"] 108 | build-backend = "poetry.core.masonry.api" 109 | -------------------------------------------------------------------------------- /rexify/__init__.py: -------------------------------------------------------------------------------- 1 | from .data import Events, Items, Output, Users 2 | from .features.extractor import FeatureExtractor 3 | from .models import Recommender 4 | from .schema import Schema 5 | 6 | 7 | BASE_IMAGE = "joseprsm/rexify" 8 | -------------------------------------------------------------------------------- /rexify/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .input import Events, Items, Users 2 | from .output import Output 3 | -------------------------------------------------------------------------------- /rexify/data/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | 6 | from rexify.features.base import HasSchemaMixin 7 | from rexify.schema import Schema 8 | 9 | 10 | class BaseDataFrame(pd.DataFrame, HasSchemaMixin): 11 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None: 12 | pd.DataFrame.__init__(self, data) 13 | HasSchemaMixin.__init__(self, schema=schema) 14 | 15 | @abstractmethod 16 | def load(cls, path: str | Path, **kwargs): 17 | pass 18 | -------------------------------------------------------------------------------- /rexify/data/input.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | from sklearn.model_selection import train_test_split 6 | 7 | from rexify.data.base import BaseDataFrame 8 | from rexify.schema import Schema 9 | 10 | 11 | class Input(BaseDataFrame): 12 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None: 13 | super().__init__(data, schema) 14 | 15 | @classmethod 16 | def load(cls, path: str | Path, load_fn: str = "read_csv", schema: Schema = None): 17 | return cls(data=getattr(pd, load_fn)(path), schema=schema) 18 | 19 | def split(self, **kwargs): 20 | train, val = train_test_split(self, **kwargs) 21 | return self.__class__(train, self.schema), self.__class__(val, self.schema) 22 | 23 | @abstractmethod 24 | def generate(cls, n: int = 100): 25 | raise NotImplementedError 26 | 27 | 28 | class Events(Input): 29 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None: 30 | super().__init__(data, schema) 31 | 32 | 33 | class Users(Input): 34 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None: 35 | super().__init__(data, schema) 36 | 37 | 38 | class Items(Input): 39 | def __init__(self, data: pd.DataFrame, schema: Schema) -> None: 40 | super().__init__(data, schema) 41 | -------------------------------------------------------------------------------- /rexify/data/output.py: -------------------------------------------------------------------------------- 1 | import json 2 | import warnings 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import tensorflow as tf 8 | 9 | from rexify.data.base import BaseDataFrame 10 | from rexify.schema import Schema 11 | from rexify.utils import get_target_id, make_dirs 12 | 13 | 14 | class Output(BaseDataFrame): 15 | def __init__( 16 | self, 17 | data: pd.DataFrame, 18 | schema: Schema, 19 | ranking_features: list[str] | None = None, 20 | ) -> None: 21 | super().__init__(data, schema) 22 | with warnings.catch_warnings(): 23 | warnings.filterwarnings("ignore") 24 | self._ranking_features = ranking_features 25 | 26 | @classmethod 27 | def load(cls, path: str | Path): 28 | path = Path(path) 29 | 30 | history = pd.read_csv(path / "history.csv") 31 | features = pd.read_csv(path / "features.csv") 32 | features["history"] = history.values.tolist() 33 | del history 34 | 35 | schema = Schema.from_json(path / "schema.json") 36 | with open(path / "ranks.json", "r") as f: 37 | ranking_features = json.load(f) 38 | 39 | return cls(features, schema=schema, ranking_features=ranking_features) 40 | 41 | def save(self, path: str | Path, name: str = None): 42 | path = Path(path) 43 | path = path / name if name else path 44 | 45 | history = pd.DataFrame(np.stack(self.loc[:, "history"].values)) 46 | 47 | make_dirs(path) 48 | history.to_csv(path / "history.csv", index=None) 49 | self.drop("history", axis=1).to_csv(path / "features.csv", index=None) 50 | 51 | with open(path / "ranks.json", "w") as f: 52 | json.dump(self._ranking_features, f) 53 | 54 | self.schema.save(path / "schema.json") 55 | 56 | def to_dataset(self) -> tf.data.Dataset: 57 | return self._make_dataset().map(self._get_header_fn()) 58 | 59 | def _make_dataset(self) -> tf.data.Dataset: 60 | return tf.data.Dataset.zip( 61 | ( 62 | self._get_target_vector_dataset(self, self._schema, "user"), 63 | self._get_target_vector_dataset(self, self._schema, "item"), 64 | tf.data.Dataset.from_tensor_slices( 65 | np.stack(self["history"].values).astype(np.int32) 66 | ), 67 | self._get_ranking_dataset(self), 68 | ) 69 | ) 70 | 71 | @staticmethod 72 | def _get_target_vector_dataset( 73 | data, schema: Schema, target: str 74 | ) -> tf.data.Dataset: 75 | return tf.data.Dataset.from_tensor_slices( 76 | data.loc[:, get_target_id(schema, target)] 77 | .values.reshape(-1) 78 | .astype(np.int32) 79 | ) 80 | 81 | @staticmethod 82 | def _get_header_fn(): 83 | @tf.autograph.experimental.do_not_convert 84 | def header_fn(user_id, item_id, history, ranks): 85 | return { 86 | "query": {"user_id": user_id, "history": history}, 87 | "candidate": {"item_id": item_id}, 88 | "rank": ranks, 89 | } 90 | 91 | return header_fn 92 | 93 | def _get_ranking_dataset(self, data) -> tf.data.Dataset: 94 | @tf.autograph.experimental.do_not_convert 95 | def add_header(x): 96 | return { 97 | self._ranking_features[i]: x[i] 98 | for i in range(len(self._ranking_features)) 99 | } 100 | 101 | return tf.data.Dataset.from_tensor_slices( 102 | data.loc[:, self._ranking_features].values.astype(np.int32) 103 | ).map(add_header) 104 | 105 | @property 106 | def ranking_features(self): 107 | return self._ranking_features 108 | -------------------------------------------------------------------------------- /rexify/features/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joseprsm/rexify/6efb0cbe8ce9e35b58b200fcb95cf8e65c03d2c2/rexify/features/__init__.py -------------------------------------------------------------------------------- /rexify/features/base.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import re 3 | from pathlib import Path 4 | 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.compose import make_column_transformer 7 | from sklearn.pipeline import Pipeline 8 | 9 | from rexify.schema import Schema 10 | from rexify.utils import get_target_feature, make_dirs 11 | 12 | 13 | class HasSchemaMixin: 14 | def __init__(self, schema: Schema): 15 | self._schema = schema 16 | 17 | @property 18 | def schema(self): 19 | return self._schema 20 | 21 | 22 | class HasTargetMixin: 23 | 24 | _SUPPORTED_TARGETS = ["user", "item"] 25 | 26 | def __init__(self, target: str): 27 | self._target = target 28 | 29 | @property 30 | def target(self): 31 | return self._target 32 | 33 | @classmethod 34 | def _validate_target(cls, target: str): 35 | if target not in cls._SUPPORTED_TARGETS: 36 | raise ValueError(f"Target {target} not supported") 37 | 38 | 39 | class Serializable: 40 | def save(self, output_dir: str, filename: str = None): 41 | make_dirs(output_dir) 42 | filename = ( 43 | filename or self._camel_to_snake_case(self.__class__.__name__) + ".pickle" 44 | ) 45 | output_path = Path(output_dir) / filename 46 | with open(output_path, "wb") as f: 47 | pickle.dump(self, f) 48 | 49 | @classmethod 50 | def load(cls, path: Path | str): 51 | with open(path, "rb") as f: 52 | feat = pickle.load(f) 53 | return feat 54 | 55 | @staticmethod 56 | def _camel_to_snake_case(name: str): 57 | return re.sub(r"(? list[str]: 73 | return get_target_feature(schema, target, dtype) 74 | 75 | def __iter__(self): 76 | for x in [self._name, self.ppl, self._targets]: 77 | yield x 78 | 79 | def as_tuple(self): 80 | return tuple(self) 81 | 82 | 83 | class BaseTransformer(BaseEstimator, TransformerMixin): 84 | def __init__(self, transformer: TransformerMixin, target_features: list[str]): 85 | super().__init__() 86 | self.transformer = transformer 87 | self.target_features = target_features 88 | 89 | self._column_transformer = make_column_transformer( 90 | (self.transformer, self.target_features), 91 | ) 92 | 93 | def fit(self, X, y=None, **fit_params): 94 | return self 95 | 96 | def transform(self, X): 97 | pass 98 | -------------------------------------------------------------------------------- /rexify/features/extractor.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.pipeline import make_pipeline 7 | 8 | from rexify.data import Events, Items, Output, Users 9 | from rexify.features.base import HasSchemaMixin, Serializable 10 | from rexify.features.transform import CustomTransformer, EventEncoder, Sequencer 11 | from rexify.features.transform.entity import EntityTransformer 12 | from rexify.schema import Schema 13 | 14 | 15 | class FeatureExtractor(BaseEstimator, TransformerMixin, HasSchemaMixin, Serializable): 16 | 17 | _model_params: dict[str, Any] 18 | _item_ids: np.ndarray 19 | _user_ids: np.ndarray 20 | 21 | def __init__( 22 | self, 23 | schema: Schema, 24 | users: str = None, 25 | items: str = None, 26 | return_dataset: bool = False, 27 | window_size: int = 3, 28 | custom_transformers: list[CustomTransformer] = None, 29 | ): 30 | HasSchemaMixin.__init__(self, schema) 31 | 32 | self._users = users 33 | self._items = items 34 | self._return_dataset = return_dataset 35 | self._window_size = window_size 36 | self._window_size = window_size 37 | self._timestamp = schema.timestamp 38 | self._custom_transformers = custom_transformers or [] 39 | 40 | self._user_transformer = EntityTransformer( 41 | schema, "user", self._custom_transformers 42 | ) 43 | self._item_transformer = EntityTransformer( 44 | schema, "item", self._custom_transformers 45 | ) 46 | 47 | self._ppl = make_pipeline( 48 | EventEncoder(self._schema), 49 | Sequencer( 50 | self._schema, 51 | timestamp_feature=self._timestamp, 52 | window_size=self._window_size, 53 | ), 54 | ) 55 | 56 | def fit(self, X: Events): 57 | self._fit_transformer(Users) 58 | self._fit_transformer(Items) 59 | 60 | x_ = X.copy() 61 | events = self._encode(self._user_transformer, x_) 62 | events = self._encode(self._item_transformer, events) 63 | _ = self._ppl.fit(events) 64 | 65 | self._model_params = self._get_model_params() 66 | return self 67 | 68 | def transform(self, X: Events) -> Output: 69 | x_ = X.copy() 70 | events = self._encode(self._user_transformer, x_) 71 | events = self._encode(self._item_transformer, events) 72 | events = self._ppl.transform(events) 73 | events = self._drop(events, self._user_transformer) 74 | events = self._drop(events, self._item_transformer) 75 | self._model_params["session_history"] = self.history 76 | 77 | transformed = Output( 78 | data=events, schema=self._schema, ranking_features=self.ranking_features 79 | ) 80 | 81 | self._user_ids = self._get_ids(transformed, self._user_transformer) 82 | self._item_ids = self._get_ids(transformed, self._item_transformer) 83 | 84 | return transformed.to_dataset() if self._return_dataset else transformed 85 | 86 | def _fit_transformer(self, inputs: Users | Items): 87 | input_name = inputs.__name__.lower() 88 | input_path: str = getattr(self, f"_{input_name}") 89 | transformer = getattr(self, f"_{input_name[:-1]}_transformer") 90 | x = inputs.load(input_path, schema=self._schema) 91 | transformer.fit(x).transform(x) 92 | 93 | @staticmethod 94 | def _encode(transformer: EntityTransformer, data: pd.DataFrame) -> pd.DataFrame: 95 | encoder, feature_names = transformer.encoder 96 | data[feature_names] = encoder.transform(data[feature_names]) 97 | return data 98 | 99 | @staticmethod 100 | def _drop(df: pd.DataFrame, transformer: EntityTransformer): 101 | encoder, id_ = transformer.encoder 102 | return df.loc[df[id_].values.reshape(-1) != encoder.unknown_value, :] 103 | 104 | def _get_model_params(self): 105 | model_params = {} 106 | model_params.update(self._user_transformer.model_params) 107 | model_params.update(self._item_transformer.model_params) 108 | model_params.update({"ranking_features": self.ranking_features}) 109 | model_params["window_size"] = self._window_size 110 | return model_params 111 | 112 | @staticmethod 113 | def _get_ids(df: pd.DataFrame, transformer: EntityTransformer): 114 | return df.loc[:, transformer.encoder[1][0]].values.astype(np.int32) 115 | 116 | @property 117 | def users(self): 118 | return self._users 119 | 120 | @property 121 | def items(self): 122 | return self._items 123 | 124 | @property 125 | def model_params(self): 126 | return self._model_params 127 | 128 | @property 129 | def ranking_features(self): 130 | return self._ppl.steps[0][1].ranking_features 131 | 132 | @property 133 | def history(self): 134 | return self._ppl.steps[1][1].history 135 | 136 | @property 137 | def return_dataset(self): 138 | return self._return_dataset 139 | 140 | @property 141 | def window_size(self): 142 | return self._window_size 143 | 144 | @property 145 | def custom_transformers(self): 146 | return self._custom_transformers 147 | 148 | @property 149 | def item_encoder(self): 150 | return self._item_transformer.encoder[0] 151 | 152 | @property 153 | def item_ids(self): 154 | return self._item_ids 155 | 156 | @property 157 | def user_encoder(self): 158 | return self._user_transformer.encoder[0] 159 | 160 | @property 161 | def user_ids(self): 162 | return self._user_ids 163 | -------------------------------------------------------------------------------- /rexify/features/transform/__init__.py: -------------------------------------------------------------------------------- 1 | from .category import CategoricalEncoder 2 | from .custom import CustomTransformer 3 | from .event import EventEncoder 4 | from .id import IDEncoder 5 | from .number import NumericalEncoder 6 | from .sequence import Sequencer 7 | -------------------------------------------------------------------------------- /rexify/features/transform/category.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import make_pipeline 2 | from sklearn.preprocessing import OneHotEncoder 3 | 4 | from rexify.features.base import BaseEncoder 5 | from rexify.schema import Schema 6 | 7 | 8 | class CategoricalEncoder(BaseEncoder): 9 | def __init__(self, schema: Schema, target: str): 10 | super().__init__(dtype="category", target=target, schema=schema) 11 | self.ppl = make_pipeline(OneHotEncoder(sparse_output=False)) 12 | -------------------------------------------------------------------------------- /rexify/features/transform/custom.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import TransformerMixin 2 | 3 | 4 | class CustomTransformer(tuple): 5 | def __new__( 6 | cls, target: str, transformer: TransformerMixin, features: list[str] 7 | ) -> tuple: 8 | name = f"{target}_{''.join([f[0] for f in features])}_customTransformer" 9 | return tuple.__new__(CustomTransformer, (name, transformer, features)) 10 | -------------------------------------------------------------------------------- /rexify/features/transform/entity.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.compose import ColumnTransformer 6 | from sklearn.pipeline import Pipeline, make_pipeline 7 | 8 | from rexify.features.base import HasSchemaMixin, HasTargetMixin 9 | from rexify.features.transform import ( 10 | CategoricalEncoder, 11 | CustomTransformer, 12 | IDEncoder, 13 | NumericalEncoder, 14 | ) 15 | from rexify.schema import Schema 16 | from rexify.utils import get_target_id 17 | 18 | 19 | class _FeatureTransformer(ColumnTransformer, HasSchemaMixin, HasTargetMixin): 20 | def __init__(self, schema: Schema, target: str): 21 | HasSchemaMixin.__init__(self, schema=schema) 22 | HasTargetMixin.__init__(self, target=target) 23 | transformers = self._get_transformers() 24 | ColumnTransformer.__init__( 25 | self, transformers=transformers, remainder="passthrough" 26 | ) 27 | 28 | def _get_transformers(self) -> list[tuple[str, Pipeline, list[str]]]: 29 | transformer_list = [] 30 | 31 | cat_encoder = CategoricalEncoder(self._schema, self._target).as_tuple() 32 | transformer_list += [cat_encoder] if cat_encoder[-1] != tuple() else [] 33 | 34 | num_encoder = NumericalEncoder(self._schema, self._target).as_tuple() 35 | transformer_list += [num_encoder] if num_encoder[-1] != tuple() else [] 36 | 37 | return transformer_list 38 | 39 | 40 | class EntityTransformer(ColumnTransformer, HasSchemaMixin, HasTargetMixin): 41 | _features: pd.DataFrame 42 | _model_params: dict[str, Any] 43 | 44 | def __init__( 45 | self, 46 | schema: Schema, 47 | target: str, 48 | custom_transformers: list[CustomTransformer] = None, 49 | ): 50 | HasSchemaMixin.__init__(self, schema) 51 | HasTargetMixin.__init__(self, target) 52 | self._custom_transformers = ( 53 | self._filter_custom_transformers(custom_transformers, self._target) or [] 54 | ) 55 | transformers = [ 56 | self._get_feature_pipeline(self._schema, self._target) 57 | ] + self._custom_transformers 58 | ColumnTransformer.__init__(self, transformers) 59 | 60 | def fit(self, X, y=None): 61 | super().fit(X, y) 62 | n_dims = self._get_n_dims(X) 63 | self._model_params = n_dims 64 | return self 65 | 66 | def transform(self, X) -> pd.DataFrame: 67 | self._features = super().transform(X) 68 | self._features = pd.DataFrame( 69 | self._features[:, :-1], index=self._features[:, -1] 70 | ) 71 | self._features = pd.concat( 72 | [ 73 | self._features, 74 | pd.DataFrame(np.zeros(self._features.shape[1])).transpose(), 75 | ], 76 | ignore_index=True, 77 | ) 78 | 79 | self._model_params.update({f"{self._target}_embeddings": self._features}) 80 | return self._features 81 | 82 | def _get_n_dims(self, X): 83 | id_col = get_target_id(self._schema, self._target)[0] 84 | input_dims = int(X[id_col].nunique() + 1) 85 | return {f"{self._target}_dims": input_dims} 86 | 87 | @staticmethod 88 | def _filter_custom_transformers( 89 | custom_transformers: list[CustomTransformer], target: str 90 | ): 91 | def target_from_name(x): 92 | return x[0].split("_")[0] == target 93 | 94 | return list(filter(target_from_name, custom_transformers)) 95 | 96 | @staticmethod 97 | def _get_feature_pipeline(schema, target) -> tuple[str, Pipeline, list[str]]: 98 | name = f"{target}_featureExtractor" 99 | ppl = make_pipeline( 100 | IDEncoder(schema, target), 101 | _FeatureTransformer(schema, target), 102 | ) 103 | target_keys = getattr(schema, target).to_dict() 104 | keys = [target_keys.pop("id")] + list(target_keys.keys()) 105 | return name, ppl, keys 106 | 107 | @property 108 | def model_params(self): 109 | return self._model_params 110 | 111 | @property 112 | def identifiers(self): 113 | return self._features.index.values.astype(int) 114 | 115 | @property 116 | def encoder(self): 117 | encoder = self.transformers_[0][1].steps[0][1].transformer.transformers_[0] 118 | return encoder[1], encoder[-1] 119 | 120 | @property 121 | def custom_transformers(self): 122 | return self._custom_transformers 123 | -------------------------------------------------------------------------------- /rexify/features/transform/event.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.base import BaseEstimator, TransformerMixin 3 | from sklearn.compose import make_column_transformer 4 | from sklearn.preprocessing import OneHotEncoder 5 | 6 | from rexify.features.base import HasSchemaMixin 7 | from rexify.schema import Schema 8 | 9 | 10 | class EventEncoder(BaseEstimator, TransformerMixin, HasSchemaMixin): 11 | def __init__(self, schema: Schema): 12 | HasSchemaMixin.__init__(self, schema) 13 | self._event_type = schema.event_type 14 | self._transformer = make_column_transformer( 15 | (OneHotEncoder(), [self._event_type]) 16 | ) 17 | 18 | def fit(self, X, y=None, **fit_params): 19 | self._transformer.fit(X, y) 20 | return self 21 | 22 | def transform(self, X): 23 | oneh = self._transformer.transform(X) 24 | oneh = pd.DataFrame(oneh, columns=self.transformer.get_feature_names_out()) 25 | x = X.drop(self._event_type, axis=1) 26 | return pd.concat([x, oneh], axis=1) 27 | 28 | @property 29 | def transformer(self): 30 | return self._transformer.transformers_[0][1] 31 | 32 | @property 33 | def ranking_features(self): 34 | return self.transformer.get_feature_names_out().tolist() 35 | -------------------------------------------------------------------------------- /rexify/features/transform/id.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | from sklearn.compose import ColumnTransformer, make_column_transformer 5 | from sklearn.preprocessing import OrdinalEncoder 6 | 7 | from rexify.features.base import HasSchemaMixin, HasTargetMixin 8 | from rexify.utils import get_target_id 9 | 10 | 11 | class IDEncoder(BaseEstimator, TransformerMixin, HasSchemaMixin, HasTargetMixin): 12 | 13 | _transformer: ColumnTransformer 14 | 15 | def __init__(self, schema, target): 16 | HasSchemaMixin.__init__(self, schema) 17 | HasTargetMixin.__init__(self, target) 18 | 19 | def fit(self, X: pd.DataFrame, y=None): 20 | target_features = get_target_id(self._schema, self._target) 21 | encoder_args = self._get_encoder_args(X, target_features) 22 | self._transformer = make_column_transformer( 23 | (OrdinalEncoder(**encoder_args), target_features), 24 | remainder="passthrough", 25 | ) 26 | self._transformer.fit(X, y) 27 | return self 28 | 29 | def transform(self, X: pd.DataFrame) -> pd.DataFrame: 30 | x = self._transformer.transform(X) 31 | columns = self._get_features_names_out() 32 | return pd.DataFrame(x, columns=columns) 33 | 34 | def _get_features_names_out(self) -> list[str]: 35 | features = self._transformer.get_feature_names_out() 36 | return [name.split("__")[-1] for name in features] 37 | 38 | @staticmethod 39 | def _get_encoder_args(df: pd.DataFrame, target_features: list[str]): 40 | value = df[target_features].nunique().sum() 41 | return { 42 | "dtype": np.int64, 43 | "handle_unknown": "use_encoded_value", 44 | "unknown_value": value, 45 | } 46 | 47 | @property 48 | def transformer(self): 49 | return self._transformer 50 | 51 | @property 52 | def target_feature(self): 53 | return self._transformer.transformers[0][-1][0] 54 | -------------------------------------------------------------------------------- /rexify/features/transform/number.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import make_pipeline 2 | from sklearn.preprocessing import MinMaxScaler 3 | 4 | from rexify.features.base import BaseEncoder 5 | from rexify.schema import Schema 6 | 7 | 8 | class NumericalEncoder(BaseEncoder): 9 | def __init__(self, schema: Schema, target: str): 10 | super().__init__(dtype="number", target=target, schema=schema) 11 | self.ppl = make_pipeline(MinMaxScaler(feature_range=(-1, 1))) 12 | -------------------------------------------------------------------------------- /rexify/features/transform/sequence.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | 5 | from rexify.features.base import HasSchemaMixin 6 | from rexify.schema import Schema 7 | from rexify.utils import get_target_id 8 | 9 | 10 | class Sequencer(BaseEstimator, TransformerMixin, HasSchemaMixin): 11 | 12 | """Transformer responsible for creating sequential data. 13 | 14 | It creates a new column `history` that holds the previous `window_size` event item IDs. 15 | 16 | Args: 17 | schema (rexify.types.Schema): the data schema 18 | timestamp_feature (str): the dataframe's feature name with a timestamp 19 | window_size (int): the size of the sliding window 20 | 21 | Examples: 22 | >>> from rexify.features.transform import Sequencer 23 | >>> sequencer = Sequencer(schema) 24 | >>> sequencer.fit(events) 25 | Sequencer(schema={'context': {'timestamp': 'timestamp'}, 26 | 'item': {'item_id': 'id', 'price': 'numerical', 27 | 'type': 'categorical'}, 28 | 'rank': [{'name': 'Purchase'}, {'name': 'Add to Cart'}, 29 | {'name': 'Page View'}], 30 | 'user': {'age': 'numerical', 'gender': 'categorical', 31 | 'user_id': 'id'}}, 32 | timestamp_feature='timestamp', window_size=4) 33 | >>> transformed = sequencer.transform(events) 34 | 35 | """ 36 | 37 | _user_id: str 38 | _item_id: str 39 | _columns: list[str] 40 | _padding: list[int] 41 | _history: pd.DataFrame 42 | 43 | def __init__(self, schema: Schema, window_size: int = 3, **kwargs): 44 | super().__init__(schema=schema) 45 | self._timestamp_feature = self._schema.timestamp 46 | self._window_size = window_size + 1 47 | 48 | def fit(self, X: pd.DataFrame, *_): 49 | self._user_id = get_target_id(self.schema, "user")[0] 50 | self._item_id = get_target_id(self.schema, "item")[0] 51 | self._columns = [col for col in X.columns if col != self._user_id] 52 | self._padding = [X[self._item_id].max() + 1] * (self._window_size - 2) 53 | return self 54 | 55 | def transform(self, X: pd.DataFrame): 56 | sequences = self._get_sequences(X) 57 | 58 | res = sequences.drop(self._item_id, axis=1).applymap(self._get_last) 59 | res[self._item_id] = sequences.pop(self._item_id) 60 | res["history"] = sequences.pop("history") 61 | res.reset_index(inplace=True) 62 | res = res.loc[res["history"].map(len) == self._window_size - 1, :] 63 | res = res.loc[~res.loc[:, self._timestamp_feature].isna()] 64 | 65 | self._history = self._get_history(res) 66 | 67 | res.drop(self._timestamp_feature, axis=1, inplace=True) 68 | return res 69 | 70 | def _get_sequences(self, df: pd.DataFrame): 71 | sequences: pd.DataFrame = ( 72 | df.sort_values(self._timestamp_feature) 73 | .set_index(self._user_id) 74 | .groupby(level=-1) 75 | .apply(self._mask) 76 | .apply(pd.Series) 77 | .rename(columns=pd.Series(self._columns)) 78 | .applymap(self._pad) 79 | .applymap(self._window) 80 | .apply(lambda x: x.explode()) 81 | ) 82 | 83 | sequences["history"] = sequences[self._item_id].map(lambda x: x[:-1]) 84 | sequences[self._item_id] = sequences[self._item_id].map(self._get_last) 85 | return sequences 86 | 87 | def _get_history(self, df: pd.DataFrame): 88 | return ( 89 | df.groupby([self._user_id]) 90 | .agg({self._timestamp_feature: max, "history": list}) 91 | .drop(self._timestamp_feature, axis=1) 92 | .history.map(self._get_last) 93 | ) 94 | 95 | def _mask(self, df: pd.DataFrame): 96 | return [list(df[col]) for col in self._columns] 97 | 98 | @staticmethod 99 | def _get_last(lst: list): 100 | return lst[-1] 101 | 102 | def _window(self, sequence): 103 | if len(sequence) >= self._window_size: 104 | sequence = np.array(sequence, dtype=object) 105 | 106 | stack = [ 107 | sequence[range(i, i + self._window_size)] 108 | for i in range(len(sequence) - self._window_size + 1) 109 | ] 110 | 111 | if len(stack) > 1: 112 | stack = np.stack(stack) 113 | 114 | return stack 115 | return [sequence] 116 | 117 | def _pad(self, x: list): 118 | return self._padding + x 119 | 120 | @property 121 | def timestamp_feature(self): 122 | return self._timestamp_feature 123 | 124 | @property 125 | def window_size(self): 126 | return self._window_size 127 | 128 | @property 129 | def history(self): 130 | return self._history 131 | -------------------------------------------------------------------------------- /rexify/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .recommender import Recommender 2 | -------------------------------------------------------------------------------- /rexify/models/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class DenseSetterMixin(ABC): 7 | @staticmethod 8 | def _set_sequential_model( 9 | layer: str | tf.keras.layers.Layer, layer_sizes: list[int], **kwargs 10 | ) -> list[tf.keras.layers.Layer]: 11 | if type(layer) == str: 12 | layer = getattr(tf.keras.layers, layer) 13 | return [layer(num_neurons, **kwargs) for num_neurons in layer_sizes] 14 | 15 | def _set_dense_layers( 16 | self, layer_sizes: list[int], activation: str | None = "relu" 17 | ) -> list[tf.keras.layers.Layer]: 18 | return self._set_sequential_model("Dense", layer_sizes, activation=activation) 19 | 20 | @staticmethod 21 | def _call_layers(layer_list: list[tf.keras.layers.Layer], inputs): 22 | x = inputs 23 | for layer in layer_list: 24 | x = layer(x) 25 | return x 26 | -------------------------------------------------------------------------------- /rexify/models/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | from .index import BruteForceCallback, ScaNNCallback 2 | 3 | 4 | try: 5 | from .mlflow import MlflowCallback 6 | except: 7 | pass 8 | -------------------------------------------------------------------------------- /rexify/models/callbacks/index.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from rexify.models.index import BruteForce, ScaNN 4 | 5 | 6 | class _IndexCallback(tf.keras.callbacks.Callback): 7 | 8 | INDEX: BruteForce | ScaNN 9 | 10 | def __init__( 11 | self, 12 | sample_query: dict[str, tf.Tensor], 13 | query_model: str = "query_model", 14 | batch_size: int = 128, 15 | **index_args, 16 | ): 17 | super().__init__() 18 | self._query_model = query_model 19 | self._batch_size = batch_size 20 | self._sample_query = sample_query 21 | self._index_args = index_args 22 | self._target = "user" if self._query_model == "query_model" else "item" 23 | 24 | def set(self) -> tf.keras.Model: 25 | query_model = getattr(self.model, self._query_model) 26 | return self.INDEX(query_model, self.model.window_size, **self._index_args) 27 | 28 | def on_train_end(self, logs=None): 29 | index = self.set() 30 | index.index_from_dataset(candidates=self._get_candidates_dataset()) 31 | _ = index(self._sample_query[f"{self._target}_id"]) 32 | setattr(self.model, f"{self._target}_index", index) 33 | 34 | def _get_candidates_dataset(self): 35 | def zip_item_dataset(item): 36 | return (item["item_id"], self.model.candidate_model(item)) 37 | 38 | candidates = self._get_candidates().batch(self._batch_size) 39 | return candidates.map(zip_item_dataset) 40 | 41 | def _get_candidates(self): 42 | def header_fn(item_id): 43 | return {"item_id": tf.cast(item_id, tf.int32)} 44 | 45 | return tf.data.Dataset.from_tensor_slices( 46 | self.model.candidate_model.identifiers 47 | ).map(header_fn) 48 | 49 | 50 | class BruteForceCallback(_IndexCallback): 51 | 52 | INDEX = BruteForce 53 | 54 | 55 | class ScaNNCallback(_IndexCallback): 56 | 57 | INDEX = ScaNN 58 | -------------------------------------------------------------------------------- /rexify/models/callbacks/mlflow.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import mlflow 4 | import tensorflow as tf 5 | 6 | 7 | class MlflowCallback(tf.keras.callbacks.Callback): 8 | def __init__( 9 | self, 10 | tracking_uri: str = os.environ.get("MLFLOW_TRACKING_URI"), 11 | experiment_name: str = os.environ.get("MLFLOW_EXPERIMENT_NAME"), 12 | ): 13 | super().__init__() 14 | if tracking_uri: 15 | mlflow.set_tracking_uri(tracking_uri) 16 | if experiment_name: 17 | mlflow.set_experiment(experiment_name) 18 | 19 | def on_train_begin(self, logs=None): 20 | config = self.model.get_config() 21 | 22 | def parse(value): 23 | if type(value).__name__ == "ListWrapper": 24 | return list(value) 25 | return value 26 | 27 | params = {k: parse(v) for k, v in config.items()} 28 | mlflow.log_params(params) 29 | 30 | def on_epoch_end(self, epoch, logs=None): 31 | mlflow.log_metrics(logs) 32 | -------------------------------------------------------------------------------- /rexify/models/index.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_recommenders as tfrs 3 | 4 | 5 | class _BaseIndex: 6 | def __init__(self, query_model: tf.keras.Model, window_size: int): 7 | self.query_model = query_model 8 | self._window_size = window_size 9 | 10 | def call(self, queries: tf.Tensor, k: int = None): 11 | queries_shape = queries.shape[0] or 1 12 | inputs = ( 13 | { 14 | "user_id": queries, 15 | "history": tf.zeros( 16 | shape=(queries_shape, self._window_size), dtype=tf.int32 17 | ), 18 | } 19 | if self.query_model.name.startswith("query") 20 | else {"item_id": queries} 21 | ) 22 | return self.__class__.__bases__[1].call(self, inputs, k) 23 | 24 | 25 | class BruteForce(_BaseIndex, tfrs.layers.factorized_top_k.BruteForce): 26 | def __init__( 27 | self, 28 | query_model: tf.keras.Model, 29 | window_size: int, 30 | k: int = 2, 31 | name: str = None, 32 | ): 33 | tfrs.layers.factorized_top_k.BruteForce.__init__(self, query_model, k, name) 34 | _BaseIndex.__init__(self, query_model, window_size) 35 | 36 | 37 | class ScaNN(_BaseIndex, tfrs.layers.factorized_top_k.ScaNN): 38 | def __init__( 39 | self, 40 | query_model: tf.keras.Model, 41 | window_size: int, 42 | k: int = 10, 43 | distance_measure: str = "dot_product", 44 | num_leaves: int = 100, 45 | num_leaves_to_search: int = 10, 46 | training_iterations: int = 12, 47 | dimensions_per_block: int = 2, 48 | num_reordering_candidates: int = None, 49 | parallelize_batch_searches: bool = True, 50 | name: str = None, 51 | ): 52 | tfrs.layers.factorized_top_k.ScaNN.__init__( 53 | self, 54 | query_model, 55 | k, 56 | distance_measure, 57 | num_leaves, 58 | num_leaves_to_search, 59 | training_iterations, 60 | dimensions_per_block, 61 | num_reordering_candidates, 62 | parallelize_batch_searches, 63 | name, 64 | ) 65 | _BaseIndex.__init__(self, query_model, window_size) 66 | -------------------------------------------------------------------------------- /rexify/models/lookup.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | 7 | class _BaseLookupModel(tf.keras.Model): 8 | def __init__(self, ids: np.ndarray, values: np.ndarray): 9 | super().__init__() 10 | self._ids = ids 11 | self._values = values 12 | 13 | identifiers_idx = np.arange(0, self._ids.shape[0]) 14 | init = tf.lookup.KeyValueTensorInitializer( 15 | keys=self._ids, 16 | values=identifiers_idx, 17 | key_dtype=tf.int32, 18 | value_dtype=tf.int32, 19 | ) 20 | 21 | self.token_to_id = tf.lookup.StaticHashTable(init, default_value=len(ids)) 22 | 23 | @tf.function(input_signature=[tf.TensorSpec([None], tf.int32)]) 24 | def call(self, inputs): 25 | ids = self.token_to_id.lookup(inputs) 26 | return tf.nn.embedding_lookup(params=self._values, ids=ids) 27 | 28 | @abstractmethod 29 | def get_config(self): 30 | pass 31 | 32 | 33 | class EmbeddingLookup(_BaseLookupModel): 34 | def __init__(self, ids: np.ndarray, embeddings: np.ndarray): 35 | super().__init__(ids=ids, values=embeddings) 36 | 37 | def get_config(self): 38 | return {"ids": self._ids, "embeddings": self._values} 39 | 40 | 41 | class SessionLookup(_BaseLookupModel): 42 | def __init__(self, ids: np.ndarray, sessions: np.ndarray): 43 | super().__init__(ids=ids, values=sessions) 44 | 45 | def get_config(self): 46 | return {"ids": self._ids, "sessions": self._values} 47 | -------------------------------------------------------------------------------- /rexify/models/ranking/__init__.py: -------------------------------------------------------------------------------- 1 | from .ranking import RankingMixin 2 | -------------------------------------------------------------------------------- /rexify/models/ranking/base.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_recommenders as tfrs 3 | 4 | from rexify.models.base import DenseSetterMixin 5 | 6 | 7 | class BaseRankingModel(tf.keras.Model, DenseSetterMixin): 8 | 9 | output_layer: tf.keras.layers.Dense 10 | task: tfrs.tasks.Ranking 11 | 12 | def __init__(self, layer_sizes: list[int]): 13 | super().__init__() 14 | self._layer_sizes = layer_sizes or [64, 32] 15 | self.hidden_layers = self._set_dense_layers(self._layer_sizes) 16 | 17 | def call(self, inputs, labels): 18 | x = self._call_layers(self.hidden_layers, inputs) 19 | x = self.output_layer(x) 20 | return self.task(labels=labels, predictions=x) 21 | -------------------------------------------------------------------------------- /rexify/models/ranking/event.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_recommenders as tfrs 3 | 4 | from rexify.models.ranking.base import BaseRankingModel 5 | 6 | 7 | class EventModel(BaseRankingModel): 8 | def __init__(self, layer_sizes: list[int] = None, n_dims: int = 1): 9 | super().__init__(layer_sizes=layer_sizes) 10 | self._n_dims = n_dims 11 | self.output_layer = tf.keras.layers.Dense(self._n_dims, activation="softmax") 12 | self.task = tfrs.tasks.Ranking(loss=tf.keras.losses.CategoricalCrossentropy()) 13 | 14 | def get_config(self): 15 | return { 16 | "layer_sizes": self._layer_sizes, 17 | "n_dims": self._n_dims, 18 | } 19 | -------------------------------------------------------------------------------- /rexify/models/ranking/ranking.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import tensorflow as tf 4 | import tensorflow_recommenders as tfrs 5 | 6 | from rexify.models.base import DenseSetterMixin 7 | 8 | 9 | class RankingMixin(tfrs.Model, DenseSetterMixin, ABC): 10 | def __init__( 11 | self, 12 | ranking_features: list[str] = None, 13 | layer_sizes: list[int] = None, 14 | weights: dict[str, float] = None, 15 | ): 16 | super().__init__() 17 | self._ranking_features = ranking_features or [] 18 | self._ranking_layers = layer_sizes or [64, 32] 19 | 20 | # todo: validate ranking weights 21 | self._ranking_weights = weights or { 22 | feature: 1.0 for feature in self._ranking_features 23 | } 24 | self._ranking_models = { 25 | feature: self._get_ranking_model() for feature in self._ranking_features 26 | } 27 | self._ranking_tasks = { 28 | feature: tfrs.tasks.Ranking(loss=tf.keras.losses.BinaryCrossentropy()) 29 | for feature in self._ranking_features 30 | } 31 | 32 | def get_loss( 33 | self, 34 | query_embeddings: tf.Tensor, 35 | candidate_embeddings: tf.Tensor, 36 | ranks: dict[str, tf.Tensor], 37 | ): 38 | loss = 0 39 | inputs = tf.concat([query_embeddings, candidate_embeddings], axis=1) 40 | for feature, model in self._ranking_models.items(): 41 | rating_preds = self._call_layers(model, inputs) 42 | loss += ( 43 | self._ranking_tasks[feature]( 44 | labels=ranks[feature], predictions=rating_preds 45 | ) 46 | * self._ranking_weights[feature] 47 | ) 48 | return loss 49 | 50 | def _get_ranking_model(self) -> list[tf.keras.layers.Layer]: 51 | model = self._set_dense_layers(self._ranking_layers) 52 | model.append(tf.keras.layers.Dense(1, activation="sigmoid")) 53 | return model 54 | -------------------------------------------------------------------------------- /rexify/models/recommender.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import tensorflow as tf 3 | 4 | from rexify.models.callbacks import BruteForceCallback 5 | from rexify.models.ranking import RankingMixin 6 | from rexify.models.retrieval import RetrievalMixin 7 | from rexify.utils import get_sample_query 8 | 9 | 10 | class Recommender(RetrievalMixin, RankingMixin): 11 | """The main Recommender model. 12 | 13 | It expects a `tf.data.Dataset`, composed of two keys: "query" and "candidate"; 14 | the query part of the dataset has three keys: 15 | 16 | * the user ID feature name, a scalar; 17 | * `user_features`, an array representing the user features 18 | * `context_features`, an array representing the context features 19 | 20 | The candidate part of the data set has two keys: 21 | 22 | * the item ID feature name, a scalar; 23 | * `item_features`, an array representing the item features 24 | 25 | The query tower model takes the user ID feature and passes it by an embedding layer. The 26 | user and context features are concatenated and passed by a number of dense layers. The 27 | item ID feature is similarly passed to an Embedding layer. Its outputs are then concatenated 28 | to the outputs of the features model whose inputs are the item features, and are then 29 | passed by a number of Dense layers. 30 | 31 | An optional Ranking model is also included, granted there are `ranking_features`. 32 | 33 | Args: 34 | user_dims (int): number possible values for the user ID feature 35 | item_dims (int): number possible values for the item ID feature 36 | embedding_dim (int): output dimension of the embedding layer 37 | feature_layers (list): number of neurons in each layer for the feature models 38 | output_layers (list): number of neurons in each layer for the output models 39 | 40 | Examples: 41 | >>> from rexify.models import Recommender 42 | >>> model = Recommender() 43 | >>> model.compile() 44 | 45 | >>> import numpy as np 46 | >>> inputs = tf.data.Dataset.from_tensor_slices(np.concatenate([np.random.randint(0, 15, size=100).reshape(-1, 1), np.random.randint(0, 1, size=100).reshape(-1, 1), np.random.randint(0, 1_000, size=100).reshape(-1, 1), np.random.randint(0, 1_000, size=100).reshape(-1, 1), np.random.randint(0, 15, size=100).reshape(-1, 1), np.random.randint(0, 5, size=100).reshape(-1, 1),], axis=1)).map(lambda x: {'query': {'user_id': x[0], 'user_features': x[1:3], 'context_features': x[3:4]}, 'candidate': {'item_id': x[4], 'item_features': x[5:]}}).batch(128) 47 | 48 | >>> _ = model.fit(inputs, verbose=0) 49 | 50 | """ 51 | 52 | def __init__( 53 | self, 54 | user_dims: int, 55 | item_dims: int, 56 | user_embeddings: pd.DataFrame, 57 | item_embeddings: pd.DataFrame, 58 | session_history: pd.DataFrame, 59 | window_size: int = 3, 60 | embedding_dim: int = 32, 61 | feature_layers: list[int] = None, 62 | output_layers: list[int] = None, 63 | ranking_features: list[str] = None, 64 | ranking_layers: list[int] = None, 65 | ranking_weights: dict[str, float] = None, 66 | ): 67 | RetrievalMixin.__init__( 68 | self, 69 | user_dims=user_dims + 1, 70 | item_dims=item_dims + 1, 71 | user_embeddings=user_embeddings, 72 | item_embeddings=item_embeddings, 73 | session_history=session_history, 74 | window_size=window_size, 75 | embedding_dim=embedding_dim, 76 | feature_layers=feature_layers, 77 | output_layers=output_layers, 78 | ) 79 | 80 | RankingMixin.__init__( 81 | self, 82 | ranking_features=ranking_features, 83 | layer_sizes=ranking_layers, 84 | weights=ranking_weights, 85 | ) 86 | 87 | def compute_loss(self, inputs, training: bool = False) -> tf.Tensor: 88 | embeddings = self( 89 | inputs, training=training 90 | ) # Recommender inherits RetrievalMixin's call method 91 | loss = RetrievalMixin.get_loss(self, *embeddings) 92 | loss += RankingMixin.get_loss(self, *embeddings, inputs["rank"]) 93 | return loss 94 | 95 | def fit( 96 | self, 97 | x: tf.data.Dataset, 98 | batch_size: int = None, 99 | epochs: int = 1, 100 | callbacks: list[tf.keras.callbacks.Callback] = None, 101 | validation_data=None, 102 | ): 103 | callbacks = callbacks if callbacks else self._get_callbacks(x, batch_size) 104 | # todo: validate number of index callbacks 105 | # - can't be more than a single index for each model (query, candidate) 106 | 107 | if batch_size: 108 | x = x.batch(batch_size) 109 | if validation_data: 110 | validation_data = validation_data.batch(batch_size) 111 | 112 | return super().fit( 113 | x, epochs=epochs, validation_data=validation_data, callbacks=callbacks 114 | ) 115 | 116 | def get_config(self): 117 | return { 118 | "item_dims": self._item_dims, 119 | "user_dims": self._user_dims, 120 | "output_layers": self._output_layers, 121 | "feature_layers": self._feature_layers, 122 | "ranking_layers": self._ranking_layers, 123 | "ranking_features": self._ranking_features, 124 | "ranking_weights": self._ranking_weights, 125 | } 126 | 127 | @classmethod 128 | def load(cls, export_dir: str) -> tf.keras.Model: 129 | return tf.saved_model.load(export_dir) 130 | 131 | @staticmethod 132 | def _get_callbacks(x, batch_size: int = None) -> list[tf.keras.callbacks.Callback]: 133 | # required to set index shapes 134 | sample_query = get_sample_query(x)["query"] 135 | 136 | def get_index_callback(): 137 | try: 138 | import scann # noqa: F401 139 | 140 | from rexify.models.callbacks import ScaNNCallback 141 | 142 | return ScaNNCallback(sample_query, batch_size=batch_size) 143 | 144 | except ImportError: 145 | return BruteForceCallback(sample_query, batch_size=batch_size) 146 | 147 | def get_mlflow_callback(): 148 | try: 149 | from rexify.models.callbacks import MlflowCallback 150 | 151 | return MlflowCallback() 152 | 153 | except ImportError: 154 | return 155 | 156 | callbacks = [get_index_callback(), get_mlflow_callback()] 157 | callbacks = callbacks[:-1] if callbacks[-1] is None else callbacks 158 | 159 | return callbacks 160 | -------------------------------------------------------------------------------- /rexify/models/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | from .retrieval import RetrievalMixin 2 | -------------------------------------------------------------------------------- /rexify/models/retrieval/candidate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from rexify.models.retrieval.tower import TowerModel 5 | 6 | 7 | class CandidateModel(TowerModel): 8 | """Tower model responsible for computing the candidate representations 9 | 10 | Args: 11 | n_items (str): number possible values for the ID feature 12 | embedding_dim (int): output dimension of the embedding layer 13 | output_layers (list): number of neurons in each layer for the output model 14 | feature_layers (list): number of neurons in each layer for the feature model 15 | 16 | Examples: 17 | 18 | >>> from rexify.models.retrieval.candidate import CandidateModel 19 | >>> model = CandidateModel('item_id', 15) 20 | >>> model({'item_id': tf.constant([1]), 'item_features': tf.constant([[1, 1, 1]])}) 21 | 23 | """ 24 | 25 | def __init__( 26 | self, 27 | n_items: int, 28 | identifiers: np.array, 29 | feature_embeddings: np.array, 30 | embedding_dim: int = 32, 31 | output_layers: list[int] = None, 32 | feature_layers: list[int] = None, 33 | ): 34 | super().__init__( 35 | "item_id", 36 | n_items, 37 | identifiers, 38 | feature_embeddings, 39 | embedding_dim, 40 | output_layers, 41 | feature_layers, 42 | ) 43 | 44 | def call(self, inputs: dict[str, tf.Tensor], training: bool = None) -> tf.Tensor: 45 | x = self.embedding_layer(inputs[self._id_feature]) 46 | features = self.lookup_model(inputs[self._id_feature]) 47 | feature_embedding = self._call_layers(self.feature_model, features) 48 | x = tf.concat([x, feature_embedding], axis=1) 49 | x = self._call_layers(self.output_model, x) 50 | return x 51 | 52 | def get_config(self): 53 | config = super().get_config() 54 | config["n_items"] = self._n_dims 55 | return config 56 | -------------------------------------------------------------------------------- /rexify/models/retrieval/query.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | 5 | from rexify.models.lookup import SessionLookup 6 | from rexify.models.retrieval.tower import TowerModel 7 | from rexify.models.sequential import SequentialModel 8 | 9 | 10 | class QueryModel(TowerModel): 11 | """Tower model responsible for computing the query representations 12 | 13 | Args: 14 | n_users (str): number possible values for the ID feature 15 | embedding_dim (int): output dimension of the embedding layer 16 | output_layers (list): number of neurons in each layer for the output model 17 | feature_layers (list): number of neurons in each layer for the feature model 18 | 19 | Examples: 20 | 21 | >>> from rexify.models.retrieval.query import QueryModel 22 | >>> model = QueryModel('user_id', 15) 23 | >>> model({"user_id": tf.constant([1]), "user_features": tf.constant([[1, 1]]), "context_features": tf.constant([[1]])}) 24 | 26 | """ 27 | 28 | def __init__( 29 | self, 30 | n_users: int, 31 | n_items: int, 32 | identifiers: np.array, 33 | feature_embeddings: np.array, 34 | session_history: pd.DataFrame, 35 | embedding_dim: int = 32, 36 | output_layers: list[int] = None, 37 | feature_layers: list[int] = None, 38 | recurrent_layers: list[int] = None, 39 | sequential_dense_layers: list[int] = None, 40 | ): 41 | super().__init__( 42 | "user_id", 43 | n_users, 44 | identifiers, 45 | feature_embeddings, 46 | embedding_dim, 47 | output_layers, 48 | feature_layers, 49 | ) 50 | self._n_items = n_items 51 | self.sequential_model = SequentialModel( 52 | n_dims=n_items, 53 | embedding_dim=self._embedding_dim, 54 | recurrent_layer_sizes=recurrent_layers, 55 | dense_layer_sizes=sequential_dense_layers, 56 | ) 57 | self.session_lookup = SessionLookup( 58 | ids=session_history.index.values.astype(int), 59 | sessions=np.stack(session_history.values).astype(int), 60 | ) 61 | 62 | def call(self, inputs: dict[str, tf.Tensor], training: bool = None) -> tf.Tensor: 63 | x = self.embedding_layer(inputs[self._id_feature]) 64 | features = [self.lookup_model(inputs[self._id_feature])] 65 | 66 | history = ( 67 | self.session_lookup(inputs[self._id_feature]) 68 | if not training 69 | else inputs["history"] 70 | ) 71 | 72 | sequential_embedding = self.sequential_model(history) 73 | x = tf.concat([x, sequential_embedding], axis=1) 74 | 75 | features = tf.concat(features, axis=1) if len(features) > 1 else features[0] 76 | feature_embedding = self._call_layers(self.feature_model, features) 77 | x = tf.concat([x, feature_embedding], axis=1) 78 | 79 | x = self._call_layers(self.output_model, x) 80 | return x 81 | 82 | def get_config(self): 83 | config = super().get_config() 84 | config["user_id"] = self._id_feature 85 | config["n_users"] = self._n_dims 86 | config["n_items"] = self._n_items 87 | return config 88 | -------------------------------------------------------------------------------- /rexify/models/retrieval/retrieval.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import pandas as pd 4 | import tensorflow as tf 5 | import tensorflow_recommenders as tfrs 6 | 7 | from rexify.models.retrieval.candidate import CandidateModel 8 | from rexify.models.retrieval.query import QueryModel 9 | 10 | 11 | class RetrievalMixin(tfrs.Model, ABC): 12 | def __init__( 13 | self, 14 | user_dims: int, 15 | item_dims: int, 16 | user_embeddings: pd.DataFrame, 17 | item_embeddings: pd.DataFrame, 18 | session_history: pd.DataFrame, 19 | window_size: int = 3, 20 | embedding_dim: int = 32, 21 | feature_layers: list[int] = None, 22 | output_layers: list[int] = None, 23 | **kwargs 24 | ): 25 | super().__init__() 26 | self._user_dims = user_dims 27 | self._item_dims = item_dims 28 | self._window_size = window_size 29 | self._embedding_dim = embedding_dim 30 | self._output_layers = output_layers or [64, 32] 31 | self._feature_layers = feature_layers or [64, 32, 16] 32 | joint_args = { 33 | "embedding_dim": self._embedding_dim, 34 | "output_layers": self._output_layers, 35 | "feature_layers": self._feature_layers, 36 | } 37 | 38 | self.query_model = QueryModel( 39 | self._user_dims, 40 | self._item_dims, 41 | identifiers=user_embeddings.index.values.astype(int), 42 | feature_embeddings=user_embeddings.values.astype(float), 43 | session_history=session_history, 44 | **joint_args 45 | ) 46 | 47 | self.candidate_model = CandidateModel( 48 | self._item_dims, 49 | identifiers=item_embeddings.index.values.astype(int), 50 | feature_embeddings=item_embeddings.values.astype(float), 51 | **joint_args 52 | ) 53 | 54 | self.retrieval_task = tfrs.tasks.Retrieval() 55 | 56 | def call(self, inputs, training: bool = False): 57 | query_embeddings: tf.Tensor = self.query_model( 58 | inputs["query"], training=training 59 | ) 60 | candidate_embeddings: tf.Tensor = self.candidate_model( 61 | inputs["candidate"], training=training 62 | ) 63 | return query_embeddings, candidate_embeddings 64 | 65 | def get_loss(self, *embeddings): 66 | return self.retrieval_task(*embeddings) 67 | 68 | @property 69 | def window_size(self): 70 | return self._window_size 71 | -------------------------------------------------------------------------------- /rexify/models/retrieval/tower.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from rexify.models.base import DenseSetterMixin 7 | from rexify.models.lookup import EmbeddingLookup 8 | 9 | 10 | class TowerModel(tf.keras.Model, DenseSetterMixin): 11 | """ 12 | 13 | Args: 14 | id_feature (str): the ID feature 15 | n_dims (str): number possible values for the ID feature 16 | embedding_dim (int): output dimension of the embedding layer 17 | layer_sizes (list): number of neurons in each layer for the output model 18 | feature_layers (list): number of neurons in each layer for the feature model 19 | 20 | Attributes: 21 | embedding_layer (tf.keras.layers.Embedding): 22 | feature_model (list): 23 | output_model (list): 24 | """ 25 | 26 | def __init__( 27 | self, 28 | id_feature: str, 29 | n_dims: int, 30 | identifiers: np.array, 31 | feature_embeddings: np.array, 32 | embedding_dim: int = 32, 33 | layer_sizes: list[int] = None, 34 | feature_layers: list[int] = None, 35 | ): 36 | super().__init__() 37 | self._id_feature = id_feature 38 | self._n_dims = n_dims 39 | self._embedding_dim = embedding_dim 40 | self._layer_sizes = layer_sizes or [64, 32] 41 | self._feature_layers = feature_layers or [64, 32, 16] 42 | self._identifiers = identifiers 43 | self._target_features = feature_embeddings 44 | 45 | self.embedding_layer = tf.keras.layers.Embedding(n_dims, embedding_dim) 46 | self.feature_model = self._set_dense_layers(self._feature_layers) 47 | self.lookup_model = EmbeddingLookup( 48 | ids=self._identifiers, embeddings=self._target_features 49 | ) 50 | self.output_model = self._set_dense_layers(self._layer_sizes, activation=None) 51 | 52 | @abstractmethod 53 | def call(self, inputs: dict[str, tf.Tensor], training: bool = None): 54 | raise NotImplementedError 55 | 56 | def get_config(self): 57 | return { 58 | "id_features": self._id_feature, 59 | "n_dims": self._n_dims, 60 | "embedding_dim": self._embedding_dim, 61 | "layer_sizes": self._layer_sizes, 62 | "feature_layers": self._feature_layers, 63 | "identifiers": self._identifiers, 64 | "feature_embeddings": self._target_features, 65 | } 66 | 67 | @property 68 | def identifiers(self): 69 | return self._identifiers 70 | -------------------------------------------------------------------------------- /rexify/models/sequential.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from rexify.models.base import DenseSetterMixin 4 | 5 | 6 | class SequentialModel(tf.keras.Model, DenseSetterMixin): 7 | def __init__( 8 | self, 9 | n_dims: int, 10 | embedding_dim: int, 11 | layer: str = "LSTM", 12 | activation: str = "relu", 13 | recurrent_layer_sizes: list[int] = None, 14 | dense_layer_sizes: list[int] = None, 15 | ): 16 | super().__init__() 17 | self._layer = layer 18 | self._n_dims = n_dims 19 | self._embedding_dim = embedding_dim 20 | self._activation = activation 21 | self._recurrent_layer_sizes = recurrent_layer_sizes or [32] * 2 22 | self._dense_layer_sizes = dense_layer_sizes or [32, 16] 23 | 24 | self.embedding_layer = tf.keras.layers.Embedding( 25 | self._n_dims, self._embedding_dim 26 | ) 27 | 28 | self.recurrent_model = self._set_recurrent_model() 29 | 30 | self.output_model = self._set_dense_layers( 31 | layer_sizes=self._dense_layer_sizes[:-1], activation=activation 32 | ) 33 | self.output_model.append(tf.keras.layers.Dense(self._dense_layer_sizes[-1])) 34 | 35 | def call(self, inputs: tf.Tensor): 36 | x = tf.cast(inputs, tf.int32) 37 | x = self.embedding_layer(x) 38 | x = self._call_layers(self.recurrent_model, x) 39 | return self._call_layers(self.output_model, x) 40 | 41 | def _set_recurrent_model(self) -> tf.keras.Model: 42 | layer = getattr(tf.keras.layers, self._layer) 43 | layers = self._set_sequential_model( 44 | layer=layer, 45 | layer_sizes=self._recurrent_layer_sizes[:-1], 46 | return_sequences=True, 47 | ) 48 | layers.append(layer(self._recurrent_layer_sizes[-1])) 49 | return layers 50 | 51 | def get_config(self): 52 | return { 53 | "n_dims": self._n_dims, 54 | "embedding_dim": self._embedding_dim, 55 | "layer": self._layer, 56 | "activation": self._activation, 57 | "recurrent_layer_sizes": self._recurrent_layer_sizes, 58 | "dense_layer_sizes": self._dense_layer_sizes, 59 | } 60 | -------------------------------------------------------------------------------- /rexify/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | PIPELINE_ROOT = os.environ.get("PIPELINE_ROOT", "outputs") 5 | -------------------------------------------------------------------------------- /rexify/pipeline/__main__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import typer 4 | from kfp.v2.compiler import Compiler 5 | from kfp.v2.dsl import pipeline 6 | 7 | from rexify.pipeline import PIPELINE_ROOT 8 | from rexify.pipeline.components import load, train 9 | 10 | 11 | @pipeline(name="pipeline", pipeline_root=PIPELINE_ROOT) 12 | def pipeline( 13 | events: str, 14 | users: str, 15 | items: str, 16 | schema: str, 17 | epochs: int = 100, 18 | batch_size: int = 512, 19 | ): 20 | 21 | load_task = load( 22 | events=events, 23 | users=users, 24 | items=items, 25 | schema=schema, 26 | ) 27 | 28 | train_task = train( # noqa:F841 29 | feature_extractor=load_task.outputs["feature_extractor"], 30 | train_data=load_task.outputs["train_data"], 31 | validation_data=load_task.outputs["validation_data"], 32 | batch_size=batch_size, 33 | epochs=epochs, 34 | ) 35 | 36 | 37 | def compile( 38 | output_path: str = typer.Option( 39 | None, help="Output path for the pipeline definition JSON file" 40 | ), 41 | parameter: list[str] = typer.Option( 42 | None, "--parameter", "-p", help="Pipeline parameter, KEY=VALUE" 43 | ), 44 | ): 45 | output_path = output_path if output_path else "pipeline.json" 46 | 47 | pipeline_parameters = ( 48 | {k: v for k, v in [param.split("=") for param in parameter]} 49 | if parameter 50 | else None 51 | ) 52 | 53 | with warnings.catch_warnings(): 54 | warnings.filterwarnings("ignore") 55 | Compiler().compile( 56 | pipeline_func=pipeline, 57 | package_path=output_path, 58 | pipeline_parameters=pipeline_parameters, 59 | ) 60 | 61 | 62 | if __name__ == "__main__": 63 | typer.run(compile) 64 | -------------------------------------------------------------------------------- /rexify/pipeline/components/__init__.py: -------------------------------------------------------------------------------- 1 | from .load import load 2 | from .train import train 3 | -------------------------------------------------------------------------------- /rexify/pipeline/components/load.py: -------------------------------------------------------------------------------- 1 | from kfp.v2.dsl import Artifact, Dataset, Output, component 2 | 3 | from rexify import BASE_IMAGE 4 | 5 | 6 | @component(base_image=BASE_IMAGE) 7 | def load( 8 | events: str, 9 | users: str, 10 | items: str, 11 | schema: str, 12 | feature_extractor: Output[Artifact], 13 | train_data: Output[Dataset], 14 | validation_data: Output[Dataset], 15 | test_size: float = 0.3, 16 | ): 17 | import json 18 | 19 | from rexify import Events, FeatureExtractor, Output, Schema 20 | 21 | schema = Schema.from_dict(json.loads(schema)) 22 | train, val = Events.load(events, schema=schema).split(test_size=test_size) 23 | 24 | fe = FeatureExtractor(schema, users, items, return_dataset=False) 25 | train: Output = fe.fit(train).transform(train) 26 | val: Output = fe.transform(val) 27 | 28 | fe.save(feature_extractor.path) 29 | train.save(train_data.path, "train.csv") 30 | val.save(validation_data.path, "val.csv") 31 | -------------------------------------------------------------------------------- /rexify/pipeline/components/train.py: -------------------------------------------------------------------------------- 1 | from kfp.v2.dsl import Artifact, Dataset, Input, Model, Output, component 2 | 3 | from rexify import BASE_IMAGE 4 | 5 | 6 | @component(base_image=BASE_IMAGE) 7 | def train( 8 | feature_extractor: Input[Artifact], 9 | train_data: Input[Dataset], 10 | validation_data: Input[Dataset], 11 | model: Output[Model], 12 | batch_size: int = 512, 13 | epochs: int = 10, 14 | ): 15 | from rexify import DataFrame, FeatureExtractor, Recommender 16 | 17 | fe = FeatureExtractor.load(feature_extractor.path) 18 | train_data = DataFrame.load(train_data.path) 19 | validation_data = DataFrame.load(validation_data.path) 20 | 21 | fit_params = {"batch_size": batch_size, "epochs": epochs} 22 | recommender = Recommender(**fe.model_params) 23 | recommender.compile() 24 | recommender.fit(train_data, validation_data=validation_data, **fit_params) 25 | recommender.save(model.path) 26 | -------------------------------------------------------------------------------- /rexify/schema.py: -------------------------------------------------------------------------------- 1 | import json 2 | from copy import deepcopy 3 | 4 | from rexify.utils import get_target_id 5 | 6 | 7 | class _JSONSerializable: 8 | def to_dict(self): 9 | return self.__dict__.copy() 10 | 11 | 12 | class _TargetSchema(_JSONSerializable): 13 | 14 | _SUPPORTED_DATA_TYPES = ["category", "number", "id"] 15 | 16 | def __init__(self, id_: str, **features): 17 | self.id = id_ 18 | for feature_name, dtype in features.items(): 19 | self._validate_features(feature_name, dtype) 20 | setattr(self, feature_name, dtype) 21 | 22 | @classmethod 23 | def _validate_features(cls, feature_name: str, dtype: str): 24 | if dtype not in cls._SUPPORTED_DATA_TYPES: 25 | raise ValueError( 26 | f""" 27 | Data type not supported for feature `{feature_name}`. 28 | Supported data types are: {cls._SUPPORTED_DATA_TYPES} 29 | """ 30 | ) 31 | 32 | 33 | class Schema(_JSONSerializable): 34 | def __init__( 35 | self, 36 | user_id: str, 37 | item_id: str, 38 | timestamp: str, 39 | event_type: str, 40 | user_features: dict[str, str] = None, 41 | item_features: dict[str, str] = None, 42 | ): 43 | user_features = user_features or {} 44 | item_features = item_features or {} 45 | self.user = _TargetSchema(user_id, **user_features) 46 | self.item = _TargetSchema(item_id, **item_features) 47 | self.timestamp = timestamp 48 | self.event_type = event_type 49 | 50 | @classmethod 51 | def from_json(cls, schema_path: str): 52 | with open(schema_path, "r") as f: 53 | schema = json.load(f) 54 | return Schema.from_dict(schema) 55 | 56 | @classmethod 57 | def from_dict(cls, schema: dict[str, str | dict[str, str]]): 58 | schema_ = deepcopy(schema) 59 | user_id = get_target_id(schema_, "user")[0] 60 | _ = schema_["user"].pop(user_id) 61 | 62 | item_id = get_target_id(schema_, "item")[0] 63 | _ = schema_["item"].pop(item_id) 64 | 65 | return Schema( 66 | user_id=user_id, 67 | item_id=item_id, 68 | timestamp=schema_["timestamp"], 69 | event_type=schema_["event_type"], 70 | user_features=schema_["user"], 71 | item_features=schema_["item"], 72 | ) 73 | 74 | def to_dict(self): 75 | schema = dict() 76 | schema["user"] = self.user.to_dict() 77 | schema["user"][schema["user"]["id"]] = "id" 78 | _ = schema["user"].pop("id") 79 | 80 | schema["item"] = self.item.to_dict() 81 | schema["item"][schema["item"]["id"]] = "id" 82 | _ = schema["item"].pop("id") 83 | 84 | schema["event_type"] = self.event_type 85 | schema["timestamp"] = self.timestamp 86 | return schema 87 | 88 | def save(self, path: str): 89 | with open(path, "w") as f: 90 | json.dump(self.to_dict(), f, indent=4) 91 | -------------------------------------------------------------------------------- /rexify/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import tensorflow as tf 4 | 5 | 6 | def _get_target(schema, target: str): 7 | return getattr(schema, target).to_dict() if type(schema) != dict else schema[target] 8 | 9 | 10 | def get_target_id(schema, target: str) -> list[str]: 11 | if type(schema) != dict: 12 | return [getattr(schema, target).id] 13 | return [k for k, v in schema[target].items() if v == "id"] 14 | 15 | 16 | def get_target_feature(schema, target: str, type_: str): 17 | def mask(x: tuple): 18 | return x[1] == type_ 19 | 20 | schema_dict = _get_target(schema, target) 21 | return list(map(lambda x: x[0], filter(mask, schema_dict.items()))) 22 | 23 | 24 | def make_dirs(*args): 25 | for dir_ in args: 26 | Path(dir_).mkdir(parents=True, exist_ok=True) 27 | 28 | 29 | def get_sample_query(x: tf.data.Dataset): 30 | return list(x.batch(1).take(1))[0] 31 | -------------------------------------------------------------------------------- /tests/test_extractor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | from sklearn.preprocessing import StandardScaler 9 | 10 | from rexify import FeatureExtractor, Output, Schema 11 | from rexify.features.transform import CustomTransformer 12 | 13 | 14 | class TestFeatureExtractor: 15 | @pytest.fixture(scope="class") 16 | def schema(self): 17 | user_id = "user_id" 18 | item_id = "item_id" 19 | timestamp = "timestamp" 20 | event_type = "event_type" 21 | user_features = {"age": "number", "gender": "category"} 22 | item_features = {"price": "number", "category": "category"} 23 | return Schema( 24 | user_id, item_id, timestamp, event_type, user_features, item_features 25 | ) 26 | 27 | @pytest.fixture(scope="class") 28 | def data(self): 29 | return pd.DataFrame( 30 | { 31 | "user_id": [1, 1, 2, 2, 3, 3], 32 | "item_id": [10, 20, 10, 20, 30, 40], 33 | "timestamp": [1, 2, 3, 4, 5, 6], 34 | "event_type": ["p", "p", "p", "p", "p", "p"], 35 | } 36 | ) 37 | 38 | @pytest.fixture(scope="class") 39 | def users(self): 40 | return pd.DataFrame( 41 | {"user_id": [1, 2, 3], "age": [25, 30, 35], "gender": ["M", "F", "M"]} 42 | ) 43 | 44 | @pytest.fixture(scope="class") 45 | def items(self): 46 | return pd.DataFrame( 47 | {"item_id": [10, 20, 30], "price": [1, 2, 3], "category": ["1", "2", "3"]} 48 | ) 49 | 50 | @pytest.fixture(scope="class") 51 | def feat(self, schema, users, items): 52 | users, items = self._save_users_items(users, items) 53 | return FeatureExtractor(schema, users, items) 54 | 55 | def test_fit(self, data, feat): 56 | _ = feat.fit(data) 57 | 58 | def test_transform(self, data, feat): 59 | transformed = feat.fit(data).transform(data) 60 | assert isinstance(transformed, Output) 61 | 62 | @pytest.fixture(scope="class") 63 | def custom_feat(self, schema, users, items): 64 | users["custom_feature"] = np.random.randint(100, 200, size=users.shape[0]) 65 | users, items = self._save_users_items(users, items) 66 | return FeatureExtractor( 67 | schema, 68 | users, 69 | items, 70 | custom_transformers=[ 71 | CustomTransformer("user", StandardScaler(), ["custom_feature"]) 72 | ], 73 | ) 74 | 75 | def test_fit_custom(self, data, feat, custom_feat): 76 | _ = feat.fit(data) 77 | _ = custom_feat.fit(data) 78 | assert feat.model_params["user_embeddings"].shape[1] == 3 79 | assert custom_feat.model_params["user_embeddings"].shape[1] == 4 80 | 81 | def test_save_load(self, data, feat): 82 | _ = feat.fit(data).transform(data) 83 | tmp_dir = tempfile.mkdtemp() 84 | feat.save(tmp_dir) 85 | feat_path = Path(tmp_dir) / "feature_extractor.pickle" 86 | assert feat_path.exists() 87 | 88 | fe = FeatureExtractor.load(feat_path) 89 | assert fe 90 | 91 | @pytest.fixture(scope="class") 92 | def fe_no_data(self, schema, users, items): 93 | users, items = self._save_users_items(users, items) 94 | return FeatureExtractor(schema, users, items, return_dataset=False) 95 | 96 | def test_make_dataset(self, data, fe_no_data): 97 | transformed = fe_no_data.fit(data).transform(data) 98 | 99 | tmp_dir = tempfile.mkdtemp() 100 | transformed_path = Path(tmp_dir) 101 | transformed.save(transformed_path) 102 | 103 | df = Output.load(transformed_path) 104 | df.to_dataset() 105 | 106 | def _save_users_items(self, users, items) -> tuple[str, str]: 107 | tmp_dir = tempfile.mkdtemp() 108 | 109 | users_path = os.path.join(tmp_dir, "users.csv") 110 | users.to_csv(users_path) 111 | 112 | items_path = os.path.join(tmp_dir, "items.csv") 113 | items.to_csv(items_path) 114 | 115 | return users_path, items_path 116 | -------------------------------------------------------------------------------- /tests/test_schema.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | 4 | import pytest 5 | 6 | from rexify.schema import Schema, _TargetSchema 7 | 8 | 9 | def test_init(): 10 | user_id = "user_id" 11 | item_id = "item_id" 12 | timestamp = "timestamp" 13 | event_type = "event_type" 14 | user_features = {"age": "number", "gender": "category"} 15 | item_features = {"price": "number", "category": "category"} 16 | schema = Schema( 17 | user_id=user_id, 18 | item_id=item_id, 19 | timestamp=timestamp, 20 | event_type=event_type, 21 | user_features=user_features, 22 | item_features=item_features, 23 | ) 24 | 25 | assert schema.user.id == "user_id" 26 | assert schema.user.age == "number" 27 | assert schema.user.gender == "category" 28 | assert schema.item.id == "item_id" 29 | assert schema.item.price == "number" 30 | assert schema.item.category == "category" 31 | assert schema.timestamp == timestamp 32 | assert schema.event_type == event_type 33 | 34 | 35 | def test_from_dict(): 36 | schema_dict = { 37 | "user": {"user_id": "id", "age": "number", "gender": "category"}, 38 | "item": {"item_id": "id", "price": "number", "category": "category"}, 39 | "timestamp": "timestamp", 40 | "event_type": "event_type", 41 | } 42 | 43 | schema = Schema.from_dict(schema_dict) 44 | 45 | assert schema.user.id == "user_id" 46 | assert schema.user.age == "number" 47 | assert schema.user.gender == "category" 48 | assert schema.item.id == "item_id" 49 | assert schema.item.price == "number" 50 | assert schema.item.category == "category" 51 | assert schema.timestamp == "timestamp" 52 | assert schema.event_type == "event_type" 53 | 54 | 55 | def test_load(): 56 | schema_dict = { 57 | "user": {"user_id": "id", "age": "number", "gender": "category"}, 58 | "item": {"item_id": "id", "price": "number", "category": "category"}, 59 | "timestamp": "timestamp", 60 | "event_type": "event_type", 61 | } 62 | 63 | with tempfile.NamedTemporaryFile(mode="w", delete=False) as f: 64 | json.dump(schema_dict, f) 65 | f.seek(0) 66 | schema = Schema.from_json(f.name) 67 | 68 | assert schema.user.id == "user_id" 69 | assert schema.user.age == "number" 70 | assert schema.user.gender == "category" 71 | assert schema.item.id == "item_id" 72 | assert schema.item.price == "number" 73 | assert schema.item.category == "category" 74 | assert schema.timestamp == "timestamp" 75 | assert schema.event_type == "event_type" 76 | 77 | 78 | def test_target_schema(): 79 | # Test data types are valid 80 | target = _TargetSchema("id", feature1="category", feature2="number") 81 | assert hasattr(target, "id") 82 | assert hasattr(target, "feature1") 83 | assert target.feature1 == "category" 84 | assert hasattr(target, "feature2") 85 | assert target.feature2 == "number" 86 | 87 | # Test unsupported data type throws error 88 | with pytest.raises(ValueError, match=r"Data type not supported"): 89 | _ = _TargetSchema("id", feature1="string") 90 | 91 | 92 | def test_schema_io(): 93 | # Test Schema to_dict method 94 | user_id = "user_id" 95 | item_id = "item_id" 96 | timestamp = "timestamp" 97 | event_type = "event_type" 98 | user_features = {"age": "number", "gender": "category"} 99 | item_features = {"price": "number", "category": "category"} 100 | schema = Schema( 101 | user_id, item_id, timestamp, event_type, user_features, item_features 102 | ) 103 | assert schema.to_dict() == { 104 | "user": {"user_id": "id", "age": "number", "gender": "category"}, 105 | "item": {"item_id": "id", "price": "number", "category": "category"}, 106 | "timestamp": "timestamp", 107 | "event_type": "event_type", 108 | } 109 | 110 | # Test Schema from_dict method 111 | schema_dict = schema.to_dict() 112 | schema_loaded = Schema.from_dict(schema_dict) 113 | assert schema_loaded.to_dict() == schema.to_dict() 114 | 115 | # Test Schema load method 116 | with open("test_schema.json", "w") as f: 117 | json.dump(schema_dict, f, indent=4) 118 | schema_loaded = Schema.from_json("test_schema.json") 119 | assert schema_loaded.to_dict() == schema.to_dict() 120 | 121 | # Test Schema save method 122 | schema.save("test_schema.json") 123 | with open("test_schema.json", "r") as f: 124 | schema_loaded = json.load(f) 125 | assert schema_loaded == schema_dict 126 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from rexify.utils import get_target_feature, get_target_id, make_dirs 6 | 7 | 8 | @pytest.fixture 9 | def schema(): 10 | return { 11 | "target1": {"key1": "id", "key2": "value1"}, 12 | "target2": {"key3": "value2", "key4": "id"}, 13 | "target3": {"key5": "value3", "key6": "value4"}, 14 | } 15 | 16 | 17 | def test_get_target_id(schema): 18 | assert get_target_id(schema, "target1") == ["key1"] 19 | assert get_target_id(schema, "target2") == ["key4"] 20 | assert get_target_id(schema, "target3") == [] 21 | 22 | 23 | def test_get_target_feature(schema): 24 | assert get_target_feature(schema, "target1", "id") == ["key1"] 25 | assert get_target_feature(schema, "target1", "value1") == ["key2"] 26 | assert get_target_feature(schema, "target2", "id") == ["key4"] 27 | assert get_target_feature(schema, "target2", "value2") == ["key3"] 28 | assert get_target_feature(schema, "target3", "value3") == ["key5"] 29 | assert get_target_feature(schema, "target3", "value4") == ["key6"] 30 | assert get_target_feature(schema, "target3", "value5") == [] 31 | 32 | 33 | def test_make_dirs(tmpdir): 34 | dir1 = tmpdir.mkdir("dir1") 35 | dir2 = tmpdir.mkdir("dir2") 36 | make_dirs(dir1, dir2) 37 | 38 | assert Path(dir1).exists() 39 | assert Path(dir2).exists() 40 | --------------------------------------------------------------------------------