├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── custom_inference ├── python │ ├── cats_and_dogs │ │ ├── Main_Script.ipynb │ │ └── cats-and-dogs-README.md │ ├── gan_mnist │ │ ├── Colab - MNIST GAN Unstructured Model.ipynb │ │ ├── README.md │ │ ├── custom.py │ │ └── gan_weights.h5 │ ├── image_object_detection │ │ ├── README.md │ │ ├── analyze_image_object_detection_deployment.ipynb │ │ ├── coco_90.json │ │ └── model │ │ │ ├── custom.py │ │ │ ├── efficientdet-d1.h5 │ │ │ ├── efficientnet.py │ │ │ ├── initializers.py │ │ │ ├── layers.py │ │ │ ├── model_load_utils.py │ │ │ ├── object_detection_model.py │ │ │ ├── object_detection_utils.py │ │ │ ├── requirements.txt │ │ │ └── tfkeras.py │ ├── imdb_graph_isomorphism │ │ ├── Colab - Graph Isomorphism Network.ipynb │ │ ├── README.md │ │ ├── custom.py │ │ ├── gin.py │ │ ├── gin_model.h5 │ │ ├── requirements.txt │ │ ├── schema_graph.avsc │ │ └── score_data.py │ ├── insurance_pricing │ │ ├── Main_Script.ipynb │ │ ├── README.md │ │ ├── artifacts │ │ │ ├── lgbm.joblib │ │ │ ├── ordinalEncoder.joblib │ │ │ ├── simpleImputerCat.joblib │ │ │ └── simpleImputerNum.joblib │ │ ├── custom.py │ │ ├── data │ │ │ └── loss_cost_short.csv │ │ ├── feature_detail.yaml │ │ ├── mymodel.py │ │ ├── requirements.txt │ │ └── schema.avsc │ ├── mnist │ │ ├── MNIST for DRUM.ipynb │ │ ├── data │ │ │ ├── mnist.csv │ │ │ └── test.csv │ │ └── drum │ │ │ ├── classlabels.txt │ │ │ ├── custom.py │ │ │ ├── mnist.h5 │ │ │ └── requirements.txt │ ├── movie_recommender │ │ ├── DRUM_Recommender.ipynb │ │ ├── read_me.md │ │ └── recommender_model │ │ │ ├── custom.py │ │ │ ├── movies.csv │ │ │ ├── predict.csv │ │ │ ├── ratings_file.csv │ │ │ ├── saved_model.pb │ │ │ └── variables │ │ │ ├── variables.data-00000-of-00001 │ │ │ └── variables.index │ └── readmissions │ │ ├── README.md │ │ ├── Readmission_level_1 │ │ ├── Main_Script_Level_1.ipynb │ │ ├── catboost_info │ │ │ ├── catboost_training.json │ │ │ ├── learn │ │ │ │ └── events.out.tfevents │ │ │ ├── learn_error.tsv │ │ │ └── time_left.tsv │ │ └── custom_model │ │ │ ├── custom.py │ │ │ ├── model.pkl │ │ │ └── requirements.txt │ │ ├── Readmission_level_2 │ │ ├── Main_Script_Level_2.ipynb │ │ └── custom_model │ │ │ ├── custom.py │ │ │ ├── model.pkl │ │ │ ├── preprocessing.pkl │ │ │ └── requirements.txt │ │ ├── Readmission_level_3 │ │ ├── Main_Script_Level_3.ipynb │ │ └── custom_model │ │ │ ├── custom.py │ │ │ ├── model.pkl │ │ │ ├── preprocessing.pkl │ │ │ └── requirements.txt │ │ ├── data │ │ ├── readmissions_test.csv │ │ └── readmissions_train.csv │ │ └── requirements.txt ├── r │ ├── README.md │ ├── r_glm_noncaret_basic │ │ ├── README.md │ │ ├── create_pipeline.R │ │ └── custom.R │ ├── r_glm_noncaret_feateng │ │ ├── README.md │ │ ├── create_pipeline.R │ │ ├── custom.R │ │ ├── preprocess.R │ │ ├── rmcons.R │ │ └── rmident.R │ ├── r_glm_noncaret_gamma │ │ ├── README.md │ │ ├── create_pipeline.R │ │ └── custom.R │ ├── r_glm_noncaret_logit │ │ ├── README.md │ │ ├── create_pipeline.R │ │ └── custom.R │ └── r_glm_noncaret_recipe │ │ ├── README.md │ │ ├── create_pipeline.R │ │ └── custom.R └── scala │ └── iris_binary │ ├── README.md │ ├── build.sbt │ ├── custom-model │ ├── custom-scala-assembly-0.1.0.jar │ └── xgb-model │ │ └── model.bin │ ├── data │ └── iris_binary_training.csv │ ├── lib │ └── predictors.jar │ ├── project │ ├── build.properties │ └── plugins.sbt │ └── src │ └── main │ └── scala │ ├── Main.scala │ └── XGBoostPredictor.scala ├── custom_tasks ├── models │ ├── classification │ │ ├── python │ │ │ ├── README.md │ │ │ ├── catboost │ │ │ │ ├── catboost_pipeline.py │ │ │ │ ├── custom.py │ │ │ │ ├── feature_selection.py │ │ │ │ └── requirements.txt │ │ │ └── graph_isomorphism_network │ │ │ │ ├── GNN_Custom_Task.ipynb │ │ │ │ ├── README.md │ │ │ │ ├── custom_task_gin │ │ │ │ ├── custom.py │ │ │ │ ├── graph_isomorphism_network.py │ │ │ │ └── requirements.txt │ │ │ │ ├── env │ │ │ │ ├── Dockerfile │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── dr_requirements.txt │ │ │ │ ├── env_info.json │ │ │ │ ├── fit.sh │ │ │ │ ├── requirements.txt │ │ │ │ └── start_server.sh │ │ │ │ ├── graph.csv │ │ │ │ └── graph2.csv │ │ └── r │ │ │ └── README.md │ ├── regression │ │ ├── python │ │ │ └── README.md │ │ └── r │ │ │ └── README.md │ └── unsupervised │ │ ├── python │ │ └── README.md │ │ └── r │ │ └── README.md ├── other │ ├── README.md │ ├── python │ │ ├── README.md │ │ └── round_predictions │ │ │ └── custom.py │ └── r │ │ └── README.md └── preprocessing │ ├── categorical │ ├── python │ │ ├── encoding │ │ │ ├── any_target │ │ │ │ ├── all_enc_catboost │ │ │ │ │ ├── README.md │ │ │ │ │ ├── custom.py │ │ │ │ │ └── requirements.txt │ │ │ │ ├── all_enc_hashing │ │ │ │ │ ├── README.md │ │ │ │ │ ├── custom.py │ │ │ │ │ └── requirements.txt │ │ │ │ └── all_enc_mest │ │ │ │ │ ├── README.md │ │ │ │ │ ├── custom.py │ │ │ │ │ └── requirements.txt │ │ │ ├── binary_target │ │ │ │ ├── binary_enc_backward_differencing │ │ │ │ │ ├── README.md │ │ │ │ │ ├── custom.py │ │ │ │ │ └── requirements.txt │ │ │ │ ├── binary_enc_glm │ │ │ │ │ ├── README.md │ │ │ │ │ ├── custom.py │ │ │ │ │ └── requirements.txt │ │ │ │ ├── binary_enc_helmert │ │ │ │ │ ├── README.md │ │ │ │ │ ├── custom.py │ │ │ │ │ └── requirements.txt │ │ │ │ ├── binary_enc_leaveonout │ │ │ │ │ ├── README.md │ │ │ │ │ ├── custom.py │ │ │ │ │ └── requirements.txt │ │ │ │ └── binary_enc_woe │ │ │ │ │ ├── README.md │ │ │ │ │ ├── custom.py │ │ │ │ │ └── requirements.txt │ │ │ ├── multiclass_target │ │ │ │ └── .gitignore │ │ │ └── regression_target │ │ │ │ └── regression_enc_glm │ │ │ │ ├── README.md │ │ │ │ ├── custom.py │ │ │ │ └── requirements.txt │ │ └── imputing │ │ │ └── .gitignore │ └── r │ │ └── README.md │ ├── images │ └── r │ │ └── README.md │ ├── numeric │ ├── python │ │ ├── encoding │ │ │ └── .gitignore │ │ ├── imputing │ │ │ ├── knn_imputer_fixed_n │ │ │ │ ├── README.md │ │ │ │ └── custom.py │ │ │ └── median_impute │ │ │ │ └── custom.py │ │ ├── monotonic transforms │ │ │ └── power_transformer │ │ │ │ └── custom.py │ │ ├── scaling │ │ │ └── minmaxscaler │ │ │ │ └── custom.py │ │ └── signal │ │ │ ├── butter_10_15_hp_1000 │ │ │ └── custom.py │ │ │ ├── butter_4_100_lowpass │ │ │ └── custom.py │ │ │ ├── cheby1_sos_10_1_15 │ │ │ └── custom.py │ │ │ ├── cheby1_sos_5_1_15 │ │ │ └── custom.py │ │ │ ├── cheby2_12_20_17 │ │ │ └── custom.py │ │ │ └── fft │ │ │ └── custom.py │ └── r │ │ └── README.md │ ├── other │ └── r │ │ └── README.md │ └── text │ └── r │ └── README.md ├── drum_overview ├── Main_Script.ipynb ├── custom_model_reg │ ├── custom.py │ └── reg_rf_model.pkl ├── data │ ├── concrete_test.csv │ └── concrete_train.csv ├── readme.MD └── requirements.txt └── tracking_agents └── python ├── Main_Script.ipynb ├── readme.MD └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .vscode 3 | .metals 4 | **/.DS_Store 5 | **/.vscode 6 | 7 | **/ipynb_checkpoints 8 | **/__pycache__ -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at community@datarobot.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Guidelines for developing and contributing to this project. 4 | 5 | ## List of project maintainers 6 | 7 | - [Matthew Cohen](https://github.com/mcohenmcohen) 8 | - [Thodoris Petropoulos](https://github.com/TheoPetropoulos) 9 | 10 | ## Opening new issues 11 | 12 | - Before opening a new issue check if there are any existing FAQ entries (if one exists), issues or pull requests that match your case 13 | - Open an issue, and make sure to label the issue accordingly - bug, improvement, feature request, etc... 14 | - Be as specific and detailed as possible 15 | 16 | ## Making a pull request 17 | 18 | Due to security concerns (API Keys leaking by mistake), please communicate directly with [Thodoris Petropoulos](https://github.com/TheoPetropoulos) and/or [Matthew Cohen](https://github.com/mcohenmcohen) to request any changes in this repository. 19 | 20 | ## Responding to issues and pull requests 21 | 22 | This project's maintainers will make every effort to respond to any open issues as soon as possible. 23 | 24 | If you don't get a response within 7 days of creating your issue or pull request, please send us an email at community@datarobot.com. 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Please note:** The code in these repos is sourced from the DataRobot user community and is not owned or maintained by DataRobot, Inc. You may need to make edits or updates for this code to function properly in your environment. 2 | 3 | # Custom Models and External Deployments Monitoring 4 | 5 | ## Important Note 6 | 7 | This repo contains a library of commonly used tasks submitted by the DataRobot community. They tend to have more complex logic and are 8 | meant to be used as-is rather than as a reference. If you are not familiar with DataRobot's Custom Inference Models, Custom Tasks, or Composable ML 9 | please see this repo instead for tutorials / reference examples: 10 | https://github.com/datarobot/datarobot-user-models 11 | 12 | There is also extensive documentation on the platform docs at: https://docs.datarobot.com/ 13 | 14 | ## Usage 15 | 16 | For each respective guide, follow the instructions in its own `.ipynb` or `.py` file. There will also be a `requirements.txt` file in each folder with instructions on how to create an environment to run everything successfully. 17 | 18 | Here is some explanation of the different definitions used throughout: 19 | - **MLOps Tracking Agents**: MLOps Tracking Agents are used when you want to deploy external models and monitor them in DataRobot. For example, you have a custom (or DataRobot) model and you deploy it in your own Kubernetes cluster (or anywhere really). In those cases, MLOps tracking agents will sent statistics back to DataRobot so that you can still monitor your model's accuracy, service health, data drift, etc. 20 | - **MLOps DRUM overview**: MLOps DRUM is an open-source framework created and managed by DataRobot that allows you to easily deploy custom models. It provides out of the box consistency & validity checks, as well as single command deployment. DRUM is also seamlessly integreated with the DataRobot platform. If you use the framework, then you can use your custom models directly within the DataRobot platform. Here is the official Github repository for [DRUM](https://github.com/datarobot/datarobot-user-models). 21 | - **Custom Inference Models**: End to end examples of custom modeling code and how it is structured in order to be deployable using the DataRobot platform. The custom code here is basically taking advantage of the DRUM framework mentioned above. 22 | - **Custom Tasks**: With Composable AI, DataRobot allows you to manipulate DataRobot created blueprints and add your own custom preprocessing steps. Within custom tasks, there are examples of how your code needs to look like to achieve this. 23 | 24 | Some of the notebooks can also be executed through Google Colab. 25 | 26 | ## Important Links 27 | 28 | - To learn to use DataRobot, visit [DataRobot University](https://university.datarobot.com/) 29 | - For General articles on DataRobot and news, visit [DataRobot Community](https://community.datarobot.com/) 30 | - End to end DataRobot API examples [Tutorials for Data Scientists](https://github.com/datarobot-community/tutorials-for-data-scientists) 31 | - DataRobot API examples [Examples for Data Scientists](https://github.com/datarobot-community/examples-for-data-scientists) 32 | 33 | ## Contents 34 | 35 | ### MLOps Tracking Agents Overview 36 | - *MLOps Tracking Agent Notebook*: An example of how you can use DataRobot's MLOps Agents functionality to monitor external deployments. [Python](https://github.com/datarobot-community/custom-models/tree/master/tracking_agents/python) 37 | 38 | ### MLOps DRUM Overview 39 | - *MLOps DRUM Notebook*: An example of you can use the DataRobot Model Runner (DRUM) library to test your custom models before deploying them using DataRobot. [Python](https://github.com/datarobot-community/custom-models/blob/master/drum_overview/Main_Script.ipynb) 40 | 41 | ### Custom Inference Model Examples 42 | - *Custom Inference Models*: Examples in multiple languages on how to create custom inference models. Some of the scripts have been updated to also include the code needed to run this as a custom training model: [Multiple Languages](https://github.com/datarobot-community/custom-models/tree/master/custom_inference) 43 | 44 | ### Custom Tasks 45 | - *Custom Tasks*: Examples of custom-tasks that you can use directly within the DataRobot platform to manipulate blueprints. Check out how they look like and create your own tasks! [Multiple Languages](https://github.com/datarobot-community/custom-models/tree/master/custom_tasks) 46 | 47 | 48 | ## Setup/Installation 49 | 50 | Each project folder contains its own instructions on setup and requirements. Furthermore, instructions are also conveniently added to the scripts themselves so that users do not need to share the readme file. 51 | 52 | ## Development and Contributing 53 | 54 | If you'd like to report an issue or bug, suggest improvements, or contribute code to this project, please refer to [CONTRIBUTING.md](CONTRIBUTING.md). 55 | 56 | 57 | # Code of Conduct 58 | 59 | This project has adopted the Contributor Covenant for its Code of Conduct. 60 | See [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) to read it in full. 61 | 62 | # License 63 | 64 | Licensed under the Apache License 2.0. 65 | See [LICENSE](LICENSE) to read it in full. 66 | 67 | 68 | -------------------------------------------------------------------------------- /custom_inference/python/cats_and_dogs/cats-and-dogs-README.md: -------------------------------------------------------------------------------- 1 | # Cats and Dogs Example 2 | 3 | This folder includes an example of how to use DRUM with a Keras DNN leveraging GPUs for inference. GPU is a costly resource for inference, but the point here to show how this can be accomplished via DRUM if the need arises. The model was trained to classify an image as a cat or a dog. This model originated in the model templates available within the [DRUM](https://github.com/datarobot/datarobot-user-models/tree/master/model_templates/inference/python3_keras_vizai_joblib) 4 | 5 | Use google colab to follow alone with `Main_Script.ipynb`. 6 | 7 | In this notebook you will 8 | 9 | * use DRUM to score data in batch 10 | * use DRUM to serve the model as a rest enpoint leveraing GPUS for inference 11 | 12 | Serving the model can be doen with either Flask, or Nginx and uwsgi. Using Nginx, you will have to modify some files, but all of the content is highlighed in the notebook. -------------------------------------------------------------------------------- /custom_inference/python/gan_mnist/README.md: -------------------------------------------------------------------------------- 1 | ## GAN MNIST as Unstructured Model 2 | 3 | #### Owner Tim Whittaker => timothy.whittaker@datarobot.com 4 | 5 | This demonstrates serving a GAN as an unstructured model with [DRUM](https://github.com/datarobot/datarobot-user-models/). 6 | 7 | Please see the notebook in this directory for more detail. -------------------------------------------------------------------------------- /custom_inference/python/gan_mnist/custom.py: -------------------------------------------------------------------------------- 1 | 2 | #import pandas as pd 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | import numpy as np 7 | import os 8 | import pickle 9 | import json 10 | 11 | def load_model(input_dir): 12 | generator = keras.Sequential( 13 | [ 14 | keras.Input(shape=(128,)), 15 | # We want to generate 128 coefficients to reshape into a 7x7x128 map 16 | layers.Dense(7 * 7 * 128), 17 | layers.LeakyReLU(alpha=0.2), 18 | layers.Reshape((7, 7, 128)), 19 | layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"), 20 | layers.LeakyReLU(alpha=0.2), 21 | layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"), 22 | layers.LeakyReLU(alpha=0.2), 23 | layers.Conv2D(1, (7, 7), padding="same", activation="sigmoid"), 24 | ], 25 | name="generator", 26 | ) 27 | generator.load_weights(os.path.join(input_dir, "gan_weights.h5")) 28 | return generator 29 | 30 | def score_unstructured(model, data, query, **kwargs): 31 | print("Incoming content type params: ", kwargs) 32 | print("Incoming data type: ", type(data)) 33 | print("Incoming query params: ", query) 34 | 35 | ## data is expected to be the number of images to generate 36 | random_latent_vectors = 128 37 | 38 | 39 | num_digits = json.loads(data)["num_digits"] 40 | ## need to parse data to int 41 | 42 | random_latent_vectors = tf.random.normal(shape=(num_digits, 128)) 43 | rand_imgs = model(random_latent_vectors) 44 | rand_imgs *= 255 45 | rand_imgs.numpy() 46 | images = [] 47 | # images["num_images": d] 48 | for i in range(num_digits): 49 | img = keras.preprocessing.image.array_to_img(rand_imgs[i]) 50 | images.append(img) 51 | # img.save("generated_img_{i}_{epoch}.png".format(i=i, epoch=epoch)) 52 | return pickle.dumps(images) 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /custom_inference/python/gan_mnist/gan_weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/gan_mnist/gan_weights.h5 -------------------------------------------------------------------------------- /custom_inference/python/image_object_detection/README.md: -------------------------------------------------------------------------------- 1 | ## Python Keras Image Object Detection Custom Inference Model Template 2 | 3 | This model is intended to work with the [Python 3 Keras Drop-In Environment](../../public_dropin_environments/python3_keras/). 4 | 5 | ## Instructions 6 | Create a new custom model with `Unstructured` Target Type, add the files in the model folder and use the [Python 3 Keras Drop-In Environment] with it 7 | 8 | Test with custom-models-wip/drum_overview/data/image_b64.txt 9 | 10 | ### To run locally using 'drum' 11 | Paths are relative to `./custom-models`: 12 | `drum score --code-dir ./custom_inference/python/image_object_detection/model --target-type unstructured --input ./drum_overview/data/image_b64.txt --verbose` 13 | -------------------------------------------------------------------------------- /custom_inference/python/image_object_detection/coco_90.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": { 3 | "id": 1, 4 | "name": "person" 5 | }, 6 | "2": { 7 | "id": 2, 8 | "name": "bicycle" 9 | }, 10 | "3": { 11 | "id": 3, 12 | "name": "car" 13 | }, 14 | "4": { 15 | "id": 4, 16 | "name": "motorcycle" 17 | }, 18 | "5": { 19 | "id": 5, 20 | "name": "airplane" 21 | }, 22 | "6": { 23 | "id": 6, 24 | "name": "bus" 25 | }, 26 | "7": { 27 | "id": 7, 28 | "name": "train" 29 | }, 30 | "8": { 31 | "id": 8, 32 | "name": "truck" 33 | }, 34 | "9": { 35 | "id": 9, 36 | "name": "boat" 37 | }, 38 | "10": { 39 | "id": 10, 40 | "name": "traffic light" 41 | }, 42 | "11": { 43 | "id": 11, 44 | "name": "fire hydrant" 45 | }, 46 | "13": { 47 | "id": 13, 48 | "name": "stop sign" 49 | }, 50 | "14": { 51 | "id": 14, 52 | "name": "parking meter" 53 | }, 54 | "15": { 55 | "id": 15, 56 | "name": "bench" 57 | }, 58 | "16": { 59 | "id": 16, 60 | "name": "bird" 61 | }, 62 | "17": { 63 | "id": 17, 64 | "name": "cat" 65 | }, 66 | "18": { 67 | "id": 18, 68 | "name": "dog" 69 | }, 70 | "19": { 71 | "id": 19, 72 | "name": "horse" 73 | }, 74 | "20": { 75 | "id": 20, 76 | "name": "sheep" 77 | }, 78 | "21": { 79 | "id": 21, 80 | "name": "cow" 81 | }, 82 | "22": { 83 | "id": 22, 84 | "name": "elephant" 85 | }, 86 | "23": { 87 | "id": 23, 88 | "name": "bear" 89 | }, 90 | "24": { 91 | "id": 24, 92 | "name": "zebra" 93 | }, 94 | "25": { 95 | "id": 25, 96 | "name": "giraffe" 97 | }, 98 | "27": { 99 | "id": 27, 100 | "name": "backpack" 101 | }, 102 | "28": { 103 | "id": 28, 104 | "name": "umbrella" 105 | }, 106 | "31": { 107 | "id": 31, 108 | "name": "handbag" 109 | }, 110 | "32": { 111 | "id": 32, 112 | "name": "tie" 113 | }, 114 | "33": { 115 | "id": 33, 116 | "name": "suitcase" 117 | }, 118 | "34": { 119 | "id": 34, 120 | "name": "frisbee" 121 | }, 122 | "35": { 123 | "id": 35, 124 | "name": "skis" 125 | }, 126 | "36": { 127 | "id": 36, 128 | "name": "snowboard" 129 | }, 130 | "37": { 131 | "id": 37, 132 | "name": "sports ball" 133 | }, 134 | "38": { 135 | "id": 38, 136 | "name": "kite" 137 | }, 138 | "39": { 139 | "id": 39, 140 | "name": "baseball bat" 141 | }, 142 | "40": { 143 | "id": 40, 144 | "name": "baseball glove" 145 | }, 146 | "41": { 147 | "id": 41, 148 | "name": "skateboard" 149 | }, 150 | "42": { 151 | "id": 42, 152 | "name": "surfboard" 153 | }, 154 | "43": { 155 | "id": 43, 156 | "name": "tennis racket" 157 | }, 158 | "44": { 159 | "id": 44, 160 | "name": "bottle" 161 | }, 162 | "46": { 163 | "id": 46, 164 | "name": "wine glass" 165 | }, 166 | "47": { 167 | "id": 47, 168 | "name": "cup" 169 | }, 170 | "48": { 171 | "id": 48, 172 | "name": "fork" 173 | }, 174 | "49": { 175 | "id": 49, 176 | "name": "knife" 177 | }, 178 | "50": { 179 | "id": 50, 180 | "name": "spoon" 181 | }, 182 | "51": { 183 | "id": 51, 184 | "name": "bowl" 185 | }, 186 | "52": { 187 | "id": 52, 188 | "name": "banana" 189 | }, 190 | "53": { 191 | "id": 53, 192 | "name": "apple" 193 | }, 194 | "54": { 195 | "id": 54, 196 | "name": "sandwich" 197 | }, 198 | "55": { 199 | "id": 55, 200 | "name": "orange" 201 | }, 202 | "56": { 203 | "id": 56, 204 | "name": "broccoli" 205 | }, 206 | "57": { 207 | "id": 57, 208 | "name": "carrot" 209 | }, 210 | "58": { 211 | "id": 58, 212 | "name": "hot dog" 213 | }, 214 | "59": { 215 | "id": 59, 216 | "name": "pizza" 217 | }, 218 | "60": { 219 | "id": 60, 220 | "name": "donut" 221 | }, 222 | "61": { 223 | "id": 61, 224 | "name": "cake" 225 | }, 226 | "62": { 227 | "id": 62, 228 | "name": "chair" 229 | }, 230 | "63": { 231 | "id": 63, 232 | "name": "couch" 233 | }, 234 | "64": { 235 | "id": 64, 236 | "name": "potted plant" 237 | }, 238 | "65": { 239 | "id": 65, 240 | "name": "bed" 241 | }, 242 | "67": { 243 | "id": 67, 244 | "name": "dining table" 245 | }, 246 | "70": { 247 | "id": 70, 248 | "name": "toilet" 249 | }, 250 | "72": { 251 | "id": 72, 252 | "name": "tv" 253 | }, 254 | "73": { 255 | "id": 73, 256 | "name": "laptop" 257 | }, 258 | "74": { 259 | "id": 74, 260 | "name": "mouse" 261 | }, 262 | "75": { 263 | "id": 75, 264 | "name": "remote" 265 | }, 266 | "76": { 267 | "id": 76, 268 | "name": "keyboard" 269 | }, 270 | "77": { 271 | "id": 77, 272 | "name": "cell phone" 273 | }, 274 | "78": { 275 | "id": 78, 276 | "name": "microwave" 277 | }, 278 | "79": { 279 | "id": 79, 280 | "name": "oven" 281 | }, 282 | "80": { 283 | "id": 80, 284 | "name": "toaster" 285 | }, 286 | "81": { 287 | "id": 81, 288 | "name": "sink" 289 | }, 290 | "82": { 291 | "id": 82, 292 | "name": "refrigerator" 293 | }, 294 | "84": { 295 | "id": 84, 296 | "name": "book" 297 | }, 298 | "85": { 299 | "id": 85, 300 | "name": "clock" 301 | }, 302 | "86": { 303 | "id": 86, 304 | "name": "vase" 305 | }, 306 | "87": { 307 | "id": 87, 308 | "name": "scissors" 309 | }, 310 | "88": { 311 | "id": 88, 312 | "name": "teddy bear" 313 | }, 314 | "89": { 315 | "id": 89, 316 | "name": "hair drier" 317 | }, 318 | "90": { 319 | "id": 90, 320 | "name": "toothbrush" 321 | } 322 | } -------------------------------------------------------------------------------- /custom_inference/python/image_object_detection/model/custom.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from sklearn.pipeline import Pipeline 4 | import json 5 | 6 | from model_load_utils import ( 7 | load_image_object_detection_inference_pipeline, 8 | predict_with_preprocessing, 9 | ) 10 | 11 | 12 | def load_model(input_dir: str) -> Pipeline: 13 | """ 14 | Note: This hook may not have to be implemented for your model. 15 | In this case implemented for the model used in the example. 16 | 17 | This keras estimator requires 'load_model()' to be overridden. Coz as it involves pipeline of 18 | preprocessor and estimator bundled together, it requires a special handling (oppose to usually 19 | simple keras.models.load_model() or unpickling) to load the model. Currently there is no elegant 20 | default method to save the keras classifier/regressor along with the sklearn pipeline. Hence we 21 | use deserialize_estimator_pipeline() to load the model pipeline to predict. 22 | 23 | Parameters 24 | ---------- 25 | input_dir: str 26 | 27 | Returns 28 | ------- 29 | pipelined_model: Pipeline 30 | Estimator pipeline obj 31 | """ 32 | model = load_image_object_detection_inference_pipeline(input_dir) 33 | return model 34 | 35 | 36 | def transform(b64_image_array: str, model: Any) -> list: 37 | """ 38 | Intended to apply transformations to the prediction data before making predictions. This is 39 | most useful if DRUM supports the model's library, but your model requires additional data 40 | processing before it can make predictions 41 | 42 | Parameters 43 | ---------- 44 | data : is the dataframe given to DRUM to make predictions on 45 | model : is the deserialized model loaded by DRUM or by `load_model`, if supplied 46 | 47 | Returns 48 | ------- 49 | Transformed data: np.ndarray 50 | """ 51 | predicted_labels = predict_with_preprocessing(model, b64_image_array) 52 | return predicted_labels 53 | 54 | 55 | def score_unstructured(model, data, query, **kwargs): 56 | print("Model: ", model) 57 | print("Incoming content type params: ", kwargs) 58 | print("Incoming data type: ", type(data)) 59 | print("Incoming data: ", data) 60 | 61 | print("Incoming query params: ", query) 62 | if isinstance(data, bytes): 63 | data = data.decode("utf8") 64 | ret = transform(data, model).astype(int).tolist() 65 | 66 | ret_mode = query.get("ret_mode", "") 67 | if ret_mode == "binary": 68 | ret_data = ret.tobytes() 69 | ret_kwargs = {"mimetype": "application/octet-stream"} 70 | ret = ret_data, ret_kwargs 71 | else: 72 | ret = json.dumps(ret) 73 | return ret 74 | 75 | 76 | # def score(data: pd.DataFrame, model: Any, **kwargs: Dict[str, Any]) -> pd.DataFrame: 77 | # """ 78 | # This hook is only needed if you would like to use DRUM with a framework not natively 79 | # supported by the tool. 80 | # 81 | # Parameters 82 | # ---------- 83 | # data : is the dataframe to make predictions against. If `transform` is supplied, 84 | # `data` will be the transformed data. 85 | # model : is the deserialized model loaded by DRUM or by `load_model`, if supplied 86 | # kwargs : additional keyword arguments to the method 87 | # In case of classification model class labels will be provided as the following arguments: 88 | # - `positive_class_label` is the positive class label for a binary classification model 89 | # - `negative_class_label` is the negative class label for a binary classification model 90 | # 91 | # Returns 92 | # ------- 93 | # This method should return predictions as a dataframe with the following format: 94 | # Binary Classification: must have columns for each class label with floating- point class 95 | # probabilities as values. Each row should sum to 1.0 96 | # Regression: must have a single column called `Predictions` with numerical values 97 | # 98 | # """ 99 | 100 | # def post_process(predictions: pd.DataFrame, model: Any) -> pd.DataFrame: 101 | # """ 102 | # This method is only needed if your model's output does not match the above expectations 103 | # 104 | # Parameters 105 | # ---------- 106 | # predictions : is the dataframe of predictions produced by DRUM or by 107 | # the `score` hook, if supplied 108 | # model : is the deserialized model loaded by DRUM or by `load_model`, if supplied 109 | # 110 | # Returns 111 | # ------- 112 | # This method should return predictions as a dataframe with the following format: 113 | # Binary Classification: must have columns for each class label with floating- point class 114 | # probabilities as values. Each row 115 | # should sum to 1.0 116 | # Regression: must have a single column called `Predictions` with numerical values 117 | # 118 | # """ 119 | -------------------------------------------------------------------------------- /custom_inference/python/image_object_detection/model/efficientdet-d1.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/image_object_detection/model/efficientdet-d1.h5 -------------------------------------------------------------------------------- /custom_inference/python/image_object_detection/model/initializers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2017-2018 Fizyr (https://fizyr.com) 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | from tensorflow import keras 18 | 19 | import numpy as np 20 | import math 21 | 22 | 23 | class PriorProbability(keras.initializers.Initializer): 24 | """ Apply a prior probability to the weights. 25 | """ 26 | 27 | def __init__(self, probability=0.01): 28 | self.probability = probability 29 | 30 | def get_config(self): 31 | return {"probability": self.probability} 32 | 33 | def __call__(self, shape, dtype=None): 34 | # set bias to -log((1 - p)/p) for foreground 35 | result = np.ones(shape, dtype=np.float32) * -math.log( 36 | (1 - self.probability) / self.probability 37 | ) 38 | 39 | return result 40 | -------------------------------------------------------------------------------- /custom_inference/python/image_object_detection/model/model_load_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import os 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from PIL import Image 8 | from sklearn.pipeline import Pipeline 9 | 10 | from object_detection_model import efficientdet 11 | from object_detection_utils import preprocess_image, postprocess_boxes 12 | 13 | IMG_SIZE = 150 14 | IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3) 15 | PHI = 1 16 | WEIGHTED_BIFPN = True 17 | MODEL_PATH = "efficientdet-d1.h5" 18 | IMAGE_SIZES = (512, 640, 768, 896, 1024, 1280, 1408) 19 | IMAGE_SIZE = IMAGE_SIZES[PHI] 20 | NUM_CLASSES = 90 # from coco 21 | SCORE_THRESHOLD = 0.3 22 | 23 | 24 | def img_preprocessing(image: Image) -> np.ndarray: 25 | """ given a PIL.Image object resize, convert to RGB and return as np.array """ 26 | image, scale = preprocess_image(image, image_size=IMAGE_SIZE) 27 | return image, scale 28 | 29 | 30 | def get_img_obj_from_base64_str(b64_img_str: str) -> Image: 31 | """ given a base64 encoded image str get the PIL.Image object """ 32 | b64_img = base64.b64decode(b64_img_str) 33 | image_bytes = io.BytesIO(b64_img) 34 | return Image.open(image_bytes) 35 | 36 | 37 | def get_base64_str_from_PIL_img(pillowed_img: Image) -> str: 38 | """ given a PIL.Image object return base64 encoded str of the image object """ 39 | buffer = io.BytesIO() 40 | pillowed_img.save(buffer, format="JPEG") 41 | return base64.b64encode(buffer.getvalue()) 42 | 43 | 44 | def load_and_preprocess_image_data(x_data: np.ndarray) -> pd.DataFrame: 45 | """ Apply the preprocessing methods on the data before prediction for the model to work on """ 46 | try: 47 | image = get_img_obj_from_base64_str(x_data) 48 | except: 49 | image = get_imputation_img() 50 | return img_preprocessing(image) 51 | 52 | 53 | def apply_image_data_preprocessing(x_data: np.ndarray) -> np.ndarray: 54 | """ Image data preprocessing before fit """ 55 | x_data, scale = load_and_preprocess_image_data(x_data) 56 | return x_data, scale 57 | 58 | 59 | def get_imputation_img() -> str: 60 | """ Black image in base64 str for data imputation filling """ 61 | black_PIL_img = Image.fromarray(np.zeros(IMG_SHAPE, dtype="float32"), "RGB") 62 | return black_PIL_img 63 | 64 | 65 | def predict_with_preprocessing(model, b64_image_string: str) -> np.ndarray: 66 | """ Apply necessary preprocessing to conver b64 image string to image values, preprocessing to 67 | the image values and finally predict bounding boxes and labels with the preprocessed image 68 | values 69 | """ 70 | image, scale = apply_image_data_preprocessing(b64_image_string) 71 | w, h = image.shape[:2] 72 | boxes, scores, labels = model.predict(np.expand_dims(image, axis=0)) 73 | boxes, scores, labels = np.squeeze(boxes), np.squeeze(scores), np.squeeze(labels) 74 | boxes = postprocess_boxes(boxes=boxes, scale=scale, height=h, width=w) 75 | # select indices which have a score above the threshold 76 | indices = np.where(scores[:] > SCORE_THRESHOLD)[0] 77 | # select those detections 78 | boxes = boxes[indices] 79 | labels = labels[indices] 80 | return np.hstack([boxes, np.expand_dims(labels, 1)]) 81 | 82 | 83 | def load_image_object_detection_inference_pipeline(input_dir: str) -> Pipeline: 84 | """ Load keras based image object detection model used to predict bounding boxes and labels """ 85 | _, model = efficientdet( 86 | phi=PHI, 87 | weighted_bifpn=WEIGHTED_BIFPN, 88 | num_classes=NUM_CLASSES, 89 | score_threshold=SCORE_THRESHOLD, 90 | ) 91 | model.load_weights(os.path.join(input_dir, MODEL_PATH), by_name=True) 92 | return model 93 | -------------------------------------------------------------------------------- /custom_inference/python/image_object_detection/model/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.15.0 2 | -------------------------------------------------------------------------------- /custom_inference/python/image_object_detection/model/tfkeras.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | Copyright 2017 xuannianz github user 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | THE FOLLOWING IS THE COPYRIGHT OF THE ORIGINAL DOCUMENT: 13 | https://github.com/xuannianz/EfficientDet/blob/master/tfkeras.py 14 | """ 15 | from object_detection_utils import inject_tfkeras_modules, init_tfkeras_custom_objects 16 | import efficientnet as model 17 | 18 | EfficientNetB0 = inject_tfkeras_modules(model.EfficientNetB0) 19 | EfficientNetB1 = inject_tfkeras_modules(model.EfficientNetB1) 20 | EfficientNetB2 = inject_tfkeras_modules(model.EfficientNetB2) 21 | EfficientNetB3 = inject_tfkeras_modules(model.EfficientNetB3) 22 | EfficientNetB4 = inject_tfkeras_modules(model.EfficientNetB4) 23 | EfficientNetB5 = inject_tfkeras_modules(model.EfficientNetB5) 24 | EfficientNetB6 = inject_tfkeras_modules(model.EfficientNetB6) 25 | EfficientNetB7 = inject_tfkeras_modules(model.EfficientNetB7) 26 | 27 | preprocess_input = inject_tfkeras_modules(model.preprocess_input) 28 | 29 | init_tfkeras_custom_objects() 30 | -------------------------------------------------------------------------------- /custom_inference/python/imdb_graph_isomorphism/README.md: -------------------------------------------------------------------------------- 1 | ## Graph Isomorphism Network 2 | 3 | #### owner Tim Whittaker => timothy.whittaker@datarobot.com 4 | 5 | Demonstrate deployed a GNN as an unstructured model with [DRUM](https://github.com/datarobot/datarobot-user-models/). 6 | 7 | Please see the notebook in this directory for more detail. 8 | 9 | -------------------------------------------------------------------------------- /custom_inference/python/imdb_graph_isomorphism/custom.py: -------------------------------------------------------------------------------- 1 | #import pandas as pd 2 | import torch 3 | import os 4 | import io 5 | from io import BytesIO 6 | import avro.io 7 | import avro 8 | from avro.datafile import DataFileReader, DataFileWriter 9 | from gin import * 10 | import dgl 11 | import pandas as pd 12 | 13 | def load_model(input_dir): 14 | """ 15 | This hook can be implemented to adjust logic in the scoring mode. 16 | 17 | load_model hook provides a way to implement model loading your self. 18 | This function should return an object that represents your model. This object will 19 | be passed to the predict hook for performing predictions. 20 | This hook can be used to load supported models if your model has multiple artifacts, or 21 | for loading models that drum does not natively support 22 | 23 | :param input_dir: the directory to load serialized models from 24 | :returns: Object containing the model - the predict hook will get this object as a parameter 25 | """ 26 | model = GIN(2, 2, 1, 20, 2, 0, 0.01, "sum", "sum") 27 | model.load_state_dict(torch.load(os.path.join(input_dir, "gin_model.h5"))) 28 | return model 29 | 30 | def score_unstructured(model, data, query, **kwargs): 31 | print("Incoming content type params: ", kwargs) 32 | print("Incoming data type: ", type(data)) 33 | print("Incoming query params: ", query) 34 | 35 | bytes_reader = io.BytesIO(data) 36 | parsed_data = DataFileReader(bytes_reader, avro.io.DatumReader()) 37 | 38 | gs = [] 39 | for graph in parsed_data: 40 | e = graph["edges"] 41 | u,v = list(zip(*e)) 42 | g = dgl.graph((u,v)) 43 | g.ndata["attr"] = torch.ones(g.num_nodes(), 1) 44 | gs.append(g) 45 | batched_graph = dgl.batch(gs) 46 | feats = batched_graph.ndata['attr'].float() 47 | preds = F.softmax(model(batched_graph, feats), dim=1).detach().numpy() 48 | return pd.DataFrame(preds, columns = ["neg_class", "pos_class"]).to_json(orient="records") 49 | -------------------------------------------------------------------------------- /custom_inference/python/imdb_graph_isomorphism/gin.py: -------------------------------------------------------------------------------- 1 | """ 2 | How Powerful are Graph Neural Networks 3 | https://arxiv.org/abs/1810.00826 4 | https://openreview.net/forum?id=ryGs6iA5Km 5 | Author's implementation: https://github.com/weihua916/powerful-gnns 6 | """ 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from dgl.nn.pytorch.conv import GINConv 13 | from dgl.nn.pytorch.glob import SumPooling, AvgPooling, MaxPooling 14 | 15 | 16 | class ApplyNodeFunc(nn.Module): 17 | """Update the node feature hv with MLP, BN and ReLU.""" 18 | def __init__(self, mlp): 19 | super(ApplyNodeFunc, self).__init__() 20 | self.mlp = mlp 21 | self.bn = nn.BatchNorm1d(self.mlp.output_dim) 22 | 23 | def forward(self, h): 24 | h = self.mlp(h) 25 | h = self.bn(h) 26 | h = F.relu(h) 27 | return h 28 | 29 | 30 | class MLP(nn.Module): 31 | """MLP with linear output""" 32 | def __init__(self, num_layers, input_dim, hidden_dim, output_dim): 33 | """MLP layers construction 34 | 35 | Paramters 36 | --------- 37 | num_layers: int 38 | The number of linear layers 39 | input_dim: int 40 | The dimensionality of input features 41 | hidden_dim: int 42 | The dimensionality of hidden units at ALL layers 43 | output_dim: int 44 | The number of classes for prediction 45 | 46 | """ 47 | super(MLP, self).__init__() 48 | self.linear_or_not = False # default is linear model 49 | self.num_layers = num_layers 50 | self.output_dim = output_dim 51 | 52 | if num_layers < 1: 53 | raise ValueError("number of layers should be positive!") 54 | elif num_layers == 1: 55 | # Linear model 56 | self.linear = nn.Linear(input_dim, output_dim) 57 | else: 58 | # Multi-layer model 59 | self.linear_or_not = False 60 | self.linears = torch.nn.ModuleList() 61 | self.batch_norms = torch.nn.ModuleList() 62 | 63 | self.linears.append(nn.Linear(input_dim, hidden_dim)) 64 | for layer in range(num_layers - 2): 65 | self.linears.append(nn.Linear(hidden_dim, hidden_dim)) 66 | self.linears.append(nn.Linear(hidden_dim, output_dim)) 67 | 68 | for layer in range(num_layers - 1): 69 | self.batch_norms.append(nn.BatchNorm1d((hidden_dim))) 70 | 71 | def forward(self, x): 72 | if self.linear_or_not: 73 | # If linear model 74 | return self.linear(x) 75 | else: 76 | # If MLP 77 | h = x 78 | for i in range(self.num_layers - 1): 79 | h = F.relu(self.batch_norms[i](self.linears[i](h))) 80 | return self.linears[-1](h) 81 | 82 | 83 | class GIN(nn.Module): 84 | """GIN model""" 85 | def __init__(self, num_layers, num_mlp_layers, input_dim, hidden_dim, 86 | output_dim, final_dropout, learn_eps, graph_pooling_type, 87 | neighbor_pooling_type): 88 | """model parameters setting 89 | 90 | Paramters 91 | --------- 92 | num_layers: int 93 | The number of linear layers in the neural network 94 | num_mlp_layers: int 95 | The number of linear layers in mlps 96 | input_dim: int 97 | The dimensionality of input features 98 | hidden_dim: int 99 | The dimensionality of hidden units at ALL layers 100 | output_dim: int 101 | The number of classes for prediction 102 | final_dropout: float 103 | dropout ratio on the final linear layer 104 | learn_eps: boolean 105 | If True, learn epsilon to distinguish center nodes from neighbors 106 | If False, aggregate neighbors and center nodes altogether. 107 | neighbor_pooling_type: str 108 | how to aggregate neighbors (sum, mean, or max) 109 | graph_pooling_type: str 110 | how to aggregate entire nodes in a graph (sum, mean or max) 111 | 112 | """ 113 | super(GIN, self).__init__() 114 | self.num_layers = num_layers 115 | self.learn_eps = learn_eps 116 | 117 | # List of MLPs 118 | self.ginlayers = torch.nn.ModuleList() 119 | self.batch_norms = torch.nn.ModuleList() 120 | 121 | for layer in range(self.num_layers - 1): 122 | if layer == 0: 123 | mlp = MLP(num_mlp_layers, input_dim, hidden_dim, hidden_dim) 124 | else: 125 | mlp = MLP(num_mlp_layers, hidden_dim, hidden_dim, hidden_dim) 126 | 127 | self.ginlayers.append( 128 | GINConv(ApplyNodeFunc(mlp), neighbor_pooling_type, 0, self.learn_eps)) 129 | self.batch_norms.append(nn.BatchNorm1d(hidden_dim)) 130 | 131 | # Linear function for graph poolings of output of each layer 132 | # which maps the output of different layers into a prediction score 133 | self.linears_prediction = torch.nn.ModuleList() 134 | 135 | for layer in range(num_layers): 136 | if layer == 0: 137 | self.linears_prediction.append( 138 | nn.Linear(input_dim, output_dim)) 139 | else: 140 | self.linears_prediction.append( 141 | nn.Linear(hidden_dim, output_dim)) 142 | 143 | self.drop = nn.Dropout(final_dropout) 144 | 145 | if graph_pooling_type == 'sum': 146 | self.pool = SumPooling() 147 | elif graph_pooling_type == 'mean': 148 | self.pool = AvgPooling() 149 | elif graph_pooling_type == 'max': 150 | self.pool = MaxPooling() 151 | else: 152 | raise NotImplementedError 153 | 154 | def forward(self, g, h): 155 | # list of hidden representation at each layer (including input) 156 | hidden_rep = [h] 157 | 158 | for i in range(self.num_layers - 1): 159 | h = self.ginlayers[i](g, h) 160 | h = self.batch_norms[i](h) 161 | h = F.relu(h) 162 | hidden_rep.append(h) 163 | 164 | score_over_layer = 0 165 | 166 | # perform pooling over all nodes in each graph in every layer 167 | for i, h in enumerate(hidden_rep): 168 | pooled_h = self.pool(g, h) 169 | score_over_layer += self.drop(self.linears_prediction[i](pooled_h)) 170 | 171 | return score_over_layer -------------------------------------------------------------------------------- /custom_inference/python/imdb_graph_isomorphism/gin_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/imdb_graph_isomorphism/gin_model.h5 -------------------------------------------------------------------------------- /custom_inference/python/imdb_graph_isomorphism/requirements.txt: -------------------------------------------------------------------------------- 1 | networkx==2.5 2 | dgl==0.5.2 3 | avro==1.10.0 4 | datarobot-drum==1.4.2 5 | -------------------------------------------------------------------------------- /custom_inference/python/imdb_graph_isomorphism/schema_graph.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "imdb.graph.avro", 3 | "type": "record", 4 | "name": "graph", 5 | "fields": [ 6 | { 7 | "name": "index", 8 | "type": "int" 9 | }, 10 | { 11 | "name": "edges", 12 | "type": [ 13 | { 14 | "type": "array", 15 | "items": { 16 | "type": "array", 17 | "items": "int", 18 | "default": [] 19 | }, 20 | "default": [] 21 | }, 22 | "null" 23 | ] 24 | }, 25 | { 26 | "name": "vertices", 27 | "type": [ 28 | { 29 | "type": "array", 30 | "items": "int", 31 | "default": [] 32 | }, "null" 33 | ] 34 | }, 35 | { 36 | "name": "label", 37 | "type": [ 38 | "int", 39 | "null" 40 | ] 41 | }, 42 | { 43 | "name": "edge_features", 44 | "type": [ 45 | { 46 | "type": "array", 47 | "items": { 48 | "type": "array", 49 | "items": "float", 50 | "default": [] 51 | }, 52 | "default": [] 53 | }, 54 | "null" 55 | ] 56 | }, 57 | { 58 | "name": "vertex_features", 59 | "type": [ 60 | { 61 | "type": "array", 62 | "items": { 63 | "type": "array", 64 | "items": "float", 65 | "default": [] 66 | }, 67 | "default": [] 68 | }, 69 | "null" 70 | ] 71 | } 72 | ] 73 | } -------------------------------------------------------------------------------- /custom_inference/python/imdb_graph_isomorphism/score_data.py: -------------------------------------------------------------------------------- 1 | import avro.io 2 | import avro 3 | from avro.datafile import DataFileReader, DataFileWriter 4 | from io import BytesIO, BufferedWriter 5 | import requests 6 | import pandas as pd 7 | 8 | def load_schema(schema_path): 9 | schema = avro.schema.parse(open(schema_path, "rb").read()) 10 | return schema 11 | 12 | 13 | def score(graphs, schema, url, port): 14 | """ 15 | graphs is expected to be a list of dictionaries, where each entry in the 16 | list represents a graph with 17 | * key idx -> index value 18 | * key nodes -> list of ints representing vertices of the graph 19 | * key edges -> list of list of ints representing edges of graph 20 | """ 21 | 22 | stream = BufferedWriter(BytesIO()) 23 | writer = DataFileWriter(stream, avro.io.DatumWriter(), schema) 24 | # writer = DataFileWriter(open("imdb-graph.avro", "wb"), DatumWriter(), schema) 25 | for graph in graphs: 26 | writer.append({"edges": graph["edges"], "vertices": graph["vertices"], "index": graph["idx"], "label": graph.get("label")}) 27 | writer.flush() 28 | raw_bytes = stream.raw.getvalue() 29 | writer.close() 30 | 31 | url = "{}:{}/predictUnstructured/?ret_mode=binary".format(url.strip("/"), port) 32 | 33 | payload = raw_bytes 34 | headers = { 35 | 'Content-Type': 'application/octet-stream' 36 | } 37 | 38 | response = requests.request("POST", url, headers=headers, data = payload) 39 | 40 | return response 41 | -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/README.md: -------------------------------------------------------------------------------- 1 | # Insurance Pricing Example 2 | 3 | This folder includes an example of a custom model using DRUM to predict loss on insurance claims. For simplicity, you can use google colab to follow alone with `Main_Script.ipynb`. 4 | 5 | The point of this example is to highlight DRUM's unstructured mode. DRUM relaxing validation of the input and output, allowing for flexibility in terms of the model that can be hosted and morever, the way data is sent to and returned from DRUM. 6 | 7 | In unstructured mode, this model will return shap values and loss prediction all by leverage avro for serializsation of data. 8 | 9 | Avro has a JSON like data model, but can be represented as either JSON or in a compact binary form. 10 | * It comes with a very sophisticated schema description language that describes data. 11 | * It has a direct mapping to and from JSON. 12 | * It has a very compact format. 13 | * The bulk of JSON, repeating every field name with every single record, is what makes JSON inefficient for high-volume usage 14 | 15 | -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/artifacts/lgbm.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/insurance_pricing/artifacts/lgbm.joblib -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/artifacts/ordinalEncoder.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/insurance_pricing/artifacts/ordinalEncoder.joblib -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/artifacts/simpleImputerCat.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/insurance_pricing/artifacts/simpleImputerCat.joblib -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/artifacts/simpleImputerNum.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/insurance_pricing/artifacts/simpleImputerNum.joblib -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/custom.py: -------------------------------------------------------------------------------- 1 | 2 | import pickle 3 | from typing import List, Optional, Any, Dict 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from category_encoders import OrdinalEncoder 9 | from sklearn.impute import SimpleImputer 10 | import lightgbm as lgb 11 | from joblib import load 12 | import yaml 13 | import numpy as np 14 | import io 15 | import avro.io 16 | import avro 17 | from avro.datafile import DataFileReader 18 | 19 | from mymodel import MyModel 20 | 21 | import shap 22 | import io 23 | from io import BytesIO 24 | import avro.io 25 | import avro 26 | from avro.datafile import DataFileReader, DataFileWriter 27 | 28 | # 29 | #""" 30 | #This file was autogenerated by: drum new model --language python 31 | #Generation date: Mon Nov 9 14:31:54 2020 32 | # 33 | #Note: this is an example of custom.py file. 34 | # Below are all the hooks you can use to provide your own implementation. 35 | # All hooks are currently commented out so uncomment a hook function in 36 | # order to use it. 37 | #""" 38 | # 39 | # 40 | #def init(**kwargs): 41 | # """ 42 | # This hook can be implemented to adjust logic in the training and scoring mode. 43 | # init is called once the code is started. 44 | # 45 | # :param kwargs: additional keyword arguments to the function. 46 | # code_dir - code folder passed in --code_dir argument 47 | # """ 48 | # pass 49 | # 50 | # 51 | def load_model(input_dir): 52 | """ 53 | This hook can be implemented to adjust logic in the scoring mode. 54 | 55 | load_model hook provides a way to implement model loading your self. 56 | This function should return an object that represents your model. This object will 57 | be passed to the predict hook for performing predictions. 58 | This hook can be used to load supported models if your model has multiple artifacts, or 59 | for loading models that drum does not natively support 60 | 61 | :param input_dir: the directory to load serialized models from 62 | :returns: Object containing the model - the predict hook will get this object as a parameter 63 | """ 64 | 65 | # Returning a string with value "dummy" as the model. 66 | return MyModel(input_dir) 67 | # 68 | # 69 | #def transform(data, model): 70 | # """ 71 | # This hook can be implemented to adjust logic in the scoring mode. 72 | # 73 | # transform(data: DataFrame, model: Any) -> DataFrame 74 | # 75 | # Intended to apply transformations to the prediction data before making predictions. 76 | # This is most useful if drum supports the model's library, but your model requires additional 77 | # data processing before it can make predictions 78 | # 79 | # :param data: dataframe given to drum to make predictions on 80 | # :param model: is the deserialized model loaded by drum or by load_model hook , if supplied 81 | # :returns: a dataframe after transformation needed 82 | # """ 83 | # return data 84 | # 85 | # 86 | def score(data, model, **kwargs): 87 | """ 88 | This hook can be implemented to adjust logic in the scoring mode. 89 | 90 | This method should return predictions as a dataframe with the following format: 91 | 92 | Binary Classification: 93 | Must have columns for each class label with floating-point class probabilities as values. 94 | Each row should sum to 1.0 95 | 96 | Regression: 97 | Must have a single column called "Predictions" with numerical values 98 | 99 | This hook is only needed if you would like to use drum with a framework not natively 100 | supported by the tool. 101 | 102 | :param data: the dataframe to make predictions against. If transform is supplied, data 103 | will be the transformed data. 104 | :param model: is the deserialized model loaded by drum or by load_model hook, if supplied 105 | :param kwargs: additional keyword arguments to the function. If model is binary classification, 106 | positive_class_label and negative_class_label will be provided in kwargs. If the model is multiclass 107 | classification (at least 3 classes), a class_labels list will be provided as a parameter. 108 | :returns: a dataframe, see documentation above on the structure of the dataframe to return. 109 | """ 110 | 111 | return model.predict(data) 112 | 113 | def score_unstructured(model, data, query, **kwargs): 114 | print("Incoming content type params: ", kwargs) 115 | print("Incoming data type: ", type(data)) 116 | print("Incoming query params: ", query) 117 | 118 | 119 | # writer = avro.io.DatumWriter(schema) 120 | # bytes_writer = BytesIO() 121 | # encoder = avro.io.BinaryEncoder(bytes_writer) 122 | X = pd.read_csv(BytesIO(data)) 123 | shap_values_dict = model.explain(X) 124 | predictions = model.predict(X).values 125 | # for p, s in zip(predictions, shap_values_dict): 126 | # writer.write({"prediction": p[0],"shap_values": s}, encoder) 127 | stream = io.BufferedWriter(io.BytesIO()) 128 | writer = DataFileWriter(stream, avro.io.DatumWriter(), model.schema) 129 | for p, s in zip(predictions, shap_values_dict): 130 | writer.append({"prediction": p[0],"shap_values": s}) 131 | writer.flush() 132 | ret_bytes = stream.raw.getvalue() 133 | writer.close() 134 | return ret_bytes -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/data/loss_cost_short.csv: -------------------------------------------------------------------------------- 1 | Policy_ID,Eff_Dt,IncurredClaims,Exposure,Zipcode,VehicleCostNew,VehicleMake,VehicleModel,EngineCapacity,VehicleAge,ClientType,DriverAge,NumberOfDrivers,CustomerTenure,Gender,MaritalStatus,Zipcode_Aged_18_24,Zipcode_Aged_25_29,Zipcode_Aged_30_39,Zipcode_Aged_40_44,Zipcode_Aged_45_49,Zipcode_Aged_50_59,Zipcode_Aged_60,Zipcode_PersonsPerHousehold,Zipcode_annualMileage,Zipcode_VehiclesPerHousehold,Zipcode_CommuteViaCar,DistributionChannel,PartitionColumn 2 | 10001,5/1/17,14448.33,1.0,33672,30000.0,MERCEDES-BENZ,C200,122,15,Retail,49.0,2,4,F,Unknown,0.157653021,0.144983768,0.246694117,0.087061525,0.078153456,0.125386016,0.160068097,2.039300999,2668.979414,0.908915835,0.4,Insurance Broker,0 3 | 10002,4/29/17,0.0,1.0,29706,27000.0,FORD,Ranger,152,11,Commercial,,1,4,C,Unknown,0.116033871,0.09194209199999999,0.184922152,0.104834745,0.095329145,0.162141491,0.244796504,2.4452525659999997,4005.0689729999995,1.4122745840000002,0.6,Insurance Agency,0 4 | 10003,8/18/17,18549.37,1.0,35555,29000.0,FORD,Ranger,152,10,Retail,52.0,2,2,F,Married,0.149935575,0.11128030900000001,0.202295888,0.097575261,0.09411971400000001,0.176525712,0.16826754100000002,2.616979485,5751.1052740000005,1.196202532,0.59,Insurance Agency,0 5 | 10004,8/2/17,2152.92,0.78,26155,97000.0,TOYOTA,Camry,122,4,Retail,58.0,2,2,M,Unknown,0.10081084400000001,0.12610432,0.24204284199999998,0.092339344,0.08060026599999999,0.160232361,0.197870023,1.999605523,2962.848071,1.092110454,0.51,Vehicle Dealership,0 6 | 10005,3/24/17,0.0,0.43,40363,12000.0,HONDA,Accord,122,17,Retail,51.0,1,10,M,Unknown,0.12900731599999998,0.123713793,0.231011717,0.10075536800000001,0.086897044,0.126033427,0.20258133600000003,2.186629834,3224.389464,0.806077348,0.37,Vehicle Dealership,0 7 | -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/feature_detail.yaml: -------------------------------------------------------------------------------- 1 | Categorical: 2 | - DistributionChannel 3 | - VehicleModel 4 | - Zipcode 5 | - VehicleMake 6 | - ClientType 7 | - MaritalStatus 8 | Date: None 9 | Numeric: 10 | - CustomerTenure 11 | - DriverAge 12 | - EngineCapacity 13 | - NumberOfDrivers 14 | - VehicleAge 15 | - VehicleCostNew 16 | - Zipcode_Aged_18_24 17 | - Zipcode_Aged_25_29 18 | - Zipcode_Aged_30_39 19 | - Zipcode_Aged_40_44 20 | - Zipcode_Aged_45_49 21 | - Zipcode_Aged_50_59 22 | - Zipcode_Aged_60 23 | - Zipcode_CommuteViaCar 24 | - Zipcode_PersonsPerHousehold 25 | - Zipcode_VehiclesPerHousehold 26 | - Zipcode_annualMileage 27 | Offset: Exposure 28 | Target: IncurredClaims 29 | Text: None 30 | -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/mymodel.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from typing import List, Optional, Any, Dict 3 | import numpy as np 4 | import pandas as pd 5 | from category_encoders import OrdinalEncoder 6 | from sklearn.impute import SimpleImputer 7 | import lightgbm as lgb 8 | from joblib import load 9 | import yaml 10 | import numpy as np 11 | import shap 12 | import avro 13 | import os 14 | 15 | class MyModel(object): 16 | def __init__(self, code_dir): 17 | """Load the model pickle file.""" 18 | # This supports both Python 2 and 3 19 | with open( code_dir + "/artifacts/lgbm.joblib", "rb") as picklefile: 20 | try: 21 | self.model = load(picklefile, encoding="latin1") 22 | except TypeError: 23 | self.model = load(picklefile) 24 | with open(code_dir + "/artifacts/ordinalEncoder.joblib", "rb") as f: 25 | self.oe = load(f) 26 | with open(code_dir + "/artifacts/simpleImputerNum.joblib", "rb") as f: 27 | self.siNum = load(f) 28 | with open(code_dir + "/artifacts/simpleImputerCat.joblib", "rb") as f: 29 | self.siCat = load(f) 30 | with open(code_dir + "/feature_detail.yaml", "r") as f: 31 | self.feature_detail = yaml.load(f, Loader=yaml.FullLoader) 32 | self.numeric_features = self.feature_detail["Numeric"] 33 | self.categorical_features = self.feature_detail["Categorical"] 34 | self.offset = self.feature_detail["Offset"] 35 | self.explainer = shap.TreeExplainer(self.model) 36 | self.shap_headers = ["SHAP_{}".format(i) for i in self.numeric_features + self.categorical_features] 37 | schema_path = os.path.join(code_dir, "schema.avsc") 38 | self.schema = avro.schema.parse(open(schema_path, "rb").read()) 39 | 40 | def preprocess_features(self, X): 41 | offset = X[self.offset].values 42 | x_num = X[self.numeric_features].values 43 | x_num = self.siNum.transform(x_num) 44 | x_cat = X[self.categorical_features].values 45 | x_cat = self.siCat.transform(x_cat) 46 | x_cat = self.oe.transform(x_cat) 47 | x = np.concatenate([x_cat, x_num], axis=1) 48 | return (x, offset) 49 | 50 | def explain(self, X): 51 | X_t = self.preprocess_features(X)[0] 52 | shap_values = self.explainer.shap_values( X_t ) 53 | return pd.DataFrame(shap_values, columns = self.shap_headers).to_dict(orient="records") 54 | 55 | 56 | 57 | def predict( 58 | self, X, positive_class_label=None, negative_class_label=None, **kwargs 59 | ): 60 | """ 61 | Predict with the pickled custom model. 62 | 63 | If your model is for classification, you likely want to ensure this function 64 | calls `predict_proba()`, whereas for regression it should use `predict()` 65 | """ 66 | X, offset = self.preprocess_features(X) 67 | predictions = np.exp(self.model.predict(X, raw_score=True)) * offset 68 | 69 | return pd.DataFrame(predictions, columns = ["Predictions"]) 70 | -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==0.16.0 2 | lightgbm==2.3.1 3 | numpy==1.19.0 4 | pandas==1.1.0 5 | scikit-learn==0.23.1 6 | scipy==1.5.1 7 | PyYAML==5.1.1 8 | category-encoders==2.2.2 9 | shap==0.37.0 10 | avro==1.10.0 -------------------------------------------------------------------------------- /custom_inference/python/insurance_pricing/schema.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "prediction", 5 | "type": "float" 6 | }, 7 | { 8 | "name": "shap_values", 9 | "type": [ 10 | { 11 | "type": "map", 12 | "values": "float", 13 | "default": {} 14 | } 15 | ] 16 | } 17 | ], 18 | "namespace": "shap.avro", 19 | "type": "record", 20 | "name": "predictions" 21 | } -------------------------------------------------------------------------------- /custom_inference/python/mnist/drum/classlabels.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | -------------------------------------------------------------------------------- /custom_inference/python/mnist/drum/custom.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tensorflow import keras 4 | from tensorflow.keras import layers 5 | from tensorflow.keras.models import Sequential 6 | import os 7 | import io 8 | from io import StringIO 9 | 10 | 11 | def load_model(code_dir): 12 | model_path = 'mnist.h5' 13 | print ("model ready to load") 14 | model = keras.models.load_model(os.path.join(code_dir, model_path), compile=False) 15 | print ("model loaded") 16 | return model 17 | 18 | def score(data, model, **kwargs): 19 | print (data.shape) 20 | X=data.drop(data.columns[0],axis=1) 21 | X=X.values.reshape(X.shape[0],28,28,1) 22 | predictions = model.predict(X) 23 | print (predictions) 24 | s = pd.DataFrame(predictions) 25 | return s 26 | 27 | def score_unstructured(model, data, query, **kwargs): 28 | print("Incoming content type params: ", kwargs) 29 | print("Incoming data type: ", type(data)) 30 | print("Incoming query params: ", query) 31 | input = io.StringIO(data) 32 | X = pd.read_csv(input) 33 | print (X.shape) 34 | X=X.drop(X.columns[0],axis=1) 35 | X=X.values.reshape(X.shape[0],28,28,1) 36 | predictions = model.predict(X) 37 | print (predictions) 38 | s = pd.DataFrame(predictions) 39 | t = s.to_csv(index=False) 40 | return t 41 | 42 | 43 | -------------------------------------------------------------------------------- /custom_inference/python/mnist/drum/mnist.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/mnist/drum/mnist.h5 -------------------------------------------------------------------------------- /custom_inference/python/mnist/drum/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.19.2 2 | tensorflow>=2.4.0 3 | -------------------------------------------------------------------------------- /custom_inference/python/movie_recommender/read_me.md: -------------------------------------------------------------------------------- 1 | # Recommender System Unstructured Example 2 | 3 | This folder includes an example of a Keras movie recommender system. The model was built using the Notebook [here](https://keras.io/examples/structured_data/collaborative_filtering_movielens/) 4 | 5 | Using the saved model from that notebook, we then use DRUM to validate and then score using the saved model. 6 | 7 | In the `custom.py` script, we use the hook functions including the `load_model` and `score_unstructured` functions. 8 | 9 | Additional modifications have been made to the `score_unstructured` function to output the actual names of the movies 10 | as opposed to the index of the movieId. 11 | 12 | -------------------------------------------------------------------------------- /custom_inference/python/movie_recommender/recommender_model/custom.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | from tensorflow import keras 5 | from tensorflow.keras import layers 6 | from keras.models import Sequential 7 | import os 8 | import io 9 | from io import StringIO 10 | 11 | 12 | def load_model(code_dir): 13 | model_path = 'saved_model.pb' 14 | model = tf.keras.models.load_model('./recommender_model/', 15 | custom_objects = None, 16 | compile = True, 17 | options = None) 18 | print ("model loaded") 19 | return model 20 | 21 | 22 | def score(data, model, **kwargs): 23 | predictions = model.predict(data) 24 | print (predictions) 25 | s = pd.DataFrame(predictions) 26 | return s 27 | 28 | 29 | def score_unstructured(model, data, query, **kwargs): 30 | 31 | """ 32 | Load Movie/User Info 33 | 34 | This will be used to output the Actual movie list 35 | instead of just the indexes of the movies 36 | """ 37 | 38 | movie_df = pd.read_csv('movies.csv') 39 | df = pd.read_csv('ratings_file.csv') 40 | data_rec = pd.read_csv('predict.csv') 41 | 42 | user_id = data_rec.userID.iloc[0] 43 | movies_watched_by_user = df[df.userId == user_id] 44 | 45 | 46 | # Find Movies Not Watched 47 | movies_not_watched = movie_df[ 48 | ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values) 49 | ]["movieId"] 50 | 51 | user_ids = df["userId"].unique().tolist() 52 | user2user_encoded = {x: i for i, x in enumerate(user_ids)} 53 | userencoded2user = {i: x for i, x in enumerate(user_ids)} 54 | movie_ids = df["movieId"].unique().tolist() 55 | movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)} 56 | movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)} 57 | 58 | movies_not_watched = list( 59 | set(movies_not_watched).intersection(set(movie2movie_encoded.keys())) 60 | ) 61 | 62 | movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched] 63 | user_encoder = user2user_encoded.get(user_id) 64 | user_movie_array = np.hstack( 65 | ([[user_encoder]] * len(movies_not_watched), movies_not_watched) 66 | ) 67 | 68 | """ 69 | Finished Loading Movie data 70 | """ 71 | 72 | input = io.StringIO(data) 73 | X = pd.read_csv(input) 74 | 75 | # Fill NA 76 | # Cast Inputs as INTs to properly handle NULL value imputation 77 | # and prevent from being cast as Floats 78 | X['userID'].fillna(user_id, inplace=True) 79 | X['movies'].fillna(3678, inplace=True) 80 | X["userID"] = X["userID"].astype(int) 81 | X["movies"] = X["movies"].astype(int) 82 | 83 | 84 | ratings = model.predict(X).flatten() 85 | 86 | # Take the Top 10 Movie Recommendations 87 | top_ratings_indices = ratings.argsort()[-10:][::-1] 88 | recommended_movie_ids = [ 89 | movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices 90 | ] 91 | 92 | recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)] 93 | for row in recommended_movies.itertuples(): 94 | print(row.title, ":", row.genres) 95 | 96 | # print (recommended_movies) 97 | s = pd.DataFrame(recommended_movies) 98 | t = s.to_csv(index=False) 99 | return t 100 | 101 | 102 | -------------------------------------------------------------------------------- /custom_inference/python/movie_recommender/recommender_model/saved_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/movie_recommender/recommender_model/saved_model.pb -------------------------------------------------------------------------------- /custom_inference/python/movie_recommender/recommender_model/variables/variables.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/movie_recommender/recommender_model/variables/variables.data-00000-of-00001 -------------------------------------------------------------------------------- /custom_inference/python/movie_recommender/recommender_model/variables/variables.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/movie_recommender/recommender_model/variables/variables.index -------------------------------------------------------------------------------- /custom_inference/python/readmissions/README.md: -------------------------------------------------------------------------------- 1 | # Custom Model Examples 2 | 3 | This folder includes various examples of custom models using the well known `Hospital Readmissions` dataset. There are three examples (Level 1 - 3) with each level presenting a more complicated example of custom model than the one previously to it. 4 | 5 | In the `custom.py` scripts, we use all of the available hook functions including the `fit` function. This means that these examples can be used as both `custom inference` and `custom training` models. To see the difference between the two, check the official DataRobot-Drum package documentation in GitHub [here](https://github.com/datarobot/datarobot-user-models) 6 | 7 | If you try to upload the models in DataRobot using the UI, make sure to choose `Scikit-Learn Drop in` environment. Lastly, make sure you upload the items saved inside `custom_model` folder and not the custom folder itself. 8 | 9 | ## Creating environment 10 | The easiest way to create an environment to both train and test these models with drum, would be to execute the below commands after you install conda. 11 | 12 | `conda create --name your-env-name python=3.7.0` 13 | `conda activate your-env-name` 14 | `pip install -r requirements.txt` 15 | 16 | ## Important Links 17 | 18 | For more information on how to use DRUM to test and deploy your custom models, follow the [link](https://github.com/datarobot-community/custom-models/tree/master/drum_overview) 19 | 20 | ## Contents 21 | 22 | #### Readmission_level_1 23 | 24 | *High Level Overview:* 25 | 26 | 1. Fit a CatBoost classifier on top of the `Hospital Readmissions` dataset. 27 | 2. `custom.py` script needs to include Null value handling. 28 | 3. `custom.py` script needs to search for keyword `Diabetes|diabetes` in `diag_1_desc` column and create a new Boolean column. 29 | 30 | #### Readmission_level_2 31 | 32 | *High Level Overview:* 33 | 34 | 1. Preprocess Data using scikit-learn pipeline 35 | 2. Fit XGboost model on the data 36 | 3. `custom.py` script needs to preprocess using the scikit-learn pipeline 37 | 4. `custom.py` script needs to score using the XGboost Model. 38 | 39 | The extra difficulty here is that we need to define where DRUM is to find both of the preprocessing and the keras model in order for this custom model to work. 40 | 41 | #### Readmission_level_3 42 | 43 | The biostatisticians at ABC labs have a legacy model that they are using to predict probability to be readmitted into the hospital. They found that using an ensemble model between their own and the keras model, yields the best outcome. The result needs to be the average probability between the two models. 44 | 45 | *High Level Overview:* 46 | 47 | 1. Preprocess Data using scikit-learn pipeline 48 | 2. Fit XGboost model on the data 49 | 3. `custom.py` script needs to preprocess using the scikit-learn pipeline 50 | 4. `custom.py` script needs to score using the XGboost Model. 51 | 4. `custom.py` script needs to return the average probability as calculated from XGboost + legacy model. -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_1/catboost_info/catboost_training.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta":{"test_sets":[],"test_metrics":[],"learn_metrics":[{"best_value":"Min","name":"Logloss"}],"launch_mode":"Train","parameters":"","iteration_count":2,"learn_sets":["learn"],"name":"experiment"}, 3 | "iterations":[ 4 | {"learn":[0.645309907],"iteration":0,"passed_time":0.06549675921,"remaining_time":0.06549675921}, 5 | {"learn":[0.632149336],"iteration":1,"passed_time":0.07959602195,"remaining_time":0} 6 | ]} -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_1/catboost_info/learn/events.out.tfevents: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_1/catboost_info/learn/events.out.tfevents -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_1/catboost_info/learn_error.tsv: -------------------------------------------------------------------------------- 1 | iter Logloss 2 | 0 0.645309907 3 | 1 0.632149336 4 | -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_1/catboost_info/time_left.tsv: -------------------------------------------------------------------------------- 1 | iter Passed Remaining 2 | 0 65 65 3 | 1 79 0 4 | -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_1/custom_model/custom.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Optional 6 | from catboost import CatBoostClassifier 7 | import io 8 | 9 | def read_input_data(input_binary_data): 10 | return pd.read_csv(io.BytesIO(input_binary_data)) 11 | 12 | 13 | def transform(data,model): 14 | """ 15 | Note: This hook may not have to be implemented for your model. 16 | In this case implemented for the model used in the example. 17 | Modify this method to add data transformation before scoring calls. For example, this can be 18 | used to implement one-hot encoding for models that don't include it on their own. 19 | Parameters 20 | ---------- 21 | data: pd.DataFrame 22 | model: object, the deserialized model 23 | Returns 24 | ------- 25 | pd.DataFrame 26 | """ 27 | # Execute any steps you need to do before scoring 28 | 29 | def find_diabetes_text(txt): 30 | try: 31 | if 'diabetes' in txt.lower(): 32 | return 1 33 | else: 34 | return 0 35 | except: 36 | 0 37 | 38 | #DataRobot Drum will check what happens when values are imputed. That is why I explicetely define cat_features 39 | cat_features = ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty', 'diag_1', 40 | 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 41 | 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 42 | 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 43 | 'glyburide_metformin', 'glipizide_metformin', 'glimepiride_pioglitazone', 'metformin_rosiglitazone', 44 | 'metformin_pioglitazone', 'change', 'diabetesMed'] 45 | 46 | # Fill null values for Categorical Features 47 | for c in cat_features: 48 | data[c] = data[c].fillna('unknown') 49 | 50 | #Some categorical features (diag_1), have float values which in reality are categories. Catboost takes either int or object as input 51 | #so I am casting. 52 | try: 53 | data[c] = data[c].astype(int) 54 | except: 55 | pass 56 | 57 | # Find out if `Diabetes|`diabetes` exists in diag_1_desc column 58 | data['diabetes'] = data['diag_1_desc'].apply(lambda x: find_diabetes_text(x)) 59 | data.drop('diag_1_desc',axis=1,inplace=True) 60 | 61 | # Fill null values for numeric features 62 | data = data.fillna(0) 63 | 64 | return data 65 | 66 | def score(data, model, **kwargs): 67 | 68 | results = model.predict_proba(data) 69 | 70 | #Create two columns with probability results 71 | predictions = pd.DataFrame({'True': results[:, 0]}) 72 | predictions['False'] = 1 - predictions['True'] 73 | 74 | return predictions 75 | -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_1/custom_model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_1/custom_model/model.pkl -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_1/custom_model/requirements.txt: -------------------------------------------------------------------------------- 1 | joblib>=0.14.0 2 | pandas>=0.25.1 3 | catboost>=0.24.2 4 | scipy>=1.5.3 5 | scikit_learn>=0.22.0 6 | -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_2/custom_model/custom.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import joblib 4 | from xgboost import XGBClassifier 5 | 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.compose import ColumnTransformer 8 | from sklearn.preprocessing import StandardScaler,OneHotEncoder 9 | 10 | import os 11 | import io 12 | from typing import List, Optional 13 | from scipy.special import expit 14 | g_code_dir = None 15 | 16 | schema = {"race": "object", "gender": "object", "age": "object", "weight": "object", "admission_type_id": "object", "discharge_disposition_id": "object", "admission_source_id": "object", "time_in_hospital": "int64", "payer_code": "object", "medical_specialty": "object", "num_lab_procedures": "int64", "num_procedures": "int64", "num_medications": "int64", "number_outpatient": "int64", "number_emergency": "int64", "number_inpatient": "int64", "number_diagnoses": "int64", "max_glu_serum": "object", "A1Cresult": "object", "metformin": "object", "repaglinide": "object", "nateglinide": "object", "chlorpropamide": "object", "glimepiride": "object", "acetohexamide": "object", "glipizide": "object", "glyburide": "object", "tolbutamide": "object", "pioglitazone": "object", "rosiglitazone": "object", "acarbose": "object", "miglitol": "object", "troglitazone": "object", "tolazamide": "object", "examide": "object", "citoglipton": "object", "insulin": "object", "glyburide_metformin": "object", "glipizide_metformin": "object", "glimepiride_pioglitazone": "object", "metformin_rosiglitazone": "object", "metformin_pioglitazone": "object", "change": "object", "diabetesMed": "object"} 17 | 18 | def init(code_dir): 19 | global g_code_dir 20 | g_code_dir = code_dir 21 | 22 | def read_input_data(input_binary_data): 23 | data = pd.read_csv(io.BytesIO(input_binary_data)) 24 | data.drop(['diag_1_desc', 'diag_1', 'diag_2', 'diag_3'],axis=1,inplace=True) 25 | 26 | #Saving this for later 27 | return data 28 | 29 | def transform(data, model): 30 | """ 31 | Note: This hook may not have to be implemented for your model. 32 | In this case implemented for the model used in the example. 33 | Modify this method to add data transformation before scoring calls. For example, this can be 34 | used to implement one-hot encoding for models that don't include it on their own. 35 | Parameters 36 | ---------- 37 | data: pd.DataFrame 38 | model: object, the deserialized model 39 | Returns 40 | ------- 41 | pd.DataFrame 42 | """ 43 | 44 | #Handle null values in categories and numerics 45 | for c,dt in schema.items(): 46 | if dt =='object': 47 | data[c] = data[c].fillna('missing') 48 | else: 49 | data[c] = data[c].fillna(0) 50 | 51 | pipeline_path = 'preprocessing.pkl' 52 | pipeline = joblib.load(os.path.join(g_code_dir, pipeline_path)) 53 | preprocessed = pipeline.transform(data) 54 | preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed) 55 | 56 | return preprocessed 57 | 58 | def load_model(code_dir): 59 | model_path = 'model.pkl' 60 | model = joblib.load(os.path.join(code_dir, model_path)) 61 | return model 62 | 63 | def score(data, model, **kwargs): 64 | results = model.predict_proba(data) 65 | predictions = pd.DataFrame({'True': results[:, 0], 'False':results[:, 1]}) 66 | 67 | return predictions 68 | -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_2/custom_model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_2/custom_model/model.pkl -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_2/custom_model/preprocessing.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_2/custom_model/preprocessing.pkl -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_2/custom_model/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | astor==0.8.1 3 | cached-property==1.5.1 4 | certifi==2020.11.8 5 | gast==0.3.3 6 | google-pasta==0.2.0 7 | grpcio==1.33.2 8 | importlib-metadata==1.7.0 9 | numpy==1.18 10 | protobuf==3.13.0 11 | PyYAML==5.3.1 12 | six==1.15.0 13 | xgboost==1.2.1 14 | termcolor==1.1.0 15 | Theano==0.8.2 16 | Werkzeug==2.0.1 17 | wrapt==1.12.1 18 | zipp==3.4.0 19 | joblib==0.14.0 20 | pandas==0.25.1 21 | scipy==1.5.3 22 | scikit_learn==0.22.0 -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_3/custom_model/custom.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import joblib 4 | from xgboost import XGBClassifier 5 | 6 | from sklearn.pipeline import Pipeline 7 | from sklearn.compose import ColumnTransformer 8 | from sklearn.preprocessing import StandardScaler,OneHotEncoder 9 | from sklearn.impute import SimpleImputer 10 | 11 | import os 12 | from typing import List, Optional 13 | from scipy.special import expit 14 | import io 15 | 16 | g_input_filename = None 17 | g_code_dir = None 18 | 19 | schema = {"race": "object", "gender": "object", "age": "object", "weight": "object", "admission_type_id": "object", "discharge_disposition_id": "object", "admission_source_id": "object", "time_in_hospital": "int64", "payer_code": "object", "medical_specialty": "object", "num_lab_procedures": "int64", "num_procedures": "int64", "num_medications": "int64", "number_outpatient": "int64", "number_emergency": "int64", "number_inpatient": "int64", "number_diagnoses": "int64", "max_glu_serum": "object", "A1Cresult": "object", "metformin": "object", "repaglinide": "object", "nateglinide": "object", "chlorpropamide": "object", "glimepiride": "object", "acetohexamide": "object", "glipizide": "object", "glyburide": "object", "tolbutamide": "object", "pioglitazone": "object", "rosiglitazone": "object", "acarbose": "object", "miglitol": "object", "troglitazone": "object", "tolazamide": "object", "examide": "object", "citoglipton": "object", "insulin": "object", "glyburide_metformin": "object", "glipizide_metformin": "object", "glimepiride_pioglitazone": "object", "metformin_rosiglitazone": "object", "metformin_pioglitazone": "object", "change": "object", "diabetesMed": "object"} 20 | 21 | def init(code_dir): 22 | global g_code_dir 23 | g_code_dir = code_dir 24 | 25 | def read_input_data(input_binary_data): 26 | data = pd.read_csv(io.BytesIO(input_binary_data)) 27 | data.drop(['diag_1_desc', 'diag_1', 'diag_2', 'diag_3'],axis=1,inplace=True) 28 | 29 | #Saving this for later 30 | global g_input_filename 31 | g_input_filename = input_binary_data 32 | return data 33 | 34 | def transform(data, model): 35 | """ 36 | Note: This hook may not have to be implemented for your model. 37 | In this case implemented for the model used in the example. 38 | Modify this method to add data transformation before scoring calls. For example, this can be 39 | used to implement one-hot encoding for models that don't include it on their own. 40 | Parameters 41 | ---------- 42 | data: pd.DataFrame 43 | model: object, the deserialized model 44 | Returns 45 | ------- 46 | pd.DataFrame 47 | """ 48 | 49 | #Handle null values in categories and numerics 50 | for c,dt in schema.items(): 51 | if dt =='object': 52 | data[c] = data[c].fillna('missing') 53 | else: 54 | data[c] = data[c].fillna(0) 55 | 56 | pipeline_path = 'preprocessing.pkl' 57 | pipeline = joblib.load(os.path.join(g_code_dir, pipeline_path)) 58 | preprocessed = pipeline.transform(data) 59 | preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed) 60 | 61 | return preprocessed 62 | 63 | def load_model(code_dir): 64 | model_path = 'model.pkl' 65 | model = joblib.load(os.path.join(code_dir, model_path)) 66 | return model 67 | 68 | def score(data, model, **kwargs): 69 | results = model.predict_proba(data) 70 | predictions = pd.DataFrame({'True': results[:, 0], 'False':results[:, 1]}) 71 | 72 | return predictions 73 | 74 | #Adding post_process to use legacy model together with Keras model 75 | def post_process(predictions,model): 76 | original_data = pd.read_csv(io.BytesIO(g_input_filename)) 77 | original_data.fillna(0,inplace=True) 78 | 79 | def legacy_score(row): 80 | try: 81 | return expit(0.59 + 0.55 * row['number_inpatient'] + 0.36 * row['number_outpatient']) 82 | except: 83 | return 0.38 84 | 85 | predictions['True_legacy'] = original_data.apply(lambda row: legacy_score(row), axis=1) 86 | predictions['True'] = (predictions['True'] + predictions['True_legacy']) 87 | predictions['True'] = predictions['True']/2 88 | predictions['False'] = 1 - predictions['True'] 89 | 90 | predictions.drop('True_legacy',axis=1,inplace=True) 91 | 92 | return predictions 93 | -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_3/custom_model/model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_3/custom_model/model.pkl -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_3/custom_model/preprocessing.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_3/custom_model/preprocessing.pkl -------------------------------------------------------------------------------- /custom_inference/python/readmissions/Readmission_level_3/custom_model/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | astor==0.8.1 3 | cached-property==1.5.1 4 | certifi==2020.11.8 5 | gast==0.3.3 6 | google-pasta==0.2.0 7 | grpcio==1.33.2 8 | importlib-metadata==1.7.0 9 | numpy==1.18 10 | protobuf==3.13.0 11 | PyYAML==5.3.1 12 | six==1.15.0 13 | xgboost==1.2.1 14 | termcolor==1.1.0 15 | Theano==0.8.2 16 | Werkzeug==2.0.1 17 | wrapt==1.12.1 18 | zipp==3.4.0 19 | joblib==0.14.0 20 | pandas==0.25.1 21 | scipy==1.5.3 22 | scikit_learn==0.22.0 -------------------------------------------------------------------------------- /custom_inference/python/readmissions/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.11.0 2 | astor==0.8.1 3 | cached-property==1.5.1 4 | certifi==2020.11.8 5 | gast==0.3.3 6 | google-pasta==0.2.0 7 | grpcio==1.33.2 8 | importlib-metadata==1.7.0 9 | numpy==1.18 10 | protobuf==3.13.0 11 | PyYAML==5.3.1 12 | six==1.15.0 13 | termcolor==1.1.0 14 | Theano==0.8.2 15 | Werkzeug==2.0.1 16 | wrapt==1.12.1 17 | zipp==3.4.0 18 | joblib==0.14.0 19 | pandas==0.25.1 20 | scipy==1.5.3 21 | scikit_learn==0.22.0 22 | catboost==0.24.2 23 | datarobot-drum==1.5.4 24 | xgboost==1.2.1 -------------------------------------------------------------------------------- /custom_inference/r/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_basic/README.md: -------------------------------------------------------------------------------- 1 | ## R Fit Template 2 | 3 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model. 4 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 5 | use different modeling or preprocessing techniques. 6 | 7 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default. 8 | 9 | Change the target value in the code below to Species to test multilabel classification. 10 | 11 | ### To run locally using 'drum' 12 | Paths are relative to `datarobot-user-models` root: 13 | `drum fit --code-dir model_templates/training/r_glm_noncaret_basic --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'` 14 | If the command succeeds, your code is ready to be uploaded. 15 | 16 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_basic/create_pipeline.R: -------------------------------------------------------------------------------- 1 | # This is the simplest POC that Composable ML / DRUM can 2 | # work with non-caret R models. 3 | # Here I just take the caret example and replace the 4 | # caret model with a very naive GLM() that uses the default 5 | # settings for family, link, prediction scale, etc. 6 | # - Jason 7 | 8 | create_pipeline<-function(X, y, model_type='regression') { 9 | 10 | # set up dataframe for modeling 11 | train_df <- X 12 | train_df$target <- unlist(y) 13 | if (model_type == 'classification'){ 14 | train_df$target <- as.factor(train_df$target) 15 | } 16 | 17 | 18 | # Run the model using builtin glm to see if we can get around using caret 19 | model <- glm(target~.,data=train_df) 20 | return(model) 21 | } 22 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_basic/custom.R: -------------------------------------------------------------------------------- 1 | init <- function(code_dir) { 2 | # custom init function to load required libraries 3 | library(tidyverse) 4 | library(caret) 5 | library(recipes) 6 | library(e1071) 7 | library(gbm) 8 | source(file.path(code_dir, 'create_pipeline.R')) 9 | } 10 | 11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){ 12 | #' User-provided fit method, required for custom training 13 | #' 14 | #' Trains a regression or classification model using gbm (via caret) 15 | #' @param X data.frame - training data to perform fit on 16 | #' @param y data.frame column or array - target data to perform fit on 17 | #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the 18 | #' 'drum fit' command. 19 | #' @param class_order : A two element long list dictating the order of classes which should be used for 20 | #' modeling. Class order will always be passed to fit by DataRobot for classification tasks, 21 | #' and never otherwise. When models predict, they output a likelihood of one class, with a 22 | #' value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order 23 | #' dictates that the first element in the list will be the 0 class, and the second will be the 24 | #' 1 class. 25 | #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important 26 | #' a row is. Row weights is only optionally used, and there will be no filtering for which 27 | #' custom models support this. There are two situations when values will be passed into 28 | #' row_weights, during smart downsampling and when weights are explicitly provided by the user 29 | #' @param ...: Added for forwards compatibility 30 | #' @return Nothing 31 | 32 | if (!is.null(class_order)){ 33 | model <- create_pipeline(X, y, 'classification') 34 | } else { 35 | model <- create_pipeline(X, y, 'regression') 36 | } 37 | # Save model 38 | model_path <- file.path(output_dir, 'artifact.rds') 39 | saveRDS(model, file = model_path) 40 | } 41 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_feateng/README.md: -------------------------------------------------------------------------------- 1 | ## WORK IN PROCESS - don't run yet 2 | I'm working on an issue with DR finding my helper functions. Until that is resolved this isn't ready to run yet. 3 | 4 | ## R Fit Template 5 | 6 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model. 7 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 8 | use different modeling or preprocessing techniques. This version uses several custom functions for data cleaning and quality-checking, including best practices for factors (categorical) levels. The max number of factor levels is 30. 9 | 10 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default. 11 | 12 | Change the target value in the code below to Species to test multilabel classification. 13 | 14 | ### To run locally using 'drum' 15 | Paths are relative to `datarobot-user-models` root: 16 | `drum fit --code-dir model_templates/training/r_glm_noncaret_feateng --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'` 17 | If the command succeeds, your code is ready to be uploaded. 18 | 19 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_feateng/create_pipeline.R: -------------------------------------------------------------------------------- 1 | source("preprocess.R") # not working in DR yet! 2 | source("rmcons.R") 3 | source("rmident.R") 4 | 5 | create_pipeline<-function(X, y, model_type='regression') { 6 | # Clean 7 | X <- rm_ident(X) 8 | X <- rm_cons(X) 9 | X <- preprocess(X) 10 | 11 | # set up dataframe for modeling 12 | train_df <- X 13 | train_df$target <- unlist(y) 14 | if (model_type == 'classification'){ 15 | train_df$target <- as.factor(train_df$target) 16 | } 17 | 18 | # Run the model using builtin glm function 19 | model <- glm(target~.,data=train_df) 20 | return(model) 21 | } 22 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_feateng/custom.R: -------------------------------------------------------------------------------- 1 | init <- function(code_dir) { 2 | # custom init function to load required libraries 3 | library(tidyverse) 4 | library(caret) 5 | library(recipes) 6 | library(e1071) 7 | library(gbm) 8 | source(file.path(code_dir, 'create_pipeline.R')) 9 | } 10 | 11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){ 12 | #' User-provided fit method, required for custom training 13 | #' 14 | #' Trains a regression or classification model using gbm (via caret) 15 | #' @param X data.frame - training data to perform fit on 16 | #' @param y data.frame column or array - target data to perform fit on 17 | #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the 18 | #' 'drum fit' command. 19 | #' @param class_order : A two element long list dictating the order of classes which should be used for 20 | #' modeling. Class order will always be passed to fit by DataRobot for classification tasks, 21 | #' and never otherwise. When models predict, they output a likelihood of one class, with a 22 | #' value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order 23 | #' dictates that the first element in the list will be the 0 class, and the second will be the 24 | #' 1 class. 25 | #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important 26 | #' a row is. Row weights is only optionally used, and there will be no filtering for which 27 | #' custom models support this. There are two situations when values will be passed into 28 | #' row_weights, during smart downsampling and when weights are explicitly provided by the user 29 | #' @param ...: Added for forwards compatibility 30 | #' @return Nothing 31 | 32 | if (!is.null(class_order)){ 33 | model <- create_pipeline(X, y, 'classification') 34 | } else { 35 | model <- create_pipeline(X, y, 'regression') 36 | } 37 | # Save model 38 | model_path <- file.path(output_dir, 'artifact.rds') 39 | saveRDS(model, file = model_path) 40 | } 41 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_feateng/preprocess.R: -------------------------------------------------------------------------------- 1 | #' Provides factor handling + mean imputation for numeric, etc 2 | #' 3 | #' @param train data.frame or similar classes 4 | #' @param test data.frame or similar 5 | #' @param exclude character scalar or vector of any column names to ignore 6 | #' @return None 7 | #' @export 8 | #' 9 | 10 | # Purge / Impute Missing Data --------------------------------------------- 11 | preprocess <- function(train, test, exclude){ 12 | # preprocess() provides factor handling and mean imputation for numeric 13 | 14 | # Coherent handling of factors when unknown new levels may occur 15 | # 16 | # Replaces NA's in numeric values 17 | # 18 | # Character are converted to factor 19 | # 20 | # Integer are converted to numeric 21 | # 22 | # Supports up to 30 factor levels, for > 30 please re-encode as a best practice 23 | # 24 | # Ignores other data classes (date, etc) 25 | 26 | if(missing(exclude)){ 27 | exclude <- c("next_term_retention", "group", "oos_Date", 28 | "Term_Start_Date") 29 | } else{ 30 | if(!class(exclude)=="character"){ 31 | cat("exclude must be a character or character vector") 32 | 33 | } 34 | } 35 | 36 | for(i in 1:length(colnames(train))){ 37 | if(class(train[,i]) %in% c("Date", "POSIXct", "POSIXt")){ 38 | next 39 | } 40 | if("integer" %in% class(train[,i])){ 41 | train[,i] <- as.numeric(train[,i]) 42 | } 43 | if("numeric" %in% class(train[,i])){ 44 | train[,i] <- na.roughfix(train[,i]) 45 | } 46 | if(("factor" %in% class(train[,i])) | 47 | ("character" %in% class(train[,i]))){ 48 | if(colnames(train)[i] %in% exclude){ 49 | next 50 | } else{ 51 | train[,i] <- as.character(train[,i]) 52 | train[1,i] <- NA 53 | train[,i][is.na(train[,i])] <- "Missing" 54 | train[,i] <- as.factor(train[,i]) 55 | } 56 | if(length(levels(train[,i])) > 30){ 57 | print("TOO MANY LEVELS!") 58 | print(colnames(train)[i]) 59 | print(length(levels(train[,i]))) 60 | } 61 | } 62 | } 63 | 64 | test <- test[,colnames(test) %in% colnames(train)] 65 | 66 | cn <- colnames(test) 67 | for(i in 1:length(cn)){ 68 | if("integer" %in% class(test[,i])){ 69 | test[,i] <- as.numeric(test[,i]) 70 | } 71 | if("numeric" %in% class(test[,i])){ 72 | test[,i] <- na.roughfix(test[,i]) 73 | } 74 | if(("factor" %in% class(test[,i])) | 75 | ("character" %in% class(test[,i]))){ 76 | if(colnames(test)[i] %in% exclude){ 77 | next 78 | } else{ 79 | test[,i] <- as.character(test[,i]) 80 | test[1,i] <- NA 81 | test[,i][is.na(test[,i])] <- "Missing" 82 | test[,i] <- as.factor(test[,i]) 83 | } 84 | 85 | if(!identical(levels(test[,i]), 86 | levels(train[,colnames(test)[i]]))){ 87 | l1 <- levels(train[,colnames(test)[i]]) 88 | l2 <- levels(test[,i]) 89 | 90 | test[,i] <- as.character(test[,i]) 91 | test[,i] <- ifelse(test[,i] %in% l1, 92 | as.character(test[,i]), 93 | "Missing") 94 | 95 | if(length(l2) < length(l1) | 96 | (length(l2) == length(l1) & 97 | !(identical(levels(as.factor(test[,i])), 98 | levels(train[,colnames(test)[i]])))) 99 | ){ 100 | for(n in 1:length(l1[!l1 %in% l2])){ 101 | test[10+n,i] <- l1[!l1 %in% l2][n] 102 | } 103 | } 104 | 105 | test[,i] <- as.factor(test[,i]) 106 | 107 | if(!identical(levels(test[,i]), 108 | levels(train[,colnames(test)[i]]))){ 109 | print("There is a problem with factor levels!") 110 | print(i) 111 | } 112 | } 113 | 114 | if(length(levels(test[,i])) > 30){ 115 | print(colnames(test)[i]) 116 | print(length(levels(test[,i]))) 117 | } 118 | } 119 | } 120 | 121 | for(i in 1:ncol(train)){ 122 | if(!(identical( 123 | colnames(train)[i], 124 | colnames(test)[i] 125 | ))){ 126 | print(colnames(train)[i]) 127 | print(i) 128 | } 129 | if(!(identical( 130 | class(train)[i], 131 | class(test)[i] 132 | ))){ 133 | print("There is a problem with:") 134 | print(colnames(train)[i]) 135 | print("which is col number:") 136 | print(i) 137 | } 138 | if(!(identical( 139 | levels(train)[i], 140 | levels(test)[i] 141 | ))){ 142 | print("There is a problem with:") 143 | print(colnames(train)[i]) 144 | print(i) 145 | } 146 | } 147 | if(sum(is.na(test)) > 0){print("!!!WARNING!!! THERE IS A PROBLEM WITH MISSING DATA IN test")} 148 | } 149 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_feateng/rmcons.R: -------------------------------------------------------------------------------- 1 | #' Removes constant columns 2 | #' 3 | #' @param x data.frame or similar classes 4 | #' 5 | #' @return None 6 | #' @export 7 | #' 8 | #' 9 | rm_cons <- function(x){ 10 | cat("\n## Removing the constant features.\n") 11 | for (f in names(x)) { 12 | if (length(unique(x[[f]])) == 1) { 13 | cat(f, "is constant. It is has been deleted.\n") 14 | x[[f]] <- NULL 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_feateng/rmident.R: -------------------------------------------------------------------------------- 1 | 2 | #' Removing identical features 3 | #' 4 | #' @param train required. data.frame or similar classes 5 | #' @param test optional. data.frame or similar classes 6 | #' 7 | #' @return None 8 | #' @export 9 | #' 10 | #' 11 | 12 | 13 | 14 | rm_ident <- function(train, test=NULL) { 15 | features_pair <- combn(names(train), 2, simplify = F) 16 | toRemove <- c() 17 | for(pair in features_pair) { 18 | f1 <- pair[1] 19 | f2 <- pair[2] 20 | 21 | if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) { 22 | if (all(train[[f1]] == train[[f2]])) { 23 | cat(f1, "and", f2, "are equals.\n") 24 | toRemove <- c(toRemove, f2) 25 | } 26 | } 27 | } 28 | train <- train[,!colnames(train) %in% toRemove] 29 | if(!missing(test)){ 30 | test <- test[,!colnames(test) %in% toRemove] 31 | } 32 | } 33 | 34 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_gamma/README.md: -------------------------------------------------------------------------------- 1 | ## R Fit Template 2 | 3 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model. 4 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 5 | use different modeling or preprocessing techniques. 6 | 7 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default. 8 | 9 | Change the target value in the code below to Species to test multilabel classification. 10 | 11 | ### To run locally using 'drum' 12 | Paths are relative to `datarobot-user-models` root: 13 | `drum fit --code-dir model_templates/training/r_glm_noncaret_gamma --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'` 14 | If the command succeeds, your code is ready to be uploaded. 15 | 16 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_gamma/create_pipeline.R: -------------------------------------------------------------------------------- 1 | create_pipeline<-function(X, y, model_type='regression') { 2 | 3 | # set up data.frame for modeling 4 | train_df <- X 5 | train_df$target <- unlist(y) 6 | if (model_type == 'classification'){ 7 | train_df$target <- as.factor(train_df$target) 8 | } 9 | 10 | 11 | # Run a logistic regression using builtin glm 12 | model <- glm(target~., data=train_df, family = Gamma) 13 | return(model) 14 | } 15 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_gamma/custom.R: -------------------------------------------------------------------------------- 1 | init <- function(code_dir) { 2 | # custom init function to load required libraries 3 | library(tidyverse) 4 | library(caret) 5 | library(recipes) 6 | library(e1071) 7 | library(gbm) 8 | source(file.path(code_dir, 'create_pipeline.R')) 9 | } 10 | 11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){ 12 | #' User-provided fit method, required for custom training 13 | #' 14 | #' Trains a regression or classification model using gbm (via caret) 15 | #' @param X data.frame - training data to perform fit on 16 | #' @param y data.frame column or array - target data to perform fit on 17 | #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the 18 | #' 'drum fit' command. 19 | #' @param class_order : A two element long list dictating the order of classes which should be used for 20 | #' modeling. Class order will always be passed to fit by DataRobot for classification tasks, 21 | #' and never otherwise. When models predict, they output a likelihood of one class, with a 22 | #' value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order 23 | #' dictates that the first element in the list will be the 0 class, and the second will be the 24 | #' 1 class. 25 | #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important 26 | #' a row is. Row weights is only optionally used, and there will be no filtering for which 27 | #' custom models support this. There are two situations when values will be passed into 28 | #' row_weights, during smart downsampling and when weights are explicitly provided by the user 29 | #' @param ...: Added for forwards compatibility 30 | #' @return Nothing 31 | 32 | if (!is.null(class_order)){ 33 | model <- create_pipeline(X, y, 'classification') 34 | } else { 35 | model <- create_pipeline(X, y, 'regression') 36 | } 37 | # Save model 38 | model_path <- file.path(output_dir, 'artifact.rds') 39 | saveRDS(model, file = model_path) 40 | } 41 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_logit/README.md: -------------------------------------------------------------------------------- 1 | ## R Fit Template 2 | 3 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model. 4 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 5 | use different modeling or preprocessing techniques. 6 | 7 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default. 8 | 9 | Change the target value in the code below to Species to test multilabel classification. 10 | 11 | ### To run locally using 'drum' 12 | Paths are relative to `datarobot-user-models` root: 13 | `drum fit --code-dir model_templates/training/r_lang --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'` 14 | If the command succeeds, your code is ready to be uploaded. 15 | 16 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_logit/create_pipeline.R: -------------------------------------------------------------------------------- 1 | create_pipeline<-function(X, y, model_type='regression') { 2 | 3 | # set up data.frame for modeling 4 | train_df <- X 5 | train_df$target <- unlist(y) 6 | if (model_type == 'classification'){ 7 | train_df$target <- as.factor(train_df$target) 8 | } 9 | 10 | 11 | # Run a logistic regression using builtin glm 12 | model <- glm(target~., data=train_df, family = "binomial") 13 | return(model) 14 | } 15 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_logit/custom.R: -------------------------------------------------------------------------------- 1 | init <- function(code_dir) { 2 | # custom init function to load required libraries 3 | library(tidyverse) 4 | library(caret) 5 | library(recipes) 6 | library(e1071) 7 | library(gbm) 8 | source(file.path(code_dir, 'create_pipeline.R')) 9 | } 10 | 11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){ 12 | #' User-provided fit method, required for custom training 13 | #' 14 | #' Trains a regression or classification model using gbm (via caret) 15 | #' @param X data.frame - training data to perform fit on 16 | #' @param y data.frame column or array - target data to perform fit on 17 | #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the 18 | #' 'drum fit' command. 19 | #' @param class_order : A two element long list dictating the order of classes which should be used for 20 | #' modeling. Class order will always be passed to fit by DataRobot for classification tasks, 21 | #' and never otherwise. When models predict, they output a likelihood of one class, with a 22 | #' value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order 23 | #' dictates that the first element in the list will be the 0 class, and the second will be the 24 | #' 1 class. 25 | #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important 26 | #' a row is. Row weights is only optionally used, and there will be no filtering for which 27 | #' custom models support this. There are two situations when values will be passed into 28 | #' row_weights, during smart downsampling and when weights are explicitly provided by the user 29 | #' @param ...: Added for forwards compatibility 30 | #' @return Nothing 31 | 32 | if (!is.null(class_order)){ 33 | model <- create_pipeline(X, y, 'classification') 34 | } else { 35 | model <- create_pipeline(X, y, 'regression') 36 | } 37 | # Save model 38 | model_path <- file.path(output_dir, 'artifact.rds') 39 | saveRDS(model, file = model_path) 40 | } 41 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_recipe/README.md: -------------------------------------------------------------------------------- 1 | ## R Fit Template 2 | 3 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model. 4 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 5 | use different modeling or preprocessing techniques. This version uses the recipe function (recipes library) to perform normalization and check for constant columns. 6 | 7 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default. 8 | 9 | Change the target value in the code below to Species to test multilabel classification. 10 | 11 | ### To run locally using 'drum' 12 | Paths are relative to `datarobot-user-models` root: 13 | `drum fit --code-dir model_templates/training/r_glm_noncaret_recipe --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'` 14 | If the command succeeds, your code is ready to be uploaded. 15 | 16 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_recipe/create_pipeline.R: -------------------------------------------------------------------------------- 1 | # This is the simplest POC that Composable ML / DRUM can 2 | # work with non-caret R models. 3 | # Here I just take the caret example and replace the 4 | # caret model with a very naive GLM() that uses the default 5 | # settings for family, link, prediction scale, etc. 6 | # - Jason 7 | 8 | create_pipeline<-function(X, y, model_type='regression') { 9 | 10 | # set up dataframe for modeling 11 | train_df <- X 12 | train_df$target <- unlist(y) 13 | if (model_type == 'classification'){ 14 | train_df$target <- as.factor(train_df$target) 15 | } 16 | 17 | # set up the modeling pipeline 18 | model_recipe <- recipe(target ~ ., data = train_df) %>% 19 | # Drop constant columns 20 | step_zv(all_predictors()) %>% 21 | 22 | # Numeric preprocessing 23 | step_medianimpute(all_numeric()) %>% 24 | step_normalize(all_numeric(), -all_outcomes()) %>% 25 | 26 | # Categorical preprocessing 27 | step_other(all_nominal(), -all_outcomes()) %>% 28 | step_dummy(all_nominal(), -all_outcomes()) 29 | 30 | # Run the model using the builtin glm function 31 | model <- glm(target~.,data=train_df) 32 | return(model) 33 | } 34 | -------------------------------------------------------------------------------- /custom_inference/r/r_glm_noncaret_recipe/custom.R: -------------------------------------------------------------------------------- 1 | init <- function(code_dir) { 2 | # custom init function to load required libraries 3 | library(tidyverse) 4 | library(caret) 5 | library(recipes) 6 | library(e1071) 7 | library(gbm) 8 | source(file.path(code_dir, 'create_pipeline.R')) 9 | } 10 | 11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){ 12 | #' User-provided fit method, required for custom training 13 | #' 14 | #' Trains a regression or classification model using gbm (via caret) 15 | #' @param X data.frame - training data to perform fit on 16 | #' @param y data.frame column or array - target data to perform fit on 17 | #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the 18 | #' 'drum fit' command. 19 | #' @param class_order : A two element long list dictating the order of classes which should be used for 20 | #' modeling. Class order will always be passed to fit by DataRobot for classification tasks, 21 | #' and never otherwise. When models predict, they output a likelihood of one class, with a 22 | #' value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order 23 | #' dictates that the first element in the list will be the 0 class, and the second will be the 24 | #' 1 class. 25 | #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important 26 | #' a row is. Row weights is only optionally used, and there will be no filtering for which 27 | #' custom models support this. There are two situations when values will be passed into 28 | #' row_weights, during smart downsampling and when weights are explicitly provided by the user 29 | #' @param ...: Added for forwards compatibility 30 | #' @return Nothing 31 | 32 | if (!is.null(class_order)){ 33 | model <- create_pipeline(X, y, 'classification') 34 | } else { 35 | model <- create_pipeline(X, y, 'regression') 36 | } 37 | # Save model 38 | model_path <- file.path(output_dir, 'artifact.rds') 39 | saveRDS(model, file = model_path) 40 | } 41 | -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/README.md: -------------------------------------------------------------------------------- 1 | ## Scala Custom Inference 2 | 3 | #### Owner Tim Whittake => timothy.whittaker@datarobot.com 4 | 5 | Ths custom inference model was written in Scala, using XGBoost4j to predict Iris Species (Binary Version). The main class, `XGBoostPredictor` inherits the `BasePredictor` class from DRUM. You can see the code [here](src/main/scala/XGBoostPredictor.scala). Training Code was included as well. 6 | 7 | The serialized version of the model is already available, but if you would like to train and save it to `custom-model/xgb-model` run the following from commend line. 8 | 9 | `java -jar custom-model/custom-scala-assembly-0.1.0.jar ./custom-model/xgb-model` 10 | 11 | To run this model with `DRUM` export the following environment variables. 12 | 13 | `export DRUM_JAVA_CUSTOM_CLASS_PATH=/full/path/to/custom-model/custom-scala-assembly-0.1.0.jar` 14 | 15 | `export DRUM_JAVA_CUSTOM_PREDICTOR_CLASS=custom.XGBoostPredictor` 16 | 17 | Now run with DRUM 18 | 19 | `drum score --code-dir ./custom-model --input data/iris_binary_training.csv --target-type binary --positive-class-label 1 --negative-class-label 0` 20 | 21 | ## requirements 22 | 23 | * java 11 24 | 25 | To build the jar youl will need sbt -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/build.sbt: -------------------------------------------------------------------------------- 1 | name := "custom-scala" 2 | 3 | scalaVersion := "2.12.8" 4 | 5 | version := "0.1.0" 6 | 7 | libraryDependencies += "ml.dmlc" %% "xgboost4j" % "1.4.1" -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/custom-model/custom-scala-assembly-0.1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/scala/iris_binary/custom-model/custom-scala-assembly-0.1.0.jar -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/custom-model/xgb-model/model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/scala/iris_binary/custom-model/xgb-model/model.bin -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/data/iris_binary_training.csv: -------------------------------------------------------------------------------- 1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species 2 | 1,5.1,3.5,1.4,0.2,Iris-setosa 3 | 2,4.9,3.0,1.4,0.2,Iris-setosa 4 | 3,4.7,3.2,1.3,0.2,Iris-setosa 5 | 4,4.6,3.1,1.5,0.2,Iris-setosa 6 | 5,5.0,3.6,1.4,0.2,Iris-setosa 7 | 6,5.4,3.9,1.7,0.4,Iris-setosa 8 | 7,4.6,3.4,1.4,0.3,Iris-setosa 9 | 8,5.0,3.4,1.5,0.2,Iris-setosa 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa 41 | 51,7.0,3.2,4.7,1.4,Iris-versicolor 42 | 52,6.4,3.2,4.5,1.5,Iris-versicolor 43 | 53,6.9,3.1,4.9,1.5,Iris-versicolor 44 | 54,5.5,2.3,4.0,1.3,Iris-versicolor 45 | 55,6.5,2.8,4.6,1.5,Iris-versicolor 46 | 56,5.7,2.8,4.5,1.3,Iris-versicolor 47 | 57,6.3,3.3,4.7,1.6,Iris-versicolor 48 | 58,4.9,2.4,3.3,1.0,Iris-versicolor 49 | 59,6.6,2.9,4.6,1.3,Iris-versicolor 50 | 60,5.2,2.7,3.9,1.4,Iris-versicolor 51 | 61,5.0,2.0,3.5,1.0,Iris-versicolor 52 | 62,5.9,3.0,4.2,1.5,Iris-versicolor 53 | 63,6.0,2.2,4.0,1.0,Iris-versicolor 54 | 64,6.1,2.9,4.7,1.4,Iris-versicolor 55 | 65,5.6,2.9,3.6,1.3,Iris-versicolor 56 | 66,6.7,3.1,4.4,1.4,Iris-versicolor 57 | 67,5.6,3.0,4.5,1.5,Iris-versicolor 58 | 68,5.8,2.7,4.1,1.0,Iris-versicolor 59 | 69,6.2,2.2,4.5,1.5,Iris-versicolor 60 | 70,5.6,2.5,3.9,1.1,Iris-versicolor 61 | 71,5.9,3.2,4.8,1.8,Iris-versicolor 62 | 72,6.1,2.8,4.0,1.3,Iris-versicolor 63 | 73,6.3,2.5,4.9,1.5,Iris-versicolor 64 | 74,6.1,2.8,4.7,1.2,Iris-versicolor 65 | 75,6.4,2.9,4.3,1.3,Iris-versicolor 66 | 76,6.6,3.0,4.4,1.4,Iris-versicolor 67 | 77,6.8,2.8,4.8,1.4,Iris-versicolor 68 | 78,6.7,3.0,5.0,1.7,Iris-versicolor 69 | 79,6.0,2.9,4.5,1.5,Iris-versicolor 70 | 1,5.1,3.5,1.4,0.2,Iris-setosa 71 | 2,4.9,3.0,1.4,0.2,Iris-setosa 72 | 3,4.7,3.2,1.3,0.2,Iris-setosa 73 | 4,4.6,3.1,1.5,0.2,Iris-setosa 74 | 5,5.0,3.6,1.4,0.2,Iris-setosa 75 | 6,5.4,3.9,1.7,0.4,Iris-setosa 76 | 7,4.6,3.4,1.4,0.3,Iris-setosa 77 | 8,5.0,3.4,1.5,0.2,Iris-setosa 78 | 9,4.4,2.9,1.4,0.2,Iris-setosa 79 | 10,4.9,3.1,1.5,0.1,Iris-setosa 80 | 11,5.4,3.7,1.5,0.2,Iris-setosa 81 | 12,4.8,3.4,1.6,0.2,Iris-setosa 82 | 13,4.8,3.0,1.4,0.1,Iris-setosa 83 | 14,4.3,3.0,1.1,0.1,Iris-setosa 84 | 15,5.8,4.0,1.2,0.2,Iris-setosa 85 | 16,5.7,4.4,1.5,0.4,Iris-setosa 86 | 17,5.4,3.9,1.3,0.4,Iris-setosa 87 | 18,5.1,3.5,1.4,0.3,Iris-setosa 88 | 19,5.7,3.8,1.7,0.3,Iris-setosa 89 | 20,5.1,3.8,1.5,0.3,Iris-setosa 90 | 21,5.4,3.4,1.7,0.2,Iris-setosa 91 | 22,5.1,3.7,1.5,0.4,Iris-setosa 92 | 23,4.6,3.6,1.0,0.2,Iris-setosa 93 | 24,5.1,3.3,1.7,0.5,Iris-setosa 94 | 25,4.8,3.4,1.9,0.2,Iris-setosa 95 | 26,5.0,3.0,1.6,0.2,Iris-setosa 96 | 27,5.0,3.4,1.6,0.4,Iris-setosa 97 | 28,5.2,3.5,1.5,0.2,Iris-setosa 98 | 29,5.2,3.4,1.4,0.2,Iris-setosa 99 | 30,4.7,3.2,1.6,0.2,Iris-setosa 100 | 31,4.8,3.1,1.6,0.2,Iris-setosa 101 | 32,5.4,3.4,1.5,0.4,Iris-setosa 102 | 33,5.2,4.1,1.5,0.1,Iris-setosa 103 | 34,5.5,4.2,1.4,0.2,Iris-setosa 104 | 73,6.3,2.5,4.9,1.5,Iris-versicolor 105 | 74,6.1,2.8,4.7,1.2,Iris-versicolor 106 | 75,6.4,2.9,4.3,1.3,Iris-versicolor 107 | 76,6.6,3.0,4.4,1.4,Iris-versicolor 108 | 77,6.8,2.8,4.8,1.4,Iris-versicolor 109 | 78,6.7,3.0,5.0,1.7,Iris-versicolor 110 | 79,6.0,2.9,4.5,1.5,Iris-versicolor 111 | -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/lib/predictors.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/scala/iris_binary/lib/predictors.jar -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.3.0 2 | -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10") 2 | 3 | 4 | 5 | // addSbtPlugin("com.github.nyavro" % "sbt-spi-plugin" % "1.0.4") -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/src/main/scala/Main.scala: -------------------------------------------------------------------------------- 1 | import ml.dmlc.xgboost4j.scala.DMatrix 2 | import ml.dmlc.xgboost4j.scala.Booster; 3 | import ml.dmlc.xgboost4j.scala.XGBoost; 4 | import ml.dmlc.xgboost4j.LabeledPoint 5 | import java.io._ 6 | import java.nio.file.Paths 7 | 8 | object TrainXGB extends App { 9 | 10 | val dir = new File(args(0)) 11 | dir.exists match { 12 | case true => null 13 | case false => dir.mkdirs 14 | } 15 | 16 | val data = scala.io.Source 17 | .fromFile( 18 | "data/iris_binary_training.csv" 19 | ) 20 | .getLines 21 | val positiveClassLabel = "Iris-versicolor" 22 | val negativeClassLabel = "Iris-setosa" 23 | val headers = data.next 24 | val nullArray = null.asInstanceOf[Array[Int]] 25 | val dataIter = data.map { row => 26 | val d = row.split(",") 27 | val len = d.length - 1 28 | val (features, label) = d.splitAt(len) 29 | val label_bin = label.apply(0) match { 30 | case "Iris-setosa" => 0f 31 | case "Iris-versicolor" => 1f 32 | case _ => throw new Exception("not set for multiclass") 33 | } 34 | LabeledPoint( 35 | label = label_bin, 36 | len - 1, 37 | indices = nullArray, 38 | values = features.map { _.toFloat }.tail 39 | ) 40 | } 41 | 42 | val dmatrix = new DMatrix(dataIter.toIterator, cacheInfo = null) 43 | val paramMap = List( 44 | "eta" -> 0.1, 45 | "max_depth" -> 5, 46 | "objective" -> "binary:logistic", 47 | "verbosity" -> 1 48 | ).toMap 49 | // number of iterations 50 | val round = 100 51 | val booster = 52 | XGBoost.train(dmatrix, paramMap, round, earlyStoppingRound = 200) 53 | 54 | val modelPath = Paths.get(dir.toString, "model.bin").toFile 55 | booster.saveModel(modelPath.toString) 56 | 57 | } 58 | -------------------------------------------------------------------------------- /custom_inference/scala/iris_binary/src/main/scala/XGBoostPredictor.scala: -------------------------------------------------------------------------------- 1 | package custom 2 | 3 | import com.datarobot.drum._ 4 | import collection.JavaConverters._ 5 | import ml.dmlc.xgboost4j.scala.DMatrix 6 | import ml.dmlc.xgboost4j.scala.Booster; 7 | import ml.dmlc.xgboost4j.scala.XGBoost; 8 | import ml.dmlc.xgboost4j.LabeledPoint 9 | 10 | import org.apache.commons.csv.CSVFormat; 11 | 12 | import java.io.{BufferedReader, ByteArrayInputStream, InputStreamReader} 13 | 14 | import java.util.HashMap 15 | 16 | import util.{Try, Success, Failure} 17 | import java.nio.file.Paths 18 | 19 | 20 | class XGBoostPredictor(name: String) extends BasePredictor(name) 21 | { 22 | 23 | var customModelPath: String = null 24 | var negativeClassLabel: String = null 25 | var positiveClassLabel: String = null 26 | var booster: Booster = null 27 | val features = Array("SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm") 28 | val numFeatures = features.length 29 | 30 | override def configure( 31 | params: java.util.Map[String, AnyRef] = new java.util.HashMap[String, AnyRef]() 32 | ) = { 33 | customModelPath = params.get("__custom_model_path__").asInstanceOf[String] 34 | negativeClassLabel = params.get("negativeClassLabel").asInstanceOf[String] 35 | positiveClassLabel = params.get("positiveClassLabel").asInstanceOf[String] 36 | val modelPath = Paths.get(customModelPath, "xgb-model", "model.bin").toFile 37 | modelPath exists match { 38 | case false => throw new Exception(s"${modelPath} does not exist") 39 | case true => null 40 | } 41 | booster = XGBoost.loadModel(modelPath.toString) 42 | } 43 | override def predict(inputBytes: Array[Byte]): String = { 44 | val reader = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(inputBytes))) 45 | val csvFormat = CSVFormat.DEFAULT.withFirstRecordAsHeader; 46 | val parser = csvFormat.parse(reader) 47 | val sParser = parser.iterator.asScala.map { _.toMap } 48 | val dataIter = sParser.map{ row => 49 | val rs = row.asScala.filter{ case(k,v) => features.contains(k)}.map{ _._2} 50 | LabeledPoint(0f, numFeatures, null, rs.toArray.map{_.toFloat}) 51 | }.toIterator 52 | val dmatrix = new DMatrix(dataIter) 53 | val predictions = booster.predict(dmatrix).map{ p => 54 | val p1 = p(0) 55 | val p0 = 1 - p1 56 | s"${p0},${p1}" 57 | } 58 | predictions.mkString(s"${negativeClassLabel},${positiveClassLabel}\n", "\n", "") 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/README.md: -------------------------------------------------------------------------------- 1 | ## Custom Tasks for Classification with Python 2 | 3 | This directory contains examples of Custom Tasks for Classification written in Python. This examples can work with DataRobot Composable ML 4 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/catboost/catboost_pipeline.py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn.pipeline import Pipeline 3 | from sklearn.compose import ColumnTransformer 4 | from sklearn.impute import SimpleImputer 5 | import numpy as np 6 | import pandas as pd 7 | from catboost import CatBoostClassifier 8 | from typing import List, Optional 9 | from feature_selection import DataSelector 10 | 11 | 12 | class CatBoostClassifier_wrapper: 13 | """ 14 | A wrapper is not required in typical cases, but is valuable for catboost 15 | it allows to automatically identify and pass categorical/text features 16 | to CatBoostClassifier.fit while working with DR custom tasks logic 17 | """ 18 | 19 | model = None 20 | 21 | def model(self): 22 | return self.model 23 | 24 | def fit(self, X, y): 25 | data = pd.DataFrame(X, columns=map(str, range(len(X[0])))) 26 | cat_features = DataSelector.CatSelector(data) 27 | text_features = DataSelector.TxtSelector(data) 28 | self.model = CatBoostClassifier( 29 | allow_writing_files=False, 30 | #train_dir="catboost_info", 31 | iterations=50 32 | ).fit( 33 | X, y, cat_features=cat_features, text_features=text_features 34 | ) 35 | return self.model 36 | 37 | def predict_proba(self, data: pd.DataFrame): 38 | return pd.DataFrame( 39 | data=self.model.predict_proba(data), columns=self.model.classes_ 40 | ) 41 | 42 | 43 | def catboost_pipeline(X): 44 | catboost_preprocessing = ColumnTransformer( 45 | transformers=[ 46 | ("num", "passthrough", DataSelector.NumSelector), 47 | ( 48 | "cat", 49 | SimpleImputer(strategy="constant", fill_value=""), 50 | DataSelector.CatSelector, 51 | ), 52 | # ("txt", SimpleImputer(strategy="constant", fill_value=""), DataSelector.TxtSelector), 53 | ], 54 | remainder="drop", 55 | ) 56 | 57 | return Pipeline( 58 | steps=[ 59 | ("preprocessing", catboost_preprocessing), 60 | ("model", CatBoostClassifier_wrapper()), 61 | ] 62 | ) 63 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/catboost/custom.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import pickle 3 | import pandas as pd 4 | import numpy as np 5 | from pathlib import Path 6 | from sklearn.preprocessing import LabelEncoder 7 | from catboost_pipeline import catboost_pipeline 8 | 9 | 10 | def fit( 11 | X: pd.DataFrame, 12 | y: pd.Series, 13 | output_dir: str, 14 | class_order: Optional[List[str]] = None, 15 | row_weights: Optional[np.ndarray] = None, 16 | **kwargs, 17 | ) -> None: 18 | 19 | 20 | estimator = catboost_pipeline(X) 21 | estimator.fit(X, y) 22 | 23 | 24 | output_dir_path = Path(output_dir) 25 | if output_dir_path.exists() and output_dir_path.is_dir(): 26 | with open("{}/artifact.pkl".format(output_dir), "wb") as fp: 27 | pickle.dump(estimator, fp) 28 | 29 | 30 | def score(data: pd.DataFrame, model, **kwargs) -> pd.DataFrame: 31 | return model.predict_proba(data) 32 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/catboost/feature_selection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def is_numeric(x: pd.Series): 5 | try: 6 | sum(x) 7 | return True 8 | except: 9 | return False 10 | 11 | 12 | # Helper function to use in text_selector 13 | def is_text(x: pd.Series): 14 | """ 15 | Decide if a pandas series is text, using a very simple heuristic: 16 | - Count the number of elements in the series that contain 1 or more whitespace character 17 | - If >75% of the elements have whitespace and # unique / # all values >0.1, the Series is text. 18 | - If # unique / # all values >0.8 then the series is text 19 | otherwise - non-text 20 | Parameters 21 | ---------- 22 | x: Series to be analyzed for text 23 | Returns 24 | ------- 25 | boolean: True for is text, False for not text 26 | """ 27 | if pd.api.types.is_string_dtype(x): 28 | x_values = x.dropna() 29 | pct_rows_with_whitespace = (x_values.str.count(r"\s") > 0).sum() / x_values.shape[0] 30 | pct_unique = float(x_values.unique().shape[0]) / x_values.shape[0] 31 | if pct_unique > 0.8: 32 | return True 33 | if pct_rows_with_whitespace > 0.75 and pct_unique > 0.1: 34 | return True 35 | return False 36 | 37 | def is_datetime(x: pd.Series): 38 | if x.dtype != np.object: 39 | return False 40 | 41 | try: 42 | pd.to_datetime(x) 43 | return True 44 | except: 45 | return False 46 | 47 | 48 | def get_columns_by_type(X: pd.DataFrame): 49 | """" 50 | Creates a dictionary with a list of features for each data type 51 | """ 52 | data = X.copy() 53 | dict = {} 54 | dict["num"] = data.columns[list(data.apply(is_numeric, result_type="expand"))].tolist() 55 | data.drop(dict["num"], axis=1, inplace=True) 56 | 57 | dict["txt"] = data.columns[list(data.apply(is_text, result_type="expand"))].tolist() 58 | data.drop(dict["txt"], axis=1, inplace=True) 59 | 60 | dict["dat"] = data.columns[list(data.apply(is_datetime, result_type="expand"))].tolist() 61 | data.drop(dict["dat"], axis=1, inplace=True) 62 | 63 | dict["cat"] = data.columns.tolist() 64 | return dict 65 | 66 | 67 | 68 | class DataSelector(): 69 | """ 70 | Valueable for catboost 71 | Each method returns a list of column indices for a specific data type 72 | """ 73 | 74 | def NumSelector(X: pd.DataFrame): 75 | return [X.columns.get_loc(c) for c in get_columns_by_type(X)['num']] 76 | 77 | def CatSelector(X: pd.DataFrame): 78 | return [X.columns.get_loc(c) for c in get_columns_by_type(X)['cat']] 79 | 80 | def TxtSelector(X: pd.DataFrame): 81 | return [X.columns.get_loc(c) for c in get_columns_by_type(X)['txt']] 82 | 83 | 84 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/catboost/requirements.txt: -------------------------------------------------------------------------------- 1 | catboost==0.24.4 2 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/README.md: -------------------------------------------------------------------------------- 1 | ## Custom Task 2 | 3 | #### Owner: Tim Whittaker (timothy.whittaker@datarobot.com) 4 | 5 | Please see the associated [notebook](custom_tasks/models/classification/python/graph_isomorphism_network/GNN_Custom_Task.ipynb) in this directory for details on this example. -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/custom_task_gin/custom.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | from pathlib import Path 4 | import pickle 5 | import dgl 6 | import torch 7 | from graph_isomorphism_network import * 8 | from torch.utils.data import DataLoader 9 | 10 | # def init(code_dir): 11 | 12 | def collate(samples): 13 | graphs, labels = map(list, zip(*samples)) 14 | batched_graph = dgl.batch(graphs) 15 | batched_labels = torch.tensor(labels) 16 | return batched_graph, batched_labels 17 | 18 | 19 | 20 | def load_model(code_dir): 21 | model = GIN(2, 2, 1, 20, 2, 0, 0.01, "sum", "sum") 22 | model.load_state_dict(torch.load(os.path.join(code_dir, "gin_model.h5"))) 23 | return model 24 | 25 | def fit(X, y, output_dir, **kwargs): 26 | 27 | model = GIN(2, 2, 1, 20, 2, 0, 0.01, "sum", "sum") 28 | dgl_graphs = X["dgl_graph"].values 29 | dgl_graphs = list( map ( lambda x: pickle.loads(eval(x)), dgl_graphs)) 30 | 31 | dataset = [] 32 | for g, l in zip(dgl_graphs, y.values): 33 | num_nodes = g.num_nodes() 34 | g.ndata["attr"] = torch.ones(g.num_nodes(), 1) 35 | g.ndata["label"] = torch.ones(num_nodes, ) if l == 1 else torch.zeros(num_nodes, ) 36 | dataset.append((g, torch.tensor(l))) 37 | 38 | 39 | dataloader = DataLoader(dataset,batch_size=1024,collate_fn=collate,drop_last=False,shuffle=True) 40 | 41 | opt = torch.optim.Adam(model.parameters(),lr=0.01) 42 | 43 | for epoch in range(500): 44 | for batched_graph, label in dataloader: 45 | feats = batched_graph.ndata['attr'].float() 46 | logits = model(batched_graph, feats) 47 | loss = F.cross_entropy(logits, label) 48 | # print(loss) 49 | opt.zero_grad() 50 | loss.backward() 51 | opt.step() 52 | if epoch % 100 == 0: 53 | print('Epoch %d | Loss: %.4f' % (epoch, loss.item())) 54 | 55 | output_dir_path = Path(output_dir) 56 | if output_dir_path.exists() and output_dir_path.is_dir(): 57 | torch.save(model.state_dict(), "{}/gin_model.h5".format(output_dir)) 58 | 59 | def score(data, model, **kwargs): 60 | dgl_graphs = data["dgl_graph"].values 61 | pos_class = kwargs["positive_class_label"] 62 | neg_class = kwargs["negative_class_label"] 63 | dgl_graphs = list( map ( lambda x: pickle.loads(eval(x)), dgl_graphs)) 64 | for g in dgl_graphs: 65 | g.ndata["attr"] = torch.ones(g.num_nodes(), 1) 66 | batched_graph = dgl.batch(dgl_graphs) 67 | feats = batched_graph.ndata['attr'].float() 68 | preds = F.softmax(model(batched_graph, feats), dim=1).detach().numpy() 69 | return pd.DataFrame(preds, columns = [neg_class, pos_class]) 70 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/custom_task_gin/requirements.txt: -------------------------------------------------------------------------------- 1 | networkx==2.5 2 | dgl==0.5.2 3 | datarobot-drum -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/env/Dockerfile: -------------------------------------------------------------------------------- 1 | # This is the default base image for use with user models and workflows. 2 | # It contains a variety of common useful data-science packages and tools. 3 | FROM datarobot/python3-dropin-env-base 4 | 5 | # Install the list of core requirements, e.g. sklearn, numpy, pandas, flask. 6 | # **Don't modify this file!** 7 | COPY dr_requirements.txt dr_requirements.txt 8 | 9 | # '--upgrade-strategy eager' will upgrade installed dependencies 10 | # according to package requirements or to the latest 11 | RUN pip3 install -U --upgrade-strategy eager --no-cache-dir --prefer-binary -r dr_requirements.txt && \ 12 | rm -rf dr_requirements.txt 13 | 14 | # Install the list of custom Python requirements, e.g. keras, xgboost, etc. 15 | COPY requirements.txt requirements.txt 16 | RUN pip3 install -r requirements.txt --no-cache-dir && \ 17 | rm -rf requirements.txt 18 | 19 | RUN mkdir -p /opt/.dgl && chmod -R 777 /opt/.dgl 20 | 21 | # Copy the drop-in environment code into the correct directory 22 | # Code from the custom model tarball can overwrite the code here 23 | ENV HOME=/opt CODE_DIR=/opt/code ADDRESS=0.0.0.0:8080 24 | WORKDIR ${CODE_DIR} 25 | COPY ./ ${CODE_DIR} 26 | 27 | ENV WITH_ERROR_SERVER=1 28 | # Uncomment the following line to switch from Flask to uwsgi server 29 | #ENV PRODUCTION=1 MAX_WORKERS=1 SHOW_STACKTRACE=1 30 | 31 | ENTRYPOINT ["${CODE_DIR}/start_server.sh"] 32 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/env/README.md: -------------------------------------------------------------------------------- 1 | # Python 3 PyTorch Drop-In Template Environment 2 | 3 | This template environment can be used to create artifact-only PyTorch custom models. 4 | Your custom model directory needs only contain your model artifact if you use the 5 | environment correctly. 6 | 7 | ## Supported Libraries 8 | 9 | This environment has built for python 3 and has support for the following scientific libraries. 10 | For specific version information, see [requirements](requirements.txt). 11 | 12 | - PyTorch 13 | 14 | ## Instructions 15 | 16 | 1. From the terminal, run `tar -czvf py_dropin.tar.gz -C /path/to/public_dropin_environments/python3_pytorch/ .` 17 | 2. Using either the API or from the UI create a new Custom Environment with the tarball created 18 | in step 1. 19 | 20 | ### Creating models for this environment 21 | 22 | To use this environment, your custom model archive must contain a single serialized model artifact 23 | with `.pth` file extension as well as any other custom code needed to use your serialized model, including 24 | the file that defines your torch network. 25 | 26 | 27 | This environment makes the following assumption about your serialized model: 28 | - The data sent to custom model can be used to make predictions without 29 | additional pre-processing 30 | - Regression models return a single floating point per row of prediction data 31 | - Binary classification models return one floating point value <= 1.0 or two floating point values that sum to 1.0 per row of prediction data. 32 | - Single value output is assumed to be the positive class probability 33 | - Multi value it is assumed that the first value is the negative class probability, the second is the positive class probability 34 | - There is a single .pth file present 35 | 36 | If these assumptions are incorrect for your model, you should make a copy of [custom.py](https://github.com/datarobot/datarobot-user-models/blob/master/model_templates/python3_pytorch/custom.py), modify it as needed, and include in your custom model archive. 37 | 38 | The structure of your custom model archive should look like: 39 | 40 | - custom_model.tar.gz 41 | - artifact.pth 42 | - custom.py (if needed) 43 | 44 | Please read [datarobot-cmrunner](https://github.com/datarobot/datarobot-user-models/blob/master/custom_model_runner/README.md) documentation on how to assemble **custom.py**. 45 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/env/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_tasks/models/classification/python/graph_isomorphism_network/env/__init__.py -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/env/dr_requirements.txt: -------------------------------------------------------------------------------- 1 | pyarrow==0.14.1 2 | datarobot-drum==1.5.7 3 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/env/env_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "5e8c888007389fe0f466c72b", 3 | "name": "[DataRobot] Python 3 PyTorch Drop-In", 4 | "description": "This template environment can be used to create artifact-only PyTorch custom models. This environment contains PyTorch and requires only your model artifact as a .pth file, any other code needed to deserialize your model, and optionally a custom.py file.", 5 | "programmingLanguage": "python", 6 | "environmentVersionId": "60ee9b889f5641ab36ae5823", 7 | "isPublic": true 8 | } 9 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/env/fit.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # You probably don't want to modify this file 3 | cd "${CODEPATH}" || exit 1 4 | export PYTHONPATH="${CODEPATH}":"${PYTHONPATH}" 5 | 6 | export X="${INPUT_DIRECTORY}/X${TRAINING_DATA_EXTENSION:-.csv}" 7 | export weights="${INPUT_DIRECTORY}/weights.csv" 8 | export sparse_colnames="${INPUT_DIRECTORY}/X.colnames" 9 | export parameters="${INPUT_DIRECTORY}/parameters.json" 10 | 11 | CMD="drum fit --target-type ${TARGET_TYPE} --input ${X} --num-rows ALL --output ${ARTIFACT_DIRECTORY} \ 12 | --code-dir ${CODEPATH} --verbose" 13 | 14 | if [ "${TARGET_TYPE}" != "anomaly" ]; then 15 | CMD="${CMD} --target-csv ${INPUT_DIRECTORY}/y.csv" 16 | fi 17 | 18 | if [ -f "${weights}" ]; then 19 | CMD="${CMD} --row-weights-csv ${weights}" 20 | fi 21 | 22 | if [ -f "${sparse_colnames}" ]; then 23 | CMD="${CMD} --sparse-column-file ${sparse_colnames}" 24 | fi 25 | 26 | if [ -f "${parameters}" ]; then 27 | CMD="${CMD} --parameter-file ${parameters}" 28 | fi 29 | 30 | echo "Environment variables:" 31 | env 32 | echo "${CMD}" 33 | sh -c "${CMD}" 34 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/env/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.2.0 2 | numpy==1.19.5 3 | pandas==1.0.5 4 | scikit-learn==0.23.1 5 | sagemaker-scikit-learn-extension==1.1.0 -------------------------------------------------------------------------------- /custom_tasks/models/classification/python/graph_isomorphism_network/env/start_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | echo "Starting Custom Model environment with DRUM prediction server" 3 | echo "Environment variables:" 4 | env 5 | echo 6 | 7 | CMD="drum server $@" 8 | echo "Executing command: ${CMD}" 9 | echo 10 | exec ${CMD} 11 | -------------------------------------------------------------------------------- /custom_tasks/models/classification/r/README.md: -------------------------------------------------------------------------------- 1 | ## Custom Tasks for Classification with R 2 | 3 | This directory contains examples of Custom Tasks for Classification written in R. This examples can work with DataRobot Composable ML -------------------------------------------------------------------------------- /custom_tasks/models/regression/python/README.md: -------------------------------------------------------------------------------- 1 | ## Custom Tasks for Regression with Python 2 | 3 | This directory contains examples of Custom Tasks for Regerssion written in Python. This examples can work with DataRobot Composable ML 4 | -------------------------------------------------------------------------------- /custom_tasks/models/regression/r/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /custom_tasks/models/unsupervised/python/README.md: -------------------------------------------------------------------------------- 1 | ## Custom Tasks for Unsupervised Models with Python 2 | 3 | This directory contains examples of Custom Tasks for Unsupervised Models written in Python. This examples can work with DataRobot Composable ML 4 | -------------------------------------------------------------------------------- /custom_tasks/models/unsupervised/r/README.md: -------------------------------------------------------------------------------- 1 | ## Custom Tasks for Unsupervised Models with R 2 | 3 | This directory contains examples of Custom Tasks for Unsupervised Models written in R. This examples can work with DataRobot Composable ML 4 | -------------------------------------------------------------------------------- /custom_tasks/other/README.md: -------------------------------------------------------------------------------- 1 | ## Other Custom Tasks 2 | 3 | This directory contains examples of Other Custom Tasks. These examples can work with DataRobot Composable ML 4 | -------------------------------------------------------------------------------- /custom_tasks/other/python/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /custom_tasks/other/python/round_predictions/custom.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import pickle 3 | import pandas as pd 4 | import numpy as np 5 | from pathlib import Path 6 | from sklearn.pipeline import Pipeline 7 | 8 | 9 | def fit( 10 | X: pd.DataFrame, 11 | y: pd.Series, 12 | output_dir: str, 13 | class_order: Optional[List[str]] = None, 14 | row_weights: Optional[np.ndarray] = None, 15 | **kwargs, 16 | ) -> None: 17 | 18 | estimator = pipeline(X) 19 | estimator.fit(X, y) 20 | 21 | output_dir_path = Path(output_dir) 22 | if output_dir_path.exists() and output_dir_path.is_dir(): 23 | with open("{}/artifact.pkl".format(output_dir), "wb") as fp: 24 | pickle.dump(estimator, fp) 25 | 26 | 27 | class RoundInput(): 28 | """ 29 | Goal is to round the output of a prior model, so using those unrounded 30 | predictions as inputs here. 31 | """ 32 | 33 | def __init__(self, X): 34 | self.X = X 35 | 36 | def fit(self, X, y=None, **kwargs): 37 | self.X = round(X) 38 | return self 39 | 40 | def transform(self, X): 41 | return np.array(round(X[X.columns[0]])).reshape(-1, 1) 42 | 43 | 44 | class EmptyEstimator(): 45 | """ 46 | This is empty because the rounding is done in the above step of the pipeline. 47 | Still need this for the pipeline to run though. 48 | """ 49 | 50 | def fit(self, X, y): 51 | return self 52 | 53 | def predict(self, data: pd.DataFrame): 54 | return data[:,0] 55 | 56 | 57 | def pipeline(X): 58 | return Pipeline(steps=[("preprocessing", RoundInput(X)), ("model", EmptyEstimator())]) 59 | 60 | 61 | def score(data: pd.DataFrame, model, **kwargs) -> pd.DataFrame: 62 | return pd.DataFrame(data=model.predict(data), columns = ['Predictions']) 63 | -------------------------------------------------------------------------------- /custom_tasks/other/r/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_catboost/README.md: -------------------------------------------------------------------------------- 1 | # Catboost Target Encoding 2 | 3 | ### Overview 4 | ``` 5 | [ ] accepts numeric inputs 6 | [x] accepts categorical inputs 7 | [ ] outputs missing values 8 | [x] works for binary targets (tested in DataRobot and confirmed) 9 | [?] works for multiclass targets 10 | [x] works for numeric targets (tested in DataRobot and confirmed) 11 | ``` 12 | ### Description 13 | 14 | tbd 15 | 16 | ### References 17 | 18 | https://contrib.scikit-learn.org/category_encoders/catboost.html# 19 | 20 | https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/ 21 | 22 | https://arxiv.org/abs/1706.09516 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_catboost/custom.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import pandas as pd 3 | from pathlib import Path 4 | import pickle 5 | 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit CatBoost encoder], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [a pickle file containing a pre-fit CatBoost Encoder] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | encoder_catboost = ce.CatBoostEncoder(cols=X.columns) 31 | encoder_catboost.fit(X,y) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/catboost.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(encoder_catboost, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | In this example, it's a pickle file containing a pre-fit CatBoost Encoder. 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | Returns a dataframe with transformed data. 60 | """ 61 | 62 | return transformer.transform(data).fillna(0) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_catboost/requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders==2.2.2 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_hashing/README.md: -------------------------------------------------------------------------------- 1 | # Hash Encoding 2 | 3 | ### Overview 4 | 5 | [ ] accepts numeric inputs 6 | 7 | [x] accepts categorical inputs 8 | 9 | [ ] outputs missing values 10 | 11 | [x] works for binary targets (tested in DataRobot and confirmed) 12 | 13 | [?] works for multiclass targets 14 | 15 | [x] works for numeric targets (tested in DataRobot and confirmed) 16 | 17 | ### Description 18 | 19 | tbd 20 | 21 | ### References 22 | 23 | https://alex.smola.org/papers/2009/Weinbergeretal09.pdf 24 | 25 | https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_hashing/custom.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import pandas as pd 3 | from pathlib import Path 4 | import pickle 5 | 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit HashingEncoder], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - a pickle file containing a pre-fit HashingEncoder] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | encoder_hash = ce.HashingEncoder(cols=X.columns, hash_method='md5', max_process=2, n_components=16) 31 | encoder_hash.fit(X,y) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/hash.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(encoder_hash, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | In this example, it's a pickle file containing a pre-fit HashingEncoder. 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | Returns a dataframe with transformed data. 60 | """ 61 | 62 | return transformer.transform(data).fillna(0) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_hashing/requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders==2.2.2 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_mest/README.md: -------------------------------------------------------------------------------- 1 | # M-estimate Target Encoding 2 | 3 | ### Overview 4 | 5 | [ ] accepts numeric inputs 6 | 7 | [x] accepts categorical inputs 8 | 9 | [ ] outputs missing values 10 | 11 | [x] works for binary targets (tested in DataRobot and confirmed) 12 | 13 | [?] works for multiclass targets 14 | 15 | [x] works for numeric targets (tested in DataRobot and confirmed) 16 | 17 | ### Description 18 | 19 | tbd 20 | 21 | ### References 22 | 23 | https://dl.acm.org/citation.cfm?id=507538 24 | 25 | https://link.springer.com/chapter/10.1007/BFb0017010 26 | 27 | https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_mest/custom.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import pandas as pd 3 | from pathlib import Path 4 | import pickle 5 | 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit M-Estimate target encoder], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - a pickle file containing a pre-fit M-Estimate target encoder] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | encoder_mest = ce.MEstimateEncoder(cols=X.columns, randomized=True, m=0.50) 31 | encoder_mest.fit(X,y) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/mest.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(encoder_mest, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | In this example, it's a pickle file containing a pre-fit M-Estimate target encoder. 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | Returns a dataframe with transformed data. 60 | """ 61 | 62 | return transformer.transform(data).fillna(0) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_mest/requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders==2.2.2 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_backward_differencing/README.md: -------------------------------------------------------------------------------- 1 | # Backward Difference Target Encoding 2 | 3 | ### Overview 4 | ``` 5 | [ ] accepts numeric inputs 6 | [x] accepts categorical inputs 7 | [ ] outputs missing values 8 | [x] works for binary targets (tested in DataRobot and confirmed) 9 | [ ] works for multiclass targets 10 | [ ] works for numeric targets 11 | ``` 12 | ### Description 13 | 14 | tbd 15 | 16 | ### References 17 | 18 | https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/ 19 | 20 | http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_backward_differencing/custom.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import pandas as pd 3 | from pathlib import Path 4 | import pickle 5 | 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | encoder_bwdiff = ce.BackwardDifferenceEncoder(cols=X.columns) 31 | encoder_bwdiff.fit(X,y) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/backdiff.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(encoder_bwdiff, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | Returns a dataframe with transformed data. 60 | """ 61 | 62 | return transformer.transform(data).fillna(0) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_backward_differencing/requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders==2.2.2 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_glm/README.md: -------------------------------------------------------------------------------- 1 | # GLM Target Encoding 2 | 3 | ### Overview 4 | ``` 5 | [ ] accepts numeric inputs 6 | [x] accepts categorical inputs 7 | [ ] outputs missing values 8 | [x] works for binary targets (tested in DataRobot and confirmed) 9 | [ ] works for multiclass targets 10 | [ ] works for numeric targets 11 | ``` 12 | ### Description 13 | 14 | **This implementation is hard-coded to handle binary targets ONLY!** 15 | 16 | This is a supervised encoder similar to TargetEncoder or MEstimateEncoder, but there are some advantages: 17 | 18 | 1) Solid statistical theory behind the technique. Mixed effects models are a mature branch of statistics. 19 | 2) No hyper-parameters to tune. The amount of shrinkage is automatically determined through the estimation process. In short, the less observations a category has and/or the more the outcome varies for a category then the higher the regularization towards “the prior” or “grand mean”. 20 | 3) The technique is applicable for both continuous and binomial targets. If the target is continuous, the encoder returns regularized difference of the observation’s category from the global mean. If the target is binomial, the encoder returns regularized log odds per category. 21 | 22 | In comparison to JamesSteinEstimator, this encoder utilizes generalized linear mixed models from statsmodels library. 23 | 24 | ### References 25 | 26 | https://faculty.psau.edu.sa/filedownload/doc-12-pdf-a1997d0d31f84d13c1cdc44ac39a8f2c-original.pdf -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_glm/custom.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import pandas as pd 3 | from pathlib import Path 4 | import pickle 5 | 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | encoder_glm = ce.GLMMEncoder(cols=X.columns, binomial_target=True, randomized=True) 31 | encoder_glm.fit(X,y) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/binomialglm.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(encoder_glm, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | Returns a dataframe with transformed data. 60 | """ 61 | 62 | return transformer.transform(data).fillna(0) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_glm/requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders==2.2.2 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_helmert/README.md: -------------------------------------------------------------------------------- 1 | # Helmert Target Encoding 2 | 3 | ### Overview 4 | ``` 5 | [ ] accepts numeric inputs 6 | [x] accepts categorical inputs 7 | [ ] outputs missing values 8 | [x] works for binary targets (tested in DataRobot and confirmed) 9 | [ ] works for multiclass targets 10 | [ ] works for numeric targets 11 | ``` 12 | ### Description 13 | 14 | tbd 15 | 16 | ### References 17 | 18 | https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/ 19 | 20 | http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_helmert/custom.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import pandas as pd 3 | from pathlib import Path 4 | import pickle 5 | 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | encoder_helmert = ce.HelmertEncoder(cols=X.columns) 31 | encoder_helmert.fit(X,y) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/helmert.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(encoder_helmert, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | Returns a dataframe with transformed data. 60 | """ 61 | 62 | return transformer.transform(data).fillna(0) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_helmert/requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders==2.2.2 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_leaveonout/README.md: -------------------------------------------------------------------------------- 1 | # Leave-One-Out Target Encoding 2 | 3 | ### Overview 4 | ``` 5 | [ ] accepts numeric inputs 6 | [x] accepts categorical inputs 7 | [ ] outputs missing values 8 | [x] works for binary targets (tested in DataRobot and confirmed) 9 | [ ] works for multiclass targets 10 | [ ] works for numeric targets 11 | ``` 12 | ### Description 13 | 14 | This is very similar to target encoding but excludes the current row’s target when calculating the mean target for a level to reduce the effect of outliers. 15 | 16 | ### References 17 | 18 | https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_leaveonout/custom.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import pandas as pd 3 | from pathlib import Path 4 | import pickle 5 | 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | encoder_loo = ce.LeaveOneOutEncoder(cols=X.columns) 31 | encoder_loo.fit(X,y) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/loo.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(encoder_loo, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | Returns a dataframe with transformed data. 60 | """ 61 | 62 | return transformer.transform(data).fillna(0) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_leaveonout/requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders==2.2.2 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_woe/README.md: -------------------------------------------------------------------------------- 1 | # Weight-Of-Evidence Target Encoding 2 | 3 | ### Overview 4 | ``` 5 | [ ] accepts numeric inputs 6 | [x] accepts categorical inputs 7 | [ ] outputs missing values 8 | [x] works for binary targets (tested in DataRobot and confirmed) 9 | [ ] works for multiclass targets 10 | [ ] works for numeric targets 11 | ``` 12 | ### Description 13 | 14 | tbd 15 | 16 | ### References 17 | 18 | https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_woe/custom.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import pandas as pd 3 | from pathlib import Path 4 | import pickle 5 | 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | encoder_woe = ce.WOEEncoder(cols=X.columns, randomized=True, handle_missing='value', handle_unknown='value') 31 | encoder_woe.fit(X,y) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/woe.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(encoder_woe, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | Returns a dataframe with transformed data. 60 | """ 61 | 62 | return transformer.transform(data).fillna(0) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_woe/requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders==2.2.2 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/multiclass_target/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_tasks/preprocessing/categorical/python/encoding/multiclass_target/.gitignore -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/regression_target/regression_enc_glm/README.md: -------------------------------------------------------------------------------- 1 | # GLM Target Encoding 2 | 3 | ### Overview 4 | ``` 5 | [ ] accepts numeric inputs 6 | [x] accepts categorical inputs 7 | [ ] outputs missing values 8 | [x] works for binary targets (tested in DataRobot and confirmed) 9 | [ ] works for multiclass targets 10 | [ ] works for numeric targets 11 | ``` 12 | ### Description 13 | 14 | **This implementation is hard-coded to handle continuous targets ONLY!** 15 | 16 | This is a supervised encoder similar to TargetEncoder or MEstimateEncoder, but there are some advantages: 17 | 18 | 1) Solid statistical theory behind the technique. Mixed effects models are a mature branch of statistics. 19 | 2) No hyper-parameters to tune. The amount of shrinkage is automatically determined through the estimation process. In short, the less observations a category has and/or the more the outcome varies for a category then the higher the regularization towards “the prior” or “grand mean”. 20 | 3) The technique is applicable for both continuous and binomial targets. If the target is continuous, the encoder returns regularized difference of the observation’s category from the global mean. If the target is binomial, the encoder returns regularized log odds per category. 21 | 22 | In comparison to JamesSteinEstimator, this encoder utilizes generalized linear mixed models from statsmodels library. 23 | 24 | ### References 25 | 26 | https://faculty.psau.edu.sa/filedownload/doc-12-pdf-a1997d0d31f84d13c1cdc44ac39a8f2c-original.pdf -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/regression_target/regression_enc_glm/custom.py: -------------------------------------------------------------------------------- 1 | import category_encoders as ce 2 | import pandas as pd 3 | from pathlib import Path 4 | import pickle 5 | 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | encoder_glm = ce.GLMMEncoder(cols=X.columns, binomial_target=False, randomized=True) 31 | encoder_glm.fit(X,y) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/glm.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(encoder_glm, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 55 | 56 | Returns 57 | ------- 58 | pd.DataFrame 59 | Returns a dataframe with transformed data. 60 | """ 61 | 62 | return transformer.transform(data).fillna(0) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/encoding/regression_target/regression_enc_glm/requirements.txt: -------------------------------------------------------------------------------- 1 | category_encoders==2.2.2 -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/python/imputing/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_tasks/preprocessing/categorical/python/imputing/.gitignore -------------------------------------------------------------------------------- /custom_tasks/preprocessing/categorical/r/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /custom_tasks/preprocessing/images/r/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/encoding/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_tasks/preprocessing/numeric/python/encoding/.gitignore -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/imputing/knn_imputer_fixed_n/README.md: -------------------------------------------------------------------------------- 1 | # KNN (Nearest Neighbors) Missing Imputation 2 | ## For Numerics 3 | 4 | ### Overview 5 | ``` 6 | [x] accepts numeric inputs 7 | [ ] accepts categorical inputs 8 | [ ] outputs missing values 9 | [x] works for binary targets (tested in DataRobot and confirmed) 10 | [x] works for multiclass targets 11 | [x] works for numeric targets 12 | ``` 13 | ### Description 14 | 15 | KNN Imputer was first supported by Scikit-Learn in December 2019 when it released its version 0.22. This imputer utilizes the k-Nearest Neighbors method to replace the missing values in the datasets with the mean value from the parameter ‘n_neighbors’ nearest neighbors found in the training set. By default, it uses a Euclidean distance metric to impute the missing values. 16 | 17 | ### References 18 | 19 | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3668100/ 20 | 21 | http://www.stat.columbia.edu/~gelman/arm/missing.pdf 22 | 23 | https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/ 24 | 25 | https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html 26 | 27 | https://www.iriseekhout.com/missing-data/ -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/imputing/knn_imputer_fixed_n/custom.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pathlib import Path 3 | import pickle 4 | from sklearn.impute import KNNImputer 5 | import numpy as np 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | knn = KNNImputer(n_neighbors=5, add_indicator=False) 31 | knn.fit(X.values) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/knn.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(knn, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | 55 | Returns 56 | ------- 57 | pd.DataFrame 58 | Returns a dataframe with transformed data. 59 | """ 60 | 61 | return pd.DataFrame(transformer.transform(data.values), columns=data.columns) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/imputing/median_impute/custom.py: -------------------------------------------------------------------------------- 1 | # This custom transform task implements missing values imputation using a median 2 | 3 | import pickle 4 | import pandas as pd 5 | import numpy as np 6 | from pathlib import Path 7 | 8 | 9 | def fit(X, y, output_dir, **kwargs): 10 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 11 | DataRobot runs this hook when the task is being trained inside a blueprint. 12 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 13 | The input parameters are passed by DataRobot based on project and blueprint configuration. 14 | 15 | Parameters 16 | ------- 17 | X: pd.DataFrame 18 | Training data that DataRobot passes when this task is being trained. 19 | y: pd.Series 20 | Project's target column (None is passed for unsupervised projects). 21 | output_dir: str 22 | A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). 23 | 24 | Returns 25 | ------- 26 | None 27 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 28 | so that the trained object can be used during scoring inside transform() 29 | """ 30 | 31 | # compute medians for all numeric features on training data, store them in a dictionary 32 | median = X.median(axis = 0, numeric_only = True, skipna = True).to_dict() 33 | 34 | 35 | # dump the trained object [in this example - dictionary with medians per column] 36 | # into an artifact [in this example - artifact.pkl] 37 | # and save it into output_dir so that it can be used later to impute on new data 38 | output_dir_path = Path(output_dir) 39 | if output_dir_path.exists() and output_dir_path.is_dir(): 40 | with open("{}/artifact.pkl".format(output_dir), "wb") as fp: 41 | pickle.dump(median, fp) 42 | 43 | 44 | def transform(data, transformer): 45 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 46 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 47 | As an output, this hook is expected to return the transformed data. 48 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 49 | 50 | Parameters 51 | ------- 52 | data: pd.DataFrame 53 | Data that DataRobot passes for transformation. 54 | transformer: Any 55 | Trained object, extracted by DataRobot from the artifact created inside fit(). 56 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 57 | 58 | Returns 59 | ------- 60 | pd.DataFrame 61 | Returns a dataframe with transformed data. 62 | """ 63 | 64 | return data.fillna(transformer) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/monotonic transforms/power_transformer/custom.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pathlib import Path 3 | import pickle 4 | from sklearn.preprocessing import PowerTransformer 5 | import numpy as np 6 | 7 | def fit(X, y, output_dir, **kwargs): 8 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 9 | DataRobot runs this hook when the task is being trained inside a blueprint. 10 | As an output, this hook is expected to create an artifact containg a trained object [in this example - power transform], that is then used to transform new data. 11 | The input parameters are passed by DataRobot based on project and blueprint configuration. 12 | 13 | Parameters 14 | ------- 15 | X: pd.DataFrame 16 | Training data that DataRobot passes when this task is being trained. 17 | y: pd.Series 18 | Project's target column (None is passed for unsupervised projects). 19 | output_dir: str 20 | A path to the output folder; the artifact [in this example - containing a power transformer] must be saved into this folder to be re-used in transform(). 21 | 22 | Returns 23 | ------- 24 | None 25 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 26 | so that the trained object can be used during scoring inside transform() 27 | """ 28 | 29 | # Transform categorical columns into a numeric transformation using Weight of Evidence 30 | pt = PowerTransformer() 31 | pt.fit(X.values) 32 | 33 | # dump the trained object 34 | # into an artifact [in this example - woe.pkl] 35 | # and save it into output_dir so that it can be used later to impute on new data 36 | output_dir_path = Path(output_dir) 37 | if output_dir_path.exists() and output_dir_path.is_dir(): 38 | with open("{}/pt.pkl".format(output_dir), "wb") as fp: 39 | pickle.dump(pt, fp) 40 | 41 | 42 | def transform(data, transformer): 43 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 44 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 45 | As an output, this hook is expected to return the transformed data. 46 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 47 | 48 | Parameters 49 | ------- 50 | data: pd.DataFrame 51 | Data that DataRobot passes for transformation. 52 | transformer: Any 53 | Trained object, extracted by DataRobot from the artifact created inside fit(). 54 | 55 | Returns 56 | ------- 57 | pd.DataFrame 58 | Returns a dataframe with transformed data. 59 | """ 60 | 61 | return pd.DataFrame(transformer.transform(data.values), columns=data.columns).fillna(np.nan) -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/scaling/minmaxscaler/custom.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.preprocessing import MinMaxScaler 8 | 9 | 10 | def fit(X, y, output_dir, **kwargs): 11 | """This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 12 | DataRobot runs this hook when the task is being trained inside a blueprint. 13 | As an output, this hook is expected to create an artifact containg a trained object, that is then used to transform new data. 14 | The input parameters are passed by DataRobot based on project and blueprint configuration. 15 | 16 | Parameters 17 | ------- 18 | X: pd.DataFrame 19 | Training data that DataRobot passes when this task is being trained. 20 | y: pd.Series 21 | Project's target column (None is passed for unsupervised projects). 22 | output_dir: str 23 | A path to the output folder; the artifact [in this example - containing a power transformer] must be saved into this folder to be re-used in transform(). 24 | 25 | Returns 26 | ------- 27 | None 28 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 29 | so that the trained object can be used during scoring inside transform() 30 | """ 31 | 32 | # Transform numeric values to [0,1] based on sklearn's MinMaxScaler() 33 | scaler = MinMaxScaler() 34 | scaler.fit(X.values) 35 | 36 | # Dump fit scaler to artifact to use for transforms 37 | with open(os.path.join(output_dir, "minmaxscaler.pkl"), "wb") as fp: 38 | pickle.dump(scaler, fp) 39 | 40 | # Save the transformed input df as an object to inspect and confirm the scaler is working 41 | transformed = pd.DataFrame(scaler.transform(X.values)) 42 | 43 | transformed.to_csv(os.path.join(output_dir, "transformed.csv"), index=False) 44 | 45 | 46 | def transform(data, transformer): 47 | """This hook defines how DataRobot will use the trained object from fit() to transform new data. 48 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 49 | As an output, this hook is expected to return the transformed data. 50 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 51 | 52 | Parameters 53 | ------- 54 | data: pd.DataFrame 55 | Data that DataRobot passes for transformation. 56 | transformer: Any 57 | Trained object, extracted by DataRobot from the artifact created inside fit(). 58 | 59 | Returns 60 | ------- 61 | pd.DataFrame 62 | Returns a dataframe with transformed data. 63 | """ 64 | 65 | transformed = pd.DataFrame( 66 | transformer.transform(data.values), columns=data.columns 67 | ).fillna(np.nan) 68 | 69 | return transformed 70 | -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/signal/butter_10_15_hp_1000/custom.py: -------------------------------------------------------------------------------- 1 | # Project: Custom Transform - Butterworth Filter 2 | # Project Cerebro Hackathon Team 3 | 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html 5 | 6 | from scipy import signal 7 | import pickle 8 | import pandas as pd 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | 13 | def fit(X, y, output_dir, **kwargs): 14 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 15 | DataRobot runs this hook when the task is being trained inside a blueprint. 16 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 17 | The input parameters are passed by DataRobot based on project and blueprint configuration. 18 | Parameters 19 | ------- 20 | X: pd.DataFrame 21 | Training data that DataRobot passes when this task is being trained. 22 | y: pd.Series 23 | Project's target column (None is passed for unsupervised projects). 24 | output_dir: str 25 | A path to the output folder; the artifact must be saved into this folder to be re-used in transform(). 26 | Returns 27 | ------- 28 | None 29 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 30 | so that the trained object can be used during scoring inside transform() 31 | """ 32 | 33 | sos = signal.butter(10, 15, 'hp', fs=1000, output='sos') 34 | 35 | # dump the trained object [in this example - dictionary with medians per column] 36 | # into an artifact [in this example - artifact.pkl] 37 | # and save it into output_dir so that it can be used later to impute on new data 38 | output_dir_path = Path(output_dir) 39 | if output_dir_path.exists() and output_dir_path.is_dir(): 40 | with open("{}/artifact.pkl".format(output_dir), "wb") as fp: 41 | pickle.dump(sos, fp) 42 | 43 | 44 | def transform(data, transformer): 45 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 46 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 47 | As an output, this hook is expected to return the transformed data. 48 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 49 | Parameters 50 | ------- 51 | data: pd.DataFrame 52 | Data that DataRobot passes for transformation. 53 | transformer: Any 54 | Trained object, extracted by DataRobot from the artifact created inside fit(). 55 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 56 | 57 | Returns 58 | ------- 59 | pd.DataFrame 60 | Returns a dataframe with transformed data. 61 | """ 62 | array = signal.sosfilt(transformer, x=data) 63 | df = pd.DataFrame(array) 64 | return df -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/signal/butter_4_100_lowpass/custom.py: -------------------------------------------------------------------------------- 1 | # Project: Custom Transform - Butterworth Filter 2 | # Project Cerebro Hackathon Team 3 | 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html 5 | 6 | from scipy import signal 7 | import pickle 8 | import pandas as pd 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | 13 | def fit(X, y, output_dir, **kwargs): 14 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 15 | DataRobot runs this hook when the task is being trained inside a blueprint. 16 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 17 | The input parameters are passed by DataRobot based on project and blueprint configuration. 18 | Parameters 19 | ------- 20 | X: pd.DataFrame 21 | Training data that DataRobot passes when this task is being trained. 22 | y: pd.Series 23 | Project's target column (None is passed for unsupervised projects). 24 | output_dir: str 25 | A path to the output folder; the artifact must be saved into this folder to be re-used in transform(). 26 | Returns 27 | ------- 28 | None 29 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 30 | so that the trained object can be used during scoring inside transform() 31 | """ 32 | 33 | sos = signal.butter(4, 100, 'low', analog=True, output='sos') 34 | 35 | # dump the trained object [in this example - dictionary with medians per column] 36 | # into an artifact [in this example - artifact.pkl] 37 | # and save it into output_dir so that it can be used later to impute on new data 38 | output_dir_path = Path(output_dir) 39 | if output_dir_path.exists() and output_dir_path.is_dir(): 40 | with open("{}/artifact.pkl".format(output_dir), "wb") as fp: 41 | pickle.dump(sos, fp) 42 | 43 | 44 | def transform(data, transformer): 45 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 46 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 47 | As an output, this hook is expected to return the transformed data. 48 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 49 | Parameters 50 | ------- 51 | data: pd.DataFrame 52 | Data that DataRobot passes for transformation. 53 | transformer: Any 54 | Trained object, extracted by DataRobot from the artifact created inside fit(). 55 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 56 | 57 | Returns 58 | ------- 59 | pd.DataFrame 60 | Returns a dataframe with transformed data. 61 | """ 62 | array = signal.sosfilt(transformer, x=data) 63 | df = pd.DataFrame(array) 64 | return df -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/signal/cheby1_sos_10_1_15/custom.py: -------------------------------------------------------------------------------- 1 | # Project: Custom Transform - Chebychev Filter 2 | # Project Cerebro Hackathon Team 3 | 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html 5 | 6 | from scipy import signal 7 | import pickle 8 | import pandas as pd 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | 13 | def fit(X, y, output_dir, **kwargs): 14 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 15 | DataRobot runs this hook when the task is being trained inside a blueprint. 16 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 17 | The input parameters are passed by DataRobot based on project and blueprint configuration. 18 | Parameters 19 | ------- 20 | X: pd.DataFrame 21 | Training data that DataRobot passes when this task is being trained. 22 | y: pd.Series 23 | Project's target column (None is passed for unsupervised projects). 24 | output_dir: str 25 | A path to the output folder; the artifact must be saved into this folder to be re-used in transform(). 26 | Returns 27 | ------- 28 | None 29 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 30 | so that the trained object can be used during scoring inside transform() 31 | """ 32 | 33 | sos = signal.cheby1(10, 1, 15, 'hp', fs=1000, output='sos') 34 | #filtered = signal.sosfilt(sos, x=X) 35 | 36 | # dump the trained object [in this example - dictionary with medians per column] 37 | # into an artifact [in this example - artifact.pkl] 38 | # and save it into output_dir so that it can be used later to impute on new data 39 | output_dir_path = Path(output_dir) 40 | if output_dir_path.exists() and output_dir_path.is_dir(): 41 | with open("{}/artifact.pkl".format(output_dir), "wb") as fp: 42 | pickle.dump(sos, fp) 43 | 44 | 45 | def transform(data, transformer): 46 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 47 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 48 | As an output, this hook is expected to return the transformed data. 49 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 50 | Parameters 51 | ------- 52 | data: pd.DataFrame 53 | Data that DataRobot passes for transformation. 54 | transformer: Any 55 | Trained object, extracted by DataRobot from the artifact created inside fit(). 56 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 57 | 58 | Returns 59 | ------- 60 | pd.DataFrame 61 | Returns a dataframe with transformed data. 62 | """ 63 | array = signal.sosfilt(transformer, x=data) 64 | df = pd.DataFrame(array) 65 | return df -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/signal/cheby1_sos_5_1_15/custom.py: -------------------------------------------------------------------------------- 1 | # Project: Custom Transform - Chebychev Filter 2 | # Project Cerebro Hackathon Team 3 | 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html 5 | 6 | from scipy import signal 7 | import pickle 8 | import pandas as pd 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | 13 | def fit(X, y, output_dir, **kwargs): 14 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 15 | DataRobot runs this hook when the task is being trained inside a blueprint. 16 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 17 | The input parameters are passed by DataRobot based on project and blueprint configuration. 18 | Parameters 19 | ------- 20 | X: pd.DataFrame 21 | Training data that DataRobot passes when this task is being trained. 22 | y: pd.Series 23 | Project's target column (None is passed for unsupervised projects). 24 | output_dir: str 25 | A path to the output folder; the artifact must be saved into this folder to be re-used in transform(). 26 | Returns 27 | ------- 28 | None 29 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 30 | so that the trained object can be used during scoring inside transform() 31 | """ 32 | 33 | sos = signal.cheby1(5, 1, 15, 'hp', fs=1000, output='sos') 34 | #filtered = signal.sosfilt(sos, x=X) 35 | 36 | # dump the trained object [in this example - dictionary with medians per column] 37 | # into an artifact [in this example - artifact.pkl] 38 | # and save it into output_dir so that it can be used later to impute on new data 39 | output_dir_path = Path(output_dir) 40 | if output_dir_path.exists() and output_dir_path.is_dir(): 41 | with open("{}/artifact.pkl".format(output_dir), "wb") as fp: 42 | pickle.dump(sos, fp) 43 | 44 | 45 | def transform(data, transformer): 46 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 47 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 48 | As an output, this hook is expected to return the transformed data. 49 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 50 | Parameters 51 | ------- 52 | data: pd.DataFrame 53 | Data that DataRobot passes for transformation. 54 | transformer: Any 55 | Trained object, extracted by DataRobot from the artifact created inside fit(). 56 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 57 | 58 | Returns 59 | ------- 60 | pd.DataFrame 61 | Returns a dataframe with transformed data. 62 | """ 63 | array = signal.sosfilt(transformer, x=data) 64 | df = pd.DataFrame(array) 65 | return df -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/signal/cheby2_12_20_17/custom.py: -------------------------------------------------------------------------------- 1 | # Project: Custom Transform - Chebychev Filter 2 | # Project Cerebro Hackathon Team 3 | 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html 5 | 6 | from scipy import signal 7 | import pickle 8 | import pandas as pd 9 | import numpy as np 10 | from pathlib import Path 11 | 12 | 13 | def fit(X, y, output_dir, **kwargs): 14 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 15 | DataRobot runs this hook when the task is being trained inside a blueprint. 16 | As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. 17 | The input parameters are passed by DataRobot based on project and blueprint configuration. 18 | Parameters 19 | ------- 20 | X: pd.DataFrame 21 | Training data that DataRobot passes when this task is being trained. 22 | y: pd.Series 23 | Project's target column (None is passed for unsupervised projects). 24 | output_dir: str 25 | A path to the output folder; the artifact must be saved into this folder to be re-used in transform(). 26 | Returns 27 | ------- 28 | None 29 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 30 | so that the trained object can be used during scoring inside transform() 31 | """ 32 | 33 | sos = signal.cheby2(12, 20, 17, 'hp', fs=1000, output='sos') 34 | #filtered = signal.sosfilt(sos, x=X) 35 | 36 | # dump the trained object [in this example - dictionary with medians per column] 37 | # into an artifact [in this example - artifact.pkl] 38 | # and save it into output_dir so that it can be used later to impute on new data 39 | output_dir_path = Path(output_dir) 40 | if output_dir_path.exists() and output_dir_path.is_dir(): 41 | with open("{}/artifact.pkl".format(output_dir), "wb") as fp: 42 | pickle.dump(sos, fp) 43 | 44 | 45 | def transform(data, transformer): 46 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 47 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 48 | As an output, this hook is expected to return the transformed data. 49 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 50 | Parameters 51 | ------- 52 | data: pd.DataFrame 53 | Data that DataRobot passes for transformation. 54 | transformer: Any 55 | Trained object, extracted by DataRobot from the artifact created inside fit(). 56 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 57 | 58 | Returns 59 | ------- 60 | pd.DataFrame 61 | Returns a dataframe with transformed data. 62 | """ 63 | array = signal.sosfilt(transformer, x=data) 64 | df = pd.DataFrame(array) 65 | return df -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/python/signal/fft/custom.py: -------------------------------------------------------------------------------- 1 | # Project: Custom Transform - Chebychev Filter 2 | # Project Cerebro Hackathon Team 3 | 4 | # by: Marshall 5 | 6 | from scipy.fft import fft2 7 | from scipy.fft import fft 8 | import pickle 9 | import pandas as pd 10 | import numpy as np 11 | from pathlib import Path 12 | 13 | 14 | def fit(X, y, output_dir, **kwargs): 15 | """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data 16 | DataRobot runs this hook when the task is being trained inside a blueprint. 17 | As an output, this hook is expected to create an artifact containing a trained object [in this example - median of each numeric column], that is then used to transform new data. 18 | The input parameters are passed by DataRobot based on project and blueprint configuration. 19 | Parameters 20 | ------- 21 | X: pd.DataFrame 22 | Training data that DataRobot passes when this task is being trained. 23 | y: pd.Series 24 | Project's target column (None is passed for unsupervised projects). 25 | output_dir: str 26 | A path to the output folder; the artifact must be saved into this folder to be re-used in transform(). 27 | Returns 28 | ------- 29 | None 30 | fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir 31 | so that the trained object can be used during scoring inside transform() 32 | """ 33 | 34 | # Placeholder so thing works. Equation is just to test 35 | fourier = X.median(axis=0, numeric_only=True, skipna=True).to_dict() 36 | 37 | # dump the trained object [in this example - dictionary with medians per column] 38 | # into an artifact [in this example - artifact.pkl] 39 | # and save it into output_dir so that it can be used later to impute on new data 40 | output_dir_path = Path(output_dir) 41 | if output_dir_path.exists() and output_dir_path.is_dir(): 42 | with open("{}/artifact.pkl".format(output_dir), "wb") as fp: 43 | pickle.dump(fourier, fp) 44 | 45 | 46 | def transform(data, transformer): 47 | """ This hook defines how DataRobot will use the trained object from fit() to transform new data. 48 | DataRobot runs this hook when the task is used for scoring inside a blueprint. 49 | As an output, this hook is expected to return the transformed data. 50 | The input parameters are passed by DataRobot based on dataset and blueprint configuration. 51 | Parameters 52 | ------- 53 | data: pd.DataFrame 54 | Data that DataRobot passes for transformation. 55 | transformer: Any 56 | Trained object, extracted by DataRobot from the artifact created inside fit(). 57 | In this example, it's a dictionary with medians per column extracted from artifact.pkl. 58 | 59 | Returns 60 | ------- 61 | pd.DataFrame 62 | Returns a dataframe with transformed data. 63 | """ 64 | df = data.copy() 65 | for i in data.columns: 66 | df[i] = fft(np.array(df.loc[:, i])).astype(float) 67 | 68 | return df 69 | -------------------------------------------------------------------------------- /custom_tasks/preprocessing/numeric/r/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /custom_tasks/preprocessing/other/r/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /custom_tasks/preprocessing/text/r/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /drum_overview/custom_model_reg/custom.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | def transform(data, model): 4 | """ 5 | Note: This hook may not have to be implemented for your model. 6 | In this case implemented for the model used in the example. 7 | Modify this method to add data transformation before scoring calls. For example, this can be 8 | used to implement one-hot encoding for models that don't include it on their own. 9 | Parameters 10 | ---------- 11 | data: pd.DataFrame 12 | model: object, the deserialized model 13 | Returns 14 | ------- 15 | pd.DataFrame 16 | """ 17 | # Execute any steps you need to do before scoring 18 | # Remove target columns if they're in the dataset 19 | if "price" in data: 20 | data.pop("concrete_compressive_strength") 21 | if "Species" in data: 22 | data.pop("Species") 23 | data = data.fillna(0) 24 | return data -------------------------------------------------------------------------------- /drum_overview/custom_model_reg/reg_rf_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/drum_overview/custom_model_reg/reg_rf_model.pkl -------------------------------------------------------------------------------- /drum_overview/readme.MD: -------------------------------------------------------------------------------- 1 | # Using MLOps DRUM to test your custom models 2 | 3 | This notebook provides an example of how you can use the MLOps DataRobot Model Runner (DRUM) library to test your custom models before uploading into DataRobot and deploying them. 4 | 5 | ### Getting Started 6 | Open `Main_Script.ipynb` and follow isntructions. 7 | 8 | ### Requirements 9 | 10 | You can create the environment needed with the below commands (requires conda): 11 | 12 | `conda create --name my-env python=3.7.0` 13 | 14 | `conda activate my-env` 15 | 16 | `pip install -r requirements.txt` 17 | 18 | To learn about DRUM: 19 | 20 | Start by following the Quickstart instructions on https://github.com/datarobot/datarobot-user-models. 21 | 22 | Additional information can be found here: 23 | - https://github.com/datarobot/datarobot-user-models/tree/master/custom_model_runner 24 | - https://pypi.org/project/datarobot-drum/ 25 | 26 | 27 | 28 | ### Problem Type 29 | Regression, Binary Classification -------------------------------------------------------------------------------- /drum_overview/requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==5.3.1 2 | xgboost==1.2.1 3 | datarobot-drum 4 | pandas==1.1.5 5 | scikit-learn==0.23.2 6 | tensorflow==2.5.0 -------------------------------------------------------------------------------- /tracking_agents/python/readme.MD: -------------------------------------------------------------------------------- 1 | # MLOps Agent - Python End to End 2 | 3 | This notebook provides an example of how you can use the MLOps Agents to monitor external deployments using DataRobot. 4 | 5 | ### Getting Started 6 | Open `Main_Script.ipynb` and follow isntructions. You can also execute this notebook through Google Colab. 7 | 8 | ### Requirements 9 | You can create the environment needed with the below commands (requires conda): 10 | 11 | `conda create --name my-env python=3.7.0` 12 | 13 | `conda activate my-env` 14 | 15 | `pip install -r requirements.txt` -------------------------------------------------------------------------------- /tracking_agents/python/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | attrs==19.3.0 3 | backcall==0.2.0 4 | boto3==1.11.4 5 | botocore==1.14.4 6 | certifi==2020.12.5 7 | chardet==3.0.4 8 | contextlib2==0.6.0.post1 9 | datarobot==2.22.1 10 | decorator==4.4.2 11 | deprecation==2.1.0 12 | docutils==0.15.2 13 | future==0.18.2 14 | idna==2.10 15 | jmespath==0.10.0 16 | joblib==0.17.0 17 | numpy==1.19.4 18 | packaging==20.7 19 | pandas==1.1.5 20 | parso==0.7.0 21 | pika==0.13.1 22 | py4j==0.10.9 23 | pyparsing==2.4.7 24 | python-dateutil==2.8.1 25 | pytz==2020.4 26 | PyYAML==5.3.1 27 | pyzmq==20.0.0 28 | requests==2.25.0 29 | requests-toolbelt==0.9.1 30 | s3transfer==0.3.3 31 | scikit-learn==0.23.2 32 | scipy==1.5.4 33 | threadpoolctl==2.1.0 34 | trafaret==1.2.0 35 | urllib3==1.25.11 36 | --------------------------------------------------------------------------------