├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── custom_inference
    ├── python
    │   ├── cats_and_dogs
    │   │   ├── Main_Script.ipynb
    │   │   └── cats-and-dogs-README.md
    │   ├── gan_mnist
    │   │   ├── Colab - MNIST GAN Unstructured Model.ipynb
    │   │   ├── README.md
    │   │   ├── custom.py
    │   │   └── gan_weights.h5
    │   ├── image_object_detection
    │   │   ├── README.md
    │   │   ├── analyze_image_object_detection_deployment.ipynb
    │   │   ├── coco_90.json
    │   │   └── model
    │   │   │   ├── custom.py
    │   │   │   ├── efficientdet-d1.h5
    │   │   │   ├── efficientnet.py
    │   │   │   ├── initializers.py
    │   │   │   ├── layers.py
    │   │   │   ├── model_load_utils.py
    │   │   │   ├── object_detection_model.py
    │   │   │   ├── object_detection_utils.py
    │   │   │   ├── requirements.txt
    │   │   │   └── tfkeras.py
    │   ├── imdb_graph_isomorphism
    │   │   ├── Colab - Graph Isomorphism Network.ipynb
    │   │   ├── README.md
    │   │   ├── custom.py
    │   │   ├── gin.py
    │   │   ├── gin_model.h5
    │   │   ├── requirements.txt
    │   │   ├── schema_graph.avsc
    │   │   └── score_data.py
    │   ├── insurance_pricing
    │   │   ├── Main_Script.ipynb
    │   │   ├── README.md
    │   │   ├── artifacts
    │   │   │   ├── lgbm.joblib
    │   │   │   ├── ordinalEncoder.joblib
    │   │   │   ├── simpleImputerCat.joblib
    │   │   │   └── simpleImputerNum.joblib
    │   │   ├── custom.py
    │   │   ├── data
    │   │   │   └── loss_cost_short.csv
    │   │   ├── feature_detail.yaml
    │   │   ├── mymodel.py
    │   │   ├── requirements.txt
    │   │   └── schema.avsc
    │   ├── mnist
    │   │   ├── MNIST for DRUM.ipynb
    │   │   ├── data
    │   │   │   ├── mnist.csv
    │   │   │   └── test.csv
    │   │   └── drum
    │   │   │   ├── classlabels.txt
    │   │   │   ├── custom.py
    │   │   │   ├── mnist.h5
    │   │   │   └── requirements.txt
    │   ├── movie_recommender
    │   │   ├── DRUM_Recommender.ipynb
    │   │   ├── read_me.md
    │   │   └── recommender_model
    │   │   │   ├── custom.py
    │   │   │   ├── movies.csv
    │   │   │   ├── predict.csv
    │   │   │   ├── ratings_file.csv
    │   │   │   ├── saved_model.pb
    │   │   │   └── variables
    │   │   │       ├── variables.data-00000-of-00001
    │   │   │       └── variables.index
    │   └── readmissions
    │   │   ├── README.md
    │   │   ├── Readmission_level_1
    │   │       ├── Main_Script_Level_1.ipynb
    │   │       ├── catboost_info
    │   │       │   ├── catboost_training.json
    │   │       │   ├── learn
    │   │       │   │   └── events.out.tfevents
    │   │       │   ├── learn_error.tsv
    │   │       │   └── time_left.tsv
    │   │       └── custom_model
    │   │       │   ├── custom.py
    │   │       │   ├── model.pkl
    │   │       │   └── requirements.txt
    │   │   ├── Readmission_level_2
    │   │       ├── Main_Script_Level_2.ipynb
    │   │       └── custom_model
    │   │       │   ├── custom.py
    │   │       │   ├── model.pkl
    │   │       │   ├── preprocessing.pkl
    │   │       │   └── requirements.txt
    │   │   ├── Readmission_level_3
    │   │       ├── Main_Script_Level_3.ipynb
    │   │       └── custom_model
    │   │       │   ├── custom.py
    │   │       │   ├── model.pkl
    │   │       │   ├── preprocessing.pkl
    │   │       │   └── requirements.txt
    │   │   ├── data
    │   │       ├── readmissions_test.csv
    │   │       └── readmissions_train.csv
    │   │   └── requirements.txt
    ├── r
    │   ├── README.md
    │   ├── r_glm_noncaret_basic
    │   │   ├── README.md
    │   │   ├── create_pipeline.R
    │   │   └── custom.R
    │   ├── r_glm_noncaret_feateng
    │   │   ├── README.md
    │   │   ├── create_pipeline.R
    │   │   ├── custom.R
    │   │   ├── preprocess.R
    │   │   ├── rmcons.R
    │   │   └── rmident.R
    │   ├── r_glm_noncaret_gamma
    │   │   ├── README.md
    │   │   ├── create_pipeline.R
    │   │   └── custom.R
    │   ├── r_glm_noncaret_logit
    │   │   ├── README.md
    │   │   ├── create_pipeline.R
    │   │   └── custom.R
    │   └── r_glm_noncaret_recipe
    │   │   ├── README.md
    │   │   ├── create_pipeline.R
    │   │   └── custom.R
    └── scala
    │   └── iris_binary
    │       ├── README.md
    │       ├── build.sbt
    │       ├── custom-model
    │           ├── custom-scala-assembly-0.1.0.jar
    │           └── xgb-model
    │           │   └── model.bin
    │       ├── data
    │           └── iris_binary_training.csv
    │       ├── lib
    │           └── predictors.jar
    │       ├── project
    │           ├── build.properties
    │           └── plugins.sbt
    │       └── src
    │           └── main
    │               └── scala
    │                   ├── Main.scala
    │                   └── XGBoostPredictor.scala
├── custom_tasks
    ├── models
    │   ├── classification
    │   │   ├── python
    │   │   │   ├── README.md
    │   │   │   ├── catboost
    │   │   │   │   ├── catboost_pipeline.py
    │   │   │   │   ├── custom.py
    │   │   │   │   ├── feature_selection.py
    │   │   │   │   └── requirements.txt
    │   │   │   └── graph_isomorphism_network
    │   │   │   │   ├── GNN_Custom_Task.ipynb
    │   │   │   │   ├── README.md
    │   │   │   │   ├── custom_task_gin
    │   │   │   │       ├── custom.py
    │   │   │   │       ├── graph_isomorphism_network.py
    │   │   │   │       └── requirements.txt
    │   │   │   │   ├── env
    │   │   │   │       ├── Dockerfile
    │   │   │   │       ├── README.md
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── dr_requirements.txt
    │   │   │   │       ├── env_info.json
    │   │   │   │       ├── fit.sh
    │   │   │   │       ├── requirements.txt
    │   │   │   │       └── start_server.sh
    │   │   │   │   ├── graph.csv
    │   │   │   │   └── graph2.csv
    │   │   └── r
    │   │   │   └── README.md
    │   ├── regression
    │   │   ├── python
    │   │   │   └── README.md
    │   │   └── r
    │   │   │   └── README.md
    │   └── unsupervised
    │   │   ├── python
    │   │       └── README.md
    │   │   └── r
    │   │       └── README.md
    ├── other
    │   ├── README.md
    │   ├── python
    │   │   ├── README.md
    │   │   └── round_predictions
    │   │   │   └── custom.py
    │   └── r
    │   │   └── README.md
    └── preprocessing
    │   ├── categorical
    │       ├── python
    │       │   ├── encoding
    │       │   │   ├── any_target
    │       │   │   │   ├── all_enc_catboost
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── custom.py
    │       │   │   │   │   └── requirements.txt
    │       │   │   │   ├── all_enc_hashing
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── custom.py
    │       │   │   │   │   └── requirements.txt
    │       │   │   │   └── all_enc_mest
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── custom.py
    │       │   │   │   │   └── requirements.txt
    │       │   │   ├── binary_target
    │       │   │   │   ├── binary_enc_backward_differencing
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── custom.py
    │       │   │   │   │   └── requirements.txt
    │       │   │   │   ├── binary_enc_glm
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── custom.py
    │       │   │   │   │   └── requirements.txt
    │       │   │   │   ├── binary_enc_helmert
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── custom.py
    │       │   │   │   │   └── requirements.txt
    │       │   │   │   ├── binary_enc_leaveonout
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── custom.py
    │       │   │   │   │   └── requirements.txt
    │       │   │   │   └── binary_enc_woe
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── custom.py
    │       │   │   │   │   └── requirements.txt
    │       │   │   ├── multiclass_target
    │       │   │   │   └── .gitignore
    │       │   │   └── regression_target
    │       │   │   │   └── regression_enc_glm
    │       │   │   │       ├── README.md
    │       │   │   │       ├── custom.py
    │       │   │   │       └── requirements.txt
    │       │   └── imputing
    │       │   │   └── .gitignore
    │       └── r
    │       │   └── README.md
    │   ├── images
    │       └── r
    │       │   └── README.md
    │   ├── numeric
    │       ├── python
    │       │   ├── encoding
    │       │   │   └── .gitignore
    │       │   ├── imputing
    │       │   │   ├── knn_imputer_fixed_n
    │       │   │   │   ├── README.md
    │       │   │   │   └── custom.py
    │       │   │   └── median_impute
    │       │   │   │   └── custom.py
    │       │   ├── monotonic transforms
    │       │   │   └── power_transformer
    │       │   │   │   └── custom.py
    │       │   ├── scaling
    │       │   │   └── minmaxscaler
    │       │   │   │   └── custom.py
    │       │   └── signal
    │       │   │   ├── butter_10_15_hp_1000
    │       │   │       └── custom.py
    │       │   │   ├── butter_4_100_lowpass
    │       │   │       └── custom.py
    │       │   │   ├── cheby1_sos_10_1_15
    │       │   │       └── custom.py
    │       │   │   ├── cheby1_sos_5_1_15
    │       │   │       └── custom.py
    │       │   │   ├── cheby2_12_20_17
    │       │   │       └── custom.py
    │       │   │   └── fft
    │       │   │       └── custom.py
    │       └── r
    │       │   └── README.md
    │   ├── other
    │       └── r
    │       │   └── README.md
    │   └── text
    │       └── r
    │           └── README.md
├── drum_overview
    ├── Main_Script.ipynb
    ├── custom_model_reg
    │   ├── custom.py
    │   └── reg_rf_model.pkl
    ├── data
    │   ├── concrete_test.csv
    │   └── concrete_train.csv
    ├── readme.MD
    └── requirements.txt
└── tracking_agents
    └── python
        ├── Main_Script.ipynb
        ├── readme.MD
        └── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .vscode
3 | .metals
4 | **/.DS_Store
5 | **/.vscode
6 | 
7 | **/ipynb_checkpoints
8 | **/__pycache__


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at community@datarobot.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Guidelines for developing and contributing to this project.
 4 | 
 5 | ## List of project maintainers
 6 | 
 7 | - [Matthew Cohen](https://github.com/mcohenmcohen)
 8 | - [Thodoris Petropoulos](https://github.com/TheoPetropoulos)
 9 | 
10 | ## Opening new issues
11 | 
12 | - Before opening a new issue check if there are any existing FAQ entries (if one exists), issues or pull requests that match your case
13 | - Open an issue, and make sure to label the issue accordingly - bug, improvement, feature request, etc...
14 | - Be as specific and detailed as possible
15 | 
16 | ## Making a pull request 
17 | 
18 | Due to security concerns (API Keys leaking by mistake), please communicate directly with [Thodoris Petropoulos](https://github.com/TheoPetropoulos) and/or [Matthew Cohen](https://github.com/mcohenmcohen) to request any changes in this repository. 
19 | 
20 | ## Responding to issues and pull requests
21 | 
22 | This project's maintainers will make every effort to respond to any open issues as soon as possible.
23 | 
24 | If you don't get a response within 7 days of creating your issue or pull request, please send us an email at community@datarobot.com.
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **Please note:** The code in these repos is sourced from the DataRobot user community and is not owned or maintained by DataRobot, Inc. You may need to make edits or updates for this code to function properly in your environment.
 2 | 
 3 | # Custom Models and External Deployments Monitoring
 4 | 
 5 | ## Important Note
 6 | 
 7 | This repo contains a library of commonly used tasks submitted by the DataRobot community. They tend to have more complex logic and are 
 8 | meant to be used as-is rather than as a reference. If you are not familiar with DataRobot's Custom Inference Models, Custom Tasks, or Composable ML
 9 | please see this repo instead for tutorials / reference examples: 
10 | https://github.com/datarobot/datarobot-user-models
11 | 
12 | There is also extensive documentation on the platform docs at: https://docs.datarobot.com/
13 | 
14 | ## Usage
15 | 
16 | For each respective guide, follow the instructions in its own `.ipynb` or `.py` file. There will also be a `requirements.txt` file in each folder with instructions on how to create an environment to run everything successfully.
17 | 
18 | Here is some explanation of the different definitions used throughout: 
19 | - **MLOps Tracking Agents**: MLOps Tracking Agents are used when you want to deploy external models and monitor them in DataRobot. For example, you have a custom (or DataRobot) model and you deploy it in your own Kubernetes cluster (or anywhere really). In those cases, MLOps tracking agents will sent statistics back to DataRobot so that you can still monitor your model's accuracy, service health, data drift, etc.
20 | - **MLOps DRUM overview**: MLOps DRUM is an open-source framework created and managed by DataRobot that allows you to easily deploy custom models. It provides out of the box consistency & validity checks, as well as single command deployment. DRUM is also seamlessly integreated with the DataRobot platform. If you use the framework, then you can use your custom models directly within the DataRobot platform. Here is the official Github repository for [DRUM](https://github.com/datarobot/datarobot-user-models).
21 | - **Custom Inference Models**: End to end examples of custom modeling code and how it is structured in order to be deployable using the DataRobot platform. The custom code here is basically taking advantage of the DRUM framework mentioned above.
22 | - **Custom Tasks**: With Composable AI, DataRobot allows you to manipulate DataRobot created blueprints and add your own custom preprocessing steps. Within custom tasks, there are examples of how your code needs to look like to achieve this.
23 | 
24 | Some of the notebooks can also be executed through Google Colab.
25 | 
26 | ## Important Links
27 | 
28 | - To learn to use DataRobot, visit [DataRobot University](https://university.datarobot.com/)
29 | - For General articles on DataRobot and news, visit [DataRobot Community](https://community.datarobot.com/)
30 | - End to end DataRobot API examples [Tutorials for Data Scientists](https://github.com/datarobot-community/tutorials-for-data-scientists)
31 | - DataRobot API examples [Examples for Data Scientists](https://github.com/datarobot-community/examples-for-data-scientists)
32 | 
33 | ## Contents
34 | 
35 | ### MLOps Tracking Agents Overview
36 | - *MLOps Tracking Agent Notebook*: An example of how you can use DataRobot's MLOps Agents functionality to monitor external deployments. [Python](https://github.com/datarobot-community/custom-models/tree/master/tracking_agents/python)
37 | 
38 | ### MLOps DRUM Overview
39 | - *MLOps DRUM Notebook*: An example of you can use the DataRobot Model Runner (DRUM) library to test your custom models before deploying them using DataRobot. [Python](https://github.com/datarobot-community/custom-models/blob/master/drum_overview/Main_Script.ipynb)
40 | 
41 | ### Custom Inference Model Examples
42 | - *Custom Inference Models*: Examples in multiple languages on how to create custom inference models. Some of the scripts have been updated to also include the code needed to run this as a custom training model: [Multiple Languages](https://github.com/datarobot-community/custom-models/tree/master/custom_inference)
43 | 
44 | ### Custom Tasks
45 | - *Custom Tasks*: Examples of custom-tasks that you can use directly within the DataRobot platform to manipulate blueprints. Check out how they look like and create your own tasks! [Multiple Languages](https://github.com/datarobot-community/custom-models/tree/master/custom_tasks)
46 | 
47 | 
48 | ## Setup/Installation
49 | 
50 | Each project folder contains its own instructions on setup and requirements. Furthermore, instructions are also conveniently added to the scripts themselves so that users do not need to share the readme file.
51 | 
52 | ## Development and Contributing
53 | 
54 | If you'd like to report an issue or bug, suggest improvements, or contribute code to this project, please refer to [CONTRIBUTING.md](CONTRIBUTING.md).
55 | 
56 | 
57 | # Code of Conduct
58 | 
59 | This project has adopted the Contributor Covenant for its Code of Conduct. 
60 | See [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) to read it in full.
61 | 
62 | # License
63 | 
64 | Licensed under the Apache License 2.0. 
65 | See [LICENSE](LICENSE) to read it in full.
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/custom_inference/python/cats_and_dogs/cats-and-dogs-README.md:
--------------------------------------------------------------------------------
 1 | # Cats and Dogs Example
 2 | 
 3 | This folder includes an example of how to use DRUM with a Keras DNN leveraging GPUs for inference.  GPU is a costly resource for inference, but the point here to show how this can be accomplished via DRUM if the need arises.  The model was trained to classify an image as a cat or a dog.  This model originated in the model templates available within the [DRUM](https://github.com/datarobot/datarobot-user-models/tree/master/model_templates/inference/python3_keras_vizai_joblib)
 4 | 
 5 | Use google colab to follow alone with `Main_Script.ipynb`.
 6 | 
 7 | In this notebook you will 
 8 | 
 9 | * use DRUM to score data in batch
10 | * use DRUM to serve the model as a rest enpoint leveraing GPUS for inference
11 | 
12 | Serving the model can be doen with either Flask, or Nginx and uwsgi.  Using Nginx, you will have to modify some files, but all of the content is highlighed in the notebook.    


--------------------------------------------------------------------------------
/custom_inference/python/gan_mnist/README.md:
--------------------------------------------------------------------------------
1 | ## GAN MNIST as Unstructured Model
2 | 
3 | #### Owner Tim Whittaker => timothy.whittaker@datarobot.com
4 | 
5 | This demonstrates serving a GAN as an unstructured model with [DRUM](https://github.com/datarobot/datarobot-user-models/).  
6 | 
7 | Please see the notebook in this directory for more detail.  


--------------------------------------------------------------------------------
/custom_inference/python/gan_mnist/custom.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #import pandas as pd
 3 | import tensorflow as tf
 4 | from tensorflow import keras
 5 | from tensorflow.keras import layers
 6 | import numpy as np
 7 | import os
 8 | import pickle
 9 | import json
10 | 
11 | def load_model(input_dir):
12 |     generator = keras.Sequential(
13 |     [
14 |         keras.Input(shape=(128,)),
15 |         # We want to generate 128 coefficients to reshape into a 7x7x128 map
16 |         layers.Dense(7 * 7 * 128),
17 |         layers.LeakyReLU(alpha=0.2),
18 |         layers.Reshape((7, 7, 128)),
19 |         layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
20 |         layers.LeakyReLU(alpha=0.2),
21 |         layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
22 |         layers.LeakyReLU(alpha=0.2),
23 |         layers.Conv2D(1, (7, 7), padding="same", activation="sigmoid"),
24 |     ],
25 |     name="generator",
26 |     )   
27 |     generator.load_weights(os.path.join(input_dir, "gan_weights.h5"))
28 |     return generator
29 | 
30 | def score_unstructured(model, data, query, **kwargs):
31 |     print("Incoming content type params: ", kwargs)
32 |     print("Incoming data type: ", type(data))
33 |     print("Incoming query params: ", query)
34 |     
35 |     ## data is expected to be the number of images to generate
36 |     random_latent_vectors = 128
37 |    
38 | 
39 |     num_digits = json.loads(data)["num_digits"]
40 |     ## need to parse data to int
41 |   
42 |     random_latent_vectors = tf.random.normal(shape=(num_digits, 128))
43 |     rand_imgs = model(random_latent_vectors)
44 |     rand_imgs *= 255
45 |     rand_imgs.numpy()
46 |     images = []
47 |     # images["num_images": d]
48 |     for i in range(num_digits):
49 |             img = keras.preprocessing.image.array_to_img(rand_imgs[i])
50 |             images.append(img)
51 |         # img.save("generated_img_{i}_{epoch}.png".format(i=i, epoch=epoch))
52 |     return pickle.dumps(images)
53 |    
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/custom_inference/python/gan_mnist/gan_weights.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/gan_mnist/gan_weights.h5


--------------------------------------------------------------------------------
/custom_inference/python/image_object_detection/README.md:
--------------------------------------------------------------------------------
 1 | ## Python Keras Image Object Detection Custom Inference Model Template
 2 | 
 3 | This model is intended to work with the [Python 3 Keras Drop-In Environment](../../public_dropin_environments/python3_keras/).
 4 | 
 5 | ## Instructions
 6 | Create a new custom model with `Unstructured` Target Type, add the files in the model folder and use the [Python 3 Keras Drop-In Environment] with it
 7 | 
 8 | Test with custom-models-wip/drum_overview/data/image_b64.txt
 9 | 
10 | ### To run locally using 'drum'
11 | Paths are relative to `./custom-models`:  
12 | `drum score --code-dir ./custom_inference/python/image_object_detection/model --target-type unstructured --input ./drum_overview/data/image_b64.txt --verbose`
13 | 


--------------------------------------------------------------------------------
/custom_inference/python/image_object_detection/coco_90.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "1": {
  3 |     "id": 1,
  4 |     "name": "person"
  5 |   },
  6 |   "2": {
  7 |     "id": 2,
  8 |     "name": "bicycle"
  9 |   },
 10 |   "3": {
 11 |     "id": 3,
 12 |     "name": "car"
 13 |   },
 14 |   "4": {
 15 |     "id": 4,
 16 |     "name": "motorcycle"
 17 |   },
 18 |   "5": {
 19 |     "id": 5,
 20 |     "name": "airplane"
 21 |   },
 22 |   "6": {
 23 |     "id": 6,
 24 |     "name": "bus"
 25 |   },
 26 |   "7": {
 27 |     "id": 7,
 28 |     "name": "train"
 29 |   },
 30 |   "8": {
 31 |     "id": 8,
 32 |     "name": "truck"
 33 |   },
 34 |   "9": {
 35 |     "id": 9,
 36 |     "name": "boat"
 37 |   },
 38 |   "10": {
 39 |     "id": 10,
 40 |     "name": "traffic light"
 41 |   },
 42 |   "11": {
 43 |     "id": 11,
 44 |     "name": "fire hydrant"
 45 |   },
 46 |   "13": {
 47 |     "id": 13,
 48 |     "name": "stop sign"
 49 |   },
 50 |   "14": {
 51 |     "id": 14,
 52 |     "name": "parking meter"
 53 |   },
 54 |   "15": {
 55 |     "id": 15,
 56 |     "name": "bench"
 57 |   },
 58 |   "16": {
 59 |     "id": 16,
 60 |     "name": "bird"
 61 |   },
 62 |   "17": {
 63 |     "id": 17,
 64 |     "name": "cat"
 65 |   },
 66 |   "18": {
 67 |     "id": 18,
 68 |     "name": "dog"
 69 |   },
 70 |   "19": {
 71 |     "id": 19,
 72 |     "name": "horse"
 73 |   },
 74 |   "20": {
 75 |     "id": 20,
 76 |     "name": "sheep"
 77 |   },
 78 |   "21": {
 79 |     "id": 21,
 80 |     "name": "cow"
 81 |   },
 82 |   "22": {
 83 |     "id": 22,
 84 |     "name": "elephant"
 85 |   },
 86 |   "23": {
 87 |     "id": 23,
 88 |     "name": "bear"
 89 |   },
 90 |   "24": {
 91 |     "id": 24,
 92 |     "name": "zebra"
 93 |   },
 94 |   "25": {
 95 |     "id": 25,
 96 |     "name": "giraffe"
 97 |   },
 98 |   "27": {
 99 |     "id": 27,
100 |     "name": "backpack"
101 |   },
102 |   "28": {
103 |     "id": 28,
104 |     "name": "umbrella"
105 |   },
106 |   "31": {
107 |     "id": 31,
108 |     "name": "handbag"
109 |   },
110 |   "32": {
111 |     "id": 32,
112 |     "name": "tie"
113 |   },
114 |   "33": {
115 |     "id": 33,
116 |     "name": "suitcase"
117 |   },
118 |   "34": {
119 |     "id": 34,
120 |     "name": "frisbee"
121 |   },
122 |   "35": {
123 |     "id": 35,
124 |     "name": "skis"
125 |   },
126 |   "36": {
127 |     "id": 36,
128 |     "name": "snowboard"
129 |   },
130 |   "37": {
131 |     "id": 37,
132 |     "name": "sports ball"
133 |   },
134 |   "38": {
135 |     "id": 38,
136 |     "name": "kite"
137 |   },
138 |   "39": {
139 |     "id": 39,
140 |     "name": "baseball bat"
141 |   },
142 |   "40": {
143 |     "id": 40,
144 |     "name": "baseball glove"
145 |   },
146 |   "41": {
147 |     "id": 41,
148 |     "name": "skateboard"
149 |   },
150 |   "42": {
151 |     "id": 42,
152 |     "name": "surfboard"
153 |   },
154 |   "43": {
155 |     "id": 43,
156 |     "name": "tennis racket"
157 |   },
158 |   "44": {
159 |     "id": 44,
160 |     "name": "bottle"
161 |   },
162 |   "46": {
163 |     "id": 46,
164 |     "name": "wine glass"
165 |   },
166 |   "47": {
167 |     "id": 47,
168 |     "name": "cup"
169 |   },
170 |   "48": {
171 |     "id": 48,
172 |     "name": "fork"
173 |   },
174 |   "49": {
175 |     "id": 49,
176 |     "name": "knife"
177 |   },
178 |   "50": {
179 |     "id": 50,
180 |     "name": "spoon"
181 |   },
182 |   "51": {
183 |     "id": 51,
184 |     "name": "bowl"
185 |   },
186 |   "52": {
187 |     "id": 52,
188 |     "name": "banana"
189 |   },
190 |   "53": {
191 |     "id": 53,
192 |     "name": "apple"
193 |   },
194 |   "54": {
195 |     "id": 54,
196 |     "name": "sandwich"
197 |   },
198 |   "55": {
199 |     "id": 55,
200 |     "name": "orange"
201 |   },
202 |   "56": {
203 |     "id": 56,
204 |     "name": "broccoli"
205 |   },
206 |   "57": {
207 |     "id": 57,
208 |     "name": "carrot"
209 |   },
210 |   "58": {
211 |     "id": 58,
212 |     "name": "hot dog"
213 |   },
214 |   "59": {
215 |     "id": 59,
216 |     "name": "pizza"
217 |   },
218 |   "60": {
219 |     "id": 60,
220 |     "name": "donut"
221 |   },
222 |   "61": {
223 |     "id": 61,
224 |     "name": "cake"
225 |   },
226 |   "62": {
227 |     "id": 62,
228 |     "name": "chair"
229 |   },
230 |   "63": {
231 |     "id": 63,
232 |     "name": "couch"
233 |   },
234 |   "64": {
235 |     "id": 64,
236 |     "name": "potted plant"
237 |   },
238 |   "65": {
239 |     "id": 65,
240 |     "name": "bed"
241 |   },
242 |   "67": {
243 |     "id": 67,
244 |     "name": "dining table"
245 |   },
246 |   "70": {
247 |     "id": 70,
248 |     "name": "toilet"
249 |   },
250 |   "72": {
251 |     "id": 72,
252 |     "name": "tv"
253 |   },
254 |   "73": {
255 |     "id": 73,
256 |     "name": "laptop"
257 |   },
258 |   "74": {
259 |     "id": 74,
260 |     "name": "mouse"
261 |   },
262 |   "75": {
263 |     "id": 75,
264 |     "name": "remote"
265 |   },
266 |   "76": {
267 |     "id": 76,
268 |     "name": "keyboard"
269 |   },
270 |   "77": {
271 |     "id": 77,
272 |     "name": "cell phone"
273 |   },
274 |   "78": {
275 |     "id": 78,
276 |     "name": "microwave"
277 |   },
278 |   "79": {
279 |     "id": 79,
280 |     "name": "oven"
281 |   },
282 |   "80": {
283 |     "id": 80,
284 |     "name": "toaster"
285 |   },
286 |   "81": {
287 |     "id": 81,
288 |     "name": "sink"
289 |   },
290 |   "82": {
291 |     "id": 82,
292 |     "name": "refrigerator"
293 |   },
294 |   "84": {
295 |     "id": 84,
296 |     "name": "book"
297 |   },
298 |   "85": {
299 |     "id": 85,
300 |     "name": "clock"
301 |   },
302 |   "86": {
303 |     "id": 86,
304 |     "name": "vase"
305 |   },
306 |   "87": {
307 |     "id": 87,
308 |     "name": "scissors"
309 |   },
310 |   "88": {
311 |     "id": 88,
312 |     "name": "teddy bear"
313 |   },
314 |   "89": {
315 |     "id": 89,
316 |     "name": "hair drier"
317 |   },
318 |   "90": {
319 |     "id": 90,
320 |     "name": "toothbrush"
321 |   }
322 | }


--------------------------------------------------------------------------------
/custom_inference/python/image_object_detection/model/custom.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | from sklearn.pipeline import Pipeline
  4 | import json
  5 | 
  6 | from model_load_utils import (
  7 |     load_image_object_detection_inference_pipeline,
  8 |     predict_with_preprocessing,
  9 | )
 10 | 
 11 | 
 12 | def load_model(input_dir: str) -> Pipeline:
 13 |     """
 14 |     Note: This hook may not have to be implemented for your model.
 15 |     In this case implemented for the model used in the example.
 16 | 
 17 |     This keras estimator requires 'load_model()' to be overridden. Coz as it involves pipeline of
 18 |     preprocessor and estimator bundled together, it requires a special handling (oppose to usually
 19 |     simple keras.models.load_model() or unpickling) to load the model. Currently there is no elegant
 20 |     default method to save the keras classifier/regressor along with the sklearn pipeline. Hence we
 21 |     use deserialize_estimator_pipeline() to load the model pipeline to predict.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     input_dir: str
 26 | 
 27 |     Returns
 28 |     -------
 29 |     pipelined_model: Pipeline
 30 |         Estimator pipeline obj
 31 |     """
 32 |     model = load_image_object_detection_inference_pipeline(input_dir)
 33 |     return model
 34 | 
 35 | 
 36 | def transform(b64_image_array: str, model: Any) -> list:
 37 |     """
 38 |     Intended to apply transformations to the prediction data before making predictions. This is
 39 |     most useful if DRUM supports the model's library, but your model requires additional data
 40 |     processing before it can make predictions
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     data : is the dataframe given to DRUM to make predictions on
 45 |     model : is the deserialized model loaded by DRUM or by `load_model`, if supplied
 46 | 
 47 |     Returns
 48 |     -------
 49 |     Transformed data: np.ndarray
 50 |     """
 51 |     predicted_labels = predict_with_preprocessing(model, b64_image_array)
 52 |     return predicted_labels
 53 | 
 54 | 
 55 | def score_unstructured(model, data, query, **kwargs):
 56 |     print("Model: ", model)
 57 |     print("Incoming content type params: ", kwargs)
 58 |     print("Incoming data type: ", type(data))
 59 |     print("Incoming data: ", data)
 60 | 
 61 |     print("Incoming query params: ", query)
 62 |     if isinstance(data, bytes):
 63 |         data = data.decode("utf8")
 64 |     ret = transform(data, model).astype(int).tolist()
 65 | 
 66 |     ret_mode = query.get("ret_mode", "")
 67 |     if ret_mode == "binary":
 68 |         ret_data = ret.tobytes()
 69 |         ret_kwargs = {"mimetype": "application/octet-stream"}
 70 |         ret = ret_data, ret_kwargs
 71 |     else:
 72 |         ret = json.dumps(ret)
 73 |     return ret
 74 | 
 75 | 
 76 | # def score(data: pd.DataFrame, model: Any, **kwargs: Dict[str, Any]) -> pd.DataFrame:
 77 | #     """
 78 | #     This hook is only needed if you would like to use DRUM with a framework not natively
 79 | #     supported by the tool.
 80 | #
 81 | #     Parameters
 82 | #     ----------
 83 | #     data : is the dataframe to make predictions against. If `transform` is supplied,
 84 | #     `data` will be the transformed data.
 85 | #     model : is the deserialized model loaded by DRUM or by `load_model`, if supplied
 86 | #     kwargs : additional keyword arguments to the method
 87 | #     In case of classification model class labels will be provided as the following arguments:
 88 | #     - `positive_class_label` is the positive class label for a binary classification model
 89 | #     - `negative_class_label` is the negative class label for a binary classification model
 90 | #
 91 | #     Returns
 92 | #     -------
 93 | #     This method should return predictions as a dataframe with the following format:
 94 | #       Binary Classification: must have columns for each class label with floating- point class
 95 | #         probabilities as values. Each row should sum to 1.0
 96 | #       Regression: must have a single column called `Predictions` with numerical values
 97 | #
 98 | #     """
 99 | 
100 | # def post_process(predictions: pd.DataFrame, model: Any) -> pd.DataFrame:
101 | #     """
102 | #     This method is only needed if your model's output does not match the above expectations
103 | #
104 | #     Parameters
105 | #     ----------
106 | #     predictions : is the dataframe of predictions produced by DRUM or by
107 | #       the `score` hook, if supplied
108 | #     model : is the deserialized model loaded by DRUM or by `load_model`, if supplied
109 | #
110 | #     Returns
111 | #     -------
112 | #     This method should return predictions as a dataframe with the following format:
113 | #       Binary Classification: must have columns for each class label with floating- point class
114 | #         probabilities as values. Each row
115 | #     should sum to 1.0
116 | #       Regression: must have a single column called `Predictions` with numerical values
117 | #
118 | #     """
119 | 


--------------------------------------------------------------------------------
/custom_inference/python/image_object_detection/model/efficientdet-d1.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/image_object_detection/model/efficientdet-d1.h5


--------------------------------------------------------------------------------
/custom_inference/python/image_object_detection/model/initializers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2017-2018 Fizyr (https://fizyr.com)
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | 
17 | from tensorflow import keras
18 | 
19 | import numpy as np
20 | import math
21 | 
22 | 
23 | class PriorProbability(keras.initializers.Initializer):
24 |     """ Apply a prior probability to the weights.
25 |     """
26 | 
27 |     def __init__(self, probability=0.01):
28 |         self.probability = probability
29 | 
30 |     def get_config(self):
31 |         return {"probability": self.probability}
32 | 
33 |     def __call__(self, shape, dtype=None):
34 |         # set bias to -log((1 - p)/p) for foreground
35 |         result = np.ones(shape, dtype=np.float32) * -math.log(
36 |             (1 - self.probability) / self.probability
37 |         )
38 | 
39 |         return result
40 | 


--------------------------------------------------------------------------------
/custom_inference/python/image_object_detection/model/model_load_utils.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import io
 3 | import os
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from PIL import Image
 8 | from sklearn.pipeline import Pipeline
 9 | 
10 | from object_detection_model import efficientdet
11 | from object_detection_utils import preprocess_image, postprocess_boxes
12 | 
13 | IMG_SIZE = 150
14 | IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)
15 | PHI = 1
16 | WEIGHTED_BIFPN = True
17 | MODEL_PATH = "efficientdet-d1.h5"
18 | IMAGE_SIZES = (512, 640, 768, 896, 1024, 1280, 1408)
19 | IMAGE_SIZE = IMAGE_SIZES[PHI]
20 | NUM_CLASSES = 90  # from coco
21 | SCORE_THRESHOLD = 0.3
22 | 
23 | 
24 | def img_preprocessing(image: Image) -> np.ndarray:
25 |     """ given a PIL.Image object resize, convert to RGB and return as np.array """
26 |     image, scale = preprocess_image(image, image_size=IMAGE_SIZE)
27 |     return image, scale
28 | 
29 | 
30 | def get_img_obj_from_base64_str(b64_img_str: str) -> Image:
31 |     """ given a base64 encoded image str get the PIL.Image object """
32 |     b64_img = base64.b64decode(b64_img_str)
33 |     image_bytes = io.BytesIO(b64_img)
34 |     return Image.open(image_bytes)
35 | 
36 | 
37 | def get_base64_str_from_PIL_img(pillowed_img: Image) -> str:
38 |     """ given a PIL.Image object return base64 encoded str of the image object """
39 |     buffer = io.BytesIO()
40 |     pillowed_img.save(buffer, format="JPEG")
41 |     return base64.b64encode(buffer.getvalue())
42 | 
43 | 
44 | def load_and_preprocess_image_data(x_data: np.ndarray) -> pd.DataFrame:
45 |     """ Apply the preprocessing methods on the data before prediction for the model to work on """
46 |     try:
47 |         image = get_img_obj_from_base64_str(x_data)
48 |     except:
49 |         image = get_imputation_img()
50 |     return img_preprocessing(image)
51 | 
52 | 
53 | def apply_image_data_preprocessing(x_data: np.ndarray) -> np.ndarray:
54 |     """ Image data preprocessing before fit """
55 |     x_data, scale = load_and_preprocess_image_data(x_data)
56 |     return x_data, scale
57 | 
58 | 
59 | def get_imputation_img() -> str:
60 |     """ Black image in base64 str for data imputation filling """
61 |     black_PIL_img = Image.fromarray(np.zeros(IMG_SHAPE, dtype="float32"), "RGB")
62 |     return black_PIL_img
63 | 
64 | 
65 | def predict_with_preprocessing(model, b64_image_string: str) -> np.ndarray:
66 |     """ Apply necessary preprocessing to conver b64 image string to image values, preprocessing to
67 |     the image values and finally predict bounding boxes and labels with the preprocessed image
68 |     values
69 |     """
70 |     image, scale = apply_image_data_preprocessing(b64_image_string)
71 |     w, h = image.shape[:2]
72 |     boxes, scores, labels = model.predict(np.expand_dims(image, axis=0))
73 |     boxes, scores, labels = np.squeeze(boxes), np.squeeze(scores), np.squeeze(labels)
74 |     boxes = postprocess_boxes(boxes=boxes, scale=scale, height=h, width=w)
75 |     # select indices which have a score above the threshold
76 |     indices = np.where(scores[:] > SCORE_THRESHOLD)[0]
77 |     # select those detections
78 |     boxes = boxes[indices]
79 |     labels = labels[indices]
80 |     return np.hstack([boxes, np.expand_dims(labels, 1)])
81 | 
82 | 
83 | def load_image_object_detection_inference_pipeline(input_dir: str) -> Pipeline:
84 |     """ Load keras based image object detection model used to predict bounding boxes and labels """
85 |     _, model = efficientdet(
86 |         phi=PHI,
87 |         weighted_bifpn=WEIGHTED_BIFPN,
88 |         num_classes=NUM_CLASSES,
89 |         score_threshold=SCORE_THRESHOLD,
90 |     )
91 |     model.load_weights(os.path.join(input_dir, MODEL_PATH), by_name=True)
92 |     return model
93 | 


--------------------------------------------------------------------------------
/custom_inference/python/image_object_detection/model/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.15.0
2 | 


--------------------------------------------------------------------------------
/custom_inference/python/image_object_detection/model/tfkeras.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | Copyright 2017 xuannianz github user
 5 | 
 6 | Licensed under the Apache License, Version 2.0 (the "License");
 7 | you may not use this file except in compliance with the License.
 8 | You may obtain a copy of the License at
 9 | 
10 | http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 | THE FOLLOWING IS THE COPYRIGHT OF THE ORIGINAL DOCUMENT:
13 | https://github.com/xuannianz/EfficientDet/blob/master/tfkeras.py
14 | """
15 | from object_detection_utils import inject_tfkeras_modules, init_tfkeras_custom_objects
16 | import efficientnet as model
17 | 
18 | EfficientNetB0 = inject_tfkeras_modules(model.EfficientNetB0)
19 | EfficientNetB1 = inject_tfkeras_modules(model.EfficientNetB1)
20 | EfficientNetB2 = inject_tfkeras_modules(model.EfficientNetB2)
21 | EfficientNetB3 = inject_tfkeras_modules(model.EfficientNetB3)
22 | EfficientNetB4 = inject_tfkeras_modules(model.EfficientNetB4)
23 | EfficientNetB5 = inject_tfkeras_modules(model.EfficientNetB5)
24 | EfficientNetB6 = inject_tfkeras_modules(model.EfficientNetB6)
25 | EfficientNetB7 = inject_tfkeras_modules(model.EfficientNetB7)
26 | 
27 | preprocess_input = inject_tfkeras_modules(model.preprocess_input)
28 | 
29 | init_tfkeras_custom_objects()
30 | 


--------------------------------------------------------------------------------
/custom_inference/python/imdb_graph_isomorphism/README.md:
--------------------------------------------------------------------------------
1 | ## Graph Isomorphism Network
2 | 
3 | #### owner Tim Whittaker => timothy.whittaker@datarobot.com
4 | 
5 | Demonstrate deployed a GNN as an unstructured model with [DRUM](https://github.com/datarobot/datarobot-user-models/).  
6 | 
7 | Please see the notebook in this directory for more detail.  
8 | 
9 | 


--------------------------------------------------------------------------------
/custom_inference/python/imdb_graph_isomorphism/custom.py:
--------------------------------------------------------------------------------
 1 | #import pandas as pd
 2 | import torch
 3 | import os
 4 | import io
 5 | from io import BytesIO
 6 | import avro.io
 7 | import avro
 8 | from avro.datafile import DataFileReader, DataFileWriter
 9 | from gin import *
10 | import dgl
11 | import pandas as pd
12 | 
13 | def load_model(input_dir):
14 |    """
15 |    This hook can be implemented to adjust logic in the scoring mode.
16 | 
17 |    load_model hook provides a way to implement model loading your self.
18 |    This function should return an object that represents your model. This object will
19 |    be passed to the predict hook for performing predictions.
20 |    This hook can be used to load supported models if your model has multiple artifacts, or
21 |    for loading models that drum does not natively support
22 | 
23 |    :param input_dir: the directory to load serialized models from
24 |    :returns: Object containing the model - the predict hook will get this object as a parameter
25 |    """
26 |    model = GIN(2, 2, 1, 20, 2, 0, 0.01, "sum", "sum")
27 |    model.load_state_dict(torch.load(os.path.join(input_dir, "gin_model.h5")))
28 |    return model
29 | 
30 | def score_unstructured(model, data, query, **kwargs):
31 |    print("Incoming content type params: ", kwargs)
32 |    print("Incoming data type: ", type(data))
33 |    print("Incoming query params: ", query)
34 |     
35 |    bytes_reader = io.BytesIO(data)
36 |    parsed_data = DataFileReader(bytes_reader, avro.io.DatumReader())
37 | 
38 |    gs = []
39 |    for graph in parsed_data:
40 |       e = graph["edges"] 
41 |       u,v = list(zip(*e))
42 |       g = dgl.graph((u,v))
43 |       g.ndata["attr"] = torch.ones(g.num_nodes(), 1)
44 |       gs.append(g)
45 |    batched_graph = dgl.batch(gs)
46 |    feats = batched_graph.ndata['attr'].float()
47 |    preds = F.softmax(model(batched_graph, feats), dim=1).detach().numpy()
48 |    return pd.DataFrame(preds, columns = ["neg_class", "pos_class"]).to_json(orient="records")
49 | 


--------------------------------------------------------------------------------
/custom_inference/python/imdb_graph_isomorphism/gin.py:
--------------------------------------------------------------------------------
  1 | """
  2 | How Powerful are Graph Neural Networks
  3 | https://arxiv.org/abs/1810.00826
  4 | https://openreview.net/forum?id=ryGs6iA5Km
  5 | Author's implementation: https://github.com/weihua916/powerful-gnns
  6 | """
  7 | 
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | from dgl.nn.pytorch.conv import GINConv
 13 | from dgl.nn.pytorch.glob import SumPooling, AvgPooling, MaxPooling
 14 | 
 15 | 
 16 | class ApplyNodeFunc(nn.Module):
 17 |     """Update the node feature hv with MLP, BN and ReLU."""
 18 |     def __init__(self, mlp):
 19 |         super(ApplyNodeFunc, self).__init__()
 20 |         self.mlp = mlp
 21 |         self.bn = nn.BatchNorm1d(self.mlp.output_dim)
 22 | 
 23 |     def forward(self, h):
 24 |         h = self.mlp(h)
 25 |         h = self.bn(h)
 26 |         h = F.relu(h)
 27 |         return h
 28 | 
 29 | 
 30 | class MLP(nn.Module):
 31 |     """MLP with linear output"""
 32 |     def __init__(self, num_layers, input_dim, hidden_dim, output_dim):
 33 |         """MLP layers construction
 34 | 
 35 |         Paramters
 36 |         ---------
 37 |         num_layers: int
 38 |             The number of linear layers
 39 |         input_dim: int
 40 |             The dimensionality of input features
 41 |         hidden_dim: int
 42 |             The dimensionality of hidden units at ALL layers
 43 |         output_dim: int
 44 |             The number of classes for prediction
 45 | 
 46 |         """
 47 |         super(MLP, self).__init__()
 48 |         self.linear_or_not = False  # default is linear model
 49 |         self.num_layers = num_layers
 50 |         self.output_dim = output_dim
 51 | 
 52 |         if num_layers < 1:
 53 |             raise ValueError("number of layers should be positive!")
 54 |         elif num_layers == 1:
 55 |             # Linear model
 56 |             self.linear = nn.Linear(input_dim, output_dim)
 57 |         else:
 58 |             # Multi-layer model
 59 |             self.linear_or_not = False
 60 |             self.linears = torch.nn.ModuleList()
 61 |             self.batch_norms = torch.nn.ModuleList()
 62 | 
 63 |             self.linears.append(nn.Linear(input_dim, hidden_dim))
 64 |             for layer in range(num_layers - 2):
 65 |                 self.linears.append(nn.Linear(hidden_dim, hidden_dim))
 66 |             self.linears.append(nn.Linear(hidden_dim, output_dim))
 67 | 
 68 |             for layer in range(num_layers - 1):
 69 |                 self.batch_norms.append(nn.BatchNorm1d((hidden_dim)))
 70 | 
 71 |     def forward(self, x):
 72 |         if self.linear_or_not:
 73 |             # If linear model
 74 |             return self.linear(x)
 75 |         else:
 76 |             # If MLP
 77 |             h = x
 78 |             for i in range(self.num_layers - 1):
 79 |                 h = F.relu(self.batch_norms[i](self.linears[i](h)))
 80 |             return self.linears[-1](h)
 81 | 
 82 | 
 83 | class GIN(nn.Module):
 84 |     """GIN model"""
 85 |     def __init__(self, num_layers, num_mlp_layers, input_dim, hidden_dim,
 86 |                  output_dim, final_dropout, learn_eps, graph_pooling_type,
 87 |                  neighbor_pooling_type):
 88 |         """model parameters setting
 89 | 
 90 |         Paramters
 91 |         ---------
 92 |         num_layers: int
 93 |             The number of linear layers in the neural network
 94 |         num_mlp_layers: int
 95 |             The number of linear layers in mlps
 96 |         input_dim: int
 97 |             The dimensionality of input features
 98 |         hidden_dim: int
 99 |             The dimensionality of hidden units at ALL layers
100 |         output_dim: int
101 |             The number of classes for prediction
102 |         final_dropout: float
103 |             dropout ratio on the final linear layer
104 |         learn_eps: boolean
105 |             If True, learn epsilon to distinguish center nodes from neighbors
106 |             If False, aggregate neighbors and center nodes altogether.
107 |         neighbor_pooling_type: str
108 |             how to aggregate neighbors (sum, mean, or max)
109 |         graph_pooling_type: str
110 |             how to aggregate entire nodes in a graph (sum, mean or max)
111 | 
112 |         """
113 |         super(GIN, self).__init__()
114 |         self.num_layers = num_layers
115 |         self.learn_eps = learn_eps
116 | 
117 |         # List of MLPs
118 |         self.ginlayers = torch.nn.ModuleList()
119 |         self.batch_norms = torch.nn.ModuleList()
120 | 
121 |         for layer in range(self.num_layers - 1):
122 |             if layer == 0:
123 |                 mlp = MLP(num_mlp_layers, input_dim, hidden_dim, hidden_dim)
124 |             else:
125 |                 mlp = MLP(num_mlp_layers, hidden_dim, hidden_dim, hidden_dim)
126 | 
127 |             self.ginlayers.append(
128 |                 GINConv(ApplyNodeFunc(mlp), neighbor_pooling_type, 0, self.learn_eps))
129 |             self.batch_norms.append(nn.BatchNorm1d(hidden_dim))
130 | 
131 |         # Linear function for graph poolings of output of each layer
132 |         # which maps the output of different layers into a prediction score
133 |         self.linears_prediction = torch.nn.ModuleList()
134 | 
135 |         for layer in range(num_layers):
136 |             if layer == 0:
137 |                 self.linears_prediction.append(
138 |                     nn.Linear(input_dim, output_dim))
139 |             else:
140 |                 self.linears_prediction.append(
141 |                     nn.Linear(hidden_dim, output_dim))
142 | 
143 |         self.drop = nn.Dropout(final_dropout)
144 | 
145 |         if graph_pooling_type == 'sum':
146 |             self.pool = SumPooling()
147 |         elif graph_pooling_type == 'mean':
148 |             self.pool = AvgPooling()
149 |         elif graph_pooling_type == 'max':
150 |             self.pool = MaxPooling()
151 |         else:
152 |             raise NotImplementedError
153 | 
154 |     def forward(self, g, h):
155 |         # list of hidden representation at each layer (including input)
156 |         hidden_rep = [h]
157 | 
158 |         for i in range(self.num_layers - 1):
159 |             h = self.ginlayers[i](g, h)
160 |             h = self.batch_norms[i](h)
161 |             h = F.relu(h)
162 |             hidden_rep.append(h)
163 | 
164 |         score_over_layer = 0
165 | 
166 |         # perform pooling over all nodes in each graph in every layer
167 |         for i, h in enumerate(hidden_rep):
168 |             pooled_h = self.pool(g, h)
169 |             score_over_layer += self.drop(self.linears_prediction[i](pooled_h))
170 | 
171 |         return score_over_layer


--------------------------------------------------------------------------------
/custom_inference/python/imdb_graph_isomorphism/gin_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/imdb_graph_isomorphism/gin_model.h5


--------------------------------------------------------------------------------
/custom_inference/python/imdb_graph_isomorphism/requirements.txt:
--------------------------------------------------------------------------------
1 | networkx==2.5
2 | dgl==0.5.2
3 | avro==1.10.0
4 | datarobot-drum==1.4.2
5 | 


--------------------------------------------------------------------------------
/custom_inference/python/imdb_graph_isomorphism/schema_graph.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |     "namespace": "imdb.graph.avro",
 3 |     "type": "record",
 4 |     "name": "graph",
 5 |     "fields": [
 6 |         {
 7 |             "name": "index",
 8 |             "type": "int"
 9 |         },
10 |         {
11 |             "name": "edges",
12 |             "type": [
13 |                 {
14 |                     "type": "array",
15 |                     "items": {
16 |                         "type": "array",
17 |                         "items": "int",
18 |                         "default": []
19 |                     },
20 |                     "default": []
21 |                 },
22 |                 "null"
23 |             ]
24 |         },
25 |         {
26 |             "name": "vertices",
27 |             "type": [
28 |                 {
29 |                     "type": "array",
30 |                     "items": "int",
31 |                     "default": []
32 |                 }, "null"
33 |             ]
34 |         },
35 |         {
36 |             "name": "label",
37 |             "type": [
38 |                 "int",
39 |                 "null"
40 |             ]
41 |         },
42 |         {
43 |             "name": "edge_features",
44 |             "type": [
45 |                 {
46 |                     "type": "array",
47 |                     "items": {
48 |                         "type": "array",
49 |                         "items": "float",
50 |                         "default": []
51 |                     },
52 |                     "default": []
53 |                 },
54 |                 "null"
55 |             ]
56 |         },
57 |         {
58 |             "name": "vertex_features",
59 |             "type": [
60 |                 {
61 |                     "type": "array",
62 |                     "items": {
63 |                         "type": "array",
64 |                         "items": "float",
65 |                         "default": []
66 |                     },
67 |                     "default": []
68 |                 },
69 |                 "null"
70 |             ]
71 |         }
72 |     ]
73 | }


--------------------------------------------------------------------------------
/custom_inference/python/imdb_graph_isomorphism/score_data.py:
--------------------------------------------------------------------------------
 1 | import avro.io
 2 | import avro
 3 | from avro.datafile import DataFileReader, DataFileWriter
 4 | from io import BytesIO, BufferedWriter
 5 | import requests
 6 | import pandas as pd
 7 | 
 8 | def load_schema(schema_path): 
 9 |     schema = avro.schema.parse(open(schema_path, "rb").read())
10 |     return schema
11 | 
12 | 
13 | def score(graphs, schema, url, port):
14 |     """
15 |     graphs is expected to be a list of dictionaries, where each entry in the 
16 |     list represents a graph with 
17 |     * key idx -> index value
18 |     * key nodes -> list of ints representing vertices of the graph
19 |     * key edges -> list of list of ints representing edges of graph
20 |     """
21 |     
22 |     stream = BufferedWriter(BytesIO())
23 |     writer = DataFileWriter(stream, avro.io.DatumWriter(), schema)
24 |     # writer = DataFileWriter(open("imdb-graph.avro", "wb"), DatumWriter(), schema)
25 |     for graph in graphs:
26 |         writer.append({"edges": graph["edges"], "vertices": graph["vertices"], "index": graph["idx"], "label": graph.get("label")})
27 |         writer.flush()
28 |     raw_bytes = stream.raw.getvalue()
29 |     writer.close()
30 |     
31 |     url = "{}:{}/predictUnstructured/?ret_mode=binary".format(url.strip("/"), port)
32 | 
33 |     payload = raw_bytes
34 |     headers = {
35 |       'Content-Type': 'application/octet-stream'
36 |     }
37 | 
38 |     response = requests.request("POST", url, headers=headers, data = payload)
39 | 
40 |     return response
41 | 


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/README.md:
--------------------------------------------------------------------------------
 1 | # Insurance Pricing Example
 2 | 
 3 | This folder includes an example of a custom model using DRUM to predict loss on insurance claims. For simplicity, you can use google colab to follow alone with `Main_Script.ipynb`.
 4 | 
 5 | The point of this example is to highlight DRUM's unstructured mode.  DRUM relaxing validation of the input and output, allowing for flexibility in terms of the model that can be hosted and morever, the way data is sent to and returned from DRUM.  
 6 | 
 7 | In unstructured mode, this model will return shap values and loss prediction all by leverage avro for serializsation of data.  
 8 | 
 9 | Avro has a JSON like data model, but can be represented as either JSON or in a compact binary form.  
10 | * It comes with a very sophisticated schema description language that describes data. 
11 | * It has a direct mapping to and from JSON. 
12 | * It has a very compact format. 
13 | * The bulk of JSON, repeating every field name with every single record, is what makes JSON inefficient for high-volume usage
14 | 
15 | 


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/artifacts/lgbm.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/insurance_pricing/artifacts/lgbm.joblib


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/artifacts/ordinalEncoder.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/insurance_pricing/artifacts/ordinalEncoder.joblib


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/artifacts/simpleImputerCat.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/insurance_pricing/artifacts/simpleImputerCat.joblib


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/artifacts/simpleImputerNum.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/insurance_pricing/artifacts/simpleImputerNum.joblib


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/custom.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pickle
  3 | from typing import List, Optional, Any, Dict
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from category_encoders import OrdinalEncoder
  9 | from sklearn.impute import SimpleImputer
 10 | import lightgbm as lgb
 11 | from joblib import load
 12 | import yaml
 13 | import numpy as np
 14 | import io
 15 | import avro.io
 16 | import avro
 17 | from avro.datafile import DataFileReader
 18 | 
 19 | from mymodel import MyModel
 20 | 
 21 | import shap
 22 | import io
 23 | from io import BytesIO
 24 | import avro.io
 25 | import avro
 26 | from avro.datafile import DataFileReader, DataFileWriter
 27 | 
 28 | #
 29 | #"""
 30 | #This file was autogenerated by: drum new model --language python
 31 | #Generation date: Mon Nov  9 14:31:54 2020
 32 | #
 33 | #Note: this is an example of custom.py file.
 34 | #    Below are all the hooks you can use to provide your own implementation.
 35 | #    All hooks are currently commented out so uncomment a hook function in 
 36 | #    order to use it.
 37 | #"""
 38 | #
 39 | #
 40 | #def init(**kwargs):
 41 | #    """
 42 | #    This hook can be implemented to adjust logic in the training and scoring mode.
 43 | #    init is called once the code is started.
 44 | #
 45 | #    :param kwargs: additional keyword arguments to the function.
 46 | #    code_dir - code folder passed in --code_dir argument
 47 | #    """
 48 | #    pass
 49 | #
 50 | #
 51 | def load_model(input_dir):
 52 |    """
 53 |    This hook can be implemented to adjust logic in the scoring mode.
 54 | 
 55 |    load_model hook provides a way to implement model loading your self.
 56 |    This function should return an object that represents your model. This object will
 57 |    be passed to the predict hook for performing predictions.
 58 |    This hook can be used to load supported models if your model has multiple artifacts, or
 59 |    for loading models that drum does not natively support
 60 | 
 61 |    :param input_dir: the directory to load serialized models from
 62 |    :returns: Object containing the model - the predict hook will get this object as a parameter
 63 |    """
 64 | 
 65 |    # Returning a string with value "dummy" as the model.
 66 |    return MyModel(input_dir)
 67 | #
 68 | #
 69 | #def transform(data, model):
 70 | #    """
 71 | #    This hook can be implemented to adjust logic in the scoring mode.
 72 | #
 73 | #    transform(data: DataFrame, model: Any) -> DataFrame
 74 | #
 75 | #    Intended to apply transformations to the prediction data before making predictions.
 76 | #    This is most useful if drum supports the model's library, but your model requires additional
 77 | #    data processing before it can make predictions
 78 | #
 79 | #    :param data: dataframe given to drum to make predictions on
 80 | #    :param model: is the deserialized model loaded by drum or by load_model hook , if supplied
 81 | #    :returns: a dataframe after transformation needed
 82 | #    """
 83 | #    return data
 84 | #
 85 | #
 86 | def score(data, model, **kwargs):
 87 |    """
 88 |    This hook can be implemented to adjust logic in the scoring mode.
 89 | 
 90 |    This method should return predictions as a dataframe with the following format:
 91 | 
 92 |    Binary Classification:
 93 |    Must have columns for each class label with floating-point class probabilities as values.
 94 |    Each row should sum to 1.0
 95 | 
 96 |    Regression:
 97 |    Must have a single column called "Predictions" with numerical values
 98 | 
 99 |    This hook is only needed if you would like to use drum with a framework not natively
100 |    supported by the tool.
101 | 
102 |    :param data: the dataframe to make predictions against. If transform is supplied, data
103 |        will be the transformed data.
104 |    :param model: is the deserialized model loaded by drum or by load_model hook, if supplied
105 |    :param kwargs: additional keyword arguments to the function. If model is binary classification,
106 |    positive_class_label and negative_class_label will be provided in kwargs. If the model is multiclass
107 |    classification (at least 3 classes), a class_labels list will be provided as a parameter.
108 |    :returns: a dataframe, see documentation above on the structure of the dataframe to return.
109 |    """
110 | 
111 |    return model.predict(data)
112 | 
113 | def score_unstructured(model, data, query, **kwargs):
114 |    print("Incoming content type params: ", kwargs)
115 |    print("Incoming data type: ", type(data))
116 |    print("Incoming query params: ", query)
117 | 
118 | 
119 |    # writer = avro.io.DatumWriter(schema)
120 |    # bytes_writer = BytesIO()
121 |    # encoder = avro.io.BinaryEncoder(bytes_writer)
122 |    X = pd.read_csv(BytesIO(data))
123 |    shap_values_dict = model.explain(X)
124 |    predictions = model.predict(X).values
125 |    # for p, s in zip(predictions, shap_values_dict):
126 |    #    writer.write({"prediction": p[0],"shap_values": s}, encoder)
127 |    stream = io.BufferedWriter(io.BytesIO())
128 |    writer = DataFileWriter(stream, avro.io.DatumWriter(), model.schema)
129 |    for p, s in zip(predictions, shap_values_dict):
130 |       writer.append({"prediction": p[0],"shap_values": s})
131 |       writer.flush()
132 |    ret_bytes = stream.raw.getvalue()
133 |    writer.close()
134 |    return ret_bytes


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/data/loss_cost_short.csv:
--------------------------------------------------------------------------------
1 | Policy_ID,Eff_Dt,IncurredClaims,Exposure,Zipcode,VehicleCostNew,VehicleMake,VehicleModel,EngineCapacity,VehicleAge,ClientType,DriverAge,NumberOfDrivers,CustomerTenure,Gender,MaritalStatus,Zipcode_Aged_18_24,Zipcode_Aged_25_29,Zipcode_Aged_30_39,Zipcode_Aged_40_44,Zipcode_Aged_45_49,Zipcode_Aged_50_59,Zipcode_Aged_60,Zipcode_PersonsPerHousehold,Zipcode_annualMileage,Zipcode_VehiclesPerHousehold,Zipcode_CommuteViaCar,DistributionChannel,PartitionColumn
2 | 10001,5/1/17,14448.33,1.0,33672,30000.0,MERCEDES-BENZ,C200,122,15,Retail,49.0,2,4,F,Unknown,0.157653021,0.144983768,0.246694117,0.087061525,0.078153456,0.125386016,0.160068097,2.039300999,2668.979414,0.908915835,0.4,Insurance Broker,0
3 | 10002,4/29/17,0.0,1.0,29706,27000.0,FORD,Ranger,152,11,Commercial,,1,4,C,Unknown,0.116033871,0.09194209199999999,0.184922152,0.104834745,0.095329145,0.162141491,0.244796504,2.4452525659999997,4005.0689729999995,1.4122745840000002,0.6,Insurance Agency,0
4 | 10003,8/18/17,18549.37,1.0,35555,29000.0,FORD,Ranger,152,10,Retail,52.0,2,2,F,Married,0.149935575,0.11128030900000001,0.202295888,0.097575261,0.09411971400000001,0.176525712,0.16826754100000002,2.616979485,5751.1052740000005,1.196202532,0.59,Insurance Agency,0
5 | 10004,8/2/17,2152.92,0.78,26155,97000.0,TOYOTA,Camry,122,4,Retail,58.0,2,2,M,Unknown,0.10081084400000001,0.12610432,0.24204284199999998,0.092339344,0.08060026599999999,0.160232361,0.197870023,1.999605523,2962.848071,1.092110454,0.51,Vehicle Dealership,0
6 | 10005,3/24/17,0.0,0.43,40363,12000.0,HONDA,Accord,122,17,Retail,51.0,1,10,M,Unknown,0.12900731599999998,0.123713793,0.231011717,0.10075536800000001,0.086897044,0.126033427,0.20258133600000003,2.186629834,3224.389464,0.806077348,0.37,Vehicle Dealership,0
7 | 


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/feature_detail.yaml:
--------------------------------------------------------------------------------
 1 | Categorical:
 2 | - DistributionChannel
 3 | - VehicleModel
 4 | - Zipcode
 5 | - VehicleMake
 6 | - ClientType
 7 | - MaritalStatus
 8 | Date: None
 9 | Numeric: 
10 | - CustomerTenure
11 | - DriverAge
12 | - EngineCapacity
13 | - NumberOfDrivers
14 | - VehicleAge
15 | - VehicleCostNew
16 | - Zipcode_Aged_18_24
17 | - Zipcode_Aged_25_29
18 | - Zipcode_Aged_30_39
19 | - Zipcode_Aged_40_44
20 | - Zipcode_Aged_45_49
21 | - Zipcode_Aged_50_59
22 | - Zipcode_Aged_60
23 | - Zipcode_CommuteViaCar
24 | - Zipcode_PersonsPerHousehold
25 | - Zipcode_VehiclesPerHousehold
26 | - Zipcode_annualMileage
27 | Offset: Exposure
28 | Target: IncurredClaims
29 | Text: None
30 | 


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/mymodel.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from typing import List, Optional, Any, Dict
 3 | import numpy as np
 4 | import pandas as pd
 5 | from category_encoders import OrdinalEncoder
 6 | from sklearn.impute import SimpleImputer
 7 | import lightgbm as lgb
 8 | from joblib import load
 9 | import yaml
10 | import numpy as np
11 | import shap
12 | import avro
13 | import os
14 | 
15 | class MyModel(object):
16 |     def __init__(self, code_dir):
17 |         """Load the model pickle file."""
18 |         # This supports both Python 2 and 3
19 |         with open( code_dir + "/artifacts/lgbm.joblib", "rb") as picklefile:
20 |             try:
21 |                 self.model = load(picklefile, encoding="latin1")
22 |             except TypeError:
23 |                 self.model = load(picklefile)
24 |         with open(code_dir + "/artifacts/ordinalEncoder.joblib", "rb") as f:
25 |             self.oe = load(f)
26 |         with open(code_dir + "/artifacts/simpleImputerNum.joblib", "rb") as f:
27 |             self.siNum = load(f)
28 |         with open(code_dir + "/artifacts/simpleImputerCat.joblib", "rb") as f:
29 |             self.siCat = load(f)
30 |         with open(code_dir + "/feature_detail.yaml", "r") as f:
31 |             self.feature_detail = yaml.load(f, Loader=yaml.FullLoader)
32 |             self.numeric_features = self.feature_detail["Numeric"]
33 |             self.categorical_features = self.feature_detail["Categorical"]
34 |             self.offset = self.feature_detail["Offset"]
35 |         self.explainer = shap.TreeExplainer(self.model)
36 |         self.shap_headers = ["SHAP_{}".format(i) for i in self.numeric_features + self.categorical_features]
37 |         schema_path = os.path.join(code_dir, "schema.avsc")
38 |         self.schema = avro.schema.parse(open(schema_path, "rb").read())
39 | 
40 |     def preprocess_features(self, X):
41 |         offset = X[self.offset].values
42 |         x_num = X[self.numeric_features].values
43 |         x_num = self.siNum.transform(x_num)
44 |         x_cat = X[self.categorical_features].values
45 |         x_cat = self.siCat.transform(x_cat)
46 |         x_cat = self.oe.transform(x_cat)
47 |         x = np.concatenate([x_cat, x_num], axis=1)
48 |         return (x, offset)
49 | 
50 |     def explain(self, X):
51 |         X_t = self.preprocess_features(X)[0]
52 |         shap_values = self.explainer.shap_values( X_t )
53 |         return pd.DataFrame(shap_values, columns = self.shap_headers).to_dict(orient="records")
54 | 
55 | 
56 | 
57 |     def predict(
58 |         self, X, positive_class_label=None, negative_class_label=None, **kwargs
59 |     ):
60 |         """
61 |         Predict with the pickled custom model.
62 | 
63 |         If your model is for classification, you likely want to ensure this function
64 |         calls `predict_proba()`, whereas for regression it should use `predict()`
65 |         """
66 |         X, offset = self.preprocess_features(X)
67 |         predictions = np.exp(self.model.predict(X, raw_score=True)) * offset
68 | 
69 |         return pd.DataFrame(predictions, columns = ["Predictions"])
70 | 


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib==0.16.0
 2 | lightgbm==2.3.1
 3 | numpy==1.19.0
 4 | pandas==1.1.0
 5 | scikit-learn==0.23.1
 6 | scipy==1.5.1
 7 | PyYAML==5.1.1
 8 | category-encoders==2.2.2
 9 | shap==0.37.0
10 | avro==1.10.0


--------------------------------------------------------------------------------
/custom_inference/python/insurance_pricing/schema.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fields": [
 3 |         {
 4 |             "name": "prediction",
 5 |             "type": "float"
 6 |         },
 7 |         {
 8 |             "name": "shap_values",
 9 |             "type": [
10 |                 {
11 |                     "type": "map",
12 |                     "values": "float",
13 |                     "default": {}
14 |                 }
15 |             ]
16 |         }
17 |     ],
18 |     "namespace": "shap.avro",
19 |     "type": "record",
20 |     "name": "predictions"
21 | }


--------------------------------------------------------------------------------
/custom_inference/python/mnist/drum/classlabels.txt:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1
 3 | 2
 4 | 3
 5 | 4
 6 | 5
 7 | 6
 8 | 7
 9 | 8
10 | 9
11 | 


--------------------------------------------------------------------------------
/custom_inference/python/mnist/drum/custom.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from tensorflow import keras
 4 | from tensorflow.keras import layers
 5 | from tensorflow.keras.models import Sequential
 6 | import os
 7 | import io
 8 | from io import StringIO
 9 | 
10 | 
11 | def load_model(code_dir):
12 |     model_path = 'mnist.h5'
13 |     print ("model ready to load")
14 |     model = keras.models.load_model(os.path.join(code_dir, model_path), compile=False)
15 |     print ("model loaded")
16 |     return model
17 | 
18 | def score(data, model, **kwargs):
19 |     print (data.shape)
20 |     X=data.drop(data.columns[0],axis=1)
21 |     X=X.values.reshape(X.shape[0],28,28,1)
22 |     predictions = model.predict(X)
23 |     print (predictions)
24 |     s = pd.DataFrame(predictions)
25 |     return s
26 | 
27 | def score_unstructured(model, data, query, **kwargs):
28 |     print("Incoming content type params: ", kwargs)
29 |     print("Incoming data type: ", type(data))
30 |     print("Incoming query params: ", query)
31 |     input = io.StringIO(data)
32 |     X = pd.read_csv(input)
33 |     print (X.shape)
34 |     X=X.drop(X.columns[0],axis=1)
35 |     X=X.values.reshape(X.shape[0],28,28,1)
36 |     predictions = model.predict(X)
37 |     print (predictions)
38 |     s = pd.DataFrame(predictions)
39 |     t = s.to_csv(index=False)
40 |     return t
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/custom_inference/python/mnist/drum/mnist.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/mnist/drum/mnist.h5


--------------------------------------------------------------------------------
/custom_inference/python/mnist/drum/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.19.2
2 | tensorflow>=2.4.0
3 | 


--------------------------------------------------------------------------------
/custom_inference/python/movie_recommender/read_me.md:
--------------------------------------------------------------------------------
 1 | # Recommender System Unstructured Example
 2 | 
 3 | This folder includes an example of a Keras movie recommender system. The model was built using the Notebook [here](https://keras.io/examples/structured_data/collaborative_filtering_movielens/)
 4 | 
 5 | Using the saved model from that notebook, we then use DRUM to validate and then score using the saved model. 
 6 | 
 7 | In the `custom.py` script, we use the hook functions including the `load_model` and `score_unstructured` functions. 
 8 | 
 9 | Additional modifications have been made to the `score_unstructured` function to output the actual names of the movies
10 | as opposed to the index of the movieId. 
11 | 
12 | 


--------------------------------------------------------------------------------
/custom_inference/python/movie_recommender/recommender_model/custom.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import tensorflow as tf
  4 | from tensorflow import keras
  5 | from tensorflow.keras import layers
  6 | from keras.models import Sequential
  7 | import os
  8 | import io
  9 | from io import StringIO
 10 | 
 11 | 
 12 | def load_model(code_dir):
 13 |     model_path = 'saved_model.pb'
 14 |     model = tf.keras.models.load_model('./recommender_model/',
 15 |                                        custom_objects = None, 
 16 |                                        compile = True, 
 17 |                                        options = None)
 18 |     print ("model loaded")
 19 |     return model
 20 | 
 21 | 
 22 | def score(data, model, **kwargs):
 23 |     predictions = model.predict(data)
 24 |     print (predictions)
 25 |     s = pd.DataFrame(predictions)
 26 |     return s
 27 | 
 28 | 
 29 | def score_unstructured(model, data, query, **kwargs):
 30 | 
 31 |     """
 32 |     Load Movie/User Info
 33 |     
 34 |     This will be used to output the Actual movie list
 35 |     instead of just the indexes of the movies
 36 |     """
 37 |     
 38 |     movie_df = pd.read_csv('movies.csv')
 39 |     df = pd.read_csv('ratings_file.csv')
 40 |     data_rec = pd.read_csv('predict.csv')
 41 |     
 42 |     user_id = data_rec.userID.iloc[0]   
 43 |     movies_watched_by_user = df[df.userId == user_id]
 44 |    
 45 |     
 46 |     # Find Movies Not Watched
 47 |     movies_not_watched = movie_df[
 48 |         ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
 49 |     ]["movieId"]
 50 | 
 51 |     user_ids = df["userId"].unique().tolist()
 52 |     user2user_encoded = {x: i for i, x in enumerate(user_ids)}
 53 |     userencoded2user  = {i: x for i, x in enumerate(user_ids)}
 54 |     movie_ids = df["movieId"].unique().tolist()
 55 |     movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
 56 |     movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
 57 | 
 58 |     movies_not_watched = list(
 59 |         set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
 60 |     )
 61 | 
 62 |     movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
 63 |     user_encoder = user2user_encoded.get(user_id)
 64 |     user_movie_array = np.hstack(
 65 |         ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
 66 |     )
 67 |     
 68 |     """
 69 |     Finished Loading Movie data
 70 |     """
 71 |     
 72 |     input = io.StringIO(data) 
 73 |     X = pd.read_csv(input)  
 74 |     
 75 |     # Fill NA
 76 |     # Cast Inputs as INTs to properly handle NULL value imputation
 77 |     # and prevent from being cast as Floats
 78 |     X['userID'].fillna(user_id, inplace=True)
 79 |     X['movies'].fillna(3678, inplace=True)
 80 |     X["userID"] = X["userID"].astype(int) 
 81 |     X["movies"] = X["movies"].astype(int)
 82 |     
 83 | 
 84 |     ratings = model.predict(X).flatten() 
 85 |     
 86 |     # Take the Top 10 Movie Recommendations
 87 |     top_ratings_indices = ratings.argsort()[-10:][::-1]
 88 |     recommended_movie_ids = [
 89 |         movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
 90 |     ]
 91 |     
 92 |     recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
 93 |     for row in recommended_movies.itertuples():
 94 |         print(row.title, ":", row.genres)
 95 | 
 96 | #     print (recommended_movies)
 97 |     s = pd.DataFrame(recommended_movies)
 98 |     t = s.to_csv(index=False)
 99 |     return t
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/custom_inference/python/movie_recommender/recommender_model/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/movie_recommender/recommender_model/saved_model.pb


--------------------------------------------------------------------------------
/custom_inference/python/movie_recommender/recommender_model/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/movie_recommender/recommender_model/variables/variables.data-00000-of-00001


--------------------------------------------------------------------------------
/custom_inference/python/movie_recommender/recommender_model/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/movie_recommender/recommender_model/variables/variables.index


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/README.md:
--------------------------------------------------------------------------------
 1 | # Custom Model Examples
 2 | 
 3 | This folder includes various examples of custom models using the well known  `Hospital Readmissions` dataset. There are three examples (Level 1 - 3) with each level presenting a more complicated example of custom model than the one previously to it.
 4 | 
 5 | In the `custom.py` scripts, we use all of the available hook functions including the `fit` function. This means that these examples can be used as both `custom inference` and `custom training` models. To see the difference between the two, check the official DataRobot-Drum package documentation in GitHub [here](https://github.com/datarobot/datarobot-user-models)
 6 | 
 7 | If you try to upload the models in DataRobot using the UI, make sure to choose `Scikit-Learn Drop in` environment. Lastly, make sure you upload the items saved inside `custom_model` folder and not the custom folder itself.
 8 | 
 9 | ## Creating environment
10 | The easiest way to create an environment to both train and test these models with drum, would be to execute the below commands after you install conda.
11 | 
12 | `conda create --name your-env-name python=3.7.0`
13 | `conda activate your-env-name`
14 | `pip install -r requirements.txt`
15 | 
16 | ## Important Links
17 | 
18 | For more information on how to use DRUM to test and deploy your custom models, follow the [link](https://github.com/datarobot-community/custom-models/tree/master/drum_overview)
19 | 
20 | ## Contents
21 | 
22 | #### Readmission_level_1
23 | 
24 | *High Level Overview:*
25 | 
26 | 1. Fit a CatBoost classifier on top of the `Hospital Readmissions` dataset.
27 | 2. `custom.py` script needs to include Null value handling.
28 | 3. `custom.py` script needs to search for keyword `Diabetes|diabetes` in `diag_1_desc` column and create a new Boolean column.
29 | 
30 | #### Readmission_level_2
31 | 
32 | *High Level Overview:*
33 | 
34 | 1. Preprocess Data using scikit-learn pipeline
35 | 2. Fit XGboost model on the data
36 | 3. `custom.py` script needs to preprocess using the scikit-learn pipeline 
37 | 4. `custom.py` script needs to score using the XGboost Model.
38 | 
39 | The extra difficulty here is that we need to define where DRUM is to find both of the preprocessing and the keras model in order for this custom model to work.
40 | 
41 | #### Readmission_level_3
42 | 
43 | The biostatisticians at ABC labs have a legacy model that they are using to predict probability to be readmitted into the hospital. They found that using an ensemble model between their own and the keras model, yields the best outcome. The result needs to be the average probability between the two models.
44 | 
45 | *High Level Overview:*
46 | 
47 | 1. Preprocess Data using scikit-learn pipeline
48 | 2. Fit XGboost model on the data
49 | 3. `custom.py` script needs to preprocess using the scikit-learn pipeline 
50 | 4. `custom.py` script needs to score using the XGboost Model.
51 | 4. `custom.py` script needs to return the average probability as calculated from XGboost + legacy model.


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_1/catboost_info/catboost_training.json:
--------------------------------------------------------------------------------
1 | {
2 | "meta":{"test_sets":[],"test_metrics":[],"learn_metrics":[{"best_value":"Min","name":"Logloss"}],"launch_mode":"Train","parameters":"","iteration_count":2,"learn_sets":["learn"],"name":"experiment"},
3 | "iterations":[
4 | {"learn":[0.645309907],"iteration":0,"passed_time":0.06549675921,"remaining_time":0.06549675921},
5 | {"learn":[0.632149336],"iteration":1,"passed_time":0.07959602195,"remaining_time":0}
6 | ]}


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_1/catboost_info/learn/events.out.tfevents:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_1/catboost_info/learn/events.out.tfevents


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_1/catboost_info/learn_error.tsv:
--------------------------------------------------------------------------------
1 | iter	Logloss
2 | 0	0.645309907
3 | 1	0.632149336
4 | 


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_1/catboost_info/time_left.tsv:
--------------------------------------------------------------------------------
1 | iter	Passed	Remaining
2 | 0	65	65
3 | 1	79	0
4 | 


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_1/custom_model/custom.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import pandas as pd
 4 | import numpy as np
 5 | from typing import List, Optional
 6 | from catboost import CatBoostClassifier
 7 | import io
 8 | 
 9 | def read_input_data(input_binary_data):
10 |     return pd.read_csv(io.BytesIO(input_binary_data))
11 | 
12 | 
13 | def transform(data,model):
14 |     """
15 |     Note: This hook may not have to be implemented for your model.
16 |     In this case implemented for the model used in the example.
17 |     Modify this method to add data transformation before scoring calls. For example, this can be
18 |     used to implement one-hot encoding for models that don't include it on their own.
19 |     Parameters
20 |     ----------
21 |     data: pd.DataFrame
22 |     model: object, the deserialized model
23 |     Returns
24 |     -------
25 |     pd.DataFrame
26 |     """
27 |     # Execute any steps you need to do before scoring
28 | 
29 |     def find_diabetes_text(txt):
30 |         try:
31 |             if 'diabetes' in txt.lower():
32 |                 return 1
33 |             else:
34 |                 return 0
35 |         except:
36 |             0
37 |             
38 |     #DataRobot Drum will check what happens when values are imputed. That is why I explicetely define cat_features
39 |     cat_features = ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty', 'diag_1',
40 |     'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
41 |     'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
42 |     'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 
43 |     'glyburide_metformin', 'glipizide_metformin', 'glimepiride_pioglitazone', 'metformin_rosiglitazone', 
44 |     'metformin_pioglitazone', 'change', 'diabetesMed']
45 | 
46 |     # Fill null values for Categorical Features
47 |     for c in cat_features:
48 |         data[c] = data[c].fillna('unknown')
49 | 
50 |         #Some categorical features (diag_1), have float values which in reality are categories. Catboost takes either int or object as input
51 |         #so I am casting.
52 |         try:
53 |             data[c] = data[c].astype(int)
54 |         except:
55 |             pass
56 | 
57 |     # Find out if `Diabetes|`diabetes` exists in diag_1_desc column
58 |     data['diabetes'] = data['diag_1_desc'].apply(lambda x: find_diabetes_text(x))
59 |     data.drop('diag_1_desc',axis=1,inplace=True)
60 | 
61 |     # Fill null values for numeric features
62 |     data = data.fillna(0)
63 | 
64 |     return data
65 | 
66 | def score(data, model, **kwargs):
67 | 
68 |     results = model.predict_proba(data)
69 | 
70 |     #Create two columns with probability results
71 |     predictions = pd.DataFrame({'True': results[:, 0]})
72 |     predictions['False'] = 1 - predictions['True']
73 | 
74 |     return predictions
75 | 


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_1/custom_model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_1/custom_model/model.pkl


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_1/custom_model/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib>=0.14.0
2 | pandas>=0.25.1
3 | catboost>=0.24.2
4 | scipy>=1.5.3
5 | scikit_learn>=0.22.0
6 | 


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_2/custom_model/custom.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import joblib
 4 | from xgboost import XGBClassifier
 5 | 
 6 | from sklearn.pipeline import Pipeline
 7 | from sklearn.compose import ColumnTransformer
 8 | from sklearn.preprocessing import StandardScaler,OneHotEncoder
 9 | 
10 | import os
11 | import io
12 | from typing import List, Optional
13 | from scipy.special import expit
14 | g_code_dir = None
15 | 
16 | schema = {"race": "object", "gender": "object", "age": "object", "weight": "object", "admission_type_id": "object", "discharge_disposition_id": "object", "admission_source_id": "object", "time_in_hospital": "int64", "payer_code": "object", "medical_specialty": "object", "num_lab_procedures": "int64", "num_procedures": "int64", "num_medications": "int64", "number_outpatient": "int64", "number_emergency": "int64", "number_inpatient": "int64", "number_diagnoses": "int64", "max_glu_serum": "object", "A1Cresult": "object", "metformin": "object", "repaglinide": "object", "nateglinide": "object", "chlorpropamide": "object", "glimepiride": "object", "acetohexamide": "object", "glipizide": "object", "glyburide": "object", "tolbutamide": "object", "pioglitazone": "object", "rosiglitazone": "object", "acarbose": "object", "miglitol": "object", "troglitazone": "object", "tolazamide": "object", "examide": "object", "citoglipton": "object", "insulin": "object", "glyburide_metformin": "object", "glipizide_metformin": "object", "glimepiride_pioglitazone": "object", "metformin_rosiglitazone": "object", "metformin_pioglitazone": "object", "change": "object", "diabetesMed": "object"}
17 | 
18 | def init(code_dir):
19 |     global g_code_dir
20 |     g_code_dir = code_dir
21 | 
22 | def read_input_data(input_binary_data):
23 |     data = pd.read_csv(io.BytesIO(input_binary_data))
24 |     data.drop(['diag_1_desc', 'diag_1', 'diag_2', 'diag_3'],axis=1,inplace=True)
25 | 
26 |     #Saving this for later
27 |     return data
28 | 
29 | def transform(data, model):
30 |     """
31 |     Note: This hook may not have to be implemented for your model.
32 |     In this case implemented for the model used in the example.
33 |     Modify this method to add data transformation before scoring calls. For example, this can be
34 |     used to implement one-hot encoding for models that don't include it on their own.
35 |     Parameters
36 |     ----------
37 |     data: pd.DataFrame
38 |     model: object, the deserialized model
39 |     Returns
40 |     -------
41 |     pd.DataFrame
42 |     """
43 | 
44 |     #Handle null values in categories and numerics
45 |     for c,dt in schema.items():
46 |         if dt =='object':
47 |             data[c] = data[c].fillna('missing')
48 |         else:
49 |             data[c] = data[c].fillna(0)
50 | 
51 |     pipeline_path = 'preprocessing.pkl'
52 |     pipeline = joblib.load(os.path.join(g_code_dir, pipeline_path))
53 |     preprocessed = pipeline.transform(data)
54 |     preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed)
55 |     
56 |     return preprocessed
57 | 
58 | def load_model(code_dir):
59 |     model_path = 'model.pkl'
60 |     model = joblib.load(os.path.join(code_dir, model_path))
61 |     return model
62 | 
63 | def score(data, model, **kwargs):
64 |     results = model.predict_proba(data)
65 |     predictions = pd.DataFrame({'True': results[:, 0], 'False':results[:, 1]})
66 | 
67 |     return predictions
68 | 


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_2/custom_model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_2/custom_model/model.pkl


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_2/custom_model/preprocessing.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_2/custom_model/preprocessing.pkl


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_2/custom_model/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.11.0
 2 | astor==0.8.1
 3 | cached-property==1.5.1
 4 | certifi==2020.11.8
 5 | gast==0.3.3
 6 | google-pasta==0.2.0
 7 | grpcio==1.33.2
 8 | importlib-metadata==1.7.0
 9 | numpy==1.18
10 | protobuf==3.13.0
11 | PyYAML==5.3.1
12 | six==1.15.0
13 | xgboost==1.2.1
14 | termcolor==1.1.0
15 | Theano==0.8.2
16 | Werkzeug==2.0.1
17 | wrapt==1.12.1
18 | zipp==3.4.0
19 | joblib==0.14.0
20 | pandas==0.25.1
21 | scipy==1.5.3
22 | scikit_learn==0.22.0


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_3/custom_model/custom.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import joblib
 4 | from xgboost import XGBClassifier
 5 | 
 6 | from sklearn.pipeline import Pipeline
 7 | from sklearn.compose import ColumnTransformer
 8 | from sklearn.preprocessing import StandardScaler,OneHotEncoder
 9 | from sklearn.impute import SimpleImputer
10 | 
11 | import os
12 | from typing import List, Optional
13 | from scipy.special import expit
14 | import io
15 | 
16 | g_input_filename = None
17 | g_code_dir = None
18 | 
19 | schema = {"race": "object", "gender": "object", "age": "object", "weight": "object", "admission_type_id": "object", "discharge_disposition_id": "object", "admission_source_id": "object", "time_in_hospital": "int64", "payer_code": "object", "medical_specialty": "object", "num_lab_procedures": "int64", "num_procedures": "int64", "num_medications": "int64", "number_outpatient": "int64", "number_emergency": "int64", "number_inpatient": "int64", "number_diagnoses": "int64", "max_glu_serum": "object", "A1Cresult": "object", "metformin": "object", "repaglinide": "object", "nateglinide": "object", "chlorpropamide": "object", "glimepiride": "object", "acetohexamide": "object", "glipizide": "object", "glyburide": "object", "tolbutamide": "object", "pioglitazone": "object", "rosiglitazone": "object", "acarbose": "object", "miglitol": "object", "troglitazone": "object", "tolazamide": "object", "examide": "object", "citoglipton": "object", "insulin": "object", "glyburide_metformin": "object", "glipizide_metformin": "object", "glimepiride_pioglitazone": "object", "metformin_rosiglitazone": "object", "metformin_pioglitazone": "object", "change": "object", "diabetesMed": "object"}
20 | 
21 | def init(code_dir):
22 |     global g_code_dir
23 |     g_code_dir = code_dir
24 | 
25 | def read_input_data(input_binary_data):
26 |     data = pd.read_csv(io.BytesIO(input_binary_data))
27 |     data.drop(['diag_1_desc', 'diag_1', 'diag_2', 'diag_3'],axis=1,inplace=True)
28 | 
29 |     #Saving this for later
30 |     global g_input_filename
31 |     g_input_filename = input_binary_data
32 |     return data
33 | 
34 | def transform(data, model):
35 |     """
36 |     Note: This hook may not have to be implemented for your model.
37 |     In this case implemented for the model used in the example.
38 |     Modify this method to add data transformation before scoring calls. For example, this can be
39 |     used to implement one-hot encoding for models that don't include it on their own.
40 |     Parameters
41 |     ----------
42 |     data: pd.DataFrame
43 |     model: object, the deserialized model
44 |     Returns
45 |     -------
46 |     pd.DataFrame
47 |     """
48 | 
49 |     #Handle null values in categories and numerics
50 |     for c,dt in schema.items():
51 |         if dt =='object':
52 |             data[c] = data[c].fillna('missing')
53 |         else:
54 |             data[c] = data[c].fillna(0)
55 | 
56 |     pipeline_path = 'preprocessing.pkl'
57 |     pipeline = joblib.load(os.path.join(g_code_dir, pipeline_path))
58 |     preprocessed = pipeline.transform(data)
59 |     preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed)
60 |     
61 |     return preprocessed
62 | 
63 | def load_model(code_dir):
64 |     model_path = 'model.pkl'
65 |     model = joblib.load(os.path.join(code_dir, model_path))
66 |     return model
67 | 
68 | def score(data, model, **kwargs):
69 |     results = model.predict_proba(data)
70 |     predictions = pd.DataFrame({'True': results[:, 0], 'False':results[:, 1]})
71 | 
72 |     return predictions
73 | 
74 | #Adding post_process to use legacy model together with Keras model
75 | def post_process(predictions,model):
76 |     original_data = pd.read_csv(io.BytesIO(g_input_filename))
77 |     original_data.fillna(0,inplace=True)
78 | 
79 |     def legacy_score(row):
80 |         try:
81 |             return expit(0.59 + 0.55 * row['number_inpatient'] + 0.36 * row['number_outpatient'])
82 |         except:
83 |             return 0.38
84 | 
85 |     predictions['True_legacy'] = original_data.apply(lambda row: legacy_score(row), axis=1)
86 |     predictions['True'] = (predictions['True'] + predictions['True_legacy'])
87 |     predictions['True'] = predictions['True']/2
88 |     predictions['False'] = 1 -  predictions['True']
89 | 
90 |     predictions.drop('True_legacy',axis=1,inplace=True)
91 | 
92 |     return predictions
93 | 


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_3/custom_model/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_3/custom_model/model.pkl


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_3/custom_model/preprocessing.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/python/readmissions/Readmission_level_3/custom_model/preprocessing.pkl


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/Readmission_level_3/custom_model/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.11.0
 2 | astor==0.8.1
 3 | cached-property==1.5.1
 4 | certifi==2020.11.8
 5 | gast==0.3.3
 6 | google-pasta==0.2.0
 7 | grpcio==1.33.2
 8 | importlib-metadata==1.7.0
 9 | numpy==1.18
10 | protobuf==3.13.0
11 | PyYAML==5.3.1
12 | six==1.15.0
13 | xgboost==1.2.1
14 | termcolor==1.1.0
15 | Theano==0.8.2
16 | Werkzeug==2.0.1
17 | wrapt==1.12.1
18 | zipp==3.4.0
19 | joblib==0.14.0
20 | pandas==0.25.1
21 | scipy==1.5.3
22 | scikit_learn==0.22.0


--------------------------------------------------------------------------------
/custom_inference/python/readmissions/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.11.0
 2 | astor==0.8.1
 3 | cached-property==1.5.1
 4 | certifi==2020.11.8
 5 | gast==0.3.3
 6 | google-pasta==0.2.0
 7 | grpcio==1.33.2
 8 | importlib-metadata==1.7.0
 9 | numpy==1.18
10 | protobuf==3.13.0
11 | PyYAML==5.3.1
12 | six==1.15.0
13 | termcolor==1.1.0
14 | Theano==0.8.2
15 | Werkzeug==2.0.1
16 | wrapt==1.12.1
17 | zipp==3.4.0
18 | joblib==0.14.0
19 | pandas==0.25.1
20 | scipy==1.5.3
21 | scikit_learn==0.22.0
22 | catboost==0.24.2
23 | datarobot-drum==1.5.4
24 | xgboost==1.2.1


--------------------------------------------------------------------------------
/custom_inference/r/README.md:
--------------------------------------------------------------------------------
1 | README
2 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_basic/README.md:
--------------------------------------------------------------------------------
 1 | ## R Fit Template
 2 | 
 3 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model.
 4 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 
 5 | use different modeling or preprocessing techniques.
 6 | 
 7 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default.
 8 | 
 9 | Change the target value in the code below to Species to test multilabel classification.
10 | 
11 | ### To run locally using 'drum'
12 | Paths are relative to `datarobot-user-models` root:
13 | `drum fit --code-dir model_templates/training/r_glm_noncaret_basic --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'`
14 | If the command succeeds, your code is ready to be uploaded.
15 | 
16 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_basic/create_pipeline.R:
--------------------------------------------------------------------------------
 1 | # This is the simplest POC that Composable ML / DRUM can
 2 | #   work with non-caret R models.
 3 | # Here I just take the caret example and replace the 
 4 | #   caret model with a very naive GLM() that uses the default
 5 | #   settings for family, link, prediction scale, etc.
 6 | #         - Jason
 7 | 
 8 | create_pipeline<-function(X, y, model_type='regression') {
 9 | 
10 |   # set up dataframe for modeling
11 |   train_df <- X
12 |   train_df$target <- unlist(y)
13 |   if (model_type == 'classification'){
14 |     train_df$target <- as.factor(train_df$target)
15 |   }
16 | 
17 | 
18 |   # Run the model using builtin glm to see if we can get around using caret
19 |   model <- glm(target~.,data=train_df)
20 |   return(model)
21 | }
22 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_basic/custom.R:
--------------------------------------------------------------------------------
 1 | init <- function(code_dir) {
 2 |   # custom init function to load required libraries
 3 |   library(tidyverse)
 4 |   library(caret)
 5 |   library(recipes)
 6 |   library(e1071)
 7 |   library(gbm)
 8 |   source(file.path(code_dir, 'create_pipeline.R'))
 9 | }
10 | 
11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){
12 |     #' User-provided fit method, required for custom training
13 |     #'
14 |     #' Trains a regression or classification model using gbm (via caret)
15 |     #' @param X data.frame - training data to perform fit on
16 |     #' @param y data.frame column or array - target data to perform fit on
17 |     #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the
18 |     #' 'drum fit' command.
19 |     #' @param class_order : A two element long list dictating the order of classes which should be used for
20 |     #'  modeling. Class order will always be passed to fit by DataRobot for classification tasks,
21 |     #'  and never otherwise. When models predict, they output a likelihood of one class, with a
22 |     #'  value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order
23 |     #'  dictates that the first element in the list will be the 0 class, and the second will be the
24 |     #'  1 class.
25 |     #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important
26 |     #'  a row is. Row weights is only optionally used, and there will be no filtering for which
27 |     #'  custom models support this. There are two situations when values will be passed into
28 |     #'  row_weights, during smart downsampling and when weights are explicitly provided by the user
29 |     #' @param ...: Added for forwards compatibility
30 |     #' @return Nothing
31 | 
32 |   if (!is.null(class_order)){
33 |     model <- create_pipeline(X, y, 'classification')
34 |   } else {
35 |     model <- create_pipeline(X, y, 'regression')
36 |   }
37 |   # Save model
38 |   model_path <- file.path(output_dir, 'artifact.rds')
39 |   saveRDS(model, file = model_path)
40 | }
41 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_feateng/README.md:
--------------------------------------------------------------------------------
 1 | ## WORK IN PROCESS - don't run yet
 2 | I'm working on an issue with DR finding my helper functions. Until that is resolved this isn't ready to run yet.
 3 | 
 4 | ## R Fit Template
 5 | 
 6 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model.
 7 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 
 8 | use different modeling or preprocessing techniques. This version uses several custom functions for data cleaning and quality-checking, including best practices for factors (categorical) levels. The max number of factor levels is 30.
 9 | 
10 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default.
11 | 
12 | Change the target value in the code below to Species to test multilabel classification.
13 | 
14 | ### To run locally using 'drum'
15 | Paths are relative to `datarobot-user-models` root:
16 | `drum fit --code-dir model_templates/training/r_glm_noncaret_feateng --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'`
17 | If the command succeeds, your code is ready to be uploaded.
18 | 
19 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_feateng/create_pipeline.R:
--------------------------------------------------------------------------------
 1 | source("preprocess.R") # not working in DR yet!
 2 | source("rmcons.R")
 3 | source("rmident.R")
 4 | 
 5 | create_pipeline<-function(X, y, model_type='regression') {
 6 |   # Clean
 7 |   X <- rm_ident(X)
 8 |   X <- rm_cons(X)
 9 |   X <- preprocess(X)
10 | 
11 |   # set up dataframe for modeling
12 |   train_df <- X
13 |   train_df$target <- unlist(y)
14 |   if (model_type == 'classification'){
15 |     train_df$target <- as.factor(train_df$target)
16 |   }
17 | 
18 |   # Run the model using builtin glm function
19 |   model <- glm(target~.,data=train_df)
20 |   return(model)
21 | }
22 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_feateng/custom.R:
--------------------------------------------------------------------------------
 1 | init <- function(code_dir) {
 2 |   # custom init function to load required libraries
 3 |   library(tidyverse)
 4 |   library(caret)
 5 |   library(recipes)
 6 |   library(e1071)
 7 |   library(gbm)
 8 |   source(file.path(code_dir, 'create_pipeline.R'))
 9 | }
10 | 
11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){
12 |     #' User-provided fit method, required for custom training
13 |     #'
14 |     #' Trains a regression or classification model using gbm (via caret)
15 |     #' @param X data.frame - training data to perform fit on
16 |     #' @param y data.frame column or array - target data to perform fit on
17 |     #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the
18 |     #' 'drum fit' command.
19 |     #' @param class_order : A two element long list dictating the order of classes which should be used for
20 |     #'  modeling. Class order will always be passed to fit by DataRobot for classification tasks,
21 |     #'  and never otherwise. When models predict, they output a likelihood of one class, with a
22 |     #'  value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order
23 |     #'  dictates that the first element in the list will be the 0 class, and the second will be the
24 |     #'  1 class.
25 |     #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important
26 |     #'  a row is. Row weights is only optionally used, and there will be no filtering for which
27 |     #'  custom models support this. There are two situations when values will be passed into
28 |     #'  row_weights, during smart downsampling and when weights are explicitly provided by the user
29 |     #' @param ...: Added for forwards compatibility
30 |     #' @return Nothing
31 | 
32 |   if (!is.null(class_order)){
33 |     model <- create_pipeline(X, y, 'classification')
34 |   } else {
35 |     model <- create_pipeline(X, y, 'regression')
36 |   }
37 |   # Save model
38 |   model_path <- file.path(output_dir, 'artifact.rds')
39 |   saveRDS(model, file = model_path)
40 | }
41 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_feateng/preprocess.R:
--------------------------------------------------------------------------------
  1 | #' Provides factor handling + mean imputation for numeric, etc 
  2 | #'
  3 | #' @param train data.frame or similar classes
  4 | #' @param test data.frame or similar
  5 | #' @param exclude character scalar or vector of any column names to ignore  
  6 | #' @return None
  7 | #' @export
  8 | #' 
  9 | 
 10 | # Purge / Impute Missing Data ---------------------------------------------
 11 | preprocess <- function(train, test, exclude){
 12 |   # preprocess() provides factor handling and mean imputation for numeric
 13 |   
 14 |   # Coherent handling of factors when unknown new levels may occur
 15 |   #
 16 |   # Replaces NA's in numeric values
 17 |   #
 18 |   # Character are converted to factor
 19 |   #
 20 |   # Integer are converted to numeric
 21 |   #
 22 |   # Supports up to 30 factor levels, for > 30 please re-encode as a best practice
 23 |   #
 24 |   # Ignores other data classes (date, etc)
 25 |   
 26 |   if(missing(exclude)){
 27 |     exclude <- c("next_term_retention", "group", "oos_Date",
 28 |       "Term_Start_Date")
 29 |   } else{
 30 |     if(!class(exclude)=="character"){
 31 |       cat("exclude must be a character or character vector")
 32 |       
 33 |     }
 34 |   }
 35 |   
 36 |   for(i in 1:length(colnames(train))){
 37 |     if(class(train[,i])  %in% c("Date", "POSIXct", "POSIXt")){
 38 |         next
 39 |       }
 40 |     if("integer"  %in% class(train[,i])){
 41 |       train[,i] <- as.numeric(train[,i])
 42 |     }
 43 |     if("numeric"  %in% class(train[,i])){
 44 |       train[,i] <- na.roughfix(train[,i])
 45 |     }
 46 |     if(("factor" %in% class(train[,i])) |
 47 |        ("character" %in% class(train[,i]))){
 48 |       if(colnames(train)[i] %in% exclude){
 49 |         next
 50 |       } else{
 51 |         train[,i]  <- as.character(train[,i])
 52 |         train[1,i] <- NA
 53 |         train[,i][is.na(train[,i])] <- "Missing"
 54 |         train[,i]  <- as.factor(train[,i])
 55 |       }
 56 |       if(length(levels(train[,i])) > 30){
 57 |         print("TOO MANY LEVELS!")
 58 |         print(colnames(train)[i])
 59 |         print(length(levels(train[,i])))
 60 |       }
 61 |     }
 62 |   }
 63 |   
 64 |   test <- test[,colnames(test) %in% colnames(train)]
 65 |   
 66 |   cn <- colnames(test)
 67 |   for(i in 1:length(cn)){
 68 |     if("integer"  %in% class(test[,i])){
 69 |       test[,i] <- as.numeric(test[,i])
 70 |     }
 71 |     if("numeric"  %in% class(test[,i])){
 72 |       test[,i] <- na.roughfix(test[,i])
 73 |     }
 74 |     if(("factor" %in% class(test[,i])) |
 75 |        ("character" %in% class(test[,i]))){
 76 |       if(colnames(test)[i] %in% exclude){
 77 |         next
 78 |       } else{
 79 |         test[,i]                          <- as.character(test[,i])
 80 |         test[1,i]                         <- NA
 81 |         test[,i][is.na(test[,i])] <- "Missing"
 82 |         test[,i]                          <- as.factor(test[,i])
 83 |       }
 84 |       
 85 |       if(!identical(levels(test[,i]),
 86 |                     levels(train[,colnames(test)[i]]))){
 87 |         l1 <- levels(train[,colnames(test)[i]])
 88 |         l2 <- levels(test[,i])
 89 |         
 90 |         test[,i] <- as.character(test[,i])
 91 |         test[,i] <- ifelse(test[,i] %in% l1,
 92 |                                  as.character(test[,i]),
 93 |                                  "Missing")
 94 |         
 95 |         if(length(l2) < length(l1) |
 96 |            (length(l2) == length(l1) &
 97 |             !(identical(levels(as.factor(test[,i])),
 98 |                         levels(train[,colnames(test)[i]]))))
 99 |         ){
100 |           for(n in 1:length(l1[!l1 %in% l2])){
101 |             test[10+n,i] <- l1[!l1 %in% l2][n]
102 |           }
103 |         }
104 |         
105 |         test[,i] <- as.factor(test[,i])
106 |         
107 |         if(!identical(levels(test[,i]),
108 |                       levels(train[,colnames(test)[i]]))){
109 |           print("There is a problem with factor levels!")
110 |           print(i)
111 |         }
112 |       }
113 |       
114 |       if(length(levels(test[,i])) > 30){
115 |         print(colnames(test)[i])
116 |         print(length(levels(test[,i])))
117 |       }
118 |     }
119 |   }
120 |   
121 |   for(i in 1:ncol(train)){
122 |     if(!(identical(
123 |       colnames(train)[i],
124 |       colnames(test)[i]
125 |     ))){
126 |       print(colnames(train)[i])
127 |       print(i)
128 |     }
129 |     if(!(identical(
130 |       class(train)[i],
131 |       class(test)[i]
132 |     ))){
133 |       print("There is a problem with:")
134 |       print(colnames(train)[i])
135 |       print("which is col number:")
136 |       print(i)
137 |     }
138 |     if(!(identical(
139 |       levels(train)[i],
140 |       levels(test)[i]
141 |     ))){
142 |       print("There is a problem with:")
143 |       print(colnames(train)[i])
144 |       print(i)
145 |     }
146 |   }
147 |   if(sum(is.na(test)) > 0){print("!!!WARNING!!! THERE IS A PROBLEM WITH MISSING DATA IN test")}
148 | }
149 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_feateng/rmcons.R:
--------------------------------------------------------------------------------
 1 | #' Removes constant columns
 2 | #'
 3 | #' @param x data.frame or similar classes
 4 | #'
 5 | #' @return None
 6 | #' @export
 7 | #' 
 8 | #' 
 9 | rm_cons <- function(x){
10 |   cat("\n## Removing the constant features.\n")
11 |   for (f in names(x)) {
12 |     if (length(unique(x[[f]])) == 1) {
13 |       cat(f, "is constant. It is has been deleted.\n")
14 |       x[[f]] <- NULL
15 |     }
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_feateng/rmident.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Removing identical features
 3 | #'
 4 | #' @param train required. data.frame or similar classes
 5 | #' @param test optional. data.frame or similar classes
 6 | #'
 7 | #' @return None
 8 | #' @export
 9 | #' 
10 | #' 
11 | 
12 | 
13 | 
14 | rm_ident <- function(train, test=NULL) {
15 |   features_pair <- combn(names(train), 2, simplify = F)
16 |   toRemove <- c()
17 |   for(pair in features_pair) {
18 |     f1 <- pair[1]
19 |     f2 <- pair[2]
20 |     
21 |     if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
22 |       if (all(train[[f1]] == train[[f2]])) {
23 |         cat(f1, "and", f2, "are equals.\n")
24 |         toRemove <- c(toRemove, f2)
25 |       }
26 |     }
27 |   }
28 |   train <- train[,!colnames(train) %in% toRemove]
29 |   if(!missing(test)){
30 |     test  <- test[,!colnames(test) %in% toRemove]
31 |   }
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_gamma/README.md:
--------------------------------------------------------------------------------
 1 | ## R Fit Template
 2 | 
 3 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model.
 4 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 
 5 | use different modeling or preprocessing techniques.
 6 | 
 7 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default.
 8 | 
 9 | Change the target value in the code below to Species to test multilabel classification.
10 | 
11 | ### To run locally using 'drum'
12 | Paths are relative to `datarobot-user-models` root:
13 | `drum fit --code-dir model_templates/training/r_glm_noncaret_gamma --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'`
14 | If the command succeeds, your code is ready to be uploaded.
15 | 
16 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_gamma/create_pipeline.R:
--------------------------------------------------------------------------------
 1 | create_pipeline<-function(X, y, model_type='regression') {
 2 | 
 3 |   # set up data.frame for modeling
 4 |   train_df <- X
 5 |   train_df$target <- unlist(y)
 6 |   if (model_type == 'classification'){
 7 |     train_df$target <- as.factor(train_df$target)
 8 |   }
 9 | 
10 | 
11 |   # Run a logistic regression using builtin glm 
12 |   model <- glm(target~., data=train_df, family = Gamma)
13 |   return(model)
14 | }
15 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_gamma/custom.R:
--------------------------------------------------------------------------------
 1 | init <- function(code_dir) {
 2 |   # custom init function to load required libraries
 3 |   library(tidyverse)
 4 |   library(caret)
 5 |   library(recipes)
 6 |   library(e1071)
 7 |   library(gbm)
 8 |   source(file.path(code_dir, 'create_pipeline.R'))
 9 | }
10 | 
11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){
12 |     #' User-provided fit method, required for custom training
13 |     #'
14 |     #' Trains a regression or classification model using gbm (via caret)
15 |     #' @param X data.frame - training data to perform fit on
16 |     #' @param y data.frame column or array - target data to perform fit on
17 |     #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the
18 |     #' 'drum fit' command.
19 |     #' @param class_order : A two element long list dictating the order of classes which should be used for
20 |     #'  modeling. Class order will always be passed to fit by DataRobot for classification tasks,
21 |     #'  and never otherwise. When models predict, they output a likelihood of one class, with a
22 |     #'  value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order
23 |     #'  dictates that the first element in the list will be the 0 class, and the second will be the
24 |     #'  1 class.
25 |     #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important
26 |     #'  a row is. Row weights is only optionally used, and there will be no filtering for which
27 |     #'  custom models support this. There are two situations when values will be passed into
28 |     #'  row_weights, during smart downsampling and when weights are explicitly provided by the user
29 |     #' @param ...: Added for forwards compatibility
30 |     #' @return Nothing
31 | 
32 |   if (!is.null(class_order)){
33 |     model <- create_pipeline(X, y, 'classification')
34 |   } else {
35 |     model <- create_pipeline(X, y, 'regression')
36 |   }
37 |   # Save model
38 |   model_path <- file.path(output_dir, 'artifact.rds')
39 |   saveRDS(model, file = model_path)
40 | }
41 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_logit/README.md:
--------------------------------------------------------------------------------
 1 | ## R Fit Template
 2 | 
 3 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model.
 4 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 
 5 | use different modeling or preprocessing techniques.
 6 | 
 7 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default.
 8 | 
 9 | Change the target value in the code below to Species to test multilabel classification.
10 | 
11 | ### To run locally using 'drum'
12 | Paths are relative to `datarobot-user-models` root:
13 | `drum fit --code-dir model_templates/training/r_lang --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'`
14 | If the command succeeds, your code is ready to be uploaded.
15 | 
16 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_logit/create_pipeline.R:
--------------------------------------------------------------------------------
 1 | create_pipeline<-function(X, y, model_type='regression') {
 2 | 
 3 |   # set up data.frame for modeling
 4 |   train_df <- X
 5 |   train_df$target <- unlist(y)
 6 |   if (model_type == 'classification'){
 7 |     train_df$target <- as.factor(train_df$target)
 8 |   }
 9 | 
10 | 
11 |   # Run a logistic regression using builtin glm 
12 |   model <- glm(target~., data=train_df, family = "binomial")
13 |   return(model)
14 | }
15 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_logit/custom.R:
--------------------------------------------------------------------------------
 1 | init <- function(code_dir) {
 2 |   # custom init function to load required libraries
 3 |   library(tidyverse)
 4 |   library(caret)
 5 |   library(recipes)
 6 |   library(e1071)
 7 |   library(gbm)
 8 |   source(file.path(code_dir, 'create_pipeline.R'))
 9 | }
10 | 
11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){
12 |     #' User-provided fit method, required for custom training
13 |     #'
14 |     #' Trains a regression or classification model using gbm (via caret)
15 |     #' @param X data.frame - training data to perform fit on
16 |     #' @param y data.frame column or array - target data to perform fit on
17 |     #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the
18 |     #' 'drum fit' command.
19 |     #' @param class_order : A two element long list dictating the order of classes which should be used for
20 |     #'  modeling. Class order will always be passed to fit by DataRobot for classification tasks,
21 |     #'  and never otherwise. When models predict, they output a likelihood of one class, with a
22 |     #'  value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order
23 |     #'  dictates that the first element in the list will be the 0 class, and the second will be the
24 |     #'  1 class.
25 |     #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important
26 |     #'  a row is. Row weights is only optionally used, and there will be no filtering for which
27 |     #'  custom models support this. There are two situations when values will be passed into
28 |     #'  row_weights, during smart downsampling and when weights are explicitly provided by the user
29 |     #' @param ...: Added for forwards compatibility
30 |     #' @return Nothing
31 | 
32 |   if (!is.null(class_order)){
33 |     model <- create_pipeline(X, y, 'classification')
34 |   } else {
35 |     model <- create_pipeline(X, y, 'regression')
36 |   }
37 |   # Save model
38 |   model_path <- file.path(output_dir, 'artifact.rds')
39 |   saveRDS(model, file = model_path)
40 | }
41 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_recipe/README.md:
--------------------------------------------------------------------------------
 1 | ## R Fit Template
 2 | 
 3 | The custom.R template includes a basic fit and init method that can be used to train a regression, binary or multilabel classification model.
 4 | The expected arguments to the fit method should remain the same, although the internal functionality can be tweaked to 
 5 | use different modeling or preprocessing techniques. This version uses the recipe function (recipes library) to perform normalization and check for constant columns.
 6 | 
 7 | Inside you will find several commented out methods related to prediction behavior. Uncomment and implement provided methods to modify this behavior from the default.
 8 | 
 9 | Change the target value in the code below to Species to test multilabel classification.
10 | 
11 | ### To run locally using 'drum'
12 | Paths are relative to `datarobot-user-models` root:
13 | `drum fit --code-dir model_templates/training/r_glm_noncaret_recipe --input tests/testdata/iris.csv --target-type regression --target 'Petal.Width'`
14 | If the command succeeds, your code is ready to be uploaded.
15 | 
16 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_recipe/create_pipeline.R:
--------------------------------------------------------------------------------
 1 | # This is the simplest POC that Composable ML / DRUM can
 2 | #   work with non-caret R models.
 3 | # Here I just take the caret example and replace the 
 4 | #   caret model with a very naive GLM() that uses the default
 5 | #   settings for family, link, prediction scale, etc.
 6 | #         - Jason
 7 | 
 8 | create_pipeline<-function(X, y, model_type='regression') {
 9 | 
10 |   # set up dataframe for modeling
11 |   train_df <- X
12 |   train_df$target <- unlist(y)
13 |   if (model_type == 'classification'){
14 |     train_df$target <- as.factor(train_df$target)
15 |   }
16 | 
17 |   # set up the modeling pipeline
18 |   model_recipe <- recipe(target ~ ., data = train_df) %>%
19 |     # Drop constant columns
20 |     step_zv(all_predictors()) %>%
21 |     
22 |     # Numeric preprocessing
23 |     step_medianimpute(all_numeric()) %>%
24 |     step_normalize(all_numeric(), -all_outcomes()) %>%
25 |     
26 |     # Categorical preprocessing
27 |     step_other(all_nominal(), -all_outcomes()) %>%
28 |     step_dummy(all_nominal(), -all_outcomes())
29 | 
30 |   # Run the model using the builtin glm function 
31 |   model <- glm(target~.,data=train_df)
32 |   return(model)
33 | }
34 | 


--------------------------------------------------------------------------------
/custom_inference/r/r_glm_noncaret_recipe/custom.R:
--------------------------------------------------------------------------------
 1 | init <- function(code_dir) {
 2 |   # custom init function to load required libraries
 3 |   library(tidyverse)
 4 |   library(caret)
 5 |   library(recipes)
 6 |   library(e1071)
 7 |   library(gbm)
 8 |   source(file.path(code_dir, 'create_pipeline.R'))
 9 | }
10 | 
11 | fit <- function(X, y, output_dir, class_order=NULL, row_weights=NULL, ...){
12 |     #' User-provided fit method, required for custom training
13 |     #'
14 |     #' Trains a regression or classification model using gbm (via caret)
15 |     #' @param X data.frame - training data to perform fit on
16 |     #' @param y data.frame column or array - target data to perform fit on
17 |     #' @param output_dir the path to write output. This is the path provided in '--output' parameter of the
18 |     #' 'drum fit' command.
19 |     #' @param class_order : A two element long list dictating the order of classes which should be used for
20 |     #'  modeling. Class order will always be passed to fit by DataRobot for classification tasks,
21 |     #'  and never otherwise. When models predict, they output a likelihood of one class, with a
22 |     #'  value from 0 to 1. The likelihood of the other class is 1 - this likelihood. Class order
23 |     #'  dictates that the first element in the list will be the 0 class, and the second will be the
24 |     #'  1 class.
25 |     #' @param row_weights: An array of non-negative numeric values which can be used to dictate how important
26 |     #'  a row is. Row weights is only optionally used, and there will be no filtering for which
27 |     #'  custom models support this. There are two situations when values will be passed into
28 |     #'  row_weights, during smart downsampling and when weights are explicitly provided by the user
29 |     #' @param ...: Added for forwards compatibility
30 |     #' @return Nothing
31 | 
32 |   if (!is.null(class_order)){
33 |     model <- create_pipeline(X, y, 'classification')
34 |   } else {
35 |     model <- create_pipeline(X, y, 'regression')
36 |   }
37 |   # Save model
38 |   model_path <- file.path(output_dir, 'artifact.rds')
39 |   saveRDS(model, file = model_path)
40 | }
41 | 


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/README.md:
--------------------------------------------------------------------------------
 1 | ## Scala Custom Inference
 2 | 
 3 | #### Owner Tim Whittake => timothy.whittaker@datarobot.com
 4 | 
 5 | Ths custom inference model was written in Scala, using XGBoost4j to predict Iris Species (Binary Version).  The main class, `XGBoostPredictor` inherits the `BasePredictor` class from DRUM.  You can see the code [here](src/main/scala/XGBoostPredictor.scala).  Training Code was included as well.  
 6 | 
 7 | The serialized version of the model is already available, but if you would like to train and save it to `custom-model/xgb-model` run the following from commend line.  
 8 | 
 9 | `java -jar custom-model/custom-scala-assembly-0.1.0.jar ./custom-model/xgb-model`
10 | 
11 | To run this model with `DRUM` export the following environment variables.  
12 | 
13 | `export DRUM_JAVA_CUSTOM_CLASS_PATH=/full/path/to/custom-model/custom-scala-assembly-0.1.0.jar`
14 | 
15 | `export DRUM_JAVA_CUSTOM_PREDICTOR_CLASS=custom.XGBoostPredictor`
16 | 
17 | Now run with DRUM 
18 | 
19 | `drum score --code-dir ./custom-model --input data/iris_binary_training.csv --target-type binary --positive-class-label 1 --negative-class-label 0`
20 | 
21 | ## requirements 
22 | 
23 | * java 11
24 | 
25 | To build the jar youl will need sbt


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/build.sbt:
--------------------------------------------------------------------------------
1 | name := "custom-scala"
2 | 
3 | scalaVersion := "2.12.8"
4 | 
5 | version := "0.1.0"
6 | 
7 | libraryDependencies += "ml.dmlc" %% "xgboost4j" % "1.4.1"


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/custom-model/custom-scala-assembly-0.1.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/scala/iris_binary/custom-model/custom-scala-assembly-0.1.0.jar


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/custom-model/xgb-model/model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/scala/iris_binary/custom-model/xgb-model/model.bin


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/data/iris_binary_training.csv:
--------------------------------------------------------------------------------
  1 | Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
  2 | 1,5.1,3.5,1.4,0.2,Iris-setosa
  3 | 2,4.9,3.0,1.4,0.2,Iris-setosa
  4 | 3,4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4,4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5,5.0,3.6,1.4,0.2,Iris-setosa
  7 | 6,5.4,3.9,1.7,0.4,Iris-setosa
  8 | 7,4.6,3.4,1.4,0.3,Iris-setosa
  9 | 8,5.0,3.4,1.5,0.2,Iris-setosa
 10 | 9,4.4,2.9,1.4,0.2,Iris-setosa
 11 | 10,4.9,3.1,1.5,0.1,Iris-setosa
 12 | 11,5.4,3.7,1.5,0.2,Iris-setosa
 13 | 12,4.8,3.4,1.6,0.2,Iris-setosa
 14 | 13,4.8,3.0,1.4,0.1,Iris-setosa
 15 | 14,4.3,3.0,1.1,0.1,Iris-setosa
 16 | 15,5.8,4.0,1.2,0.2,Iris-setosa
 17 | 16,5.7,4.4,1.5,0.4,Iris-setosa
 18 | 17,5.4,3.9,1.3,0.4,Iris-setosa
 19 | 18,5.1,3.5,1.4,0.3,Iris-setosa
 20 | 19,5.7,3.8,1.7,0.3,Iris-setosa
 21 | 20,5.1,3.8,1.5,0.3,Iris-setosa
 22 | 21,5.4,3.4,1.7,0.2,Iris-setosa
 23 | 22,5.1,3.7,1.5,0.4,Iris-setosa
 24 | 23,4.6,3.6,1.0,0.2,Iris-setosa
 25 | 24,5.1,3.3,1.7,0.5,Iris-setosa
 26 | 25,4.8,3.4,1.9,0.2,Iris-setosa
 27 | 26,5.0,3.0,1.6,0.2,Iris-setosa
 28 | 27,5.0,3.4,1.6,0.4,Iris-setosa
 29 | 28,5.2,3.5,1.5,0.2,Iris-setosa
 30 | 29,5.2,3.4,1.4,0.2,Iris-setosa
 31 | 30,4.7,3.2,1.6,0.2,Iris-setosa
 32 | 31,4.8,3.1,1.6,0.2,Iris-setosa
 33 | 32,5.4,3.4,1.5,0.4,Iris-setosa
 34 | 33,5.2,4.1,1.5,0.1,Iris-setosa
 35 | 34,5.5,4.2,1.4,0.2,Iris-setosa
 36 | 35,4.9,3.1,1.5,0.1,Iris-setosa
 37 | 36,5.0,3.2,1.2,0.2,Iris-setosa
 38 | 37,5.5,3.5,1.3,0.2,Iris-setosa
 39 | 38,4.9,3.1,1.5,0.1,Iris-setosa
 40 | 39,4.4,3.0,1.3,0.2,Iris-setosa
 41 | 51,7.0,3.2,4.7,1.4,Iris-versicolor
 42 | 52,6.4,3.2,4.5,1.5,Iris-versicolor
 43 | 53,6.9,3.1,4.9,1.5,Iris-versicolor
 44 | 54,5.5,2.3,4.0,1.3,Iris-versicolor
 45 | 55,6.5,2.8,4.6,1.5,Iris-versicolor
 46 | 56,5.7,2.8,4.5,1.3,Iris-versicolor
 47 | 57,6.3,3.3,4.7,1.6,Iris-versicolor
 48 | 58,4.9,2.4,3.3,1.0,Iris-versicolor
 49 | 59,6.6,2.9,4.6,1.3,Iris-versicolor
 50 | 60,5.2,2.7,3.9,1.4,Iris-versicolor
 51 | 61,5.0,2.0,3.5,1.0,Iris-versicolor
 52 | 62,5.9,3.0,4.2,1.5,Iris-versicolor
 53 | 63,6.0,2.2,4.0,1.0,Iris-versicolor
 54 | 64,6.1,2.9,4.7,1.4,Iris-versicolor
 55 | 65,5.6,2.9,3.6,1.3,Iris-versicolor
 56 | 66,6.7,3.1,4.4,1.4,Iris-versicolor
 57 | 67,5.6,3.0,4.5,1.5,Iris-versicolor
 58 | 68,5.8,2.7,4.1,1.0,Iris-versicolor
 59 | 69,6.2,2.2,4.5,1.5,Iris-versicolor
 60 | 70,5.6,2.5,3.9,1.1,Iris-versicolor
 61 | 71,5.9,3.2,4.8,1.8,Iris-versicolor
 62 | 72,6.1,2.8,4.0,1.3,Iris-versicolor
 63 | 73,6.3,2.5,4.9,1.5,Iris-versicolor
 64 | 74,6.1,2.8,4.7,1.2,Iris-versicolor
 65 | 75,6.4,2.9,4.3,1.3,Iris-versicolor
 66 | 76,6.6,3.0,4.4,1.4,Iris-versicolor
 67 | 77,6.8,2.8,4.8,1.4,Iris-versicolor
 68 | 78,6.7,3.0,5.0,1.7,Iris-versicolor
 69 | 79,6.0,2.9,4.5,1.5,Iris-versicolor
 70 | 1,5.1,3.5,1.4,0.2,Iris-setosa
 71 | 2,4.9,3.0,1.4,0.2,Iris-setosa
 72 | 3,4.7,3.2,1.3,0.2,Iris-setosa
 73 | 4,4.6,3.1,1.5,0.2,Iris-setosa
 74 | 5,5.0,3.6,1.4,0.2,Iris-setosa
 75 | 6,5.4,3.9,1.7,0.4,Iris-setosa
 76 | 7,4.6,3.4,1.4,0.3,Iris-setosa
 77 | 8,5.0,3.4,1.5,0.2,Iris-setosa
 78 | 9,4.4,2.9,1.4,0.2,Iris-setosa
 79 | 10,4.9,3.1,1.5,0.1,Iris-setosa
 80 | 11,5.4,3.7,1.5,0.2,Iris-setosa
 81 | 12,4.8,3.4,1.6,0.2,Iris-setosa
 82 | 13,4.8,3.0,1.4,0.1,Iris-setosa
 83 | 14,4.3,3.0,1.1,0.1,Iris-setosa
 84 | 15,5.8,4.0,1.2,0.2,Iris-setosa
 85 | 16,5.7,4.4,1.5,0.4,Iris-setosa
 86 | 17,5.4,3.9,1.3,0.4,Iris-setosa
 87 | 18,5.1,3.5,1.4,0.3,Iris-setosa
 88 | 19,5.7,3.8,1.7,0.3,Iris-setosa
 89 | 20,5.1,3.8,1.5,0.3,Iris-setosa
 90 | 21,5.4,3.4,1.7,0.2,Iris-setosa
 91 | 22,5.1,3.7,1.5,0.4,Iris-setosa
 92 | 23,4.6,3.6,1.0,0.2,Iris-setosa
 93 | 24,5.1,3.3,1.7,0.5,Iris-setosa
 94 | 25,4.8,3.4,1.9,0.2,Iris-setosa
 95 | 26,5.0,3.0,1.6,0.2,Iris-setosa
 96 | 27,5.0,3.4,1.6,0.4,Iris-setosa
 97 | 28,5.2,3.5,1.5,0.2,Iris-setosa
 98 | 29,5.2,3.4,1.4,0.2,Iris-setosa
 99 | 30,4.7,3.2,1.6,0.2,Iris-setosa
100 | 31,4.8,3.1,1.6,0.2,Iris-setosa
101 | 32,5.4,3.4,1.5,0.4,Iris-setosa
102 | 33,5.2,4.1,1.5,0.1,Iris-setosa
103 | 34,5.5,4.2,1.4,0.2,Iris-setosa
104 | 73,6.3,2.5,4.9,1.5,Iris-versicolor
105 | 74,6.1,2.8,4.7,1.2,Iris-versicolor
106 | 75,6.4,2.9,4.3,1.3,Iris-versicolor
107 | 76,6.6,3.0,4.4,1.4,Iris-versicolor
108 | 77,6.8,2.8,4.8,1.4,Iris-versicolor
109 | 78,6.7,3.0,5.0,1.7,Iris-versicolor
110 | 79,6.0,2.9,4.5,1.5,Iris-versicolor
111 | 


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/lib/predictors.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_inference/scala/iris_binary/lib/predictors.jar


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.3.0
2 | 


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.10")
2 | 
3 | 
4 | 
5 | // addSbtPlugin("com.github.nyavro" % "sbt-spi-plugin" % "1.0.4")


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/src/main/scala/Main.scala:
--------------------------------------------------------------------------------
 1 | import ml.dmlc.xgboost4j.scala.DMatrix
 2 | import ml.dmlc.xgboost4j.scala.Booster;
 3 | import ml.dmlc.xgboost4j.scala.XGBoost;
 4 | import ml.dmlc.xgboost4j.LabeledPoint
 5 | import java.io._
 6 | import java.nio.file.Paths
 7 | 
 8 | object TrainXGB extends App {
 9 | 
10 |   val dir = new File(args(0))
11 |   dir.exists match { 
12 |       case true => null
13 |       case false => dir.mkdirs
14 |   }
15 | 
16 |   val data = scala.io.Source
17 |     .fromFile(
18 |       "data/iris_binary_training.csv"
19 |     )
20 |     .getLines
21 |   val positiveClassLabel = "Iris-versicolor"
22 |   val negativeClassLabel = "Iris-setosa"
23 |   val headers = data.next
24 |   val nullArray = null.asInstanceOf[Array[Int]]
25 |   val dataIter = data.map { row =>
26 |     val d = row.split(",")
27 |     val len = d.length - 1
28 |     val (features, label) = d.splitAt(len)
29 |     val label_bin = label.apply(0) match {
30 |       case "Iris-setosa"     => 0f
31 |       case "Iris-versicolor" => 1f
32 |       case _                 => throw new Exception("not set for multiclass")
33 |     }
34 |     LabeledPoint(
35 |       label = label_bin,
36 |       len - 1,
37 |       indices = nullArray,
38 |       values = features.map { _.toFloat }.tail
39 |     )
40 |   }
41 | 
42 |   val dmatrix = new DMatrix(dataIter.toIterator, cacheInfo = null)
43 |   val paramMap = List(
44 |     "eta" -> 0.1,
45 |     "max_depth" -> 5,
46 |     "objective" -> "binary:logistic",
47 |     "verbosity" -> 1
48 |   ).toMap
49 |   // number of iterations
50 |   val round = 100
51 |   val booster =
52 |     XGBoost.train(dmatrix, paramMap, round, earlyStoppingRound = 200)
53 | 
54 |   val modelPath = Paths.get(dir.toString, "model.bin").toFile
55 |   booster.saveModel(modelPath.toString)
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/custom_inference/scala/iris_binary/src/main/scala/XGBoostPredictor.scala:
--------------------------------------------------------------------------------
 1 | package custom
 2 | 
 3 | import com.datarobot.drum._
 4 | import collection.JavaConverters._
 5 | import ml.dmlc.xgboost4j.scala.DMatrix
 6 | import ml.dmlc.xgboost4j.scala.Booster;
 7 | import ml.dmlc.xgboost4j.scala.XGBoost;
 8 | import ml.dmlc.xgboost4j.LabeledPoint
 9 | 
10 | import org.apache.commons.csv.CSVFormat;
11 | 
12 | import java.io.{BufferedReader, ByteArrayInputStream, InputStreamReader}
13 | 
14 | import java.util.HashMap
15 | 
16 | import util.{Try, Success, Failure}
17 | import java.nio.file.Paths
18 | 
19 | 
20 | class XGBoostPredictor(name: String) extends BasePredictor(name) 
21 | {
22 | 
23 |     var customModelPath: String = null
24 |     var negativeClassLabel: String = null 
25 |     var positiveClassLabel: String = null
26 |     var booster: Booster = null
27 |     val features = Array("SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm")
28 |     val numFeatures = features.length
29 | 
30 |     override def configure(
31 |       params: java.util.Map[String, AnyRef] = new java.util.HashMap[String, AnyRef]()
32 |     ) = {
33 |         customModelPath = params.get("__custom_model_path__").asInstanceOf[String]
34 |         negativeClassLabel = params.get("negativeClassLabel").asInstanceOf[String]
35 |         positiveClassLabel = params.get("positiveClassLabel").asInstanceOf[String]
36 |         val modelPath = Paths.get(customModelPath, "xgb-model", "model.bin").toFile
37 |         modelPath exists match { 
38 |           case false => throw new Exception(s"${modelPath} does not exist")
39 |           case true => null 
40 |         }
41 |         booster = XGBoost.loadModel(modelPath.toString)
42 |     }
43 |     override def predict(inputBytes: Array[Byte]): String = {
44 |         val reader = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(inputBytes)))
45 |         val csvFormat = CSVFormat.DEFAULT.withFirstRecordAsHeader;
46 |         val parser = csvFormat.parse(reader)
47 |         val sParser = parser.iterator.asScala.map { _.toMap }
48 |         val dataIter = sParser.map{ row => 
49 |           val rs = row.asScala.filter{ case(k,v) => features.contains(k)}.map{ _._2}
50 |           LabeledPoint(0f, numFeatures, null, rs.toArray.map{_.toFloat})
51 |         }.toIterator
52 |         val dmatrix = new DMatrix(dataIter)
53 |         val predictions = booster.predict(dmatrix).map{ p => 
54 |           val p1 = p(0)
55 |           val p0 = 1 - p1
56 |           s"${p0},${p1}"
57 |         }
58 |         predictions.mkString(s"${negativeClassLabel},${positiveClassLabel}\n", "\n", "")
59 |     }
60 |  
61 | }
62 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/README.md:
--------------------------------------------------------------------------------
1 | ## Custom Tasks for Classification with Python
2 | 
3 | This directory contains examples of Custom Tasks for Classification written in Python.  This examples can work with DataRobot Composable ML
4 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/catboost/catboost_pipeline.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sklearn.pipeline import Pipeline
 3 | from sklearn.compose import ColumnTransformer
 4 | from sklearn.impute import SimpleImputer
 5 | import numpy as np
 6 | import pandas as pd
 7 | from catboost import CatBoostClassifier
 8 | from typing import List, Optional
 9 | from feature_selection import DataSelector
10 | 
11 | 
12 | class CatBoostClassifier_wrapper:
13 |     """
14 |     A wrapper is not required in typical cases, but is valuable for catboost
15 |     it allows to automatically identify and pass categorical/text features
16 |     to CatBoostClassifier.fit while working with DR custom tasks logic
17 |     """
18 | 
19 |     model = None
20 | 
21 |     def model(self):
22 |         return self.model
23 | 
24 |     def fit(self, X, y):
25 |         data = pd.DataFrame(X, columns=map(str, range(len(X[0]))))
26 |         cat_features = DataSelector.CatSelector(data)
27 |         text_features = DataSelector.TxtSelector(data)
28 |         self.model = CatBoostClassifier(
29 |             allow_writing_files=False,
30 |             #train_dir="catboost_info", 
31 |             iterations=50
32 |             ).fit(
33 |             X, y, cat_features=cat_features, text_features=text_features
34 |         )
35 |         return self.model
36 | 
37 |     def predict_proba(self, data: pd.DataFrame):
38 |         return pd.DataFrame(
39 |             data=self.model.predict_proba(data), columns=self.model.classes_
40 |         )
41 | 
42 | 
43 | def catboost_pipeline(X):
44 |     catboost_preprocessing = ColumnTransformer(
45 |         transformers=[
46 |             ("num", "passthrough", DataSelector.NumSelector),
47 |             (
48 |                 "cat",
49 |                 SimpleImputer(strategy="constant", fill_value=""),
50 |                 DataSelector.CatSelector,
51 |             ),
52 |             # ("txt", SimpleImputer(strategy="constant", fill_value=""), DataSelector.TxtSelector),
53 |         ],
54 |         remainder="drop",
55 |     )
56 | 
57 |     return Pipeline(
58 |         steps=[
59 |             ("preprocessing", catboost_preprocessing),
60 |             ("model", CatBoostClassifier_wrapper()),
61 |         ]
62 |     )
63 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/catboost/custom.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | import pickle
 3 | import pandas as pd
 4 | import numpy as np
 5 | from pathlib import Path
 6 | from sklearn.preprocessing import LabelEncoder
 7 | from catboost_pipeline import catboost_pipeline
 8 | 
 9 | 
10 | def fit(
11 |     X: pd.DataFrame,
12 |     y: pd.Series,
13 |     output_dir: str,
14 |     class_order: Optional[List[str]] = None,
15 |     row_weights: Optional[np.ndarray] = None,
16 |     **kwargs,
17 | ) -> None:
18 | 
19 | 
20 |     estimator = catboost_pipeline(X)
21 |     estimator.fit(X, y)
22 | 
23 | 
24 |     output_dir_path = Path(output_dir)
25 |     if output_dir_path.exists() and output_dir_path.is_dir():
26 |         with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
27 |             pickle.dump(estimator, fp)
28 | 
29 | 
30 | def score(data: pd.DataFrame, model, **kwargs) -> pd.DataFrame:
31 |     return model.predict_proba(data)
32 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/catboost/feature_selection.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | def is_numeric(x: pd.Series):
 5 |     try:
 6 |         sum(x)
 7 |         return True
 8 |     except:
 9 |         return False
10 | 
11 | 
12 | # Helper function to use in text_selector
13 | def is_text(x: pd.Series):
14 |     """
15 |     Decide if a pandas series is text, using a very simple heuristic:
16 |     - Count the number of elements in the series that contain 1 or more whitespace character
17 |     - If >75% of the elements have whitespace and # unique / # all values >0.1, the Series is text. 
18 |     - If # unique / # all values >0.8 then the series is text
19 |     otherwise - non-text
20 |     Parameters
21 |     ----------
22 |     x: Series to be analyzed for text
23 |     Returns
24 |     -------
25 |     boolean: True for is text, False for not text
26 |     """
27 |     if pd.api.types.is_string_dtype(x):
28 |         x_values = x.dropna()
29 |         pct_rows_with_whitespace = (x_values.str.count(r"\s") > 0).sum() / x_values.shape[0]
30 |         pct_unique = float(x_values.unique().shape[0]) / x_values.shape[0]
31 |         if pct_unique > 0.8:
32 |             return True
33 |         if pct_rows_with_whitespace > 0.75 and pct_unique > 0.1:
34 |             return True
35 |     return False
36 | 
37 | def is_datetime(x: pd.Series):
38 |     if x.dtype != np.object:
39 |         return False
40 | 
41 |     try:
42 |         pd.to_datetime(x)
43 |         return True
44 |     except:
45 |         return False
46 | 
47 | 
48 | def get_columns_by_type(X: pd.DataFrame):
49 |     """"
50 |     Creates a dictionary with a list of features for each data type
51 |     """
52 |     data = X.copy()
53 |     dict = {}
54 |     dict["num"] = data.columns[list(data.apply(is_numeric, result_type="expand"))].tolist()
55 |     data.drop(dict["num"], axis=1, inplace=True)
56 | 
57 |     dict["txt"] = data.columns[list(data.apply(is_text, result_type="expand"))].tolist()
58 |     data.drop(dict["txt"], axis=1, inplace=True)
59 | 
60 |     dict["dat"] = data.columns[list(data.apply(is_datetime, result_type="expand"))].tolist()
61 |     data.drop(dict["dat"], axis=1, inplace=True)
62 | 
63 |     dict["cat"] = data.columns.tolist()
64 |     return dict
65 | 
66 | 
67 | 
68 | class DataSelector():
69 |     """
70 |         Valueable for catboost
71 |         Each method returns a list of column indices for a specific data type 
72 |     """
73 | 
74 |     def NumSelector(X: pd.DataFrame):
75 |         return [X.columns.get_loc(c) for c in get_columns_by_type(X)['num']]
76 | 
77 |     def CatSelector(X: pd.DataFrame):
78 |         return [X.columns.get_loc(c) for c in get_columns_by_type(X)['cat']]
79 | 
80 |     def TxtSelector(X: pd.DataFrame):
81 |         return [X.columns.get_loc(c) for c in get_columns_by_type(X)['txt']]
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/catboost/requirements.txt:
--------------------------------------------------------------------------------
1 | catboost==0.24.4
2 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/README.md:
--------------------------------------------------------------------------------
1 | ## Custom Task
2 | 
3 | #### Owner: Tim Whittaker (timothy.whittaker@datarobot.com)
4 | 
5 | Please see the associated [notebook](custom_tasks/models/classification/python/graph_isomorphism_network/GNN_Custom_Task.ipynb) in this directory for details on this example.  


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/custom_task_gin/custom.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | from pathlib import Path
 4 | import pickle
 5 | import dgl 
 6 | import torch
 7 | from graph_isomorphism_network import *
 8 | from torch.utils.data import DataLoader
 9 | 
10 | # def init(code_dir):
11 | 
12 | def collate(samples):
13 |     graphs, labels = map(list, zip(*samples))
14 |     batched_graph = dgl.batch(graphs)
15 |     batched_labels = torch.tensor(labels)
16 |     return batched_graph, batched_labels
17 |     
18 | 
19 | 
20 | def load_model(code_dir):
21 |     model = GIN(2, 2, 1, 20, 2, 0, 0.01, "sum", "sum")
22 |     model.load_state_dict(torch.load(os.path.join(code_dir, "gin_model.h5")))
23 |     return model
24 | 
25 | def fit(X, y, output_dir, **kwargs):
26 | 
27 |     model = GIN(2, 2, 1, 20, 2, 0, 0.01, "sum", "sum")
28 |     dgl_graphs = X["dgl_graph"].values
29 |     dgl_graphs = list( map ( lambda x: pickle.loads(eval(x)), dgl_graphs))
30 | 
31 |     dataset = []
32 |     for g, l in zip(dgl_graphs, y.values):
33 |         num_nodes = g.num_nodes()
34 |         g.ndata["attr"] = torch.ones(g.num_nodes(), 1)
35 |         g.ndata["label"] = torch.ones(num_nodes, ) if l == 1 else torch.zeros(num_nodes, )
36 |         dataset.append((g, torch.tensor(l)))
37 | 
38 | 
39 |     dataloader = DataLoader(dataset,batch_size=1024,collate_fn=collate,drop_last=False,shuffle=True)
40 | 
41 |     opt = torch.optim.Adam(model.parameters(),lr=0.01)
42 | 
43 |     for epoch in range(500):
44 |         for batched_graph, label in dataloader:
45 |             feats = batched_graph.ndata['attr'].float()
46 |             logits = model(batched_graph, feats)
47 |             loss = F.cross_entropy(logits, label)
48 |             # print(loss)
49 |             opt.zero_grad()
50 |             loss.backward()
51 |             opt.step()
52 |         if epoch % 100 == 0:
53 |             print('Epoch %d | Loss: %.4f' % (epoch, loss.item()))
54 | 
55 |     output_dir_path = Path(output_dir)
56 |     if output_dir_path.exists() and output_dir_path.is_dir():
57 |         torch.save(model.state_dict(), "{}/gin_model.h5".format(output_dir))
58 | 
59 | def score(data, model, **kwargs): 
60 |     dgl_graphs = data["dgl_graph"].values
61 |     pos_class = kwargs["positive_class_label"]
62 |     neg_class = kwargs["negative_class_label"]
63 |     dgl_graphs = list( map ( lambda x: pickle.loads(eval(x)), dgl_graphs))
64 |     for g in dgl_graphs:
65 |         g.ndata["attr"] = torch.ones(g.num_nodes(), 1)
66 |     batched_graph = dgl.batch(dgl_graphs)
67 |     feats = batched_graph.ndata['attr'].float()
68 |     preds = F.softmax(model(batched_graph, feats), dim=1).detach().numpy()
69 |     return pd.DataFrame(preds, columns = [neg_class, pos_class])
70 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/custom_task_gin/requirements.txt:
--------------------------------------------------------------------------------
1 | networkx==2.5
2 | dgl==0.5.2
3 | datarobot-drum


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/env/Dockerfile:
--------------------------------------------------------------------------------
 1 | # This is the default base image for use with user models and workflows.
 2 | # It contains a variety of common useful data-science packages and tools.
 3 | FROM datarobot/python3-dropin-env-base
 4 | 
 5 | # Install the list of core requirements, e.g. sklearn, numpy, pandas, flask.
 6 | # **Don't modify this file!**
 7 | COPY dr_requirements.txt dr_requirements.txt
 8 | 
 9 | # '--upgrade-strategy eager' will upgrade installed dependencies
10 | # according to package requirements or to the latest
11 | RUN pip3 install -U --upgrade-strategy eager --no-cache-dir --prefer-binary -r dr_requirements.txt  && \
12 |     rm -rf dr_requirements.txt
13 | 
14 | # Install the list of custom Python requirements, e.g. keras, xgboost, etc.
15 | COPY requirements.txt requirements.txt
16 | RUN pip3 install -r requirements.txt --no-cache-dir && \
17 |     rm -rf requirements.txt
18 | 
19 | RUN mkdir -p /opt/.dgl && chmod -R 777 /opt/.dgl
20 | 
21 | # Copy the drop-in environment code into the correct directory
22 | # Code from the custom model tarball can overwrite the code here
23 | ENV HOME=/opt CODE_DIR=/opt/code ADDRESS=0.0.0.0:8080
24 | WORKDIR ${CODE_DIR}
25 | COPY ./ ${CODE_DIR}
26 | 
27 | ENV WITH_ERROR_SERVER=1
28 | # Uncomment the following line to switch from Flask to uwsgi server
29 | #ENV PRODUCTION=1 MAX_WORKERS=1 SHOW_STACKTRACE=1
30 | 
31 | ENTRYPOINT ["${CODE_DIR}/start_server.sh"]
32 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/env/README.md:
--------------------------------------------------------------------------------
 1 | # Python 3 PyTorch Drop-In Template Environment
 2 | 
 3 | This template environment can be used to create artifact-only PyTorch custom models.
 4 | Your custom model directory needs only contain your model artifact if you use the
 5 | environment correctly.
 6 | 
 7 | ## Supported Libraries
 8 | 
 9 | This environment has built for python 3 and has support for the following scientific libraries.
10 | For specific version information, see [requirements](requirements.txt).
11 | 
12 | - PyTorch
13 | 
14 | ## Instructions
15 | 
16 | 1. From the terminal, run `tar -czvf py_dropin.tar.gz -C /path/to/public_dropin_environments/python3_pytorch/ .`
17 | 2. Using either the API or from the UI create a new Custom Environment with the tarball created
18 | in step 1.
19 | 
20 | ### Creating models for this environment
21 | 
22 | To use this environment, your custom model archive must contain a single serialized model artifact
23 | with `.pth` file extension as well as any other custom code needed to use your serialized model, including
24 | the file that defines your torch network.
25 | 
26 | 
27 | This environment makes the following assumption about your serialized model:
28 | - The data sent to custom model can be used to make predictions without
29 | additional pre-processing
30 | - Regression models return a single floating point per row of prediction data
31 | - Binary classification models return one floating point value <= 1.0 or two floating point values that sum to 1.0 per row of prediction data.
32 |   - Single value output is assumed to be the positive class probability
33 |   - Multi value it is assumed that the first value is the negative class probability, the second is the positive class probability
34 | - There is a single .pth file present
35 |   
36 | If these assumptions are incorrect for your model, you should make a copy of [custom.py](https://github.com/datarobot/datarobot-user-models/blob/master/model_templates/python3_pytorch/custom.py), modify it as needed, and include in your custom model archive.
37 | 
38 | The structure of your custom model archive should look like:
39 | 
40 | - custom_model.tar.gz
41 |   - artifact.pth
42 |   - custom.py (if needed)
43 | 
44 | Please read [datarobot-cmrunner](https://github.com/datarobot/datarobot-user-models/blob/master/custom_model_runner/README.md) documentation on how to assemble **custom.py**.
45 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/env/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_tasks/models/classification/python/graph_isomorphism_network/env/__init__.py


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/env/dr_requirements.txt:
--------------------------------------------------------------------------------
1 | pyarrow==0.14.1
2 | datarobot-drum==1.5.7
3 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/env/env_info.json:
--------------------------------------------------------------------------------
1 | {
2 |   "id": "5e8c888007389fe0f466c72b",
3 |   "name": "[DataRobot] Python 3 PyTorch Drop-In",
4 |   "description": "This template environment can be used to create artifact-only PyTorch custom models. This environment contains PyTorch and requires only your model artifact as a .pth file, any other code needed to deserialize your model, and optionally a custom.py file.",
5 |   "programmingLanguage": "python",
6 |   "environmentVersionId": "60ee9b889f5641ab36ae5823",
7 |   "isPublic": true
8 | }
9 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/env/fit.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | # You probably don't want to modify this file
 3 | cd "${CODEPATH}" || exit 1
 4 | export PYTHONPATH="${CODEPATH}":"${PYTHONPATH}"
 5 | 
 6 | export X="${INPUT_DIRECTORY}/X${TRAINING_DATA_EXTENSION:-.csv}"
 7 | export weights="${INPUT_DIRECTORY}/weights.csv"
 8 | export sparse_colnames="${INPUT_DIRECTORY}/X.colnames"
 9 | export parameters="${INPUT_DIRECTORY}/parameters.json"
10 | 
11 | CMD="drum fit --target-type ${TARGET_TYPE} --input ${X} --num-rows ALL --output ${ARTIFACT_DIRECTORY} \
12 | --code-dir ${CODEPATH} --verbose"
13 | 
14 | if [ "${TARGET_TYPE}" != "anomaly" ]; then
15 |     CMD="${CMD} --target-csv ${INPUT_DIRECTORY}/y.csv"
16 | fi
17 | 
18 | if [ -f "${weights}" ]; then
19 |     CMD="${CMD} --row-weights-csv ${weights}"
20 | fi
21 | 
22 | if [ -f "${sparse_colnames}" ]; then
23 |     CMD="${CMD} --sparse-column-file ${sparse_colnames}"
24 | fi
25 | 
26 | if [ -f "${parameters}" ]; then
27 |     CMD="${CMD} --parameter-file ${parameters}"
28 | fi
29 | 
30 | echo "Environment variables:"
31 | env
32 | echo "${CMD}"
33 | sh -c "${CMD}"
34 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/env/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.2.0
2 | numpy==1.19.5
3 | pandas==1.0.5
4 | scikit-learn==0.23.1
5 | sagemaker-scikit-learn-extension==1.1.0


--------------------------------------------------------------------------------
/custom_tasks/models/classification/python/graph_isomorphism_network/env/start_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | echo "Starting Custom Model environment with DRUM prediction server"
 3 | echo "Environment variables:"
 4 | env
 5 | echo
 6 | 
 7 | CMD="drum server $@"
 8 | echo "Executing command: ${CMD}"
 9 | echo
10 | exec ${CMD}
11 | 


--------------------------------------------------------------------------------
/custom_tasks/models/classification/r/README.md:
--------------------------------------------------------------------------------
1 | ## Custom Tasks for Classification with R
2 | 
3 | This directory contains examples of Custom Tasks for Classification written in R.  This examples can work with DataRobot Composable ML


--------------------------------------------------------------------------------
/custom_tasks/models/regression/python/README.md:
--------------------------------------------------------------------------------
1 | ## Custom Tasks for Regression with Python
2 | 
3 | This directory contains examples of Custom Tasks for Regerssion written in Python.  This examples can work with DataRobot Composable ML
4 | 


--------------------------------------------------------------------------------
/custom_tasks/models/regression/r/README.md:
--------------------------------------------------------------------------------
1 | README
2 | 


--------------------------------------------------------------------------------
/custom_tasks/models/unsupervised/python/README.md:
--------------------------------------------------------------------------------
1 | ## Custom Tasks for Unsupervised Models with Python
2 | 
3 | This directory contains examples of Custom Tasks for Unsupervised Models written in Python.  This examples can work with DataRobot Composable ML
4 | 


--------------------------------------------------------------------------------
/custom_tasks/models/unsupervised/r/README.md:
--------------------------------------------------------------------------------
1 | ## Custom Tasks for Unsupervised Models with R
2 | 
3 | This directory contains examples of Custom Tasks for Unsupervised Models written in R.  This examples can work with DataRobot Composable ML
4 | 


--------------------------------------------------------------------------------
/custom_tasks/other/README.md:
--------------------------------------------------------------------------------
1 | ## Other Custom Tasks
2 | 
3 | This directory contains examples of Other Custom Tasks.  These examples can work with DataRobot Composable ML
4 | 


--------------------------------------------------------------------------------
/custom_tasks/other/python/README.md:
--------------------------------------------------------------------------------
1 | README
2 | 


--------------------------------------------------------------------------------
/custom_tasks/other/python/round_predictions/custom.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | import pickle
 3 | import pandas as pd
 4 | import numpy as np
 5 | from pathlib import Path
 6 | from sklearn.pipeline import Pipeline
 7 | 
 8 | 
 9 | def fit(
10 |     X: pd.DataFrame,
11 |     y: pd.Series,
12 |     output_dir: str,
13 |     class_order: Optional[List[str]] = None,
14 |     row_weights: Optional[np.ndarray] = None,
15 |     **kwargs,
16 | ) -> None:
17 | 
18 |     estimator = pipeline(X)
19 |     estimator.fit(X, y)
20 | 
21 |     output_dir_path = Path(output_dir)
22 |     if output_dir_path.exists() and output_dir_path.is_dir():
23 |         with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
24 |             pickle.dump(estimator, fp)
25 | 
26 | 
27 | class RoundInput():
28 |     """
29 |     Goal is to round the output of a prior model, so using those unrounded
30 |     predictions as inputs here.
31 |     """
32 | 
33 |     def __init__(self, X):
34 |         self.X = X
35 |         
36 |     def fit(self, X, y=None, **kwargs):
37 |         self.X = round(X)
38 |         return self
39 | 
40 |     def transform(self, X):
41 |         return np.array(round(X[X.columns[0]])).reshape(-1, 1)
42 | 
43 | 
44 | class EmptyEstimator():
45 |     """
46 |     This is empty because the rounding is done in the above step of the pipeline.
47 |     Still need this for the pipeline to run though.
48 |     """
49 | 
50 |     def fit(self, X, y):
51 |         return self
52 | 
53 |     def predict(self, data: pd.DataFrame):
54 |         return data[:,0]
55 | 
56 | 
57 | def pipeline(X):
58 |     return Pipeline(steps=[("preprocessing", RoundInput(X)), ("model", EmptyEstimator())])
59 | 
60 | 
61 | def score(data: pd.DataFrame, model, **kwargs) -> pd.DataFrame:
62 |     return pd.DataFrame(data=model.predict(data), columns = ['Predictions'])
63 | 


--------------------------------------------------------------------------------
/custom_tasks/other/r/README.md:
--------------------------------------------------------------------------------
1 | README
2 | 


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_catboost/README.md:
--------------------------------------------------------------------------------
 1 | # Catboost Target Encoding
 2 | 
 3 | ### Overview
 4 | ```
 5 | [ ] accepts numeric inputs
 6 | [x] accepts categorical inputs
 7 | [ ] outputs missing values
 8 | [x] works for binary targets (tested in DataRobot and confirmed)
 9 | [?] works for multiclass targets
10 | [x] works for numeric targets (tested in DataRobot and confirmed)
11 | ```
12 | ### Description
13 | 
14 | tbd
15 | 
16 | ### References
17 | 
18 | https://contrib.scikit-learn.org/category_encoders/catboost.html#
19 | 
20 | https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/
21 | 
22 | https://arxiv.org/abs/1706.09516


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_catboost/custom.py:
--------------------------------------------------------------------------------
 1 | import category_encoders as ce
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit CatBoost encoder], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [a pickle file containing a pre-fit CatBoost Encoder] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     encoder_catboost = ce.CatBoostEncoder(cols=X.columns)
31 |     encoder_catboost.fit(X,y)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/catboost.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(encoder_catboost, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |         In this example, it's a pickle file containing a pre-fit CatBoost Encoder.
55 |     
56 |     Returns
57 |     -------
58 |     pd.DataFrame
59 |         Returns a dataframe with transformed data.
60 |     """
61 | 
62 |     return transformer.transform(data).fillna(0)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_catboost/requirements.txt:
--------------------------------------------------------------------------------
1 | category_encoders==2.2.2


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_hashing/README.md:
--------------------------------------------------------------------------------
 1 | # Hash Encoding
 2 | 
 3 | ### Overview
 4 | 
 5 | [ ] accepts numeric inputs
 6 | 
 7 | [x] accepts categorical inputs
 8 | 
 9 | [ ] outputs missing values
10 | 
11 | [x] works for binary targets (tested in DataRobot and confirmed)
12 | 
13 | [?] works for multiclass targets
14 | 
15 | [x] works for numeric targets (tested in DataRobot and confirmed)
16 | 
17 | ### Description
18 | 
19 | tbd
20 | 
21 | ### References
22 | 
23 | https://alex.smola.org/papers/2009/Weinbergeretal09.pdf
24 | 
25 | https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_hashing/custom.py:
--------------------------------------------------------------------------------
 1 | import category_encoders as ce
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit HashingEncoder], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - a pickle file containing a pre-fit HashingEncoder] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     encoder_hash = ce.HashingEncoder(cols=X.columns, hash_method='md5', max_process=2, n_components=16)
31 |     encoder_hash.fit(X,y)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/hash.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(encoder_hash, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |         In this example, it's a pickle file containing a pre-fit HashingEncoder.
55 |     
56 |     Returns
57 |     -------
58 |     pd.DataFrame
59 |         Returns a dataframe with transformed data.
60 |     """
61 | 
62 |     return transformer.transform(data).fillna(0)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_hashing/requirements.txt:
--------------------------------------------------------------------------------
1 | category_encoders==2.2.2


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_mest/README.md:
--------------------------------------------------------------------------------
 1 | # M-estimate Target Encoding
 2 | 
 3 | ### Overview
 4 | 
 5 | [ ] accepts numeric inputs
 6 | 
 7 | [x] accepts categorical inputs
 8 | 
 9 | [ ] outputs missing values
10 | 
11 | [x] works for binary targets (tested in DataRobot and confirmed)
12 | 
13 | [?] works for multiclass targets
14 | 
15 | [x] works for numeric targets (tested in DataRobot and confirmed)
16 | 
17 | ### Description
18 | 
19 | tbd
20 | 
21 | ### References
22 | 
23 | https://dl.acm.org/citation.cfm?id=507538
24 | 
25 | https://link.springer.com/chapter/10.1007/BFb0017010
26 | 
27 | https://en.wikipedia.org/wiki/Additive_smoothing#Generalized_to_the_case_of_known_incidence_rates


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_mest/custom.py:
--------------------------------------------------------------------------------
 1 | import category_encoders as ce
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - a pre-fit M-Estimate target encoder], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - a pickle file containing a pre-fit M-Estimate target encoder] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     encoder_mest = ce.MEstimateEncoder(cols=X.columns, randomized=True, m=0.50)
31 |     encoder_mest.fit(X,y)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/mest.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(encoder_mest, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |         In this example, it's a pickle file containing a pre-fit M-Estimate target encoder.
55 |     
56 |     Returns
57 |     -------
58 |     pd.DataFrame
59 |         Returns a dataframe with transformed data.
60 |     """
61 | 
62 |     return transformer.transform(data).fillna(0)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/any_target/all_enc_mest/requirements.txt:
--------------------------------------------------------------------------------
1 | category_encoders==2.2.2


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_backward_differencing/README.md:
--------------------------------------------------------------------------------
 1 | # Backward Difference Target Encoding
 2 | 
 3 | ### Overview
 4 | ```
 5 | [ ] accepts numeric inputs
 6 | [x] accepts categorical inputs
 7 | [ ] outputs missing values
 8 | [x] works for binary targets (tested in DataRobot and confirmed)
 9 | [ ] works for multiclass targets
10 | [ ] works for numeric targets
11 | ```
12 | ### Description
13 | 
14 | tbd
15 | 
16 | ### References
17 | 
18 | https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
19 | 
20 | http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_backward_differencing/custom.py:
--------------------------------------------------------------------------------
 1 | import category_encoders as ce
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     encoder_bwdiff = ce.BackwardDifferenceEncoder(cols=X.columns)
31 |     encoder_bwdiff.fit(X,y)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/backdiff.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(encoder_bwdiff, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
55 |     
56 |     Returns
57 |     -------
58 |     pd.DataFrame
59 |         Returns a dataframe with transformed data.
60 |     """
61 | 
62 |     return transformer.transform(data).fillna(0)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_backward_differencing/requirements.txt:
--------------------------------------------------------------------------------
1 | category_encoders==2.2.2


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_glm/README.md:
--------------------------------------------------------------------------------
 1 | # GLM Target Encoding
 2 | 
 3 | ### Overview
 4 | ```
 5 | [ ] accepts numeric inputs
 6 | [x] accepts categorical inputs
 7 | [ ] outputs missing values
 8 | [x] works for binary targets (tested in DataRobot and confirmed)
 9 | [ ] works for multiclass targets
10 | [ ] works for numeric targets
11 | ```
12 | ### Description
13 | 
14 | **This implementation is hard-coded to handle binary targets ONLY!**
15 | 
16 | This is a supervised encoder similar to TargetEncoder or MEstimateEncoder, but there are some advantages: 
17 | 
18 | 1) Solid statistical theory behind the technique. Mixed effects models are a mature branch of statistics. 
19 | 2) No hyper-parameters to tune. The amount of shrinkage is automatically determined through the estimation process. In short, the less observations a category has and/or the more the outcome varies for a category then the higher the regularization towards “the prior” or “grand mean”. 
20 | 3) The technique is applicable for both continuous and binomial targets. If the target is continuous, the encoder returns regularized difference of the observation’s category from the global mean. If the target is binomial, the encoder returns regularized log odds per category.
21 | 
22 | In comparison to JamesSteinEstimator, this encoder utilizes generalized linear mixed models from statsmodels library.
23 | 
24 | ### References
25 | 
26 | https://faculty.psau.edu.sa/filedownload/doc-12-pdf-a1997d0d31f84d13c1cdc44ac39a8f2c-original.pdf


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_glm/custom.py:
--------------------------------------------------------------------------------
 1 | import category_encoders as ce
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     encoder_glm = ce.GLMMEncoder(cols=X.columns, binomial_target=True, randomized=True)
31 |     encoder_glm.fit(X,y)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/binomialglm.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(encoder_glm, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
55 |     
56 |     Returns
57 |     -------
58 |     pd.DataFrame
59 |         Returns a dataframe with transformed data.
60 |     """
61 | 
62 |     return transformer.transform(data).fillna(0)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_glm/requirements.txt:
--------------------------------------------------------------------------------
1 | category_encoders==2.2.2


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_helmert/README.md:
--------------------------------------------------------------------------------
 1 | # Helmert Target Encoding
 2 | 
 3 | ### Overview
 4 | ```
 5 | [ ] accepts numeric inputs
 6 | [x] accepts categorical inputs
 7 | [ ] outputs missing values
 8 | [x] works for binary targets (tested in DataRobot and confirmed)
 9 | [ ] works for multiclass targets
10 | [ ] works for numeric targets 
11 | ```
12 | ### Description
13 | 
14 | tbd
15 | 
16 | ### References
17 | 
18 | https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
19 | 
20 | http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_helmert/custom.py:
--------------------------------------------------------------------------------
 1 | import category_encoders as ce
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     encoder_helmert = ce.HelmertEncoder(cols=X.columns)
31 |     encoder_helmert.fit(X,y)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/helmert.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(encoder_helmert, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
55 |     
56 |     Returns
57 |     -------
58 |     pd.DataFrame
59 |         Returns a dataframe with transformed data.
60 |     """
61 | 
62 |     return transformer.transform(data).fillna(0)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_helmert/requirements.txt:
--------------------------------------------------------------------------------
1 | category_encoders==2.2.2


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_leaveonout/README.md:
--------------------------------------------------------------------------------
 1 | # Leave-One-Out Target Encoding
 2 | 
 3 | ### Overview
 4 | ```
 5 | [ ] accepts numeric inputs
 6 | [x] accepts categorical inputs
 7 | [ ] outputs missing values
 8 | [x] works for binary targets (tested in DataRobot and confirmed)
 9 | [ ] works for multiclass targets
10 | [ ] works for numeric targets 
11 | ```
12 | ### Description
13 | 
14 | This is very similar to target encoding but excludes the current row’s target when calculating the mean target for a level to reduce the effect of outliers.
15 | 
16 | ### References
17 | 
18 | https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_leaveonout/custom.py:
--------------------------------------------------------------------------------
 1 | import category_encoders as ce
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     encoder_loo = ce.LeaveOneOutEncoder(cols=X.columns)
31 |     encoder_loo.fit(X,y)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/loo.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(encoder_loo, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
55 |     
56 |     Returns
57 |     -------
58 |     pd.DataFrame
59 |         Returns a dataframe with transformed data.
60 |     """
61 | 
62 |     return transformer.transform(data).fillna(0)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_leaveonout/requirements.txt:
--------------------------------------------------------------------------------
1 | category_encoders==2.2.2


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_woe/README.md:
--------------------------------------------------------------------------------
 1 | # Weight-Of-Evidence Target Encoding
 2 | 
 3 | ### Overview
 4 | ```
 5 | [ ] accepts numeric inputs
 6 | [x] accepts categorical inputs
 7 | [ ] outputs missing values
 8 | [x] works for binary targets (tested in DataRobot and confirmed)
 9 | [ ] works for multiclass targets
10 | [ ] works for numeric targets
11 | ```
12 | ### Description
13 | 
14 | tbd
15 | 
16 | ### References
17 | 
18 | https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_woe/custom.py:
--------------------------------------------------------------------------------
 1 | import category_encoders as ce
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     encoder_woe = ce.WOEEncoder(cols=X.columns, randomized=True, handle_missing='value', handle_unknown='value')
31 |     encoder_woe.fit(X,y)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/woe.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(encoder_woe, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
55 |     
56 |     Returns
57 |     -------
58 |     pd.DataFrame
59 |         Returns a dataframe with transformed data.
60 |     """
61 | 
62 |     return transformer.transform(data).fillna(0)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/binary_target/binary_enc_woe/requirements.txt:
--------------------------------------------------------------------------------
1 | category_encoders==2.2.2


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/multiclass_target/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_tasks/preprocessing/categorical/python/encoding/multiclass_target/.gitignore


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/regression_target/regression_enc_glm/README.md:
--------------------------------------------------------------------------------
 1 | # GLM Target Encoding
 2 | 
 3 | ### Overview
 4 | ```
 5 | [ ] accepts numeric inputs
 6 | [x] accepts categorical inputs
 7 | [ ] outputs missing values
 8 | [x] works for binary targets (tested in DataRobot and confirmed)
 9 | [ ] works for multiclass targets
10 | [ ] works for numeric targets
11 | ```
12 | ### Description
13 | 
14 | **This implementation is hard-coded to handle continuous targets ONLY!**
15 | 
16 | This is a supervised encoder similar to TargetEncoder or MEstimateEncoder, but there are some advantages: 
17 | 
18 | 1) Solid statistical theory behind the technique. Mixed effects models are a mature branch of statistics. 
19 | 2) No hyper-parameters to tune. The amount of shrinkage is automatically determined through the estimation process. In short, the less observations a category has and/or the more the outcome varies for a category then the higher the regularization towards “the prior” or “grand mean”. 
20 | 3) The technique is applicable for both continuous and binomial targets. If the target is continuous, the encoder returns regularized difference of the observation’s category from the global mean. If the target is binomial, the encoder returns regularized log odds per category.
21 | 
22 | In comparison to JamesSteinEstimator, this encoder utilizes generalized linear mixed models from statsmodels library.
23 | 
24 | ### References
25 | 
26 | https://faculty.psau.edu.sa/filedownload/doc-12-pdf-a1997d0d31f84d13c1cdc44ac39a8f2c-original.pdf


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/regression_target/regression_enc_glm/custom.py:
--------------------------------------------------------------------------------
 1 | import category_encoders as ce
 2 | import pandas as pd
 3 | from pathlib import Path
 4 | import pickle
 5 | 
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     encoder_glm = ce.GLMMEncoder(cols=X.columns, binomial_target=False, randomized=True)
31 |     encoder_glm.fit(X,y)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/glm.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(encoder_glm, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
55 |     
56 |     Returns
57 |     -------
58 |     pd.DataFrame
59 |         Returns a dataframe with transformed data.
60 |     """
61 | 
62 |     return transformer.transform(data).fillna(0)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/encoding/regression_target/regression_enc_glm/requirements.txt:
--------------------------------------------------------------------------------
1 | category_encoders==2.2.2


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/python/imputing/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_tasks/preprocessing/categorical/python/imputing/.gitignore


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/categorical/r/README.md:
--------------------------------------------------------------------------------
1 | README
2 | 


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/images/r/README.md:
--------------------------------------------------------------------------------
1 | README
2 | 


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/encoding/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/custom_tasks/preprocessing/numeric/python/encoding/.gitignore


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/imputing/knn_imputer_fixed_n/README.md:
--------------------------------------------------------------------------------
 1 | # KNN (Nearest Neighbors) Missing Imputation
 2 | ## For Numerics
 3 | 
 4 | ### Overview
 5 | ```
 6 | [x] accepts numeric inputs
 7 | [ ] accepts categorical inputs
 8 | [ ] outputs missing values
 9 | [x] works for binary targets (tested in DataRobot and confirmed)
10 | [x] works for multiclass targets
11 | [x] works for numeric targets
12 | ```
13 | ### Description
14 | 
15 | KNN Imputer was first supported by Scikit-Learn in December 2019 when it released its version 0.22. This imputer utilizes the k-Nearest Neighbors method to replace the missing values in the datasets with the mean value from the parameter ‘n_neighbors’ nearest neighbors found in the training set. By default, it uses a Euclidean distance metric to impute the missing values.
16 | 
17 | ### References
18 | 
19 | https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3668100/
20 | 
21 | http://www.stat.columbia.edu/~gelman/arm/missing.pdf
22 | 
23 | https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/
24 | 
25 | https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html
26 | 
27 | https://www.iriseekhout.com/missing-data/


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/imputing/knn_imputer_fixed_n/custom.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pathlib import Path
 3 | import pickle
 4 | from sklearn.impute import KNNImputer
 5 | import numpy as np
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     knn = KNNImputer(n_neighbors=5, add_indicator=False)
31 |     knn.fit(X.values)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/knn.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(knn, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |     
55 |     Returns
56 |     -------
57 |     pd.DataFrame
58 |         Returns a dataframe with transformed data.
59 |     """
60 | 
61 |     return pd.DataFrame(transformer.transform(data.values), columns=data.columns)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/imputing/median_impute/custom.py:
--------------------------------------------------------------------------------
 1 | # This custom transform task implements missing values imputation using a median
 2 | 
 3 | import pickle
 4 | import pandas as pd
 5 | import numpy as np
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | def fit(X, y, output_dir, **kwargs):
10 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
11 |     DataRobot runs this hook when the task is being trained inside a blueprint.
12 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
13 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
14 | 
15 |     Parameters
16 |     -------
17 |     X: pd.DataFrame
18 |         Training data that DataRobot passes when this task is being trained.
19 |     y: pd.Series
20 |         Project's target column (None is passed for unsupervised projects).
21 |     output_dir: str
22 |         A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().
23 | 
24 |     Returns
25 |     -------
26 |     None
27 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
28 |         so that the trained object can be used during scoring inside transform()
29 |     """
30 | 
31 |     # compute medians for all numeric features on training data, store them in a dictionary
32 |     median = X.median(axis = 0, numeric_only = True, skipna = True).to_dict()
33 | 
34 | 
35 |     # dump the trained object [in this example - dictionary with medians per column] 
36 |     # into an artifact [in this example - artifact.pkl]
37 |     # and save it into output_dir so that it can be used later to impute on new data
38 |     output_dir_path = Path(output_dir)
39 |     if output_dir_path.exists() and output_dir_path.is_dir():
40 |         with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
41 |             pickle.dump(median, fp)
42 | 
43 | 
44 | def transform(data, transformer): 
45 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
46 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
47 |     As an output, this hook is expected to return the transformed data.
48 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
49 | 
50 |     Parameters
51 |     -------
52 |     data: pd.DataFrame
53 |         Data that DataRobot passes for transformation.
54 |     transformer: Any
55 |         Trained object, extracted by DataRobot from the artifact created inside fit().
56 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
57 |     
58 |     Returns
59 |     -------
60 |     pd.DataFrame
61 |         Returns a dataframe with transformed data.
62 |     """
63 | 
64 |     return data.fillna(transformer)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/monotonic transforms/power_transformer/custom.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pathlib import Path
 3 | import pickle
 4 | from sklearn.preprocessing import PowerTransformer
 5 | import numpy as np
 6 | 
 7 | def fit(X, y, output_dir, **kwargs):
 8 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
 9 |     DataRobot runs this hook when the task is being trained inside a blueprint.
10 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - power transform], that is then used to transform new data.
11 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
12 | 
13 |     Parameters
14 |     -------
15 |     X: pd.DataFrame
16 |         Training data that DataRobot passes when this task is being trained.
17 |     y: pd.Series
18 |         Project's target column (None is passed for unsupervised projects).
19 |     output_dir: str
20 |         A path to the output folder; the artifact [in this example - containing a power transformer] must be saved into this folder to be re-used in transform().
21 | 
22 |     Returns
23 |     -------
24 |     None
25 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
26 |         so that the trained object can be used during scoring inside transform()
27 |     """
28 | 
29 |     # Transform categorical columns into a numeric transformation using Weight of Evidence
30 |     pt = PowerTransformer()
31 |     pt.fit(X.values)
32 | 
33 |     # dump the trained object 
34 |     # into an artifact [in this example - woe.pkl]
35 |     # and save it into output_dir so that it can be used later to impute on new data
36 |     output_dir_path = Path(output_dir)
37 |     if output_dir_path.exists() and output_dir_path.is_dir():
38 |         with open("{}/pt.pkl".format(output_dir), "wb") as fp:
39 |             pickle.dump(pt, fp)
40 | 
41 | 
42 | def transform(data, transformer): 
43 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
44 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
45 |     As an output, this hook is expected to return the transformed data.
46 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
47 | 
48 |     Parameters
49 |     -------
50 |     data: pd.DataFrame
51 |         Data that DataRobot passes for transformation.
52 |     transformer: Any
53 |         Trained object, extracted by DataRobot from the artifact created inside fit().
54 |     
55 |     Returns
56 |     -------
57 |     pd.DataFrame
58 |         Returns a dataframe with transformed data.
59 |     """
60 | 
61 |     return pd.DataFrame(transformer.transform(data.values), columns=data.columns).fillna(np.nan)


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/scaling/minmaxscaler/custom.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.preprocessing import MinMaxScaler
 8 | 
 9 | 
10 | def fit(X, y, output_dir, **kwargs):
11 |     """This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
12 |     DataRobot runs this hook when the task is being trained inside a blueprint.
13 |     As an output, this hook is expected to create an artifact containg a trained object, that is then used to transform new data.
14 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
15 | 
16 |     Parameters
17 |     -------
18 |     X: pd.DataFrame
19 |         Training data that DataRobot passes when this task is being trained.
20 |     y: pd.Series
21 |         Project's target column (None is passed for unsupervised projects).
22 |     output_dir: str
23 |         A path to the output folder; the artifact [in this example - containing a power transformer] must be saved into this folder to be re-used in transform().
24 | 
25 |     Returns
26 |     -------
27 |     None
28 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
29 |         so that the trained object can be used during scoring inside transform()
30 |     """
31 | 
32 |     # Transform numeric values to [0,1] based on sklearn's MinMaxScaler()
33 |     scaler = MinMaxScaler()
34 |     scaler.fit(X.values)
35 | 
36 |     # Dump fit scaler to artifact to use for transforms
37 |     with open(os.path.join(output_dir, "minmaxscaler.pkl"), "wb") as fp:
38 |         pickle.dump(scaler, fp)
39 | 
40 |     # Save the transformed input df as an object to inspect and confirm the scaler is working
41 |     transformed = pd.DataFrame(scaler.transform(X.values))
42 | 
43 |     transformed.to_csv(os.path.join(output_dir, "transformed.csv"), index=False)
44 | 
45 | 
46 | def transform(data, transformer):
47 |     """This hook defines how DataRobot will use the trained object from fit() to transform new data.
48 |     DataRobot runs this hook when the task is used for scoring inside a blueprint.
49 |     As an output, this hook is expected to return the transformed data.
50 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
51 | 
52 |     Parameters
53 |     -------
54 |     data: pd.DataFrame
55 |         Data that DataRobot passes for transformation.
56 |     transformer: Any
57 |         Trained object, extracted by DataRobot from the artifact created inside fit().
58 | 
59 |     Returns
60 |     -------
61 |     pd.DataFrame
62 |         Returns a dataframe with transformed data.
63 |     """
64 | 
65 |     transformed = pd.DataFrame(
66 |         transformer.transform(data.values), columns=data.columns
67 |     ).fillna(np.nan)
68 | 
69 |     return transformed
70 | 


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/signal/butter_10_15_hp_1000/custom.py:
--------------------------------------------------------------------------------
 1 | # Project: Custom Transform - Butterworth Filter
 2 | # Project Cerebro Hackathon Team
 3 | 
 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html
 5 | 
 6 | from scipy import signal
 7 | import pickle
 8 | import pandas as pd
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | 
13 | def fit(X, y, output_dir, **kwargs):
14 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
15 |     DataRobot runs this hook when the task is being trained inside a blueprint.
16 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
17 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
18 |     Parameters
19 |     -------
20 |     X: pd.DataFrame
21 |         Training data that DataRobot passes when this task is being trained.
22 |     y: pd.Series
23 |         Project's target column (None is passed for unsupervised projects).
24 |     output_dir: str
25 |         A path to the output folder; the artifact must be saved into this folder to be re-used in transform().
26 |     Returns
27 |     -------
28 |     None
29 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
30 |         so that the trained object can be used during scoring inside transform()
31 |     """
32 | 
33 |     sos = signal.butter(10, 15, 'hp', fs=1000, output='sos')
34 | 
35 |     # dump the trained object [in this example - dictionary with medians per column]
36 |     # into an artifact [in this example - artifact.pkl]
37 |     # and save it into output_dir so that it can be used later to impute on new data
38 |     output_dir_path = Path(output_dir)
39 |     if output_dir_path.exists() and output_dir_path.is_dir():
40 |         with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
41 |             pickle.dump(sos, fp)
42 | 
43 | 
44 | def transform(data, transformer):
45 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
46 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
47 |     As an output, this hook is expected to return the transformed data.
48 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
49 |     Parameters
50 |     -------
51 |     data: pd.DataFrame
52 |         Data that DataRobot passes for transformation.
53 |     transformer: Any
54 |         Trained object, extracted by DataRobot from the artifact created inside fit().
55 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
56 |     
57 |     Returns
58 |     -------
59 |     pd.DataFrame
60 |         Returns a dataframe with transformed data.
61 |     """
62 |     array = signal.sosfilt(transformer, x=data)
63 |     df    = pd.DataFrame(array)
64 |     return df


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/signal/butter_4_100_lowpass/custom.py:
--------------------------------------------------------------------------------
 1 | # Project: Custom Transform - Butterworth Filter
 2 | # Project Cerebro Hackathon Team
 3 | 
 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html
 5 | 
 6 | from scipy import signal
 7 | import pickle
 8 | import pandas as pd
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | 
13 | def fit(X, y, output_dir, **kwargs):
14 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
15 |     DataRobot runs this hook when the task is being trained inside a blueprint.
16 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
17 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
18 |     Parameters
19 |     -------
20 |     X: pd.DataFrame
21 |         Training data that DataRobot passes when this task is being trained.
22 |     y: pd.Series
23 |         Project's target column (None is passed for unsupervised projects).
24 |     output_dir: str
25 |         A path to the output folder; the artifact must be saved into this folder to be re-used in transform().
26 |     Returns
27 |     -------
28 |     None
29 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
30 |         so that the trained object can be used during scoring inside transform()
31 |     """
32 | 
33 |     sos = signal.butter(4, 100, 'low', analog=True, output='sos')
34 | 
35 |     # dump the trained object [in this example - dictionary with medians per column]
36 |     # into an artifact [in this example - artifact.pkl]
37 |     # and save it into output_dir so that it can be used later to impute on new data
38 |     output_dir_path = Path(output_dir)
39 |     if output_dir_path.exists() and output_dir_path.is_dir():
40 |         with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
41 |             pickle.dump(sos, fp)
42 | 
43 | 
44 | def transform(data, transformer):
45 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
46 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
47 |     As an output, this hook is expected to return the transformed data.
48 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
49 |     Parameters
50 |     -------
51 |     data: pd.DataFrame
52 |         Data that DataRobot passes for transformation.
53 |     transformer: Any
54 |         Trained object, extracted by DataRobot from the artifact created inside fit().
55 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
56 |     
57 |     Returns
58 |     -------
59 |     pd.DataFrame
60 |         Returns a dataframe with transformed data.
61 |     """
62 |     array = signal.sosfilt(transformer, x=data)
63 |     df    = pd.DataFrame(array)
64 |     return df


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/signal/cheby1_sos_10_1_15/custom.py:
--------------------------------------------------------------------------------
 1 | # Project: Custom Transform - Chebychev Filter
 2 | # Project Cerebro Hackathon Team
 3 | 
 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html
 5 | 
 6 | from scipy import signal
 7 | import pickle
 8 | import pandas as pd
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | 
13 | def fit(X, y, output_dir, **kwargs):
14 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
15 |     DataRobot runs this hook when the task is being trained inside a blueprint.
16 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
17 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
18 |     Parameters
19 |     -------
20 |     X: pd.DataFrame
21 |         Training data that DataRobot passes when this task is being trained.
22 |     y: pd.Series
23 |         Project's target column (None is passed for unsupervised projects).
24 |     output_dir: str
25 |         A path to the output folder; the artifact must be saved into this folder to be re-used in transform().
26 |     Returns
27 |     -------
28 |     None
29 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
30 |         so that the trained object can be used during scoring inside transform()
31 |     """
32 | 
33 |     sos = signal.cheby1(10, 1, 15, 'hp', fs=1000, output='sos')
34 |     #filtered = signal.sosfilt(sos, x=X)
35 | 
36 |     # dump the trained object [in this example - dictionary with medians per column]
37 |     # into an artifact [in this example - artifact.pkl]
38 |     # and save it into output_dir so that it can be used later to impute on new data
39 |     output_dir_path = Path(output_dir)
40 |     if output_dir_path.exists() and output_dir_path.is_dir():
41 |         with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
42 |             pickle.dump(sos, fp)
43 | 
44 | 
45 | def transform(data, transformer):
46 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
47 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
48 |     As an output, this hook is expected to return the transformed data.
49 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
50 |     Parameters
51 |     -------
52 |     data: pd.DataFrame
53 |         Data that DataRobot passes for transformation.
54 |     transformer: Any
55 |         Trained object, extracted by DataRobot from the artifact created inside fit().
56 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
57 |     
58 |     Returns
59 |     -------
60 |     pd.DataFrame
61 |         Returns a dataframe with transformed data.
62 |     """
63 |     array = signal.sosfilt(transformer, x=data)
64 |     df    = pd.DataFrame(array)
65 |     return df


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/signal/cheby1_sos_5_1_15/custom.py:
--------------------------------------------------------------------------------
 1 | # Project: Custom Transform - Chebychev Filter
 2 | # Project Cerebro Hackathon Team
 3 | 
 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html
 5 | 
 6 | from scipy import signal
 7 | import pickle
 8 | import pandas as pd
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | 
13 | def fit(X, y, output_dir, **kwargs):
14 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
15 |     DataRobot runs this hook when the task is being trained inside a blueprint.
16 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
17 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
18 |     Parameters
19 |     -------
20 |     X: pd.DataFrame
21 |         Training data that DataRobot passes when this task is being trained.
22 |     y: pd.Series
23 |         Project's target column (None is passed for unsupervised projects).
24 |     output_dir: str
25 |         A path to the output folder; the artifact must be saved into this folder to be re-used in transform().
26 |     Returns
27 |     -------
28 |     None
29 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
30 |         so that the trained object can be used during scoring inside transform()
31 |     """
32 | 
33 |     sos = signal.cheby1(5, 1, 15, 'hp', fs=1000, output='sos')
34 |     #filtered = signal.sosfilt(sos, x=X)
35 | 
36 |     # dump the trained object [in this example - dictionary with medians per column]
37 |     # into an artifact [in this example - artifact.pkl]
38 |     # and save it into output_dir so that it can be used later to impute on new data
39 |     output_dir_path = Path(output_dir)
40 |     if output_dir_path.exists() and output_dir_path.is_dir():
41 |         with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
42 |             pickle.dump(sos, fp)
43 | 
44 | 
45 | def transform(data, transformer):
46 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
47 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
48 |     As an output, this hook is expected to return the transformed data.
49 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
50 |     Parameters
51 |     -------
52 |     data: pd.DataFrame
53 |         Data that DataRobot passes for transformation.
54 |     transformer: Any
55 |         Trained object, extracted by DataRobot from the artifact created inside fit().
56 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
57 |     
58 |     Returns
59 |     -------
60 |     pd.DataFrame
61 |         Returns a dataframe with transformed data.
62 |     """
63 |     array = signal.sosfilt(transformer, x=data)
64 |     df    = pd.DataFrame(array)
65 |     return df


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/signal/cheby2_12_20_17/custom.py:
--------------------------------------------------------------------------------
 1 | # Project: Custom Transform - Chebychev Filter
 2 | # Project Cerebro Hackathon Team
 3 | 
 4 | # See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.cheby1.html
 5 | 
 6 | from scipy import signal
 7 | import pickle
 8 | import pandas as pd
 9 | import numpy as np
10 | from pathlib import Path
11 | 
12 | 
13 | def fit(X, y, output_dir, **kwargs):
14 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
15 |     DataRobot runs this hook when the task is being trained inside a blueprint.
16 |     As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
17 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
18 |     Parameters
19 |     -------
20 |     X: pd.DataFrame
21 |         Training data that DataRobot passes when this task is being trained.
22 |     y: pd.Series
23 |         Project's target column (None is passed for unsupervised projects).
24 |     output_dir: str
25 |         A path to the output folder; the artifact must be saved into this folder to be re-used in transform().
26 |     Returns
27 |     -------
28 |     None
29 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
30 |         so that the trained object can be used during scoring inside transform()
31 |     """
32 | 
33 |     sos = signal.cheby2(12, 20, 17, 'hp', fs=1000, output='sos')
34 |     #filtered = signal.sosfilt(sos, x=X)
35 | 
36 |     # dump the trained object [in this example - dictionary with medians per column]
37 |     # into an artifact [in this example - artifact.pkl]
38 |     # and save it into output_dir so that it can be used later to impute on new data
39 |     output_dir_path = Path(output_dir)
40 |     if output_dir_path.exists() and output_dir_path.is_dir():
41 |         with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
42 |             pickle.dump(sos, fp)
43 | 
44 | 
45 | def transform(data, transformer):
46 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
47 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
48 |     As an output, this hook is expected to return the transformed data.
49 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
50 |     Parameters
51 |     -------
52 |     data: pd.DataFrame
53 |         Data that DataRobot passes for transformation.
54 |     transformer: Any
55 |         Trained object, extracted by DataRobot from the artifact created inside fit().
56 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
57 |     
58 |     Returns
59 |     -------
60 |     pd.DataFrame
61 |         Returns a dataframe with transformed data.
62 |     """
63 |     array = signal.sosfilt(transformer, x=data)
64 |     df    = pd.DataFrame(array)
65 |     return df


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/python/signal/fft/custom.py:
--------------------------------------------------------------------------------
 1 | # Project: Custom Transform - Chebychev Filter
 2 | # Project Cerebro Hackathon Team
 3 | 
 4 | # by: Marshall
 5 | 
 6 | from scipy.fft import fft2
 7 | from scipy.fft import fft
 8 | import pickle
 9 | import pandas as pd
10 | import numpy as np
11 | from pathlib import Path
12 | 
13 | 
14 | def fit(X, y, output_dir, **kwargs):
15 |     """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
16 |     DataRobot runs this hook when the task is being trained inside a blueprint.
17 |     As an output, this hook is expected to create an artifact containing a trained object [in this example - median of each numeric column], that is then used to transform new data.
18 |     The input parameters are passed by DataRobot based on project and blueprint configuration.
19 |     Parameters
20 |     -------
21 |     X: pd.DataFrame
22 |         Training data that DataRobot passes when this task is being trained.
23 |     y: pd.Series
24 |         Project's target column (None is passed for unsupervised projects).
25 |     output_dir: str
26 |         A path to the output folder; the artifact must be saved into this folder to be re-used in transform().
27 |     Returns
28 |     -------
29 |     None
30 |         fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
31 |         so that the trained object can be used during scoring inside transform()
32 |     """
33 |     
34 |     # Placeholder so thing works. Equation is just to test
35 |     fourier = X.median(axis=0, numeric_only=True, skipna=True).to_dict()
36 |     
37 |     # dump the trained object [in this example - dictionary with medians per column]
38 |     # into an artifact [in this example - artifact.pkl]
39 |     # and save it into output_dir so that it can be used later to impute on new data
40 |     output_dir_path = Path(output_dir)
41 |     if output_dir_path.exists() and output_dir_path.is_dir():
42 |         with open("{}/artifact.pkl".format(output_dir), "wb") as fp:
43 |             pickle.dump(fourier, fp)
44 | 
45 | 
46 | def transform(data, transformer):
47 |     """ This hook defines how DataRobot will use the trained object from fit() to transform new data.
48 |     DataRobot runs this hook when the task is used for scoring inside a blueprint. 
49 |     As an output, this hook is expected to return the transformed data.
50 |     The input parameters are passed by DataRobot based on dataset and blueprint configuration.
51 |     Parameters
52 |     -------
53 |     data: pd.DataFrame
54 |         Data that DataRobot passes for transformation.
55 |     transformer: Any
56 |         Trained object, extracted by DataRobot from the artifact created inside fit().
57 |         In this example, it's a dictionary with medians per column extracted from artifact.pkl.
58 |     
59 |     Returns
60 |     -------
61 |     pd.DataFrame
62 |         Returns a dataframe with transformed data.
63 |     """
64 |     df = data.copy()
65 |     for i in data.columns:
66 |         df[i] = fft(np.array(df.loc[:, i])).astype(float)
67 |     
68 |     return df
69 | 


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/numeric/r/README.md:
--------------------------------------------------------------------------------
1 | README
2 | 


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/other/r/README.md:
--------------------------------------------------------------------------------
1 | README
2 | 


--------------------------------------------------------------------------------
/custom_tasks/preprocessing/text/r/README.md:
--------------------------------------------------------------------------------
1 | README
2 | 


--------------------------------------------------------------------------------
/drum_overview/custom_model_reg/custom.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pandas as pd
 3 | def transform(data, model):
 4 |     """
 5 |     Note: This hook may not have to be implemented for your model.
 6 |     In this case implemented for the model used in the example.
 7 |     Modify this method to add data transformation before scoring calls. For example, this can be
 8 |     used to implement one-hot encoding for models that don't include it on their own.
 9 |     Parameters
10 |     ----------
11 |     data: pd.DataFrame
12 |     model: object, the deserialized model
13 |     Returns
14 |     -------
15 |     pd.DataFrame
16 |     """
17 |     # Execute any steps you need to do before scoring
18 |     # Remove target columns if  they're in the dataset
19 |     if "price" in data:
20 |         data.pop("concrete_compressive_strength")
21 |     if "Species" in data:
22 |         data.pop("Species")
23 |     data = data.fillna(0)
24 |     return data


--------------------------------------------------------------------------------
/drum_overview/custom_model_reg/reg_rf_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datarobot-community/custom-models/96e659c39ae24660a50c7a264dd2d1dd52a1e975/drum_overview/custom_model_reg/reg_rf_model.pkl


--------------------------------------------------------------------------------
/drum_overview/readme.MD:
--------------------------------------------------------------------------------
 1 | # Using MLOps DRUM to test your custom models
 2 | 
 3 | This notebook provides an example of how you can use the MLOps DataRobot Model Runner (DRUM) library to test your custom models before uploading into DataRobot and deploying them.
 4 | 
 5 | ### Getting Started
 6 | Open `Main_Script.ipynb` and follow isntructions.
 7 | 
 8 | ### Requirements
 9 | 
10 | You can create the environment needed with the below commands (requires conda):
11 | 
12 | `conda create --name my-env python=3.7.0`
13 | 
14 | `conda activate my-env`
15 | 
16 | `pip install -r requirements.txt`
17 | 
18 | To learn about DRUM:
19 | 
20 | Start by following the Quickstart instructions on https://github.com/datarobot/datarobot-user-models.
21 | 
22 | Additional information can be found here:  
23 | - https://github.com/datarobot/datarobot-user-models/tree/master/custom_model_runner
24 | - https://pypi.org/project/datarobot-drum/
25 | 
26 | 
27 | 
28 | ### Problem Type
29 | Regression, Binary Classification


--------------------------------------------------------------------------------
/drum_overview/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYAML==5.3.1
2 | xgboost==1.2.1
3 | datarobot-drum
4 | pandas==1.1.5
5 | scikit-learn==0.23.2
6 | tensorflow==2.5.0


--------------------------------------------------------------------------------
/tracking_agents/python/readme.MD:
--------------------------------------------------------------------------------
 1 | # MLOps Agent - Python End to End
 2 | 
 3 | This notebook provides an example of how you can use the MLOps Agents to monitor external deployments using DataRobot.
 4 | 
 5 | ### Getting Started
 6 | Open `Main_Script.ipynb` and follow isntructions. You can also execute this notebook through Google Colab.
 7 | 
 8 | ### Requirements
 9 | You can create the environment needed with the below commands (requires conda):
10 | 
11 | `conda create --name my-env python=3.7.0`
12 | 
13 | `conda activate my-env`
14 | 
15 | `pip install -r requirements.txt`


--------------------------------------------------------------------------------
/tracking_agents/python/requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | attrs==19.3.0
 3 | backcall==0.2.0
 4 | boto3==1.11.4
 5 | botocore==1.14.4
 6 | certifi==2020.12.5
 7 | chardet==3.0.4
 8 | contextlib2==0.6.0.post1
 9 | datarobot==2.22.1
10 | decorator==4.4.2
11 | deprecation==2.1.0
12 | docutils==0.15.2
13 | future==0.18.2
14 | idna==2.10
15 | jmespath==0.10.0
16 | joblib==0.17.0
17 | numpy==1.19.4
18 | packaging==20.7
19 | pandas==1.1.5
20 | parso==0.7.0
21 | pika==0.13.1
22 | py4j==0.10.9
23 | pyparsing==2.4.7
24 | python-dateutil==2.8.1
25 | pytz==2020.4
26 | PyYAML==5.3.1
27 | pyzmq==20.0.0
28 | requests==2.25.0
29 | requests-toolbelt==0.9.1
30 | s3transfer==0.3.3
31 | scikit-learn==0.23.2
32 | scipy==1.5.4
33 | threadpoolctl==2.1.0
34 | trafaret==1.2.0
35 | urllib3==1.25.11
36 | 


--------------------------------------------------------------------------------