├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.MD ├── docs ├── .readthedocs.yml ├── Makefile ├── README.MD ├── base_model.rst ├── conf.py ├── contribute.rst ├── datasets │ ├── download_criteo_uplift_prediction.rst │ ├── download_hillstrom_email_marketing.rst │ ├── download_lalonde_nsw.rst │ ├── index.rst │ ├── load_criteo_uplift_prediction.rst │ ├── load_hillstrom_email_marketing.rst │ ├── load_lalonde_nsw.rst │ └── make_linear_regression.rst ├── examples.rst ├── index.rst ├── installation.rst ├── make.bat ├── metrics │ ├── get_average_effect.rst │ └── index.rst ├── model_selection │ ├── index.rst │ ├── train_test_split.rst │ └── treatment_cross_val_score.rst ├── requirements.txt ├── transformation │ ├── index.rst │ ├── jaskowski.rst │ ├── kane.rst │ ├── lai.rst │ ├── pessimistic.rst │ ├── reflective.rst │ └── transformation_base_model.rst ├── utils │ ├── download_file.rst │ ├── index.rst │ └── retrieve_from_gz.rst └── variable_selection │ ├── cadit.rst │ ├── dummy.rst │ ├── econometric.rst │ ├── index.rst │ └── two_model.rst ├── examples └── README.MD ├── pyuplift ├── __init__.py ├── base.py ├── datasets │ ├── __init__.py │ ├── generators │ │ ├── __init__.py │ │ └── linear.py │ └── loaders │ │ ├── __init__.py │ │ ├── criteo_uplift_prediction.py │ │ ├── hillstrom_email_marketing.py │ │ └── lalonde_nsw.py ├── metrics │ ├── __init__.py │ └── average_effect.py ├── model_selection │ ├── __init__.py │ ├── model_validation │ │ ├── __init__.py │ │ └── treatment_cross_validation.py │ └── splitters │ │ ├── __init__.py │ │ └── train_test_split.py ├── transformation │ ├── __init__.py │ ├── base.py │ ├── jaskowski.py │ ├── kane.py │ ├── lai.py │ ├── pessimistic.py │ └── reflective.py ├── utils │ ├── __init__.py │ ├── downloader.py │ └── retriever.py └── variable_selection │ ├── __init__.py │ ├── cadit.py │ ├── dummy.py │ ├── econometric.py │ └── two_model.py ├── resources ├── logo.psd └── pyuplift-logo.png ├── setup.cfg ├── setup.py ├── tests ├── README.MD ├── __init__.py ├── datasets │ ├── __init__.py │ ├── generators │ │ ├── __init__.py │ │ └── test_linear.py │ └── loaders │ │ ├── __init__.py │ │ ├── test_criteo_uplift_prediction.py │ │ ├── test_hillstrom_email_marketing.py │ │ └── test_lalonde_nsw.py ├── metrics │ ├── __init__.py │ └── test_average_effect.py ├── model_selection │ ├── __init__.py │ ├── model_validation │ │ ├── __init__.py │ │ └── test_treatment_cross_validation.py │ └── splitters │ │ ├── __init__.py │ │ └── test_train_test_split.py ├── transformation │ ├── __init__.py │ ├── base.py │ ├── test_jaskowski.py │ ├── test_kane.py │ ├── test_lai.py │ ├── test_pessimistic.py │ └── test_reflective.py ├── utils │ ├── __init__.py │ ├── data │ │ └── test.test.gz │ ├── test_downloader.py │ └── test_retriever.py └── variable_selection │ ├── __init__.py │ ├── base.py │ ├── test_cadit.py │ ├── test_dummy.py │ ├── test_econometric.py │ └── test_two_model.py └── tutorials ├── EDA Hillstrom Email Marketing.ipynb ├── EDA Lalonde NSW.ipynb ├── Getting started.ipynb └── README.MD /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | .idea/ 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial 2 | language: python 3 | python: 4 | - "3.5" 5 | - "3.6" 6 | - "3.7" 7 | install: 8 | - python setup.py install 9 | script: 10 | - pytest 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Artem Kuchumov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.MD 2 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | ![](https://github.com/duketemon/pyuplift/raw/master/resources/pyuplift-logo.png) 2 | 3 | [![Documentation Status](https://readthedocs.org/projects/pyuplift/badge/?version=latest)](https://pyuplift.readthedocs.io/en/latest/?badge=latest) 4 | [![Build Status](https://travis-ci.org/duketemon/pyuplift.svg?branch=master)](https://travis-ci.org/duketemon/pyuplift) 5 | [![PyPI - Python Version](https://img.shields.io/badge/python-3.5%20%7C%203.6%20%7C%203.7-blue.svg)](https://github.com/duketemon/pyuplift) 6 | [![GitHub](https://img.shields.io/github/license/duketemon/pyuplift.svg)](https://github.com/duketemon/pyuplift/blob/master/LICENSE) 7 | 8 | [Documentation](https://pyuplift.readthedocs.io) • 9 | [License](https://github.com/duketemon/pyuplift/blob/master/LICENSE) • 10 | [How to contribute](#how-to-contribute) • 11 | [Uplift datasets](#uplift-datasets) • 12 | [Inspiration](#inspiration) 13 | 14 | ## Installation 15 | ### Install from PyPI 16 | ```bash 17 | pip install pyuplift 18 | ``` 19 | ### Install from source code 20 | ```bash 21 | git clone https://github.com/duketemon/pyuplift.git 22 | cd pyuplift 23 | python setup.py install 24 | ``` 25 | 26 | ## How to contribute 27 | Contributions are always welcomed. There is a lot of ways how you can help to the project. 28 | * Contribute to the [tests](https://github.com/duketemon/pyuplift/tree/master/tests) to make it more reliable. 29 | * Contribute to the [documentation](https://github.com/duketemon/pyuplift/tree/master/docs) to make it clearer for everyone. 30 | * Contribute to the [tutorials](https://github.com/duketemon/pyuplift/tree/master/tutorials) to share your experience with other users. 31 | * Look for [issues with tag "help wanted"](https://github.com/duketemon/pyuplift/issues?q=is%3Aissue+is%3Aopen+label%3A"help+wanted") and submit pull requests to address them. 32 | * [Open an issue](https://github.com/duketemon/pyuplift/issues) to report problems or recommend new features. 33 | 34 | ## Uplift datasets 35 | * [Criteo Uplift Prediction](http://ailab.criteo.com/criteo-uplift-prediction-dataset) 36 | * [Hillstrom Email Marketing](https://blog.minethatdata.com/2008/05/best-answer-e-mail-analytics-challenge.html) 37 | * [Lalonde NSW](https://users.nber.org/~rdehejia/nswdata.html) 38 | 39 | ## Compatible with 40 | * [NumPy](https://github.com/numpy/numpy) 41 | * [Scikit-learn](https://github.com/scikit-learn/scikit-learn) 42 | 43 | ## Inspiration 44 | * [Identifying Individuals Who Are Truly Impacted by Treatment](https://www.researchgate.net/profile/Victor_Lo3/publication/270217235_Identifying_Individuals_Who_Are_Truly_Impacted_by_Treatment_Introduction_to_Recent_Advances_in_Uplift_Modeling/links/54a2dbbf0cf257a63604da2a/Identifying-Individuals-Who-Are-Truly-Impacted-by-Treatment-Introduction-to-Recent-Advances-in-Uplift-Modeling.pdf) 45 | * [Pinpointing the Persuadables: Convincing the Right Voters to Support Barack Obama](https://www.predictiveanalyticsworld.com/patimes/video-dan-porter-clip/2957) 46 | * [Revenue Uplift Modeling](https://www.researchgate.net/publication/321729653_Revenue_Uplift_Modeling) 47 | 48 | ## References 49 | * Devriendt F, Moldovan D, Verbeke W. A literature survey and experimental evaluation of the state-of-the-art in uplift modeling: A stepping stone toward the development of prescriptive analytics. Big data. 2018 Mar 1;6(1):13-41. 50 | * Weisberg HI, Pontes VP. Post hoc subgroups in clinical trials: Anathema or analytics?. Clinical trials. 2015 Aug;12(4):357-64. 51 | * Lo VS. The true lift model: a novel data mining approach to response modeling in database marketing. ACM SIGKDD Explorations Newsletter. 2002 Dec 1;4(2):78-86. 52 | * Guelman L, Guillén M, Pérez-Marín AM. A decision support framework to implement optimal personalized marketing interventions. Decision Support Systems. 2015 Apr 1;72:24-32. 53 | * Tian L, Alizadeh AA, Gentles AJ, Tibshirani R. A simple method for estimating interactions between a treatment and a large number of covariates. Journal of the American Statistical Association. 2014 Oct 2;109(508):1517-32. 54 | 55 | ## Notes 56 | The library was prepared within the framework of the Academic Fund Program at the National Research University Higher School of Economics (HSE) in 2019-2019 (grant № 19-04-048) and by the Russian Academic Excellence Project "5-100" 57 | -------------------------------------------------------------------------------- /docs/.readthedocs.yml: -------------------------------------------------------------------------------- 1 | build: 2 | image: latest 3 | python: 4 | version: 3.7 5 | requirements_file: docs/requirements.txt -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/README.MD: -------------------------------------------------------------------------------- 1 | ## Documentation 2 | This directory contains the full manual and web site as displayed at https://pyuplift.readthedocs.io. Documentation for pyuplift is generated using [Sphinx](http://www.sphinx-doc.org/en/master/). 3 | -------------------------------------------------------------------------------- /docs/base_model.rst: -------------------------------------------------------------------------------- 1 | ########## 2 | Base Model 3 | ########## 4 | 5 | The base class for all uplift estimators. 6 | 7 | .. note:: 8 | This class should not be used directly. Use derived classes instead. 9 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import re 4 | 5 | from sphinx.locale import _ 6 | from sphinx_rtd_theme import __version__ 7 | 8 | project = u'pyuplift' 9 | slug = re.sub(r'\W+', '-', project.lower()) 10 | version = __version__ 11 | release = __version__ 12 | author = u'Artem Kuchumov & contributors' 13 | copyright = author 14 | language = 'en' 15 | 16 | extensions = [ 17 | 'sphinx.ext.intersphinx', 18 | 'sphinx.ext.autodoc', 19 | 'sphinx.ext.mathjax', 20 | 'sphinx.ext.viewcode', 21 | 'sphinxcontrib.httpdomain', 22 | ] 23 | 24 | templates_path = ['_templates'] 25 | source_suffix = '.rst' 26 | exclude_patterns = [] 27 | 28 | master_doc = 'index' 29 | suppress_warnings = ['image.nonlocal_uri'] 30 | pygments_style = 'default' 31 | 32 | intersphinx_mapping = { 33 | 'rtd': ('https://docs.readthedocs.io/en/latest/', None), 34 | 'sphinx': ('http://www.sphinx-doc.org/en/stable/', None), 35 | } 36 | 37 | html_theme = 'sphinx_rtd_theme' 38 | html_theme_options = { 39 | 'logo_only': True 40 | } 41 | html_theme_path = ["../.."] 42 | html_show_sourcelink = True 43 | htmlhelp_basename = slug 44 | 45 | latex_documents = [ 46 | ('index', '{0}.tex'.format(slug), project, author, 'manual'), 47 | ] 48 | 49 | man_pages = [ 50 | ('index', slug, project, [author], 1) 51 | ] 52 | 53 | texinfo_documents = [ 54 | ('index', slug, project, author, slug, project, 'Miscellaneous'), 55 | ] 56 | 57 | 58 | # Extensions to theme docs 59 | def setup(app): 60 | from sphinx.domains.python import PyField 61 | from sphinx.util.docfields import Field 62 | 63 | app.add_object_type( 64 | 'confval', 65 | 'confval', 66 | objname='configuration value', 67 | indextemplate='pair: %s; configuration value', 68 | doc_field_types=[ 69 | PyField( 70 | 'type', 71 | label=_('Type'), 72 | has_arg=False, 73 | names=('type',), 74 | bodyrolename='class' 75 | ), 76 | Field( 77 | 'default', 78 | label=_('Default'), 79 | has_arg=False, 80 | names=('default',), 81 | ), 82 | ] 83 | ) 84 | -------------------------------------------------------------------------------- /docs/contribute.rst: -------------------------------------------------------------------------------- 1 | ###################### 2 | Contribute to pyuplift 3 | ###################### 4 | Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users. 5 | 6 | **Guidelines** 7 | 8 | * `Submit Pull Request`_ 9 | * `Git Workflow Howtos`_ 10 | 11 | - `How to resolve conflict with master`_ 12 | - `How to combine multiple commits into one`_ 13 | - `What is the consequence of force push`_ 14 | 15 | * `Documents`_ 16 | 17 | ******************* 18 | Submit Pull Request 19 | ******************* 20 | 21 | * Before submit, please rebase your code on the most recent version of master, you can do it by 22 | 23 | .. code-block:: bash 24 | 25 | git remote add upstream https://github.com/duketemon/pyuplift 26 | git fetch upstream 27 | git rebase upstream/master 28 | 29 | * If you have multiple small commits, 30 | it might be good to merge them together(use git rebase then squash) into more meaningful groups. 31 | * Send the pull request! 32 | 33 | - Fix the problems reported by automatic checks 34 | - If you are contributing a new module, consider add a testcase 35 | 36 | ******************* 37 | Git Workflow Howtos 38 | ******************* 39 | 40 | How to resolve conflict with master 41 | =================================== 42 | - First rebase to most recent master 43 | 44 | .. code-block:: bash 45 | 46 | # The first two steps can be skipped after you do it once. 47 | git remote add upstream https://github.com/duketemon/pyuplift 48 | git fetch upstream 49 | git rebase upstream/master 50 | 51 | - The git may show some conflicts it cannot merge, say ``conflicted.py``. 52 | 53 | - Manually modify the file to resolve the conflict. 54 | - After you resolved the conflict, mark it as resolved by 55 | 56 | .. code-block:: bash 57 | 58 | git add conflicted.py 59 | 60 | - Then you can continue rebase by 61 | 62 | .. code-block:: bash 63 | 64 | git rebase --continue 65 | 66 | - Finally push to your fork, you may need to force push here. 67 | 68 | .. code-block:: bash 69 | 70 | git push --force 71 | 72 | How to combine multiple commits into one 73 | ======================================== 74 | Sometimes we want to combine multiple commits, especially when later commits are only fixes to previous ones, 75 | to create a PR with set of meaningful commits. You can do it by following steps. 76 | 77 | - Before doing so, configure the default editor of git if you haven't done so before. 78 | 79 | .. code-block:: bash 80 | 81 | git config core.editor the-editor-you-like 82 | 83 | - Assume we want to merge last 3 commits, type the following commands 84 | 85 | .. code-block:: bash 86 | 87 | git rebase -i HEAD~3 88 | 89 | - It will pop up an text editor. Set the first commit as ``pick``, and change later ones to ``squash``. 90 | - After you saved the file, it will pop up another text editor to ask you modify the combined commit message. 91 | - Push the changes to your fork, you need to force push. 92 | 93 | .. code-block:: bash 94 | 95 | git push --force 96 | 97 | What is the consequence of force push 98 | ===================================== 99 | The previous two tips requires force push, this is because we altered the path of the commits. 100 | It is fine to force push to your own fork, as long as the commits changed are only yours. 101 | 102 | ********* 103 | Documents 104 | ********* 105 | * Documentation is built using sphinx. 106 | * Each document is written in `reStructuredText `_. 107 | * You can build document locally to see the effect. 108 | 109 | -------------------------------------------------------------------------------- /docs/datasets/download_criteo_uplift_prediction.rst: -------------------------------------------------------------------------------- 1 | ################################# 2 | download_criteo_uplift_prediction 3 | ################################# 4 | 5 | Downloading the Criteo Uplift Prediction dataset. 6 | 7 | **************** 8 | Data description 9 | **************** 10 | This dataset is constructed by assembling data resulting from several incrementality tests, a particular randomized trial procedure where a random part of the population is prevented from being targeted by advertising. 11 | It consists of 25M rows, each one representing a user with 11 features, a treatment indicator and 2 labels (visits and conversions). 12 | 13 | ******* 14 | Privacy 15 | ******* 16 | For privacy reasons the data has been sub-sampled non-uniformly so that the original incrementality level cannot be deduced from the dataset while preserving a realistic, challenging benchmark. 17 | Feature names have been anonymized and their values randomly projected so as to keep predictive power while making it practically impossible to recover the original features or user context. 18 | 19 | +--------------------------+------------+ 20 | | Features | 11 | 21 | +--------------------------+------------+ 22 | | Treatment | 2 | 23 | +--------------------------+------------+ 24 | | Samples total | 25,309,483 | 25 | +--------------------------+------------+ 26 | | Average visit rate | 0.04132 | 27 | +--------------------------+------------+ 28 | | Average conversion rate | 0.00229 | 29 | +--------------------------+------------+ 30 | 31 | More information about dataset you can find in 32 | the `official dataset description `_. 33 | 34 | +-----------------+---------------------------------------------------------------------------------------------------------------------+ 35 | | **Parameters:** | | **data_home**: str, default=None | 36 | | | | The URL to file with data. | 37 | | | | **url**: str, default=https://s3.us-east-2.amazonaws.com/criteo-uplift-dataset/criteo-uplift.csv.gz | 38 | | | | The URL to file with data. | 39 | +-----------------+---------------------------------------------------------------------------------------------------------------------+ 40 | | **Returns:** | | **dataset**: dict | 41 | | | | Dictionary object with the following attributes: | 42 | | | | **dataset.description** : str | 43 | | | | Description of the Criteo Uplift Prediction dataset. | 44 | | | | **dataset.data**: numpy ndarray of shape (25309483, 11) | 45 | | | | Each row corresponding to the 11 feature values in order. | 46 | | | | **dataset.feature_names**: list, size 11 | 47 | | | | List of feature names. | 48 | | | | **dataset.treatment**: numpy ndarray, shape (25309483,) | 49 | | | | Each value corresponds to the treatment. | 50 | | | | **dataset.target**: numpy array of shape (25309483,) | 51 | | | | Each value corresponds to one of the outcomes. By default, it's `visit` outcome (look at `target_visit` below). | 52 | | | | **dataset.target_visit**: numpy array of shape (25309483,) | 53 | | | | Each value corresponds to whether a visit occurred for this user (binary, label). | 54 | | | | **dataset.target_exposure**: numpy array of shape (25309483,) | 55 | | | | Each value corresponds to treatment effect, whether the user has been effectively exposed (binary). | 56 | | | | **dataset.target_conversion**: numpy array of shape (25309483,) | 57 | | | | Each value corresponds to whether a conversion occurred for this user (binary, label). | 58 | +-----------------+---------------------------------------------------------------------------------------------------------------------+ 59 | 60 | ******** 61 | Examples 62 | ******** 63 | 64 | .. code-block:: python3 65 | 66 | from pyuplift.datasets import download_criteo_uplift_prediction 67 | download_criteo_uplift_prediction() 68 | -------------------------------------------------------------------------------- /docs/datasets/download_hillstrom_email_marketing.rst: -------------------------------------------------------------------------------- 1 | ################################## 2 | download_hillstrom_email_marketing 3 | ################################## 4 | 5 | Downloading the Hillstrom Email Marketing dataset. 6 | 7 | **************** 8 | Data description 9 | **************** 10 | This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test. 11 | 12 | * 1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise. 13 | * 1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise. 14 | * 1/3 were randomly chosen to not receive an e-mail campaign. 15 | 16 | During a period of two weeks following the e-mail campaign, results were tracked. Your job is to tell the world if the Mens or Womens e-mail campaign was successful. 17 | 18 | +--------------------------+------------+ 19 | | Features | 8 | 20 | +--------------------------+------------+ 21 | | Treatment | 3 | 22 | +--------------------------+------------+ 23 | | Samples total | 64,000 | 24 | +--------------------------+------------+ 25 | | Average spend rate | 1.05091 | 26 | +--------------------------+------------+ 27 | | Average visit rate | 0.14678 | 28 | +--------------------------+------------+ 29 | | Average conversion rate | 0.00903 | 30 | +--------------------------+------------+ 31 | 32 | More information about dataset you can find in the `official paper `_. 33 | 34 | +-----------------+----------------------------------------------------------------------------------+ 35 | | **Parameters** | | **data_home: str** | 36 | | | | Specify another download and cache folder for the dataset. | 37 | | | | By default the dataset will be stored in the data folder in the same folder. | 38 | | | | **url: str** | 39 | | | | The URL to file with data. | 40 | +-----------------+----------------------------------------------------------------------------------+ 41 | | **Returns** | **None** | 42 | +-----------------+----------------------------------------------------------------------------------+ 43 | 44 | ******** 45 | Examples 46 | ******** 47 | 48 | .. code-block:: python3 49 | 50 | from pyuplift.datasets import download_hillstrom_email_marketing 51 | download_hillstrom_email_marketing() 52 | -------------------------------------------------------------------------------- /docs/datasets/download_lalonde_nsw.rst: -------------------------------------------------------------------------------- 1 | #################### 2 | download_lalonde_nsw 3 | #################### 4 | 5 | Downloading the Lalonde NSW dataset. 6 | 7 | **************** 8 | Data description 9 | **************** 10 | The dataset contains the treated and control units from the male sub-sample from the National Supported Work Demonstration as used by Lalonde in his paper. 11 | 12 | +--------------------------+------------+ 13 | | Features | 7 | 14 | +--------------------------+------------+ 15 | | Treatment | 2 | 16 | +--------------------------+------------+ 17 | | Samples total | 722 | 18 | +--------------------------+------------+ 19 | 20 | ******************** 21 | Features description 22 | ******************** 23 | * **treat** - an indicator variable for treatment status. 24 | * **age** - age in years. 25 | * **educ** - years of schooling. 26 | * **black** - indicator variable for blacks. 27 | * **hisp** - indicator variable for Hispanics. 28 | * **married** - indicator variable for martial status. 29 | * **nodegr** - indicator variable for high school diploma. 30 | * **re75** - real earnings in 1975. 31 | * **re78** - real earnings in 1978. 32 | 33 | More information about dataset you can find `here `_. 34 | 35 | +-----------------+----------------------------------------------------------------------------------+ 36 | | **Parameters** | | **data_home: str** | 37 | | | | Specify another download and cache folder for the dataset. | 38 | | | | By default the dataset will be stored in the data folder in the same folder. | 39 | | | | **control_data_url: str** | 40 | | | | The URL to file with data of the control group. | 41 | | | | **treated_data_url: str** | 42 | | | | The URL to file with data of the treated group. | 43 | | | | **separator: str** | 44 | | | | The separator which used in the data files. | 45 | | | | **column_names: list** | 46 | | | | List of column names of the dataset. | 47 | | | | **column_types: dict** | 48 | | | | List of types for columns of the dataset. | 49 | | | | **random_state: int** | 50 | | | | The random seed. | 51 | +-----------------+----------------------------------------------------------------------------------+ 52 | | **Returns** | **None** | 53 | +-----------------+----------------------------------------------------------------------------------+ 54 | 55 | 56 | ******** 57 | Examples 58 | ******** 59 | 60 | .. code-block:: python3 61 | 62 | from pyuplift.datasets import download_lalonde_nsw 63 | download_lalonde_nsw() 64 | -------------------------------------------------------------------------------- /docs/datasets/index.rst: -------------------------------------------------------------------------------- 1 | ######## 2 | Datasets 3 | ######## 4 | 5 | .. toctree:: 6 | :hidden: 7 | 8 | load_criteo_uplift_prediction 9 | download_criteo_uplift_prediction 10 | load_hillstrom_email_marketing 11 | download_hillstrom_email_marketing 12 | load_lalonde_nsw 13 | download_lalonde_nsw 14 | make_linear_regression 15 | 16 | The pyuplift.datasets module includes utilities to load datasets, including methods to download and return popular datasets. It also features some artificial data generators. 17 | 18 | ******* 19 | Loaders 20 | ******* 21 | +--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------+ 22 | | `datasets.download_criteo_uplift_prediction([data_home, url]) `_ | Downloading the Criteo Uplift Prediction dataset. | 23 | +--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------+ 24 | | `datasets.load_criteo_uplift_prediction([data_home, download_if_missing]) `_ | Loading the Criteo Uplift Prediction dataset from the local file. | 25 | +--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------+ 26 | | `datasets.download_hillstrom_email_marketing([data_home, url]) `_ | Downloading the Hillstrom Email Marketing dataset. | 27 | +--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------+ 28 | | `datasets.load_hillstrom_email_marketing([data_home, load_raw_data, download_if_missing]) `_ | Loading the Hillstrom Email Marketing dataset from the local file. | 29 | +--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------+ 30 | | `datasets.download_lalonde_nsw([data_home, control_data_url, treated_data_url, separator, column_names, column_types, random_state]) `_ | Downloading the Lalonde NSW dataset. | 31 | +--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------+ 32 | | `datasets.load_lalonde_nsw([data_home, load_raw_data, download_if_missing]) `_ | Loading the Lalonde NSW dataset from the local file. | 33 | +--------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------+ 34 | 35 | ********** 36 | Generators 37 | ********** 38 | +----------------------------------------------------------------------------------------------+--------------------------------------------+ 39 | | `datasets.make_linear_regression(size, [x1_params, x2_params, x3_params, t_params, e_params, | | Generate data by formula: Y' = X1+X2*T+E | 40 | | eps, seed]) `_ | | Y = Y', if Y' - int(Y') > eps, | 41 | | | | Y = 0, otherwise. | 42 | +----------------------------------------------------------------------------------------------+--------------------------------------------+ 43 | -------------------------------------------------------------------------------- /docs/datasets/load_criteo_uplift_prediction.rst: -------------------------------------------------------------------------------- 1 | ############################# 2 | load_criteo_uplift_prediction 3 | ############################# 4 | 5 | Loading the Criteo Uplift Prediction dataset from the local file. 6 | 7 | **************** 8 | Data description 9 | **************** 10 | This dataset is constructed by assembling data resulting from several incrementality tests, a particular randomized trial procedure where a random part of the population is prevented from being targeted by advertising. 11 | It consists of 25M rows, each one representing a user with 11 features, a treatment indicator and 2 labels (visits and conversions). 12 | 13 | ******* 14 | Privacy 15 | ******* 16 | For privacy reasons the data has been sub-sampled non-uniformly so that the original incrementality level cannot be deduced from the dataset while preserving a realistic, challenging benchmark. 17 | Feature names have been anonymized and their values randomly projected so as to keep predictive power while making it practically impossible to recover the original features or user context. 18 | 19 | +--------------------------+------------+ 20 | | Features | 11 | 21 | +--------------------------+------------+ 22 | | Treatment | 2 | 23 | +--------------------------+------------+ 24 | | Samples total | 25,309,483 | 25 | +--------------------------+------------+ 26 | | Average visit rate | 0.04132 | 27 | +--------------------------+------------+ 28 | | Average conversion rate | 0.00229 | 29 | +--------------------------+------------+ 30 | 31 | More information about dataset you can find in 32 | the `official dataset description `_. 33 | 34 | +-----------------+---------------------------------------------------------------------------------------------------------------------+ 35 | | **Parameters** | | **data_home: str** | 36 | | | | Specify another download and cache folder for the dataset. | 37 | | | | By default the dataset will be stored in the data folder in the same folder. | 38 | | | | **download_if_missing: bool, default=True** | 39 | | | | Download the dataset if it is not downloaded. | 40 | +-----------------+---------------------------------------------------------------------------------------------------------------------+ 41 | | **Returns:** | | **dataset**: dict | 42 | | | | Dictionary object with the following attributes: | 43 | | | | **dataset.description** : str | 44 | | | | Description of the Criteo Uplift Prediction dataset. | 45 | | | | **dataset.data**: numpy ndarray of shape (25309483, 11) | 46 | | | | Each row corresponding to the 11 feature values in order. | 47 | | | | **dataset.feature_names**: list, size 11 | 48 | | | | List of feature names. | 49 | | | | **dataset.treatment**: numpy ndarray, shape (25309483,) | 50 | | | | Each value corresponds to the treatment. | 51 | | | | **dataset.target**: numpy array of shape (25309483,) | 52 | | | | Each value corresponds to one of the outcomes. By default, it's `visit` outcome (look at `target_visit` below). | 53 | | | | **dataset.target_visit**: numpy array of shape (25309483,) | 54 | | | | Each value corresponds to whether a visit occurred for this user (binary, label). | 55 | | | | **dataset.target_exposure**: numpy array of shape (25309483,) | 56 | | | | Each value corresponds to treatment effect, whether the user has been effectively exposed (binary). | 57 | | | | **dataset.target_conversion**: numpy array of shape (25309483,) | 58 | | | | Each value corresponds to whether a conversion occurred for this user (binary, label). | 59 | +-----------------+---------------------------------------------------------------------------------------------------------------------+ 60 | 61 | ******** 62 | Examples 63 | ******** 64 | 65 | .. code-block:: python3 66 | 67 | from pyuplift.datasets import load_criteo_uplift_prediction 68 | df = load_criteo_uplift_prediction() 69 | print(df) 70 | -------------------------------------------------------------------------------- /docs/datasets/load_hillstrom_email_marketing.rst: -------------------------------------------------------------------------------- 1 | ############################## 2 | load_hillstrom_email_marketing 3 | ############################## 4 | 5 | Loading the Hillstrom Email Marketing dataset from the local file. 6 | 7 | **************** 8 | Data description 9 | **************** 10 | This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test. 11 | 12 | * 1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise. 13 | * 1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise. 14 | * 1/3 were randomly chosen to not receive an e-mail campaign. 15 | 16 | During a period of two weeks following the e-mail campaign, results were tracked. Your job is to tell the world if the Mens or Womens e-mail campaign was successful. 17 | 18 | +--------------------------+------------+ 19 | | Features | 8 | 20 | +--------------------------+------------+ 21 | | Treatment | 3 | 22 | +--------------------------+------------+ 23 | | Samples total | 64,000 | 24 | +--------------------------+------------+ 25 | | Average spend rate | 1.05091 | 26 | +--------------------------+------------+ 27 | | Average visit rate | 0.14678 | 28 | +--------------------------+------------+ 29 | | Average conversion rate | 0.00903 | 30 | +--------------------------+------------+ 31 | 32 | More information about dataset you can find in the `official paper `_. 33 | 34 | +-----------------+----------------------------------------------------------------------------------------------------------------------------------------+ 35 | | **Parameters:** | | **data_home**: str, default=None | 36 | | | | Specify another download and cache folder for the dataset. | 37 | | | | By default the dataset will be stored in the data folder in the same folder. | 38 | | | | **load_raw_data**: bool, default=False | 39 | | | | The loading of raw or preprocessed data? | 40 | | | | **download_if_missing**: bool, default=True | 41 | | | | Download the dataset if it is not downloaded. | 42 | +-----------------+----------------------------------------------------------------------------------------------------------------------------------------+ 43 | | **Returns:** | | **dataset**: dict | 44 | | | | Dictionary object with the following attributes: | 45 | | | | **dataset.description** : str | 46 | | | | Description of the Hillstrom email marketing dataset. | 47 | | | | **dataset.data**: numpy ndarray of shape (64000, 8) | 48 | | | | Each row corresponding to the 8 feature values in order. | 49 | | | | **dataset.feature_names**: list, size 8 | 50 | | | | List of feature names. | 51 | | | | **dataset.treatment**: numpy ndarray, shape (64000,) | 52 | | | | Each value corresponds to the treatment. | 53 | | | | **dataset.target**: numpy array of shape (64000,) | 54 | | | | Each value corresponds to one of the outcomes. By default, it's `spend` outcome (look at `target_spend` below). | 55 | | | | **dataset.target_spend**: numpy array of shape (64000,) | 56 | | | | Each value corresponds to how much customers spent during a two-week outcome period. | 57 | | | | **dataset.target_visit**: numpy array of shape (64000,) | 58 | | | | Each value corresponds to whether people visited the site during a two-week outcome period. | 59 | | | | **dataset.target_conversion**: numpy array of shape (64000,) | 60 | | | | Each value corresponds to whether they purchased at the site (“conversion”) during a two-week outcome period. | 61 | +-----------------+----------------------------------------------------------------------------------------------------------------------------------------+ 62 | 63 | ******** 64 | Examples 65 | ******** 66 | 67 | .. code-block:: python3 68 | 69 | from pyuplift.datasets import load_hillstrom_email_marketing 70 | df = load_hillstrom_email_marketing() 71 | print(df) 72 | -------------------------------------------------------------------------------- /docs/datasets/load_lalonde_nsw.rst: -------------------------------------------------------------------------------- 1 | ################ 2 | load_lalonde_nsw 3 | ################ 4 | 5 | Loading the Lalonde NSW dataset from the local file. 6 | 7 | **************** 8 | Data description 9 | **************** 10 | The dataset contains the treated and control units from the male sub-sample from the National Supported Work Demonstration as used by Lalonde in his paper. 11 | 12 | +--------------------------+------------+ 13 | | Features | 7 | 14 | +--------------------------+------------+ 15 | | Treatment | 2 | 16 | +--------------------------+------------+ 17 | | Samples total | 722 | 18 | +--------------------------+------------+ 19 | 20 | ******************** 21 | Features description 22 | ******************** 23 | * **treat** - an indicator variable for treatment status. 24 | * **age** - age in years. 25 | * **educ** - years of schooling. 26 | * **black** - indicator variable for blacks. 27 | * **hisp** - indicator variable for Hispanics. 28 | * **married** - indicator variable for martial status. 29 | * **nodegr** - indicator variable for high school diploma. 30 | * **re75** - real earnings in 1975. 31 | * **re78** - real earnings in 1978. 32 | 33 | More information about dataset you can find `here `_. 34 | 35 | +-----------------+----------------------------------------------------------------------------------------------------------------------------------------+ 36 | | **Parameters:** | | **data_home**: str, default=None | 37 | | | | Specify another download and cache folder for the dataset. | 38 | | | | By default the dataset will be stored in the data folder in the same folder. | 39 | | | | **download_if_missing**: bool, default=True | 40 | | | | Download the dataset if it is not downloaded. | 41 | +-----------------+----------------------------------------------------------------------------------------------------------------------------------------+ 42 | | **Returns:** | | **dataset**: dict | 43 | | | | Dictionary object with the following attributes: | 44 | | | | **dataset.description** : str | 45 | | | | Description of the Hillstrom email marketing dataset. | 46 | | | | **dataset.data**: numpy ndarray of shape (722, 7) | 47 | | | | Each row corresponding to the 7 feature values in order. | 48 | | | | **dataset.feature_names**: list, size 7 | 49 | | | | List of feature names. | 50 | | | | **dataset.treatment**: numpy ndarray, shape (722,) | 51 | | | | Each value corresponds to the treatment. | 52 | | | | **dataset.target**: numpy array of shape (722,) | 53 | | | | Each value corresponds to one of the outcomes. By default, it's `re78` outcome. | 54 | +-----------------+----------------------------------------------------------------------------------------------------------------------------------------+ 55 | 56 | ******** 57 | Examples 58 | ******** 59 | 60 | .. code-block:: python3 61 | 62 | from pyuplift.datasets import load_lalonde_nsw 63 | df = load_lalonde_nsw() 64 | print(df) 65 | -------------------------------------------------------------------------------- /docs/datasets/make_linear_regression.rst: -------------------------------------------------------------------------------- 1 | ###################### 2 | make_linear_regression 3 | ###################### 4 | 5 | Generate data by formula. 6 | 7 | **************** 8 | Data description 9 | **************** 10 | Synthetic data generated by Generate data by formula: 11 | 12 | | ``Y' = X1 + X2 * T + E`` 13 | | ``Y = Y', if Y' - int(Y') > eps,`` 14 | | ``Y = 0, otherwise.`` 15 | 16 | Statistics for default parameters and size equals 100,000: 17 | 18 | +--------------------------+-------------+ 19 | |Features | 3 | 20 | +--------------------------+-------------+ 21 | |Treatment | 2 | 22 | +--------------------------+-------------+ 23 | |Samples total | `size` | 24 | +--------------------------+-------------+ 25 | |Y not equals 0 | 0.49438 | 26 | +--------------------------+-------------+ 27 | |Y values | 0 to 555.93 | 28 | +--------------------------+-------------+ 29 | 30 | 31 | +-----------------+-----------------------------------------------------------------------------+ 32 | | **Parameters:** | | **size**: integer | 33 | | | | The number of observations. | 34 | | | | **x1_params** : tuple(mu, sigma), default: (0, 1) | 35 | | | | The feature with gaussian distribution and mean=mu, sd=sigma. | 36 | | | | X1 ~ N(mu, sigma) | 37 | | | | **x2_params** : tuple(mu, sigma), default: (0, 0.1) | 38 | | | | The feature with gaussian distribution and mean=mu, sd=sigma. | 39 | | | | X2 ~ N(mu, sigma) | 40 | | | | **x3_params** : tuple(mu, sigma), default: (0, 1) | 41 | | | | The feature with gaussian distribution and mean=mu, sd=sigma. | 42 | | | | X3 ~ N(mu, sigma) | 43 | | | | **t_params** : tuple(mu, sigma), default: (0, 1) | 44 | | | | The treatment with uniform distribution. Min value=min, Max value=max-1 | 45 | | | | T ~ R(min, max) | 46 | | | | **e_params** : tuple(mu, sigma), default: (0, 1) | 47 | | | | The error with gaussian distribution and mean=mu, sd=sigma. | 48 | | | | E ~ N(mu, sigma) | 49 | | | | **eps** : tuple(mu, sigma), default: (0, 1) | 50 | | | | The border value. | 51 | | | | **random_state** : integer, default=777 | 52 | | | | random_state is the seed used by the random number generator. | 53 | +-----------------+-----------------------------------------------------------------------------+ 54 | | **Returns:** | | **dataset**: pandas DataFrame | 55 | | | | Generated data. | 56 | +-----------------+-----------------------------------------------------------------------------+ 57 | 58 | ******** 59 | Examples 60 | ******** 61 | 62 | .. code-block:: python3 63 | 64 | from pyuplift.datasets import make_linear_regression 65 | df = make_linear_regression(10000) 66 | print(df) 67 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | ################# 2 | Examples of Usage 3 | ################# 4 | 5 | This section contains official examples of usage pyuplift package. 6 | 7 | ******** 8 | Contents 9 | ******** 10 | - `Hillstrom Email Marketing dataset `_ 11 | - `Synthetic dataset `_ 12 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ###################### 2 | pyuplift documentation 3 | ###################### 4 | 5 | **pyuplift** is a scientific uplift modeling library. It implements variable selection and transformation approaches. pyuplift provides API for work with such an uplift datasets as `Hillstrom Email Marketing `_ and `Criteo Uplift Prediction `_. 6 | 7 | ******** 8 | Contents 9 | ******** 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | :titlesonly: 14 | 15 | installation 16 | examples 17 | contribute 18 | 19 | .. toctree:: 20 | :maxdepth: 2 21 | :titlesonly: 22 | :caption: API 23 | 24 | base_model 25 | variable_selection/index 26 | transformation/index 27 | datasets/index 28 | model_selection/index 29 | metrics/index 30 | utils/index 31 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ################## 2 | Installation Guide 3 | ################## 4 | 5 | ***************** 6 | Install from PyPI 7 | ***************** 8 | 9 | .. code-block:: bash 10 | 11 | pip install pyuplift 12 | 13 | 14 | ************************ 15 | Install from source code 16 | ************************ 17 | 18 | .. code-block:: bash 19 | 20 | python setup.py install 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/metrics/get_average_effect.rst: -------------------------------------------------------------------------------- 1 | ################## 2 | get_average_effect 3 | ################## 4 | 5 | Estimating an average effect of the test set. 6 | 7 | +-----------------+----------------------------------------------------------------------------------+ 8 | | **Parameters:** | | **y_test**: numpy array | 9 | | | | Actual y values. | 10 | | | | **t_test**: numpy array | 11 | | | | Actual treatment values. | 12 | | | | **y_pred**: numpy array | 13 | | | | Predicted y values by uplift model. | 14 | | | | **test_share**: float | 15 | | | | Share of the test data which will be taken for estimating an average effect. | 16 | +-----------------+----------------------------------------------------------------------------------+ 17 | | **Returns:** | | **average effect**: float | 18 | | | | Average effect on the test set. | 19 | +-----------------+----------------------------------------------------------------------------------+ 20 | 21 | ******** 22 | Examples 23 | ******** 24 | 25 | .. code-block:: python3 26 | 27 | from pyuplift.metrics import get_average_effect 28 | ... 29 | model.fit(X_train, y_train, t_train) 30 | y_pred = model.predict(X_test) 31 | effect = get_average_effect(y_test, t_test, y_pred, test_share) 32 | print(effect) 33 | -------------------------------------------------------------------------------- /docs/metrics/index.rst: -------------------------------------------------------------------------------- 1 | ####### 2 | Metrics 3 | ####### 4 | 5 | .. toctree:: 6 | :hidden: 7 | 8 | get_average_effect 9 | 10 | The pyuplift.metrics module includes score functions, performance metrics and pairwise metrics and distance computations. 11 | 12 | +-----------------------------------------------------------------------------------------------+-----------------------------------------------+ 13 | | `metrics.get_average_effect(y_test, t_test, y_pred, [test_share]) `_ | Estimating an average effect of the test set. | 14 | +-----------------------------------------------------------------------------------------------+-----------------------------------------------+ 15 | -------------------------------------------------------------------------------- /docs/model_selection/index.rst: -------------------------------------------------------------------------------- 1 | ############### 2 | Model Selection 3 | ############### 4 | 5 | .. toctree:: 6 | :hidden: 7 | 8 | train_test_split 9 | treatment_cross_val_score 10 | 11 | The pyuplift.model_selection module includes model validation and splitter functions. 12 | 13 | ****************** 14 | Splitter Functions 15 | ****************** 16 | 17 | +--------------------------------------------------------------------------------------------------------+---------------------------------------------------+ 18 | | `model_selection.train_test_split(X, y, t, [train_share, random_state]) `_ | Split X, y, t into random train and test subsets. | 19 | +--------------------------------------------------------------------------------------------------------+---------------------------------------------------+ 20 | 21 | 22 | **************** 23 | Model validation 24 | **************** 25 | +-------------------------------------------------------------------------------------------------------------------------+----------------------------------------+ 26 | | `model_selection.treatment_cross_val_score(X, y, t, model, [cv, train_share, seeds]) `_ | Evaluate a scores by cross-validation. | 27 | +-------------------------------------------------------------------------------------------------------------------------+----------------------------------------+ 28 | -------------------------------------------------------------------------------- /docs/model_selection/train_test_split.rst: -------------------------------------------------------------------------------- 1 | ################ 2 | train_test_split 3 | ################ 4 | 5 | Split X, y, t into random train and test subsets. 6 | 7 | +------------------+-----------------------------------------------------------------------------------------+ 8 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 9 | | | | Matrix of features. | 10 | | | | **y: numpy array with shape = [n_samples,]** | 11 | | | | Array of target of feature. | 12 | | | | **t: numpy array with shape = [n_samples,]** | 13 | | | | Array of treatments. | 14 | | | | **train_share: float, optional (default=0.7)** | 15 | | | | train_share represents the proportion of the dataset to include in the train split. | 16 | | | | **random_state: int, optional (default=None)** | 17 | | | | random_state is the seed used by the random number generator. | 18 | +------------------+-----------------------------------------------------------------------------------------+ 19 | | **Return** | | **X_train: numpy ndarray** | 20 | | | | Train matrix of features. | 21 | | | | **X_test: numpy ndarray** | 22 | | | | Test matrix of features. | 23 | | | | **y_train: numpy array** | 24 | | | | Train array of target of feature. | 25 | | | | **y_test: numpy array** | 26 | | | | Test array of target of feature. | 27 | | | | **t_train: numpy array** | 28 | | | | Train array of treatments. | 29 | | | | **t_test: numpy array** | 30 | | | | Test array of treatments. | 31 | +------------------+-----------------------------------------------------------------------------------------+ 32 | 33 | ******** 34 | Examples 35 | ******** 36 | 37 | .. code-block:: python3 38 | 39 | from pyuplift.model_selection import train_test_split 40 | ... 41 | for seed in seeds: 42 | X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(X, y, t, train_share, seed) 43 | model.fit(X_train, y_train, t_train) 44 | score = get_average_effect(y_test, t_test, model.predict(X_test)) 45 | scores.append(score) 46 | -------------------------------------------------------------------------------- /docs/model_selection/treatment_cross_val_score.rst: -------------------------------------------------------------------------------- 1 | ######################### 2 | treatment_cross_val_score 3 | ######################### 4 | 5 | Evaluate a scores by cross-validation. 6 | 7 | +------------------+-----------------------------------------------------------------------------------------+ 8 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 9 | | | | Matrix of features. | 10 | | | | **y: numpy array with shape = [n_samples,]** | 11 | | | | Array of target of feature. | 12 | | | | **t: numpy array with shape = [n_samples,]** | 13 | | | | Array of treatments. | 14 | | | | **train_share: float, optional (default=0.7)** | 15 | | | | train_share represents the proportion of the dataset to include in the train split. | 16 | | | | **random_state: int, optional (default=777)** | 17 | | | | random_state is the seed used by the random number generator. | 18 | +------------------+-----------------------------------------------------------------------------------------+ 19 | | **Return** | | **scores: numpy array of floats** | 20 | | | | Array of scores of the estimator for each run of the cross validation. | 21 | +------------------+-----------------------------------------------------------------------------------------+ 22 | 23 | ******** 24 | Examples 25 | ******** 26 | 27 | .. code-block:: python3 28 | 29 | from pyuplift.model_selection import treatment_cross_val_score 30 | ... 31 | for model_name in models: 32 | scores = treatment_cross_val_score(X, y, t, models[model_name], cv, seeds=seeds) 33 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinxcontrib-httpdomain 2 | sphinx 3 | -------------------------------------------------------------------------------- /docs/transformation/index.rst: -------------------------------------------------------------------------------- 1 | ############## 2 | Transformation 3 | ############## 4 | 5 | The pyuplift.transformation module includes classes which belongs to a transformation group of approaches. 6 | 7 | .. toctree:: 8 | :hidden: 9 | 10 | transformation_base_model 11 | lai 12 | kane 13 | jaskowski 14 | pessimistic 15 | reflective 16 | 17 | +------------------------------------------------------------------------------+---------------------------------------------------------------------------+ 18 | | `transformation.TransformationBaseModel() `_ | A base model of all classes which implements a transformation approaches. | 19 | +------------------------------------------------------------------------------+---------------------------------------------------------------------------+ 20 | | `transformation.Lai([model, use_weights]) `_ | A Lai's approach. | 21 | +------------------------------------------------------------------------------+---------------------------------------------------------------------------+ 22 | | `transformation.Kane([model, use_weights]) `_ | A Kane's approach. | 23 | +------------------------------------------------------------------------------+---------------------------------------------------------------------------+ 24 | | `transformation.Jaskowski([model]) `_ | A Jaskowski's approach. | 25 | +------------------------------------------------------------------------------+---------------------------------------------------------------------------+ 26 | | `transformation.Pessimistic([model]) `_ | A pessimistic approach. | 27 | +------------------------------------------------------------------------------+---------------------------------------------------------------------------+ 28 | | `transformation.Reflective([model]) `_ | A reflective approach. | 29 | +------------------------------------------------------------------------------+---------------------------------------------------------------------------+ 30 | -------------------------------------------------------------------------------- /docs/transformation/jaskowski.rst: -------------------------------------------------------------------------------- 1 | ######### 2 | Jaskowski 3 | ######### 4 | 5 | The class which implements the Jaskowski's approach [1]. 6 | 7 | +----------------+-----------------------------------------------------------------------------------+ 8 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 9 | | | | The classification model which will be used for predict uplift. | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | 12 | 13 | ******* 14 | Methods 15 | ******* 16 | +-----------------------------------------------+----------------------------------------------------+ 17 | | :ref:`fit(self, X, y, t) ` | Build the model from the training set (X, y, t). | 18 | +-----------------------------------------------+----------------------------------------------------+ 19 | | :ref:`predict(self, X, t=None) `| Predict an uplift for X. | 20 | +-----------------------------------------------+----------------------------------------------------+ 21 | 22 | .. _jask_fit: 23 | 24 | fit(self, X, y, t) 25 | ------------------ 26 | Build the model from the training set (X, y, t). 27 | 28 | +------------------+---------------------------------------------------------------------------------+ 29 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 30 | | | | Matrix of features. | 31 | | | | **y: numpy array with shape = [n_samples,]** | 32 | | | | Array of target of feature. | 33 | | | | **t: numpy array with shape = [n_samples,]** | 34 | | | | Array of treatments. | 35 | +------------------+---------------------------------------------------------------------------------+ 36 | | **Returns** | **self : object** | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | 39 | .. _jask_predict: 40 | 41 | predict(self, X, t=None) 42 | ------------------------ 43 | Predict an uplift for X. 44 | 45 | +------------------+---------------------------------------------------------------------------------+ 46 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 47 | | | | Matrix of features. | 48 | | | | **t: numpy array with shape = [n_samples,] or None** | 49 | | | | Array of treatments. | 50 | +------------------+---------------------------------------------------------------------------------+ 51 | | **Returns** | | **self : object** | 52 | | | | The predicted values. | 53 | +------------------+---------------------------------------------------------------------------------+ 54 | 55 | ********** 56 | References 57 | ********** 58 | 1. A Literature Survey and Experimental Evaluation of the State-of-the-Art in Uplift Modeling: A Stepping Stone Toward the Development of Prescriptive Analytics by Floris Devriendt, Darie Moldovan, and Wouter Verbeke 59 | 60 | 61 | .. code-block:: python3 62 | 63 | from pyuplift.transformation import Jaskowski 64 | ... 65 | model = Jaskowski() 66 | model.fit(X[train_indexes, :], y[train_indexes], t[train_indexes]) 67 | uplift = model.predict(X[test_indexes, :]) 68 | print(uplift) 69 | -------------------------------------------------------------------------------- /docs/transformation/kane.rst: -------------------------------------------------------------------------------- 1 | #### 2 | Kane 3 | #### 4 | 5 | The class which implements the Kane's approach [1]. 6 | 7 | +----------------+-----------------------------------------------------------------------------------+ 8 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 9 | | | | The classification model which will be used for predict uplift. | 10 | | | | **use_weights : boolean, optional (default=False)** | 11 | | | | Use or not weights? | 12 | +----------------+-----------------------------------------------------------------------------------+ 13 | 14 | 15 | ******* 16 | Methods 17 | ******* 18 | +-----------------------------------------------+----------------------------------------------------+ 19 | | :ref:`fit(self, X, y, t) ` | Build the model from the training set (X, y, t). | 20 | +-----------------------------------------------+----------------------------------------------------+ 21 | | :ref:`predict(self, X, t=None) `| Predict an uplift for X. | 22 | +-----------------------------------------------+----------------------------------------------------+ 23 | 24 | .. _kane_fit: 25 | 26 | fit(self, X, y, t) 27 | ------------------ 28 | Build the model from the training set (X, y, t). 29 | 30 | +------------------+---------------------------------------------------------------------------------+ 31 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 32 | | | | Matrix of features. | 33 | | | | **y: numpy array with shape = [n_samples,]** | 34 | | | | Array of target of feature. | 35 | | | | **t: numpy array with shape = [n_samples,]** | 36 | | | | Array of treatments. | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | | **Returns** | **self : object** | 39 | +------------------+---------------------------------------------------------------------------------+ 40 | 41 | .. _kane_predict: 42 | 43 | predict(self, X, t=None) 44 | ------------------------ 45 | Predict an uplift for X. 46 | 47 | +------------------+---------------------------------------------------------------------------------+ 48 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 49 | | | | Matrix of features. | 50 | | | | **t: numpy array with shape = [n_samples,] or None** | 51 | | | | Array of treatments. | 52 | +------------------+---------------------------------------------------------------------------------+ 53 | | **Returns** | | **self : object** | 54 | | | | The predicted values. | 55 | +------------------+---------------------------------------------------------------------------------+ 56 | 57 | ********** 58 | References 59 | ********** 60 | 1. A Literature Survey and Experimental Evaluation of the State-of-the-Art in Uplift Modeling: A Stepping Stone Toward the Development of Prescriptive Analytics by Floris Devriendt, Darie Moldovan, and Wouter Verbeke 61 | 62 | 63 | .. code-block:: python3 64 | 65 | from pyuplift.transformation import Kane 66 | ... 67 | model = Kane() 68 | model.fit(X[train_indexes, :], y[train_indexes], t[train_indexes]) 69 | uplift = model.predict(X[test_indexes, :]) 70 | print(uplift) 71 | -------------------------------------------------------------------------------- /docs/transformation/lai.rst: -------------------------------------------------------------------------------- 1 | ### 2 | Lai 3 | ### 4 | 5 | The class which implements the Lai's approach [1]. 6 | 7 | +----------------+-----------------------------------------------------------------------------------+ 8 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 9 | | | | The classification model which will be used for predict uplift. | 10 | | | | **use_weights : boolean, optional (default=False)** | 11 | | | | Use or not weights? | 12 | +----------------+-----------------------------------------------------------------------------------+ 13 | 14 | 15 | ******* 16 | Methods 17 | ******* 18 | +-----------------------------------------------+----------------------------------------------------+ 19 | | :ref:`fit(self, X, y, t) ` | Build a the model from the training set (X, y, t). | 20 | +-----------------------------------------------+----------------------------------------------------+ 21 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 22 | +-----------------------------------------------+----------------------------------------------------+ 23 | 24 | .. _lai_fit: 25 | 26 | fit(self, X, y, t) 27 | ------------------ 28 | Build a the model from the training set (X, y, t). 29 | 30 | +------------------+---------------------------------------------------------------------------------+ 31 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 32 | | | | Matrix of features. | 33 | | | | **y: numpy array with shape = [n_samples,]** | 34 | | | | Array of target of feature. | 35 | | | | **t: numpy array with shape = [n_samples,]** | 36 | | | | Array of treatments. | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | | **Returns** | **self : object** | 39 | +------------------+---------------------------------------------------------------------------------+ 40 | 41 | .. _lai_predict: 42 | 43 | predict(self, X, t=None) 44 | ------------------------ 45 | Predict an uplift for X. 46 | 47 | +------------------+---------------------------------------------------------------------------------+ 48 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 49 | | | | Matrix of features. | 50 | | | | **t: numpy array with shape = [n_samples,] or None** | 51 | | | | Array of treatments. | 52 | +------------------+---------------------------------------------------------------------------------+ 53 | | **Returns** | | **self : object** | 54 | | | | The predicted values. | 55 | +------------------+---------------------------------------------------------------------------------+ 56 | 57 | ********** 58 | References 59 | ********** 60 | 1. A Literature Survey and Experimental Evaluation of the State-of-the-Art in Uplift Modeling: A Stepping Stone Toward the Development of Prescriptive Analytics by Floris Devriendt, Darie Moldovan, and Wouter Verbeke 61 | 62 | 63 | .. code-block:: python3 64 | 65 | from pyuplift.transformation import Lai 66 | ... 67 | model = Lai() 68 | model.fit(X[train_indexes, :], y[train_indexes], t[train_indexes]) 69 | uplift = model.predict(X[test_indexes, :]) 70 | print(uplift) 71 | -------------------------------------------------------------------------------- /docs/transformation/pessimistic.rst: -------------------------------------------------------------------------------- 1 | ########### 2 | Pessimistic 3 | ########### 4 | 5 | The class which implements the pessimistic approach [1]. 6 | 7 | +----------------+-----------------------------------------------------------------------------------+ 8 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 9 | | | | The classification model which will be used for predict uplift. | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | 12 | 13 | ******* 14 | Methods 15 | ******* 16 | +-----------------------------------------------+----------------------------------------------------+ 17 | | :ref:`fit(self, X, y, t) ` | Build the model from the training set (X, y, t). | 18 | +-----------------------------------------------+----------------------------------------------------+ 19 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 20 | +-----------------------------------------------+----------------------------------------------------+ 21 | 22 | .. _pes_fit: 23 | 24 | fit(self, X, y, t) 25 | ------------------ 26 | Build the model from the training set (X, y, t). 27 | 28 | +------------------+---------------------------------------------------------------------------------+ 29 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 30 | | | | Matrix of features. | 31 | | | | **y: numpy array with shape = [n_samples,]** | 32 | | | | Array of target of feature. | 33 | | | | **t: numpy array with shape = [n_samples,]** | 34 | | | | Array of treatments. | 35 | +------------------+---------------------------------------------------------------------------------+ 36 | | **Returns** | **self : object** | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | 39 | .. _pes_predict: 40 | 41 | predict(self, X, t=None) 42 | ------------------------ 43 | Predict an uplift for X. 44 | 45 | +------------------+---------------------------------------------------------------------------------+ 46 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 47 | | | | Matrix of features. | 48 | | | | **t: numpy array with shape = [n_samples,] or None** | 49 | | | | Array of treatments. | 50 | +------------------+---------------------------------------------------------------------------------+ 51 | | **Returns** | | **self : object** | 52 | | | | The predicted values. | 53 | +------------------+---------------------------------------------------------------------------------+ 54 | 55 | ********** 56 | References 57 | ********** 58 | 1. A Literature Survey and Experimental Evaluation of the State-of-the-Art in Uplift Modeling: A Stepping Stone Toward the Development of Prescriptive Analytics by Floris Devriendt, Darie Moldovan, and Wouter Verbeke 59 | 60 | 61 | .. code-block:: python3 62 | 63 | from pyuplift.transformation import Pessimistic 64 | ... 65 | model = Pessimistic() 66 | model.fit(X[train_indexes, :], y[train_indexes], t[train_indexes]) 67 | uplift = model.predict(X[test_indexes, :]) 68 | print(uplift) 69 | -------------------------------------------------------------------------------- /docs/transformation/reflective.rst: -------------------------------------------------------------------------------- 1 | ########## 2 | Reflective 3 | ########## 4 | 5 | The class which implements the reflective approach [1]. 6 | 7 | +----------------+-----------------------------------------------------------------------------------+ 8 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 9 | | | | The classification model which will be used for predict uplift. | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | 12 | 13 | ******* 14 | Methods 15 | ******* 16 | +-----------------------------------------------+----------------------------------------------------+ 17 | | :ref:`fit(self, X, y, t) ` | Build the model from the training set (X, y, t). | 18 | +-----------------------------------------------+----------------------------------------------------+ 19 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 20 | +-----------------------------------------------+----------------------------------------------------+ 21 | 22 | .. _ref_fit: 23 | 24 | fit(self, X, y, t) 25 | ------------------ 26 | Build the model from the training set (X, y, t). 27 | 28 | +------------------+---------------------------------------------------------------------------------+ 29 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 30 | | | | Matrix of features. | 31 | | | | **y: numpy array with shape = [n_samples,]** | 32 | | | | Array of target of feature. | 33 | | | | **t: numpy array with shape = [n_samples,]** | 34 | | | | Array of treatments. | 35 | +------------------+---------------------------------------------------------------------------------+ 36 | | **Returns** | **self : object** | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | 39 | .. _ref_predict: 40 | 41 | predict(self, X, t=None) 42 | ------------------------ 43 | Predict an uplift for X. 44 | 45 | +------------------+---------------------------------------------------------------------------------+ 46 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 47 | | | | Matrix of features. | 48 | | | | **t: numpy array with shape = [n_samples,] or None** | 49 | | | | Array of treatments. | 50 | +------------------+---------------------------------------------------------------------------------+ 51 | | **Returns** | | **self : object** | 52 | | | | The predicted values. | 53 | +------------------+---------------------------------------------------------------------------------+ 54 | 55 | ********** 56 | References 57 | ********** 58 | 1. A Literature Survey and Experimental Evaluation of the State-of-the-Art in Uplift Modeling: A Stepping Stone Toward the Development of Prescriptive Analytics by Floris Devriendt, Darie Moldovan, and Wouter Verbeke 59 | 60 | 61 | .. code-block:: python3 62 | 63 | from pyuplift.transformation import Reflective 64 | ... 65 | model = Reflective() 66 | model.fit(X[train_indexes, :], y[train_indexes], t[train_indexes]) 67 | uplift = model.predict(X[test_indexes, :]) 68 | print(uplift) 69 | -------------------------------------------------------------------------------- /docs/transformation/transformation_base_model.rst: -------------------------------------------------------------------------------- 1 | ######################### 2 | Transformation Base Model 3 | ######################### 4 | 5 | The base class for a transformation uplift estimators. 6 | 7 | .. note:: 8 | This class should not be used directly. Use derived classes instead. 9 | -------------------------------------------------------------------------------- /docs/utils/download_file.rst: -------------------------------------------------------------------------------- 1 | ############# 2 | download_file 3 | ############# 4 | 5 | Download file from `url` to `output_path`. 6 | 7 | +-----------------+--------------------------------------+ 8 | | **Parameters** | | **url: string** | 9 | | | | Data's URL. | 10 | | | | **output_path: string** | 11 | | | | Path where file will be saved. | 12 | +-----------------+--------------------------------------+ 13 | | **Returns** | **None** | 14 | +-----------------+--------------------------------------+ 15 | 16 | ******** 17 | Examples 18 | ******** 19 | 20 | .. code-block:: python3 21 | 22 | from pyuplift.utils import download_file 23 | ... 24 | if not os.path.exists(data_path): 25 | if not os.path.exists(archive_path): 26 | download_file(url, archive_path) 27 | -------------------------------------------------------------------------------- /docs/utils/index.rst: -------------------------------------------------------------------------------- 1 | ######### 2 | Utilities 3 | ######### 4 | 5 | .. toctree:: 6 | :hidden: 7 | 8 | download_file 9 | retrieve_from_gz 10 | 11 | The pyuplift.utils module includes various utilities. 12 | 13 | +------------------------------------------------------------------------------+----------------------------------------------------------------------+ 14 | | `utils.download_file(url, output_path) `_ | Download file from `url` to `output_path`. | 15 | +------------------------------------------------------------------------------+----------------------------------------------------------------------+ 16 | | `utils.retrieve_from_gz(archive_path, output_path) `_ | The retrieving gz-archived data from `archive_path` to `output_path` | 17 | +------------------------------------------------------------------------------+----------------------------------------------------------------------+ 18 | -------------------------------------------------------------------------------- /docs/utils/retrieve_from_gz.rst: -------------------------------------------------------------------------------- 1 | ################ 2 | retrieve_from_gz 3 | ################ 4 | 5 | The retrieving gz-archived data from `archive_path` to `output_path`. 6 | 7 | +-----------------+--------------------------------------+ 8 | | **Parameters** | | **archive_path: string** | 9 | | | | The archive path. | 10 | | | | **output_path: string** | 11 | | | | The retrieved data path. | 12 | +-----------------+--------------------------------------+ 13 | | **Returns** | **None** | 14 | +-----------------+--------------------------------------+ 15 | 16 | ******** 17 | Examples 18 | ******** 19 | 20 | .. code-block:: python3 21 | 22 | from pyuplift.utils import retrieve_from_gz 23 | ... 24 | if not os.path.exists(data_path): 25 | if not os.path.exists(archive_path): 26 | download_file(url, archive_path) 27 | retrieve_from_gz(archive_path, data_path) 28 | -------------------------------------------------------------------------------- /docs/variable_selection/cadit.rst: -------------------------------------------------------------------------------- 1 | ##### 2 | Cadit 3 | ##### 4 | 5 | The class which implements the cadit approach [1]. 6 | 7 | +----------------+-----------------------------------------------------------------------------------+ 8 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LinearRegression)** | 9 | | | | The regression model which will be used for predict uplift. | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | 12 | 13 | ******* 14 | Methods 15 | ******* 16 | +-------------------------------------------------+--------------------------------------------------+ 17 | | :ref:`fit(self, X, y, t) ` | Build a model from the training set (X, y, t). | 18 | +-------------------------------------------------+--------------------------------------------------+ 19 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 20 | +-------------------------------------------------+--------------------------------------------------+ 21 | 22 | .. _cadit_fit: 23 | 24 | fit(self, X, y, t) 25 | ------------------ 26 | Build a model from the training set (X, y, t). 27 | 28 | +------------------+---------------------------------------------------------------------------------+ 29 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 30 | | | | Matrix of features. | 31 | | | | **y: numpy array with shape = [n_samples,]** | 32 | | | | Array of target of feature. | 33 | | | | **t: numpy array with shape = [n_samples,]** | 34 | | | | Array of treatments. | 35 | +------------------+---------------------------------------------------------------------------------+ 36 | | **Returns** | **self : object** | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | 39 | .. _cadit_predict: 40 | 41 | predict(self, X, t=None) 42 | ------------------------ 43 | Predict an uplift for X. 44 | 45 | +------------------+---------------------------------------------------------------------------------+ 46 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 47 | | | | Matrix of features. | 48 | | | | **t: numpy array with shape = [n_samples,] or None** | 49 | | | | Array of treatments. | 50 | +------------------+---------------------------------------------------------------------------------+ 51 | | **Returns** | | **self : object** | 52 | | | | The predicted values. | 53 | +------------------+---------------------------------------------------------------------------------+ 54 | 55 | ********** 56 | References 57 | ********** 58 | 1. Weisberg HI, Pontes VP. Post hoc subgroups in clinical trials: Anathema or analytics? // Clinical trials. 2015 Aug;12(4):357-64. 59 | 60 | .. code-block:: python3 61 | 62 | from pyuplift.variable_selection import Cadit 63 | ... 64 | model = Cadit() 65 | model.fit(X[train_indexes, :], y[train_indexes], t[train_indexes]) 66 | uplift = model.predict(X[test_indexes, :]) 67 | print(uplift) 68 | -------------------------------------------------------------------------------- /docs/variable_selection/dummy.rst: -------------------------------------------------------------------------------- 1 | ##### 2 | Dummy 3 | ##### 4 | 5 | The class which implements the dummy approach [1]. 6 | 7 | +----------------+-----------------------------------------------------------------------------------+ 8 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LinearRegression)** | 9 | | | | The regression model which will be used for predict uplift. | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | 12 | 13 | ******* 14 | Methods 15 | ******* 16 | +-------------------------------------------------+-----------------------------------------------------+ 17 | | :ref:`fit(self, X, y, t) ` | Build a dummy model from the training set (X, y, t).| 18 | +-------------------------------------------------+-----------------------------------------------------+ 19 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 20 | +-------------------------------------------------+-----------------------------------------------------+ 21 | 22 | .. _dummy_fit: 23 | 24 | fit(self, X, y, t) 25 | ------------------ 26 | Build a dummy model from the training set (X, y, t). 27 | 28 | +------------------+---------------------------------------------------------------------------------+ 29 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 30 | | | | Matrix of features. | 31 | | | | **y: numpy array with shape = [n_samples,]** | 32 | | | | Array of target of feature. | 33 | | | | **t: numpy array with shape = [n_samples,]** | 34 | | | | Array of treatments. | 35 | +------------------+---------------------------------------------------------------------------------+ 36 | | **Returns** | **self : object** | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | 39 | .. _dummy_predict: 40 | 41 | predict(self, X, t=None) 42 | ------------------------ 43 | Predict an uplift for X. 44 | 45 | +------------------+---------------------------------------------------------------------------------+ 46 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 47 | | | | Matrix of features. | 48 | | | | **t: numpy array with shape = [n_samples,] or None** | 49 | | | | Array of treatments. | 50 | +------------------+---------------------------------------------------------------------------------+ 51 | | **Returns** | | **self : object** | 52 | | | | The predicted values. | 53 | +------------------+---------------------------------------------------------------------------------+ 54 | 55 | ********** 56 | References 57 | ********** 58 | 1. A Literature Survey and Experimental Evaluation of the State-of-the-Art in Uplift Modeling: A Stepping Stone Toward the Development of Prescriptive Analytics by Floris Devriendt, Darie Moldovan, and Wouter Verbeke 59 | 60 | 61 | .. code-block:: python3 62 | 63 | from pyuplift.variable_selection import Dummy 64 | ... 65 | model = Dummy() 66 | model.fit(X[train_indexes, :], y[train_indexes], t[train_indexes]) 67 | uplift = model.predict(X[test_indexes, :]) 68 | print(uplift) 69 | -------------------------------------------------------------------------------- /docs/variable_selection/econometric.rst: -------------------------------------------------------------------------------- 1 | ########### 2 | Econometric 3 | ########### 4 | 5 | The class which implements the econometric approach [1]. 6 | 7 | +----------------+-----------------------------------------------------------------------------------+ 8 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LinearRegression)** | 9 | | | | The regression model which will be used for predict uplift. | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | 12 | 13 | ******* 14 | Methods 15 | ******* 16 | +-----------------------------------------------+------------------------------------------------------------+ 17 | | :ref:`fit(self, X, y, t) ` | Build an econometric model from the training set (X, y, t).| 18 | +-----------------------------------------------+------------------------------------------------------------+ 19 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 20 | +-----------------------------------------------+------------------------------------------------------------+ 21 | 22 | .. _eco_fit: 23 | 24 | fit(self, X, y, t) 25 | ------------------ 26 | Build an econometric model from the training set (X, y, t). 27 | 28 | +------------------+---------------------------------------------------------------------------------+ 29 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 30 | | | | Matrix of features. | 31 | | | | **y: numpy array with shape = [n_samples,]** | 32 | | | | Array of target of feature. | 33 | | | | **t: numpy array with shape = [n_samples,]** | 34 | | | | Array of treatments. | 35 | +------------------+---------------------------------------------------------------------------------+ 36 | | **Returns** | **self : object** | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | 39 | .. _eco_predict: 40 | 41 | predict(self, X, t=None) 42 | ------------------------ 43 | Predict an uplift for X. 44 | 45 | +------------------+---------------------------------------------------------------------------------+ 46 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 47 | | | | Matrix of features. | 48 | | | | **t: numpy array with shape = [n_samples,] or None** | 49 | | | | Array of treatments. | 50 | +------------------+---------------------------------------------------------------------------------+ 51 | | **Returns** | | **self : object** | 52 | | | | The predicted values. | 53 | +------------------+---------------------------------------------------------------------------------+ 54 | 55 | ********** 56 | References 57 | ********** 58 | 1. A Literature Survey and Experimental Evaluation of the State-of-the-Art in Uplift Modeling: A Stepping Stone Toward the Development of Prescriptive Analytics by Floris Devriendt, Darie Moldovan, and Wouter Verbeke 59 | 60 | 61 | .. code-block:: python3 62 | 63 | from pyuplift.variable_selection import Econometric 64 | ... 65 | model = Econometric() 66 | model.fit(X[train_indexes, :], y[train_indexes], t[train_indexes]) 67 | uplift = model.predict(X[test_indexes, :]) 68 | print(uplift) 69 | -------------------------------------------------------------------------------- /docs/variable_selection/index.rst: -------------------------------------------------------------------------------- 1 | ################## 2 | Variable Selection 3 | ################## 4 | 5 | The pyuplift.variable_selection module includes classes which belongs to variable selection group of approaches. 6 | 7 | .. toctree:: 8 | :hidden: 9 | 10 | two_model 11 | econometric 12 | dummy 13 | cadit 14 | 15 | +--------------------------------------------------------------------------------------------+--------------------------+ 16 | | `variable_selection.TwoModel([no_treatment_model, has_treatment_model]) `_ | A two model approach. | 17 | +--------------------------------------------------------------------------------------------+--------------------------+ 18 | | `variable_selection.Econometric([model]) `_ | An econometric approach. | 19 | +--------------------------------------------------------------------------------------------+--------------------------+ 20 | | `variable_selection.Dummy([model]) `_ | A dummy approach. | 21 | +--------------------------------------------------------------------------------------------+--------------------------+ 22 | | `variable_selection.Cadit([model]) `_ | A cadit approach. | 23 | +--------------------------------------------------------------------------------------------+--------------------------+ 24 | -------------------------------------------------------------------------------- /docs/variable_selection/two_model.rst: -------------------------------------------------------------------------------- 1 | ######### 2 | Two Model 3 | ######### 4 | 5 | The class which implements the two model approach [1]. 6 | 7 | +----------------+---------------------------------------------------------------------------------------------+ 8 | | **Parameters** | | **no_treatment_model : object, optional (default=sklearn.linear_model.LinearRegression)** | 9 | | | | The regression model which will be used for predict uplift. | 10 | | | | **has_treatment_model : object, optional (default=sklearn.linear_model.LinearRegression)**| 11 | | | | The regression model which will be used for predict uplift. | 12 | +----------------+---------------------------------------------------------------------------------------------+ 13 | 14 | ******* 15 | Methods 16 | ******* 17 | +-----------------------------------------------+--------------------------------------------------------------+ 18 | | :ref:`fit(self, X, y, t) ` | Build a two model model from the training set (X, y, t). | 19 | +-----------------------------------------------+--------------------------------------------------------------+ 20 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 21 | +-----------------------------------------------+--------------------------------------------------------------+ 22 | 23 | .. _two_fit: 24 | 25 | fit(self, X, y, t) 26 | ------------------ 27 | Build a model model model from the training set (X, y, t). 28 | 29 | +------------------+---------------------------------------------------------------------------------+ 30 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 31 | | | | Matrix of features. | 32 | | | | **y: numpy array with shape = [n_samples,]** | 33 | | | | Array of target of feature. | 34 | | | | **t: numpy array with shape = [n_samples,]** | 35 | | | | Array of treatments. | 36 | +------------------+---------------------------------------------------------------------------------+ 37 | | **Returns** | **self : object** | 38 | +------------------+---------------------------------------------------------------------------------+ 39 | 40 | .. _two_predict: 41 | 42 | predict(self, X, t=None) 43 | ------------------------ 44 | Predict an uplift for X. 45 | 46 | +------------------+---------------------------------------------------------------------------------+ 47 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 48 | | | | Matrix of features. | 49 | | | | **t: numpy array with shape = [n_samples,] or None** | 50 | | | | Array of treatments. | 51 | +------------------+---------------------------------------------------------------------------------+ 52 | | **Returns** | | **self : object** | 53 | | | | The predicted values. | 54 | +------------------+---------------------------------------------------------------------------------+ 55 | 56 | ********** 57 | References 58 | ********** 59 | 1. A Literature Survey and Experimental Evaluation of the State-of-the-Art in Uplift Modeling: A Stepping Stone Toward the Development of Prescriptive Analytics by Floris Devriendt, Darie Moldovan, and Wouter Verbeke 60 | 61 | 62 | .. code-block:: python3 63 | 64 | from pyuplift.variable_selection import TwoModel 65 | ... 66 | model = TwoModel() 67 | model.fit(X[train_indexes, :], y[train_indexes], t[train_indexes]) 68 | uplift = model.predict(X[test_indexes, :]) 69 | print(uplift) 70 | -------------------------------------------------------------------------------- /examples/README.MD: -------------------------------------------------------------------------------- 1 | ## Examples 2 | This directory contains examples of code which related to the pyuplift library. 3 | -------------------------------------------------------------------------------- /pyuplift/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseModel 2 | -------------------------------------------------------------------------------- /pyuplift/base.py: -------------------------------------------------------------------------------- 1 | class BaseModel: 2 | """Base class for uplift models. 3 | 4 | Note: This class should not be used directly. Use derived classes instead. 5 | """ 6 | 7 | def fit(self, X, y, t): 8 | """Build a TwoModel approach from the training set (x, y, t). 9 | 10 | Parameters 11 | ---------- 12 | X : numpy array of shape = [n_samples, n_features] 13 | The training input samples. 14 | y : numpy array of shape = [n_samples] or [n_samples, n_outputs] 15 | The target values (class labels in classification, real numbers in regression). 16 | t : numpy array of shape = [n_samples] or [n_samples, n_outputs] 17 | The treatments. 18 | Returns 19 | ------- 20 | self : object 21 | """ 22 | return self 23 | 24 | def predict(self, X, t): 25 | """Predict treatment effect for x. 26 | 27 | Parameters 28 | ---------- 29 | X : numpy array of shape = [n_samples, n_features] 30 | The input samples. 31 | t : numpy array of shape = [n_samples, n_features] 32 | The treatments. 33 | Returns 34 | ------- 35 | y : array of shape = [n_samples] or [n_samples, n_outputs] 36 | The predicted treatment effects. 37 | """ 38 | pass 39 | -------------------------------------------------------------------------------- /pyuplift/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .loaders.criteo_uplift_prediction import download_criteo_uplift_prediction, load_criteo_uplift_prediction 2 | from .loaders.hillstrom_email_marketing import download_hillstrom_email_marketing, load_hillstrom_email_marketing 3 | from .loaders.lalonde_nsw import download_lalonde_nsw, load_lalonde_nsw 4 | 5 | from .generators.linear import make_linear_regression 6 | -------------------------------------------------------------------------------- /pyuplift/datasets/generators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/pyuplift/datasets/generators/__init__.py -------------------------------------------------------------------------------- /pyuplift/datasets/generators/linear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def make_linear_regression( 6 | size: int, 7 | x1_params=(0, 100), 8 | x2_params=(0, 10), 9 | x3_params=(0, 100), 10 | t_params=(0, 2), 11 | e_params=(0, 100), 12 | eps=0.01, 13 | random_state=777 14 | ): 15 | """The data generates by formula: 16 | 17 | **************** 18 | Data description 19 | **************** 20 | Synthetic data generated by Generate data by formula: 21 | 22 | | ``Y' = X1 + X2 * T + E`` 23 | | ``Y = Y', if Y' - int(Y') > eps,`` 24 | | ``Y = 0, otherwise.`` 25 | 26 | Statistics for default parameters and size equals 100,000: 27 | 28 | +--------------------------+-------------+ 29 | |Features | 3 | 30 | +--------------------------+-------------+ 31 | |Treatment | 2 | 32 | +--------------------------+-------------+ 33 | |Samples total | `size` | 34 | +--------------------------+-------------+ 35 | |Y not equals 0 | 0.49438 | 36 | +--------------------------+-------------+ 37 | |Y values | 0 to 555.93 | 38 | +--------------------------+-------------+ 39 | 40 | Parameters 41 | ---------- 42 | size : int 43 | The number of observations. 44 | x1_params : tuple(mu, sigma), default: (0, 100) 45 | The feature with gaussian distribution and mean=mu, sd=sigma. 46 | X1 ~ N(mu, sigma) 47 | x2_params : tuple(mu, sigma), default: (0, 10) 48 | The feature with gaussian distribution and mean=mu, sd=sigma. 49 | X2 ~ N(mu, sigma) 50 | x3_params : tuple(mu, sigma), default: (0, 100) 51 | The feature with gaussian distribution and mean=mu, sd=sigma. 52 | X3 ~ N(mu, sigma) 53 | t_params : tuple(min, max), default: (0, 2) 54 | The treatment with uniform distribution. Min value=min, Max value=max-1 55 | T ~ R(min, max) 56 | e_params : tuple(mu, sigma), default: (0, 100) 57 | The error with gaussian distribution and mean=mu, sd=sigma. 58 | E ~ N(mu, sigma) 59 | eps : float, default: 0.01 60 | The border value. 61 | random_state : int, default: 777 62 | The random seed. 63 | Returns 64 | ------- 65 | dataset : pandas DataFrame 66 | """ 67 | 68 | if size <= 0: 69 | raise ValueError('Size of the dataset should be non negative.') 70 | 71 | np.random.seed(random_state) 72 | x1 = np.random.normal(*x1_params, size) 73 | x2 = np.random.normal(*x2_params, size) 74 | x3 = np.random.normal(*x3_params, size) 75 | t = np.random.randint(*t_params, size) 76 | e = np.random.normal(*e_params, size) 77 | y_ = x1 + x2 * t + e 78 | y = [] 79 | for value in y_: 80 | if value - int(value) > eps: 81 | y.append(value) 82 | else: 83 | y.append(0) 84 | y = np.array(y) 85 | return pd.DataFrame(data={ 86 | 'x1': x1, 87 | 'x2': x2, 88 | 'x3': x3, 89 | 't': t, 90 | 'y': y 91 | }) 92 | -------------------------------------------------------------------------------- /pyuplift/datasets/loaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/pyuplift/datasets/loaders/__init__.py -------------------------------------------------------------------------------- /pyuplift/datasets/loaders/criteo_uplift_prediction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from pyuplift.utils import download_file, retrieve_from_gz 5 | 6 | 7 | def download_criteo_uplift_prediction( 8 | data_home=None, 9 | url='https://s3.us-east-2.amazonaws.com/criteo-uplift-dataset/criteo-uplift.csv.gz' 10 | ): 11 | """Downloading the Criteo Uplift Prediction dataset. 12 | 13 | **************** 14 | Data description 15 | **************** 16 | This dataset is constructed by assembling data resulting from several incrementality tests, 17 | a particular randomized trial procedure 18 | where a random part of the population is prevented from being targeted by advertising. 19 | It consists of 25M rows, each one representing a user with 11 features, 20 | a treatment indicator and 2 labels (visits and conversions). 21 | 22 | ******* 23 | Privacy 24 | ******* 25 | For privacy reasons the data has been sub-sampled non-uniformly so that the original incrementality level 26 | cannot be deduced from the dataset while preserving a realistic, challenging benchmark. 27 | Feature names have been anonymized and their values randomly projected so as to keep predictive power 28 | while making it practically impossible to recover the original features or user context. 29 | 30 | +--------------------------+------------+ 31 | |Features | 11 | 32 | +--------------------------+------------+ 33 | |Treatment | 2 | 34 | +--------------------------+------------+ 35 | |Samples total | 25,309,483 | 36 | +--------------------------+------------+ 37 | |Average visit rate | 0.04132 | 38 | +--------------------------+------------+ 39 | |Average conversion rate | 0.00229 | 40 | +--------------------------+------------+ 41 | 42 | More information about dataset you can find in 43 | the `official dataset description `_. 44 | 45 | +-----------------+----------------------------------------------------------------------------------+ 46 | | **Parameters** | | **data_home: string** | 47 | | | | Specify another download and cache folder for the dataset. | 48 | | | | By default the dataset will be stored in the data folder in the same folder. | 49 | | | | **url: string** | 50 | | | | The URL to file with data. | 51 | +-----------------+----------------------------------------------------------------------------------+ 52 | | **Returns** | **None** | 53 | +-----------------+----------------------------------------------------------------------------------+ 54 | """ 55 | 56 | data_home, dataset_path = __get_data_home_dataset_file_paths(data_home) 57 | if not os.path.isdir(data_home): 58 | os.makedirs(data_home) 59 | 60 | archive_path = dataset_path.replace('.csv', '.gz') 61 | if not os.path.exists(dataset_path): 62 | if not os.path.exists(archive_path): 63 | download_file(url, archive_path) 64 | retrieve_from_gz(archive_path, dataset_path) 65 | 66 | 67 | def load_criteo_uplift_prediction( 68 | data_home=None, 69 | download_if_missing=True 70 | ): 71 | """Loading the Criteo Uplift Prediction dataset from the local file. 72 | 73 | **************** 74 | Data description 75 | **************** 76 | This dataset is constructed by assembling data resulting from several incrementality tests, 77 | a particular randomized trial procedure 78 | where a random part of the population is prevented from being targeted by advertising. 79 | It consists of 25M rows, each one representing a user with 11 features, 80 | a treatment indicator and 2 labels (visits and conversions). 81 | 82 | ******* 83 | Privacy 84 | ******* 85 | For privacy reasons the data has been sub-sampled non-uniformly so that the original incrementality level 86 | cannot be deduced from the dataset while preserving a realistic, challenging benchmark. 87 | Feature names have been anonymized and their values randomly projected so as to keep predictive power 88 | while making it practically impossible to recover the original features or user context. 89 | 90 | +--------------------------+------------+ 91 | |Features | 11 | 92 | +--------------------------+------------+ 93 | |Treatment | 2 | 94 | +--------------------------+------------+ 95 | |Samples total | 25,309,483 | 96 | +--------------------------+------------+ 97 | |Average visit rate | 0.04132 | 98 | +--------------------------+------------+ 99 | |Average conversion rate | 0.00229 | 100 | +--------------------------+------------+ 101 | 102 | More information about dataset you can find in 103 | the `official dataset description `_. 104 | 105 | Parameters 106 | ---------- 107 | data_home : str, optional (default=None) 108 | Specify another download and cache folder for the dataset. 109 | By default the dataset will be stored in the data folder in the same folder. 110 | download_if_missing : bool, optional (default=True) 111 | Download the dataset if it is not downloaded. 112 | 113 | Returns 114 | ------- 115 | dataset : dict object with the following attributes: 116 | 117 | dataset.description : str 118 | Description of the Criteo Uplift Prediction dataset. 119 | 120 | dataset.data : ndarray, shape (25309483, 11) 121 | Each row corresponding to the 11 feature values in order. 122 | 123 | dataset.feature_names : list, size 11 124 | List of feature names. 125 | 126 | dataset.treatment : ndarray, shape (25309483,) 127 | Each value corresponds to the treatment. 128 | 129 | dataset.target : numpy array of shape (25309483,) 130 | Each value corresponds to one of the outcomes. By default, it's `visit` outcome (look at `target_visit` below). 131 | 132 | dataset.target_visit : numpy array of shape (25309483,) 133 | Each value corresponds to whether a visit occurred for this user (binary, label). 134 | 135 | dataset.target_exposure : numpy array of shape (25309483,) 136 | Each value corresponds to treatment effect, whether the user has been effectively exposed (binary). 137 | 138 | dataset.target_conversion : numpy array of shape (25309483,) 139 | Each value corresponds to whether a conversion occurred for this user (binary, label). 140 | """ 141 | 142 | data_home, dataset_path = __get_data_home_dataset_file_paths(data_home) 143 | if not os.path.exists(dataset_path): 144 | if download_if_missing: 145 | download_criteo_uplift_prediction(data_home) 146 | else: 147 | raise FileNotFoundError( 148 | 'The dataset does not exist. ' 149 | 'Use `download_criteo_uplift_prediction` function to download the dataset.' 150 | ) 151 | 152 | df = pd.read_csv(dataset_path) 153 | description = 'This dataset is constructed by assembling data resulting from several incrementality tests, ' \ 154 | 'a particular randomized trial procedure where a random part of the population' \ 155 | 'is prevented from being targeted by advertising. It consists of 25M rows, ' \ 156 | 'each one representing a user with 11 features, a treatment indicator and ' \ 157 | '2 labels (visits and conversions).' 158 | 159 | drop_names = ['exposure', 'visit', 'conversion', 'treatment'] 160 | dataset = { 161 | 'description': description, 162 | 'data': df.drop(drop_names, axis=1).values, 163 | 'feature_names': np.array([name for name in df.columns if name not in drop_names]), 164 | 'treatment': df['treatment'].values, 165 | 'target': df['visit'].values, 166 | 'target_visit': df['visit'].values, 167 | 'target_exposure': df['exposure'].values, 168 | 'target_conversion': df['conversion'].values, 169 | } 170 | return dataset 171 | 172 | 173 | def __get_data_home_dataset_file_paths(data_home_path): 174 | if data_home_path is None: 175 | data_home_path = os.path.join(os.sep.join(__file__.split(os.sep)[:-1]), 'data') 176 | dataset_path = os.path.join(data_home_path, 'criteo_uplift_prediction.csv') 177 | return data_home_path, dataset_path 178 | 179 | 180 | -------------------------------------------------------------------------------- /pyuplift/datasets/loaders/lalonde_nsw.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.utils import shuffle 5 | 6 | 7 | column_names = ['treat', 'age', 'educ', 'black', 'hisp', 'married', 'nodegr', 're75', 're78'] 8 | column_types = { 9 | 'treat': 'int32', 10 | 'age': 'int32', 11 | 'educ': 'int32', 12 | 'black': 'int32', 13 | 'hisp': 'int32', 14 | 'married': 'int32', 15 | 'nodegr': 'int32', 16 | } 17 | 18 | 19 | def download_lalonde_nsw( 20 | data_home=None, 21 | control_data_url='https://users.nber.org/~rdehejia/data/nsw_control.txt', 22 | treated_data_url='https://users.nber.org/~rdehejia/data/nsw_treated.txt', 23 | separator=r'\s+', 24 | column_names=column_names, 25 | column_types=column_types, 26 | random_state=123 27 | ): 28 | """Downloading the Lalonde NSW dataset. 29 | 30 | **************** 31 | Data description 32 | **************** 33 | The dataset contains the treated and control units from the male sub-sample 34 | from the National Supported Work Demonstration as used by Lalonde in his paper. 35 | 36 | +--------------------------+------------+ 37 | | Features | 7 | 38 | +--------------------------+------------+ 39 | | Treatment | 2 | 40 | +--------------------------+------------+ 41 | | Samples total | 722 | 42 | +--------------------------+------------+ 43 | 44 | More information about dataset you can find `here `_. 45 | 46 | +-----------------+----------------------------------------------------------------------------------+ 47 | | **Parameters** | | **data_home: str** | 48 | | | | Specify another download and cache folder for the dataset. | 49 | | | | By default the dataset will be stored in the data folder in the same folder. | 50 | | | | **control_data_url: str** | 51 | | | | The URL to file with data of the control group. | 52 | | | | **treated_data_url: str** | 53 | | | | The URL to file with data of the treated group. | 54 | | | | **separator: str** | 55 | | | | The separator which used in the data files. | 56 | | | | **column_names: list** | 57 | | | | List of column names of the dataset. | 58 | | | | **column_types: dict** | 59 | | | | List of types for columns of the dataset. | 60 | | | | **random_state: int** | 61 | | | | The random seed. | 62 | +-----------------+----------------------------------------------------------------------------------+ 63 | | **Returns** | **None** | 64 | +-----------------+----------------------------------------------------------------------------------+ 65 | """ 66 | 67 | data_home, dataset_path = __get_data_home_dataset_file_paths(data_home) 68 | if not os.path.isdir(data_home): 69 | os.makedirs(data_home) 70 | 71 | if not os.path.exists(dataset_path): 72 | try: 73 | control_df = pd.read_csv( 74 | control_data_url, 75 | sep=separator, 76 | header=None, 77 | names=column_names, 78 | dtype=column_types 79 | ) 80 | except: 81 | raise Exception( 82 | 'The file with data of the control group not found. ' 83 | 'Check `control_data_url` value.' 84 | ) 85 | 86 | try: 87 | treated_df = pd.read_csv( 88 | treated_data_url, 89 | sep=separator, 90 | header=None, 91 | names=column_names, 92 | dtype=column_types 93 | ) 94 | except: 95 | raise Exception( 96 | 'The file with data of the treated group not found. ' 97 | 'Check `treated_data_url` value.' 98 | ) 99 | 100 | df = control_df.append(treated_df, ignore_index=True) 101 | df = shuffle(df, random_state=random_state) 102 | df.to_csv(dataset_path, index=False) 103 | 104 | 105 | def load_lalonde_nsw( 106 | data_home=None, 107 | download_if_missing=True 108 | ): 109 | """Loading the Lalonde NSW dataset from the local file. 110 | 111 | **************** 112 | Data description 113 | **************** 114 | The dataset contains the treated and control units from the male sub-sample 115 | from the National Supported Work Demonstration as used by Lalonde in his paper. 116 | 117 | +--------------------------+------------+ 118 | | Features | 7 | 119 | +--------------------------+------------+ 120 | | Treatment | 2 | 121 | +--------------------------+------------+ 122 | | Samples total | 722 | 123 | +--------------------------+------------+ 124 | 125 | More information about dataset you can find `here `_. 126 | 127 | Parameters 128 | ---------- 129 | data_home : str, optional (default=None) 130 | Specify another download and cache folder for the dataset. 131 | By default the dataset will be stored in the data folder in the same folder. 132 | download_if_missing : bool, optional (default=True) 133 | Download the dataset if it is not downloaded. 134 | 135 | Returns 136 | ------- 137 | dataset : dict object with the following attributes: 138 | 139 | dataset.description : str 140 | Description of the dataset. 141 | 142 | dataset.data : ndarray, shape (722, 7) 143 | Each row corresponding to the 7 feature values in order. 144 | 145 | dataset.feature_names : list, size 7 146 | List of feature names. 147 | 148 | dataset.treatment : ndarray, shape (722,) 149 | Each value corresponds to the treatment. 150 | 151 | dataset.target : numpy array of shape (722,) 152 | Each value corresponds to one of the outcomes. By default, it's `re78` outcome. 153 | """ 154 | 155 | data_home, dataset_path = __get_data_home_dataset_file_paths(data_home) 156 | if not os.path.exists(dataset_path): 157 | if download_if_missing: 158 | download_lalonde_nsw(data_home) 159 | else: 160 | raise FileNotFoundError( 161 | 'The dataset does not exist. ' 162 | 'Use `download_lalonde_nsw` function to download the dataset.' 163 | ) 164 | 165 | df = pd.read_csv(dataset_path) 166 | description = 'The dataset contains the treated and control units from the male sub-sample ' \ 167 | 'from the National Supported Work Demonstration as used by Lalonde in his paper.' 168 | 169 | drop_names = ['treat', 're78'] 170 | dataset = { 171 | 'description': description, 172 | 'data': df.drop(drop_names, axis=1).values, 173 | 'feature_names': np.array([name for name in df.columns if name not in drop_names]), 174 | 'treatment': df['treat'].values, 175 | 'target': df['re78'].values, 176 | } 177 | return dataset 178 | 179 | 180 | def __get_data_home_dataset_file_paths(data_home_path): 181 | if data_home_path is None: 182 | data_home_path = os.path.join(os.sep.join(__file__.split(os.sep)[:-1]), 'data') 183 | dataset_path = os.path.join(data_home_path, 'lalonde_nsw.csv') 184 | return data_home_path, dataset_path 185 | -------------------------------------------------------------------------------- /pyuplift/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .average_effect import get_average_effect 2 | -------------------------------------------------------------------------------- /pyuplift/metrics/average_effect.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from statistics import mean 3 | 4 | 5 | def get_average_effect(y_test, t_test, y_pred, test_share=0.3): 6 | """Estimating an average effect of the test set. 7 | 8 | +-----------------+----------------------------------------------------------------------------------+ 9 | | **Parameters:** | | **y_test**: numpy array | 10 | | | | Actual y values. | 11 | | | | **t_test**: numpy array | 12 | | | | Actual treatment values. | 13 | | | | **y_pred**: numpy array | 14 | | | | Predicted y values by uplift model. | 15 | | | | **test_share**: float | 16 | | | | Share of the test data which will be taken for estimating an average effect. | 17 | +-----------------+----------------------------------------------------------------------------------+ 18 | | **Returns:** | | **average effect**: float | 19 | | | | Average effect on the test set. | 20 | +-----------------+----------------------------------------------------------------------------------+ 21 | """ 22 | 23 | df = pd.DataFrame(data={ 24 | 'effect': y_pred, 25 | 'y': y_test, 26 | 't': t_test 27 | }) 28 | df = df.sort_values(by='effect', ascending=False) 29 | test_size = int(test_share * df.shape[0]) 30 | idx, s1, s0 = 0, [], [] 31 | for _, row in df.iterrows(): 32 | idx += 1 33 | if idx > test_size: 34 | break 35 | if row['t'] == 1: 36 | s1.append(row['y']) 37 | else: 38 | s0.append(row['y']) 39 | if len(s0) == 0: 40 | s0.append(0) 41 | if len(s1) == 0: 42 | s1.append(0) 43 | return mean(s1) - mean(s0) 44 | -------------------------------------------------------------------------------- /pyuplift/model_selection/__init__.py: -------------------------------------------------------------------------------- 1 | from .splitters.train_test_split import train_test_split 2 | 3 | from .model_validation.treatment_cross_validation import treatment_cross_val_score 4 | -------------------------------------------------------------------------------- /pyuplift/model_selection/model_validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/pyuplift/model_selection/model_validation/__init__.py -------------------------------------------------------------------------------- /pyuplift/model_selection/model_validation/treatment_cross_validation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from pyuplift.metrics import get_average_effect 4 | from pyuplift.model_selection import train_test_split 5 | 6 | 7 | def treatment_cross_val_score(X, y, t, model, cv=5, train_share=0.7, seeds=None): 8 | """Evaluate a scores by cross-validation. 9 | 10 | +------------------+-----------------------------------------------------------------------------------------+ 11 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 12 | | | | Matrix of features. | 13 | | | | **y: numpy array with shape = [n_samples,]** | 14 | | | | Array of target of feature. | 15 | | | | **t: numpy array with shape = [n_samples,]** | 16 | | | | Array of treatments. | 17 | | | | **train_share: float, optional (default=0.7)** | 18 | | | | train_share represents the proportion of the dataset to include in the train split. | 19 | | | | **random_state: int, optional (default=777)** | 20 | | | | random_state is the seed used by the random number generator. | 21 | +------------------+-----------------------------------------------------------------------------------------+ 22 | | **Return** | | **scores: numpy array of floats** | 23 | | | | Array of scores of the estimator for each run of the cross validation. | 24 | +------------------+-----------------------------------------------------------------------------------------+ 25 | """ 26 | 27 | if seeds is None: 28 | seeds = [None for _ in range(cv)] 29 | 30 | if cv < 1: 31 | raise ValueError('Count of validations should be positive integer number.') 32 | elif cv != len(seeds): 33 | raise ValueError("The length of seed's array should be equals to cv.") 34 | elif not (0 < train_share <= 1): 35 | raise ValueError('Train share should be float number between 0 and 1.') 36 | 37 | scores = [] 38 | for seed in seeds: 39 | X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(X, y, t, train_share, seed) 40 | model.fit(X_train, y_train, t_train) 41 | score = get_average_effect(y_test, t_test, model.predict(X_test)) 42 | scores.append(score) 43 | return np.array(scores) 44 | -------------------------------------------------------------------------------- /pyuplift/model_selection/splitters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/pyuplift/model_selection/splitters/__init__.py -------------------------------------------------------------------------------- /pyuplift/model_selection/splitters/train_test_split.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def train_test_split(X, y, t, train_share=0.7, random_state=None): 5 | """Split X, y, t into random train and test subsets. 6 | 7 | +------------------+-----------------------------------------------------------------------------------------+ 8 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 9 | | | | Matrix of features. | 10 | | | | **y: numpy array with shape = [n_samples,]** | 11 | | | | Array of target of feature. | 12 | | | | **t: numpy array with shape = [n_samples,]** | 13 | | | | Array of treatments. | 14 | | | | **train_share: float, optional (default=0.7)** | 15 | | | | train_share represents the proportion of the dataset to include in the train split. | 16 | | | | **random_state: int, optional (default=None)** | 17 | | | | random_state is the seed used by the random number generator. | 18 | +------------------+-----------------------------------------------------------------------------------------+ 19 | | **Return** | | **X_train: numpy ndarray** | 20 | | | | Train matrix of features. | 21 | | | | **X_test: numpy ndarray** | 22 | | | | Test matrix of features. | 23 | | | | **y_train: numpy array** | 24 | | | | Train array of target of feature. | 25 | | | | **y_test: numpy array** | 26 | | | | Test array of target of feature. | 27 | | | | **t_train: numpy array** | 28 | | | | Train array of treatments. | 29 | | | | **t_test: numpy array** | 30 | | | | Test array of treatments. | 31 | +------------------+-----------------------------------------------------------------------------------------+ 32 | """ 33 | 34 | if not (0 < train_share <= 1): 35 | raise ValueError('Train share should be float number between 0 and 1.') 36 | 37 | random.seed(random_state) 38 | size = len(y) 39 | train_part_size = int(train_share * size) 40 | train_index = random.sample([i for i in range(size)], train_part_size) 41 | test_index = [i for i in range(size) if i not in train_index] 42 | 43 | X_train = X[train_index, :] 44 | X_test = X[test_index, :] 45 | 46 | y_train = y[train_index] 47 | y_test = y[test_index] 48 | 49 | t_train = t[train_index] 50 | t_test = t[test_index] 51 | return X_train, X_test, y_train, y_test, t_train, t_test 52 | -------------------------------------------------------------------------------- /pyuplift/transformation/__init__.py: -------------------------------------------------------------------------------- 1 | from .lai import Lai 2 | from .kane import Kane 3 | from .pessimistic import Pessimistic 4 | from .reflective import Reflective 5 | from .jaskowski import Jaskowski 6 | -------------------------------------------------------------------------------- /pyuplift/transformation/base.py: -------------------------------------------------------------------------------- 1 | from pyuplift import BaseModel 2 | 3 | 4 | class TransformationBaseModel(BaseModel): 5 | """Base class for a transformation uplift models. 6 | 7 | Note: This class should not be used directly. Use derived classes instead. 8 | """ 9 | 10 | def is_tr(self, y, t): 11 | """Is pair (y,t) is TR? 12 | Treatment responders (TR) are customers who were treated and responded 13 | 14 | Parameters 15 | ---------- 16 | y : float 17 | The target value. 18 | t : float 19 | The treatment value. 20 | Returns 21 | ------- 22 | is_tr : bool 23 | """ 24 | return t != 0 and y != 0 25 | 26 | def is_cn(self, y, t): 27 | """Is pair (y,t) is CN? 28 | Control nonresponders (CN) are the customers who did not receive a treatment and did not respond. 29 | 30 | Parameters 31 | ---------- 32 | y : float 33 | The target value. 34 | t : float 35 | The treatment value. 36 | Returns 37 | ------- 38 | is_tr : bool 39 | """ 40 | return t == 0 and y == 0 41 | 42 | def is_tn(self, y, t): 43 | """Is pair (y,t) is TN? 44 | Treatment nonresponders (TN) are customers who received a treatment but did not respond. 45 | 46 | Parameters 47 | ---------- 48 | y : float 49 | The target value. 50 | t : float 51 | The treatment value. 52 | Returns 53 | ------- 54 | is_tr : bool 55 | """ 56 | return t != 0 and y == 0 57 | 58 | def is_cr(self, y, t): 59 | """Is pair (y,t) is CR? 60 | Control responders (CR) are the customers who responded without having received a treatment. 61 | 62 | Parameters 63 | ---------- 64 | y : float 65 | The target value. 66 | t : float 67 | The treatment value. 68 | Returns 69 | ------- 70 | is_tr : bool 71 | """ 72 | return t == 0 and y != 0 73 | -------------------------------------------------------------------------------- /pyuplift/transformation/jaskowski.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LogisticRegression 3 | 4 | from .base import TransformationBaseModel 5 | 6 | 7 | class Jaskowski(TransformationBaseModel): 8 | """The class which implements the Jaskowski's approach. 9 | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 12 | | | | The classification model which will be used for predict uplift. | 13 | +----------------+-----------------------------------------------------------------------------------+ 14 | 15 | 16 | ******* 17 | Methods 18 | ******* 19 | +-----------------------------------------------+----------------------------------------------------+ 20 | | :ref:`fit(self, X, y, t) ` | Build the model from the training set (X, y, t). | 21 | +-----------------------------------------------+----------------------------------------------------+ 22 | | :ref:`predict(self, X, t=None) `| Predict an uplift for X. | 23 | +-----------------------------------------------+----------------------------------------------------+ 24 | """ 25 | 26 | def __init__(self, model=LogisticRegression(n_jobs=-1)): 27 | try: 28 | model.__getattribute__('fit') 29 | model.__getattribute__('predict') 30 | except AttributeError: 31 | raise ValueError('Model should contains two methods: fit and predict.') 32 | self.model = model 33 | 34 | def fit(self, X, y, t): 35 | """Build the model from the training set (X, y, t). 36 | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 39 | | | | Matrix of features. | 40 | | | | **y: numpy array with shape = [n_samples,]** | 41 | | | | Array of target of feature. | 42 | | | | **t: numpy array with shape = [n_samples,]** | 43 | | | | Array of treatments. | 44 | +------------------+---------------------------------------------------------------------------------+ 45 | | **Returns** | **self : object** | 46 | +------------------+---------------------------------------------------------------------------------+ 47 | """ 48 | 49 | y_encoded = self.__encode_data(y, t) 50 | self.model.fit(X, y_encoded) 51 | return self 52 | 53 | def predict(self, X, t=None): 54 | """Predict an uplift for X. 55 | 56 | +------------------+---------------------------------------------------------------------------------+ 57 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 58 | | | | Matrix of features. | 59 | | | | **t: numpy array with shape = [n_samples,] or None** | 60 | | | | Array of treatments. | 61 | +------------------+---------------------------------------------------------------------------------+ 62 | | **Returns** | | **self : object** | 63 | | | | The predicted values. | 64 | +------------------+---------------------------------------------------------------------------------+ 65 | """ 66 | 67 | p = self.model.predict_proba(X)[:, 1] 68 | return 2 * p - 1 69 | 70 | def __encode_data(self, y, t): 71 | y_values = [] 72 | for i in range(y.shape[0]): 73 | if self.is_tr(y[i], t[i]) or self.is_cn(y[i], t[i]): 74 | y_values.append(1) 75 | else: 76 | y_values.append(0) 77 | return np.array(y_values) 78 | -------------------------------------------------------------------------------- /pyuplift/transformation/kane.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LogisticRegression 3 | 4 | from .base import TransformationBaseModel 5 | 6 | 7 | class Kane(TransformationBaseModel): 8 | """The class which implements the Kane's approach. 9 | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 12 | | | | The classification model which will be used for predict uplift. | 13 | | | | **use_weights : boolean, optional (default=False)** | 14 | | | | Use or not weights? | 15 | +----------------+-----------------------------------------------------------------------------------+ 16 | 17 | 18 | ******* 19 | Methods 20 | ******* 21 | +-----------------------------------------------+----------------------------------------------------+ 22 | | :ref:`fit(self, X, y, t) ` | Build the model from the training set (X, y, t). | 23 | +-----------------------------------------------+----------------------------------------------------+ 24 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 25 | +-----------------------------------------------+----------------------------------------------------+ 26 | """ 27 | 28 | def __init__(self, model=LogisticRegression(n_jobs=-1), use_weights=False): 29 | try: 30 | model.__getattribute__('fit') 31 | model.__getattribute__('predict') 32 | except AttributeError: 33 | raise ValueError('Model should contains two methods: fit and predict.') 34 | self.model = model 35 | self.use_weights = use_weights 36 | 37 | def fit(self, X, y, t): 38 | """Build the model from the training set (X, y, t). 39 | 40 | +------------------+---------------------------------------------------------------------------------+ 41 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 42 | | | | Matrix of features. | 43 | | | | **y: numpy array with shape = [n_samples,]** | 44 | | | | Array of target of feature. | 45 | | | | **t: numpy array with shape = [n_samples,]** | 46 | | | | Array of treatments. | 47 | +------------------+---------------------------------------------------------------------------------+ 48 | | **Returns** | **self : object** | 49 | +------------------+---------------------------------------------------------------------------------+ 50 | """ 51 | 52 | y_encoded = self.__encode_data(y, t) 53 | self.model.fit(X, y_encoded) 54 | if self.use_weights: 55 | self.__init_weights(t) 56 | return self 57 | 58 | def predict(self, X, t=None): 59 | """Predict an uplift for X. 60 | 61 | +------------------+---------------------------------------------------------------------------------+ 62 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 63 | | | | Matrix of features. | 64 | | | | **t: numpy array with shape = [n_samples,] or None** | 65 | | | | Array of treatments. | 66 | +------------------+---------------------------------------------------------------------------------+ 67 | | **Returns** | | **self : object** | 68 | | | | The predicted values. | 69 | +------------------+---------------------------------------------------------------------------------+ 70 | """ 71 | 72 | p_tr = self.model.predict_proba(X)[:, 0] 73 | p_cn = self.model.predict_proba(X)[:, 1] 74 | p_tn = self.model.predict_proba(X)[:, 2] 75 | p_cr = self.model.predict_proba(X)[:, 3] 76 | if self.use_weights: 77 | return (p_tr / self.treatment_count + p_cn / self.control_count) - \ 78 | (p_tn / self.treatment_count + p_cr / self.control_count) 79 | else: 80 | return (p_tr + p_cn) - (p_tn + p_cr) 81 | 82 | def __encode_data(self, y, t): 83 | y_values = [] 84 | for i in range(y.shape[0]): 85 | if self.is_tr(y[i], t[i]): 86 | y_values.append(0) 87 | elif self.is_cn(y[i], t[i]): 88 | y_values.append(1) 89 | elif self.is_tn(y[i], t[i]): 90 | y_values.append(2) 91 | elif self.is_cr(y[i], t[i]): 92 | y_values.append(3) 93 | return np.array(y_values) 94 | 95 | def __init_weights(self, t): 96 | control_count, treatment_count = 0, 0 97 | for el in t: 98 | if el == 0.0: 99 | control_count += 1 100 | else: 101 | treatment_count += 1 102 | self.control_count = control_count 103 | self.treatment_count = treatment_count 104 | -------------------------------------------------------------------------------- /pyuplift/transformation/lai.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LogisticRegression 3 | 4 | from .base import TransformationBaseModel 5 | 6 | 7 | class Lai(TransformationBaseModel): 8 | """The class which implements the Lai's approach. 9 | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 12 | | | | The classification model which will be used for predict uplift. | 13 | | | | **use_weights : boolean, optional (default=False)** | 14 | | | | Use or not weights? | 15 | +----------------+-----------------------------------------------------------------------------------+ 16 | 17 | 18 | ******* 19 | Methods 20 | ******* 21 | +-----------------------------------------------+----------------------------------------------------+ 22 | | :ref:`fit(self, X, y, t) ` | Build a Lai model from the training set (X, y, t). | 23 | +-----------------------------------------------+----------------------------------------------------+ 24 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 25 | +-----------------------------------------------+----------------------------------------------------+ 26 | """ 27 | 28 | def __init__(self, model=LogisticRegression(n_jobs=-1), use_weights=False): 29 | try: 30 | model.__getattribute__('fit') 31 | model.__getattribute__('predict') 32 | except AttributeError: 33 | raise ValueError('Model should contains two methods: fit and predict.') 34 | self.model = model 35 | self.use_weights = use_weights 36 | 37 | def fit(self, X, y, t): 38 | """Build a Lai model from the training set (X, y, t). 39 | 40 | +------------------+---------------------------------------------------------------------------------+ 41 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 42 | | | | Matrix of features. | 43 | | | | **y: numpy array with shape = [n_samples,]** | 44 | | | | Array of target of feature. | 45 | | | | **t: numpy array with shape = [n_samples,]** | 46 | | | | Array of treatments. | 47 | +------------------+---------------------------------------------------------------------------------+ 48 | | **Returns** | **self : object** | 49 | +------------------+---------------------------------------------------------------------------------+ 50 | """ 51 | 52 | y_encoded = self.__encode_data(y, t) 53 | if self.use_weights: 54 | self.__init_weights(y, t) 55 | self.model.fit(X, y_encoded) 56 | return self 57 | 58 | def predict(self, X, t=None): 59 | """Predict an uplift for X. 60 | 61 | +------------------+---------------------------------------------------------------------------------+ 62 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 63 | | | | Matrix of features. | 64 | | | | **t: numpy array with shape = [n_samples,] or None** | 65 | | | | Array of treatments. | 66 | +------------------+---------------------------------------------------------------------------------+ 67 | | **Returns** | | **self : object** | 68 | | | | The predicted values. | 69 | +------------------+---------------------------------------------------------------------------------+ 70 | """ 71 | 72 | p_tr_cn = self.model.predict_proba(X)[:, 1] 73 | if self.use_weights: 74 | p_tn_cr = self.model.predict_proba(X)[:, 0] 75 | return p_tr_cn * self.p_tr_or_cn - p_tn_cr * self.p_tn_or_cr 76 | else: 77 | return 2 * p_tr_cn - 1 78 | 79 | def __encode_data(self, y, t): 80 | y_values = [] 81 | for i in range(y.shape[0]): 82 | if self.is_tr(y[i], t[i]) or self.is_cn(y[i], t[i]): 83 | y_values.append(1) 84 | elif self.is_tn(y[i], t[i]) or self.is_cr(y[i], t[i]): 85 | y_values.append(0) 86 | return np.array(y_values) 87 | 88 | def __init_weights(self, y, t): 89 | pos_count, neg_count = 0, 0 90 | for i in range(y.shape[0]): 91 | if self.is_tr(y[i], t[i]) or self.is_cn(y[i], t[i]): 92 | pos_count += 1 93 | elif self.is_tn(y[i], t[i]) or self.is_cr(y[i], t[i]): 94 | neg_count += 1 95 | 96 | self.p_tr_or_cn = pos_count / (pos_count + neg_count) 97 | self.p_tn_or_cr = neg_count / (pos_count + neg_count) 98 | -------------------------------------------------------------------------------- /pyuplift/transformation/pessimistic.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LogisticRegression 2 | 3 | from .base import TransformationBaseModel 4 | from .lai import Lai 5 | from .reflective import Reflective 6 | 7 | 8 | class Pessimistic(TransformationBaseModel): 9 | """The class which implements the pessimistic approach. 10 | 11 | +----------------+-----------------------------------------------------------------------------------+ 12 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 13 | | | | The classification model which will be used for predict uplift. | 14 | +----------------+-----------------------------------------------------------------------------------+ 15 | 16 | 17 | ******* 18 | Methods 19 | ******* 20 | +-----------------------------------------------+----------------------------------------------------+ 21 | | :ref:`fit(self, X, y, t) ` | Build the model from the training set (X, y, t). | 22 | +-----------------------------------------------+----------------------------------------------------+ 23 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 24 | +-----------------------------------------------+----------------------------------------------------+ 25 | """ 26 | 27 | def __init__(self, model=LogisticRegression(n_jobs=-1)): 28 | try: 29 | model.__getattribute__('fit') 30 | model.__getattribute__('predict') 31 | except AttributeError: 32 | raise ValueError('Model should contains two methods: fit and predict.') 33 | self.w_lai_model = Lai(model, use_weights=True) 34 | self.reflective_model = Reflective(model) 35 | 36 | def fit(self, X, y, t): 37 | """Build the model from the training set (X, y, t). 38 | 39 | +------------------+---------------------------------------------------------------------------------+ 40 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 41 | | | | Matrix of features. | 42 | | | | **y: numpy array with shape = [n_samples,]** | 43 | | | | Array of target of feature. | 44 | | | | **t: numpy array with shape = [n_samples,]** | 45 | | | | Array of treatments. | 46 | +------------------+---------------------------------------------------------------------------------+ 47 | | **Returns** | **self : object** | 48 | +------------------+---------------------------------------------------------------------------------+ 49 | """ 50 | 51 | self.w_lai_model.fit(X, y, t) 52 | self.reflective_model.fit(X, y, t) 53 | return self 54 | 55 | def predict(self, X, t=None): 56 | """Predict an uplift for X. 57 | 58 | +------------------+---------------------------------------------------------------------------------+ 59 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 60 | | | | Matrix of features. | 61 | | | | **t: numpy array with shape = [n_samples,] or None** | 62 | | | | Array of treatments. | 63 | +------------------+---------------------------------------------------------------------------------+ 64 | | **Returns** | | **self : object** | 65 | | | | The predicted values. | 66 | +------------------+---------------------------------------------------------------------------------+ 67 | """ 68 | 69 | w_lai_uplift = self.w_lai_model.predict(X) 70 | reflective_uplift = self.reflective_model.predict(X) 71 | return (w_lai_uplift + reflective_uplift) / 2 72 | -------------------------------------------------------------------------------- /pyuplift/transformation/reflective.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LogisticRegression 3 | 4 | from .base import TransformationBaseModel 5 | 6 | 7 | class Reflective(TransformationBaseModel): 8 | """The class which implements the reflective approach. 9 | 10 | +----------------+-----------------------------------------------------------------------------------+ 11 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LogisticRegression)** | 12 | | | | The classification model which will be used for predict uplift. | 13 | +----------------+-----------------------------------------------------------------------------------+ 14 | 15 | 16 | ******* 17 | Methods 18 | ******* 19 | +-----------------------------------------------+----------------------------------------------------+ 20 | | :ref:`fit(self, X, y, t) ` | Build the model from the training set (X, y, t). | 21 | +-----------------------------------------------+----------------------------------------------------+ 22 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 23 | +-----------------------------------------------+----------------------------------------------------+ 24 | """ 25 | 26 | def __init__(self, model=LogisticRegression(n_jobs=-1)): 27 | try: 28 | model.__getattribute__('fit') 29 | model.__getattribute__('predict') 30 | except AttributeError: 31 | raise ValueError('Model should contains two methods: fit and predict.') 32 | self.model = model 33 | 34 | def fit(self, X, y, t): 35 | """Build the model from the training set (X, y, t). 36 | 37 | +------------------+---------------------------------------------------------------------------------+ 38 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 39 | | | | Matrix of features. | 40 | | | | **y: numpy array with shape = [n_samples,]** | 41 | | | | Array of target of feature. | 42 | | | | **t: numpy array with shape = [n_samples,]** | 43 | | | | Array of treatments. | 44 | +------------------+---------------------------------------------------------------------------------+ 45 | | **Returns** | **self : object** | 46 | +------------------+---------------------------------------------------------------------------------+ 47 | """ 48 | 49 | y_encoded = self.__encode_data(y, t) 50 | self.model.fit(X, y_encoded) 51 | self.__init_weights(y, t) 52 | return self 53 | 54 | def predict(self, X, t=None): 55 | """Predict an uplift for X. 56 | 57 | +------------------+---------------------------------------------------------------------------------+ 58 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 59 | | | | Matrix of features. | 60 | | | | **t: numpy array with shape = [n_samples,] or None** | 61 | | | | Array of treatments. | 62 | +------------------+---------------------------------------------------------------------------------+ 63 | | **Returns** | | **self : object** | 64 | | | | The predicted values. | 65 | +------------------+---------------------------------------------------------------------------------+ 66 | """ 67 | 68 | p_tr = self.model.predict_proba(X)[:, 0] 69 | p_cn = self.model.predict_proba(X)[:, 1] 70 | p_tn = self.model.predict_proba(X)[:, 2] 71 | p_cr = self.model.predict_proba(X)[:, 3] 72 | 73 | p_pos = self.p_tlr * p_tr + self.p_cln * p_cn 74 | p_neg = self.p_tln * p_tn + self.p_clr * p_cr 75 | return p_pos - p_neg 76 | 77 | def __encode_data(self, y, t): 78 | y_values = [] 79 | for i in range(y.shape[0]): 80 | if self.is_tr(y[i], t[i]): 81 | y_values.append(0) 82 | elif self.is_cn(y[i], t[i]): 83 | y_values.append(1) 84 | elif self.is_tn(y[i], t[i]): 85 | y_values.append(2) 86 | elif self.is_cr(y[i], t[i]): 87 | y_values.append(3) 88 | return np.array(y_values) 89 | 90 | def __init_weights(self, y, t): 91 | t_r, c_r, t_n, c_n = 0, 0, 0, 0 92 | r_count, n_count = 0, 0 93 | size = y.shape[0] 94 | for i in range(size): 95 | if y[i] != 0: 96 | r_count += 1 97 | if t[i] != 0: 98 | # T|R 99 | t_r += 1 100 | else: 101 | # C|R 102 | c_r += 1 103 | else: 104 | n_count += 1 105 | if t[i] != 0: 106 | # T|N 107 | t_n += 1 108 | else: 109 | # C|N 110 | c_n += 1 111 | 112 | self.p_tlr = t_r / r_count 113 | self.p_clr = c_r / r_count 114 | self.p_cln = c_n / n_count 115 | self.p_tln = t_n / n_count 116 | -------------------------------------------------------------------------------- /pyuplift/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .downloader import download_file 2 | from .retriever import retrieve_from_gz 3 | -------------------------------------------------------------------------------- /pyuplift/utils/downloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | 4 | 5 | def download_file(url: str, output_path: str): 6 | """Download file from `url` to `output_path`. 7 | 8 | +-----------------+--------------------------------------+ 9 | | **Parameters** | | **url: string** | 10 | | | | Data's URL. | 11 | | | | **output_path: string** | 12 | | | | Path where file will be saved. | 13 | +-----------------+--------------------------------------+ 14 | | **Returns** | **None** | 15 | +-----------------+--------------------------------------+ 16 | """ 17 | 18 | if os.path.isfile(output_path): 19 | os.remove(output_path) 20 | 21 | print("Downloading file to '{}'...".format(output_path)) 22 | response = requests.get(url) 23 | # Check if the response is ok (200) 24 | status_code = int(response.status_code) 25 | if status_code == 200: 26 | with open(output_path, 'wb') as file: 27 | # A chunk of 128 bytes 28 | for chunk in response: 29 | file.write(chunk) 30 | elif status_code == 404: 31 | raise Exception('Wrong URL (' + url + ').') 32 | -------------------------------------------------------------------------------- /pyuplift/utils/retriever.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import shutil 3 | 4 | 5 | def retrieve_from_gz(archive_path: str, output_path: str): 6 | """The retrieving gz-archived data from `archive_path` to `output_path`. 7 | 8 | +-----------------+--------------------------------------+ 9 | | **Parameters** | | **archive_path: string** | 10 | | | | The archive path. | 11 | | | | **output_path: string** | 12 | | | | The retrieved data path. | 13 | +-----------------+--------------------------------------+ 14 | | **Returns** | **None** | 15 | +-----------------+--------------------------------------+ 16 | """ 17 | 18 | with gzip.open(archive_path, 'rb') as f_in: 19 | with open(output_path, 'wb') as f_out: 20 | shutil.copyfileobj(f_in, f_out) 21 | -------------------------------------------------------------------------------- /pyuplift/variable_selection/__init__.py: -------------------------------------------------------------------------------- 1 | from .cadit import Cadit 2 | from .dummy import Dummy 3 | from .two_model import TwoModel 4 | from .econometric import Econometric 5 | -------------------------------------------------------------------------------- /pyuplift/variable_selection/cadit.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | from pyuplift import BaseModel 4 | 5 | 6 | class Cadit(BaseModel): 7 | """The class which implements the cadit approach [1]. 8 | 9 | +----------------+-----------------------------------------------------------------------------------+ 10 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LinearRegression)** | 11 | | | | The regression model which will be used for predict uplift. | 12 | +----------------+-----------------------------------------------------------------------------------+ 13 | 14 | 15 | ******* 16 | Methods 17 | ******* 18 | +-------------------------------------------------+--------------------------------------------------+ 19 | | :ref:`fit(self, X, y, t) ` | Build a model from the training set (X, y, t). | 20 | +-------------------------------------------------+--------------------------------------------------+ 21 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 22 | +-------------------------------------------------+--------------------------------------------------+ 23 | """ 24 | 25 | def __init__(self, model=LinearRegression()): 26 | try: 27 | model.__getattribute__('fit') 28 | model.__getattribute__('predict') 29 | except AttributeError: 30 | raise ValueError('Model should contains two methods: fit and predict.') 31 | self.model = model 32 | 33 | def fit(self, X, y, t): 34 | """Build a model from the training set (X, y, t). 35 | 36 | +------------------+---------------------------------------------------------------------------------+ 37 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 38 | | | | Matrix of features. | 39 | | | | **y: numpy array with shape = [n_samples,]** | 40 | | | | Array of target of feature. | 41 | | | | **t: numpy array with shape = [n_samples,]** | 42 | | | | Array of treatments. | 43 | +------------------+---------------------------------------------------------------------------------+ 44 | | **Returns** | **self : object** | 45 | +------------------+---------------------------------------------------------------------------------+ 46 | """ 47 | 48 | z = self.__get_z_values(y, t) 49 | self.model.fit(X, z) 50 | return self 51 | 52 | def predict(self, X, t=None): 53 | """Predict an uplift for X. 54 | 55 | +------------------+---------------------------------------------------------------------------------+ 56 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 57 | | | | Matrix of features. | 58 | | | | **t: numpy array with shape = [n_samples,] or None** | 59 | | | | Array of treatments. | 60 | +------------------+---------------------------------------------------------------------------------+ 61 | | **Returns** | | **self : object** | 62 | | | | The predicted values. | 63 | +------------------+---------------------------------------------------------------------------------+ 64 | """ 65 | 66 | return self.model.predict(X) 67 | 68 | def __get_z_values(self, y, t): 69 | p_t0 = t[t == 0].shape[0] / t.shape[0] 70 | p_t1 = 1 - p_t0 71 | y_mean = y.mean() 72 | z = [] 73 | for i in range(y.shape[0]): 74 | if t[i] == 0: 75 | val = (1/p_t1) * (y[i] - y_mean) 76 | else: 77 | val = - (1/p_t0) * (y[i] - y_mean) 78 | z.append(val) 79 | return np.array(z) 80 | -------------------------------------------------------------------------------- /pyuplift/variable_selection/dummy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | from pyuplift import BaseModel 4 | 5 | 6 | class Dummy(BaseModel): 7 | """The class which implements the dummy approach. 8 | 9 | +----------------+-----------------------------------------------------------------------------------+ 10 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LinearRegression)** | 11 | | | | The regression model which will be used for predict uplift. | 12 | +----------------+-----------------------------------------------------------------------------------+ 13 | 14 | 15 | ******* 16 | Methods 17 | ******* 18 | +-------------------------------------------------+-----------------------------------------------------+ 19 | | :ref:`fit(self, X, y, t) ` | Build a dummy model from the training set (X, y, t).| 20 | +-------------------------------------------------+-----------------------------------------------------+ 21 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 22 | +-------------------------------------------------+-----------------------------------------------------+ 23 | """ 24 | 25 | def __init__(self, model=LinearRegression()): 26 | try: 27 | model.__getattribute__('fit') 28 | model.__getattribute__('predict') 29 | except AttributeError: 30 | raise ValueError('Model should contains two methods: fit and predict.') 31 | self.model = model 32 | 33 | def fit(self, X, y, t): 34 | """Build a dummy model from the training set (X, y, t). 35 | 36 | +------------------+---------------------------------------------------------------------------------+ 37 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 38 | | | | Matrix of features. | 39 | | | | **y: numpy array with shape = [n_samples,]** | 40 | | | | Array of target of feature. | 41 | | | | **t: numpy array with shape = [n_samples,]** | 42 | | | | Array of treatments. | 43 | +------------------+---------------------------------------------------------------------------------+ 44 | | **Returns** | **self : object** | 45 | +------------------+---------------------------------------------------------------------------------+ 46 | """ 47 | 48 | x_train = np.append(X, t.reshape((-1, 1)), axis=1) 49 | self.model.fit(x_train, y) 50 | return self 51 | 52 | def predict(self, X, t=None): 53 | """Predict an uplift for X. 54 | 55 | +------------------+---------------------------------------------------------------------------------+ 56 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 57 | | | | Matrix of features. | 58 | | | | **t: numpy array with shape = [n_samples,] or None** | 59 | | | | Array of treatments. | 60 | +------------------+---------------------------------------------------------------------------------+ 61 | | **Returns** | | **self : object** | 62 | | | | The predicted values. | 63 | +------------------+---------------------------------------------------------------------------------+ 64 | """ 65 | 66 | col = np.array(X.shape[0] * [0]) 67 | x_test = np.append(X, col.reshape((-1, 1)), axis=1) 68 | # All treatment values == 0 69 | s0 = self.model.predict(x_test) 70 | x_test[:, -1] = 1 71 | # All treatment values == 1 72 | s1 = self.model.predict(x_test) 73 | return s1 - s0 74 | -------------------------------------------------------------------------------- /pyuplift/variable_selection/econometric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | from pyuplift import BaseModel 4 | 5 | 6 | class Econometric(BaseModel): 7 | """The class which implements the econometric approach. 8 | 9 | +----------------+-----------------------------------------------------------------------------------+ 10 | | **Parameters** | | **model : object, optional (default=sklearn.linear_model.LinearRegression)** | 11 | | | | The regression model which will be used for predict uplift. | 12 | +----------------+-----------------------------------------------------------------------------------+ 13 | 14 | 15 | ******* 16 | Methods 17 | ******* 18 | +-----------------------------------------------+------------------------------------------------------------+ 19 | | :ref:`fit(self, X, y, t) ` | Build an econometric model from the training set (X, y, t).| 20 | +-----------------------------------------------+------------------------------------------------------------+ 21 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 22 | +-----------------------------------------------+------------------------------------------------------------+ 23 | """ 24 | 25 | def __init__(self, model=LinearRegression()): 26 | try: 27 | model.__getattribute__('fit') 28 | model.__getattribute__('predict') 29 | except AttributeError: 30 | raise ValueError('Model should contains two methods: fit and predict.') 31 | self.model = model 32 | 33 | def fit(self, X, y, t): 34 | """Build an econometric model from the training set (X, y, t). 35 | 36 | +------------------+---------------------------------------------------------------------------------+ 37 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 38 | | | | Matrix of features. | 39 | | | | **y: numpy array with shape = [n_samples,]** | 40 | | | | Array of target of feature. | 41 | | | | **t: numpy array with shape = [n_samples,]** | 42 | | | | Array of treatments. | 43 | +------------------+---------------------------------------------------------------------------------+ 44 | | **Returns** | **self : object** | 45 | +------------------+---------------------------------------------------------------------------------+ 46 | """ 47 | 48 | x_train = self.__get_matrix(X, t) 49 | self.model.fit(x_train, y) 50 | return self 51 | 52 | def predict(self, X, t=None): 53 | """Predict an uplift for X. 54 | 55 | +------------------+---------------------------------------------------------------------------------+ 56 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 57 | | | | Matrix of features. | 58 | | | | **t: numpy array with shape = [n_samples,] or None** | 59 | | | | Array of treatments. | 60 | +------------------+---------------------------------------------------------------------------------+ 61 | | **Returns** | | **self : object** | 62 | | | | The predicted values. | 63 | +------------------+---------------------------------------------------------------------------------+ 64 | """ 65 | 66 | x_test = self.__get_matrix(X, np.array(X.shape[0] * [0])) 67 | v0 = self.model.predict(x_test) 68 | x_test = self.__get_matrix(X, np.array(X.shape[0] * [1])) 69 | v1 = self.model.predict(x_test) 70 | return v1 - v0 71 | 72 | def __get_matrix(self, X, t): 73 | """Create X|T|X*T matrix""" 74 | 75 | x_t = np.append(X, t.reshape((-1, 1)), axis=1) 76 | xt = X * t.reshape((-1, 1)) 77 | return np.append(x_t, xt, axis=1) 78 | -------------------------------------------------------------------------------- /pyuplift/variable_selection/two_model.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LinearRegression 2 | from pyuplift import BaseModel 3 | 4 | 5 | class TwoModel(BaseModel): 6 | """The class which implements the two model approach. 7 | 8 | +----------------+---------------------------------------------------------------------------------------------+ 9 | | **Parameters** | | **no_treatment_model : object, optional (default=sklearn.linear_model.LinearRegression)** | 10 | | | | The regression model which will be used for predict uplift. | 11 | | | | **has_treatment_model : object, optional (default=sklearn.linear_model.LinearRegression)**| 12 | | | | The regression model which will be used for predict uplift. | 13 | +----------------+---------------------------------------------------------------------------------------------+ 14 | 15 | ******* 16 | Methods 17 | ******* 18 | +-----------------------------------------------+--------------------------------------------------------------+ 19 | | :ref:`fit(self, X, y, t) ` | Build a two model model from the training set (X, y, t). | 20 | +-----------------------------------------------+--------------------------------------------------------------+ 21 | | :ref:`predict(self, X, t=None) ` | Predict an uplift for X. | 22 | +-----------------------------------------------+--------------------------------------------------------------+ 23 | """ 24 | 25 | def __init__(self, no_treatment_model=LinearRegression(), has_treatment_model=LinearRegression()): 26 | try: 27 | no_treatment_model.__getattribute__('fit') 28 | no_treatment_model.__getattribute__('predict') 29 | except AttributeError: 30 | raise ValueError('No treatment model should contains two methods: fit and predict.') 31 | 32 | try: 33 | has_treatment_model.__getattribute__('fit') 34 | has_treatment_model.__getattribute__('predict') 35 | except AttributeError: 36 | raise ValueError('Has treatment model should contains two methods: fit and predict.') 37 | 38 | self.no_treatment_model = no_treatment_model 39 | self.has_treatment_model = has_treatment_model 40 | 41 | def fit(self, X, y, t): 42 | """Build a model model model from the training set (X, y, t). 43 | 44 | +------------------+---------------------------------------------------------------------------------+ 45 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 46 | | | | Matrix of features. | 47 | | | | **y: numpy array with shape = [n_samples,]** | 48 | | | | Array of target of feature. | 49 | | | | **t: numpy array with shape = [n_samples,]** | 50 | | | | Array of treatments. | 51 | +------------------+---------------------------------------------------------------------------------+ 52 | | **Returns** | **self : object** | 53 | +------------------+---------------------------------------------------------------------------------+ 54 | """ 55 | 56 | no_treatment_x, no_treatment_y = [], [] 57 | has_treatment_x, has_treatment_y = [], [] 58 | for idx, el in enumerate(t): 59 | if el: 60 | has_treatment_x.append(X[idx]) 61 | has_treatment_y.append(y[idx]) 62 | else: 63 | no_treatment_x.append(X[idx]) 64 | no_treatment_y.append(y[idx]) 65 | self.no_treatment_model.fit(no_treatment_x, no_treatment_y) 66 | self.has_treatment_model.fit(has_treatment_x, has_treatment_y) 67 | return self 68 | 69 | def predict(self, X, t=None): 70 | """Predict an uplift for X. 71 | 72 | +------------------+---------------------------------------------------------------------------------+ 73 | | **Parameters** | | **X: numpy ndarray with shape = [n_samples, n_features]** | 74 | | | | Matrix of features. | 75 | | | | **t: numpy array with shape = [n_samples,] or None** | 76 | | | | Array of treatments. | 77 | +------------------+---------------------------------------------------------------------------------+ 78 | | **Returns** | | **self : object** | 79 | | | | The predicted values. | 80 | +------------------+---------------------------------------------------------------------------------+ 81 | """ 82 | 83 | s1 = self.has_treatment_model.predict(X) 84 | s0 = self.no_treatment_model.predict(X) 85 | return s1 - s0 86 | -------------------------------------------------------------------------------- /resources/logo.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/resources/logo.psd -------------------------------------------------------------------------------- /resources/pyuplift-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/resources/pyuplift-logo.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import codecs 3 | from setuptools import setup, find_packages 4 | 5 | path = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | 8 | def read(filename): 9 | return codecs.open(os.path.join(path, filename), 'r').read() 10 | 11 | 12 | setup( 13 | name='pyuplift', 14 | version='0.0.4.0', 15 | license='MIT License', 16 | author='Artem Kuchumov', 17 | author_email='kuchumov7@gmail.com', 18 | url='https://github.com/duketemon/pyuplift', 19 | description='Uplift modeling implementation', 20 | long_description=read('README.MD'), 21 | long_description_content_type='text/markdown', 22 | include_package_data=True, 23 | zip_safe=False, 24 | packages=find_packages(), 25 | keywords=['uplift modeling', 'machine learning', 'true response modeling', 'incremental value marketing'], 26 | install_requires=[ 27 | 'pandas>=0.23.4', 28 | 'scikit-learn>=0.20.0', 29 | 'requests>=2.19.1', 30 | ], 31 | extras_require={ 32 | 'tests': [ 33 | 'pytest>=4.5.0' 34 | ] 35 | }, 36 | classifiers=[ 37 | 'Intended Audience :: Science/Research', 38 | 'Intended Audience :: Education', 39 | 'Programming Language :: Python :: 3', 40 | 'Programming Language :: Python :: 3.5', 41 | 'Programming Language :: Python :: 3.6', 42 | 'Programming Language :: Python :: 3.7', 43 | 'License :: OSI Approved :: MIT License', 44 | 'Operating System :: OS Independent', 45 | 'Topic :: Software Development :: Libraries', 46 | 'Topic :: Software Development :: Libraries :: Python Modules' 47 | ] 48 | ) 49 | -------------------------------------------------------------------------------- /tests/README.MD: -------------------------------------------------------------------------------- 1 | ## Tests 2 | This directory contains tests of the pyuplift library. 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/__init__.py -------------------------------------------------------------------------------- /tests/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/datasets/__init__.py -------------------------------------------------------------------------------- /tests/datasets/generators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/datasets/generators/__init__.py -------------------------------------------------------------------------------- /tests/datasets/generators/test_linear.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from pyuplift.datasets import make_linear_regression 4 | 5 | 6 | def test_make_linear_regression__repeated_random_state(): 7 | random_state, size = 101, 1000 8 | df1 = make_linear_regression(size, random_state=random_state) 9 | df2 = make_linear_regression(size, random_state=random_state) 10 | 11 | assert np.array_equal(df1['x1'].values, df2['x1'].values) 12 | assert np.array_equal(df1['x2'].values, df2['x2'].values) 13 | assert np.array_equal(df1['x3'].values, df2['x3'].values) 14 | assert np.array_equal(df1['t'].values, df2['t'].values) 15 | assert np.array_equal(df1['y'].values, df2['y'].values) 16 | 17 | 18 | def test_make_linear_regression__none_random_state(): 19 | size = 1000 20 | df1 = make_linear_regression(size, random_state=None) 21 | df2 = make_linear_regression(size, random_state=None) 22 | 23 | assert not np.array_equal(df1['x1'].values, df2['x1'].values) 24 | 25 | 26 | def test_make_linear_regression__zero_size(): 27 | with pytest.raises(ValueError): 28 | make_linear_regression(0) 29 | 30 | 31 | def test_make_linear_regression__negative_size(): 32 | with pytest.raises(ValueError): 33 | make_linear_regression(-10) 34 | -------------------------------------------------------------------------------- /tests/datasets/loaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/datasets/loaders/__init__.py -------------------------------------------------------------------------------- /tests/datasets/loaders/test_criteo_uplift_prediction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pytest 4 | from pyuplift.datasets import load_criteo_uplift_prediction 5 | from pyuplift.datasets import download_criteo_uplift_prediction 6 | 7 | 8 | data_home = os.path.join(os.sep.join(__file__.split(os.sep)[:-1]), 'data') 9 | 10 | 11 | def test_load_criteo_uplift_prediction__do_not_download_if_missing(): 12 | with pytest.raises(FileNotFoundError): 13 | load_criteo_uplift_prediction(data_home=data_home, download_if_missing=False) 14 | 15 | 16 | def test_download_criteo_uplift_prediction__wrong_url(): 17 | with pytest.raises(Exception): 18 | download_criteo_uplift_prediction(url='https://s3.us-east-2.amazonaws.com/criteo-uplift/criteo-uplift.csv.gz') 19 | 20 | 21 | def test_download_criteo_uplift_prediction(): 22 | download_criteo_uplift_prediction(data_home=data_home) 23 | # shutil.rmtree(data_home) 24 | 25 | 26 | def test_load_criteo_uplift_prediction(): 27 | df = load_criteo_uplift_prediction(data_home=data_home) 28 | assert len(df['feature_names']) != 11 29 | shutil.rmtree(data_home) 30 | -------------------------------------------------------------------------------- /tests/datasets/loaders/test_hillstrom_email_marketing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pytest 4 | from pyuplift.datasets import download_hillstrom_email_marketing 5 | from pyuplift.datasets import load_hillstrom_email_marketing 6 | 7 | 8 | data_home = os.path.join(os.sep.join(__file__.split(os.sep)[:-1]), 'data') 9 | 10 | 11 | def test_download_hillstrom_email_marketing(): 12 | download_hillstrom_email_marketing(data_home=data_home) 13 | shutil.rmtree(data_home) 14 | 15 | 16 | def test_download_hillstrom_email_marketing__twice(): 17 | download_hillstrom_email_marketing(data_home=data_home) 18 | download_hillstrom_email_marketing(data_home=data_home) 19 | shutil.rmtree(data_home) 20 | 21 | 22 | def test_download_hillstrom_email_marketing__wrong_url(): 23 | with pytest.raises(Exception): 24 | download_hillstrom_email_marketing(url='http://www.minethatdata.com/Kevin_Hillstrom_Min') 25 | 26 | 27 | def test_load_hillstrom_email_marketing__using_encoded_data(): 28 | df = load_hillstrom_email_marketing(data_home=data_home, load_raw_data=False) 29 | assert len(df['feature_names']) == 18 30 | shutil.rmtree(data_home) 31 | 32 | 33 | def test_load_hillstrom_email_marketing__using_raw_data(): 34 | df = load_hillstrom_email_marketing(data_home=data_home, load_raw_data=True) 35 | assert len(df['feature_names']) != 18 36 | shutil.rmtree(data_home) 37 | 38 | 39 | def test_load_hillstrom_email_marketing__do_not_download_if_missing(): 40 | with pytest.raises(FileNotFoundError): 41 | load_hillstrom_email_marketing(data_home=data_home, download_if_missing=False) 42 | -------------------------------------------------------------------------------- /tests/datasets/loaders/test_lalonde_nsw.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pytest 4 | from pyuplift.datasets import download_lalonde_nsw, load_lalonde_nsw 5 | 6 | 7 | data_home = os.path.join(os.sep.join(__file__.split(os.sep)[:-1]), 'data') 8 | 9 | 10 | def test_download_lalonde_nsw(): 11 | download_lalonde_nsw(data_home=data_home) 12 | shutil.rmtree(data_home) 13 | 14 | 15 | def test_download_lalonde_nsw__twice(): 16 | download_lalonde_nsw(data_home=data_home) 17 | download_lalonde_nsw(data_home=data_home) 18 | shutil.rmtree(data_home) 19 | 20 | 21 | def test_download_lalonde_nsw__wrong_control_data_url(): 22 | with pytest.raises(Exception, match=r'.*control_data_url.*'): 23 | download_lalonde_nsw(control_data_url='https://users.nber.org/~rdehejia/data/nsw_control_fake.txt') 24 | 25 | 26 | def test_download_lalonde_nsw__wrong_treated_data_url(): 27 | with pytest.raises(Exception, match=r'.*treated_data_url.*'): 28 | download_lalonde_nsw(treated_data_url='https://users.nber.org/~rdehejia/data/nsw_control_fake.txt') 29 | 30 | 31 | def test_load_lalonde_nsw__do_not_download_if_missing(): 32 | with pytest.raises(FileNotFoundError): 33 | load_lalonde_nsw(data_home=data_home, download_if_missing=False) 34 | 35 | 36 | def test_load_lalonde_nsw(): 37 | df = load_lalonde_nsw(data_home=data_home) 38 | assert len(df['feature_names']) == 7 39 | shutil.rmtree(data_home) 40 | -------------------------------------------------------------------------------- /tests/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/metrics/__init__.py -------------------------------------------------------------------------------- /tests/metrics/test_average_effect.py: -------------------------------------------------------------------------------- 1 | from pyuplift.variable_selection import Dummy 2 | from pyuplift.model_selection import train_test_split 3 | from pyuplift.datasets import make_linear_regression 4 | from pyuplift.metrics import get_average_effect 5 | 6 | 7 | random_state = 123 8 | df = make_linear_regression(10000) 9 | X, y, t = df.drop(['y', 't'], axis=1).values, df['y'].values, df['t'].values 10 | X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(X, y, t, random_state=random_state) 11 | 12 | model = Dummy() 13 | model.fit(X_train, y_train, t_train) 14 | 15 | 16 | def test_get_average_effect__zero_test_share(): 17 | y_pred = model.predict(X_test) 18 | test_share = 0 19 | effect = get_average_effect(y_test, t_test, y_pred, test_share) 20 | assert effect == 0 21 | -------------------------------------------------------------------------------- /tests/model_selection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/model_selection/__init__.py -------------------------------------------------------------------------------- /tests/model_selection/model_validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/model_selection/model_validation/__init__.py -------------------------------------------------------------------------------- /tests/model_selection/model_validation/test_treatment_cross_validation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyuplift.variable_selection import Dummy 3 | from pyuplift.datasets import make_linear_regression 4 | from pyuplift.model_selection import treatment_cross_val_score 5 | 6 | 7 | model = Dummy() 8 | random_state = 101 9 | size = 1000 10 | train_share = 0.7 11 | df = make_linear_regression(size, random_state=random_state) 12 | X, y, t = df.drop(['y', 't'], axis=1).values, df['y'].values, df['t'].values 13 | 14 | 15 | def test_treatment_cross_val_score__seeds_are_none(): 16 | cv, seeds = 5, None 17 | scores = treatment_cross_val_score(X, y, t, model, cv, train_share, seeds) 18 | assert len(scores) == cv 19 | 20 | 21 | def test_treatment_cross_val_score__cv_not_equals_len_of_seeds(): 22 | cv, seeds = 5, list(range(3)) 23 | with pytest.raises(ValueError): 24 | treatment_cross_val_score(X, y, t, model, cv, train_share, seeds) 25 | 26 | 27 | def test_treatment_cross_val_score__negative_cv(): 28 | cv, seeds = -5, list(range(3)) 29 | with pytest.raises(ValueError): 30 | treatment_cross_val_score(X, y, t, model, cv, train_share, seeds) 31 | 32 | 33 | def test_treatment_cross_val_score__zero_cv(): 34 | cv, seeds = 0, list(range(3)) 35 | with pytest.raises(ValueError): 36 | treatment_cross_val_score(X, y, t, model, cv, train_share, seeds) 37 | 38 | 39 | def test_treatment_cross_val_score__negative_train_share(): 40 | train_share = -0.7 41 | cv, seeds = 3, list(range(3)) 42 | with pytest.raises(ValueError): 43 | treatment_cross_val_score(X, y, t, model, cv, train_share, seeds) 44 | 45 | 46 | def test_treatment_cross_val_score__zero_train_share(): 47 | train_share = 0 48 | cv, seeds = 3, list(range(3)) 49 | with pytest.raises(ValueError): 50 | treatment_cross_val_score(X, y, t, model, cv, train_share, seeds) 51 | -------------------------------------------------------------------------------- /tests/model_selection/splitters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/model_selection/splitters/__init__.py -------------------------------------------------------------------------------- /tests/model_selection/splitters/test_train_test_split.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from pyuplift.datasets import make_linear_regression 4 | from pyuplift.model_selection import train_test_split 5 | 6 | 7 | def test_train_test_split__default(): 8 | size = 1000 9 | df = make_linear_regression(size) 10 | X, y, t = df.drop(['y', 't'], axis=1).values, df['y'].values, df['t'].values 11 | X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(X, y, t, random_state=10) 12 | 13 | assert X_train.shape[0] == 700 14 | assert X_test.shape[0] == 300 15 | assert y_train.shape[0] == 700 16 | assert y_test.shape[0] == 300 17 | assert t_train.shape[0] == 700 18 | assert t_test.shape[0] == 300 19 | 20 | 21 | def test_train_test_split__repeated_random_state(): 22 | random_state, size = 101, 1000 23 | 24 | df = make_linear_regression(size, random_state=random_state) 25 | X, y, t = df.drop(['y', 't'], axis=1).values, df['y'].values, df['t'].values 26 | X_train1, X_test1, y_train1, y_test1, t_train1, t_test1 = train_test_split(X, y, t, random_state=random_state) 27 | 28 | df = make_linear_regression(size, random_state=random_state) 29 | X, y, t = df.drop(['y', 't'], axis=1).values, df['y'].values, df['t'].values 30 | X_train2, X_test2, y_train2, y_test2, t_train2, t_test2 = train_test_split(X, y, t, random_state=random_state) 31 | 32 | assert np.array_equal(X_train1, X_train2) 33 | assert np.array_equal(X_test1, X_test2) 34 | assert np.array_equal(y_train1, y_train2) 35 | assert np.array_equal(y_test1, y_test2) 36 | assert np.array_equal(t_train1, t_train2) 37 | assert np.array_equal(t_test1, t_test2) 38 | 39 | 40 | def test_train_test_split__none_random_state(): 41 | random_state, size = 101, 1000 42 | 43 | df = make_linear_regression(size, random_state=random_state) 44 | X, y, t = df.drop(['y', 't'], axis=1).values, df['y'].values, df['t'].values 45 | X_train1, X_test1, y_train1, y_test1, t_train1, t_test1 = train_test_split(X, y, t, random_state=None) 46 | 47 | df = make_linear_regression(size, random_state=random_state) 48 | X, y, t = df.drop(['y', 't'], axis=1).values, df['y'].values, df['t'].values 49 | X_train2, X_test2, y_train2, y_test2, t_train2, t_test2 = train_test_split(X, y, t, random_state=None) 50 | 51 | assert not np.array_equal(X_train1, X_train2) 52 | 53 | 54 | def test_train_test_split__negative_train_share(): 55 | random_state, size = 101, 1000 56 | df = make_linear_regression(size, random_state=random_state) 57 | X, y, t = df.drop(['y', 't'], axis=1).values, df['y'].values, df['t'].values 58 | 59 | with pytest.raises(ValueError): 60 | train_test_split(X, y, t, train_share=-0.5) 61 | 62 | 63 | def test_train_test_split__zero_train_share(): 64 | random_state, size = 101, 1000 65 | df = make_linear_regression(size, random_state=random_state) 66 | X, y, t = df.drop(['y', 't'], axis=1).values, df['y'].values, df['t'].values 67 | 68 | with pytest.raises(ValueError): 69 | train_test_split(X, y, t, train_share=0) 70 | -------------------------------------------------------------------------------- /tests/transformation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/transformation/__init__.py -------------------------------------------------------------------------------- /tests/transformation/base.py: -------------------------------------------------------------------------------- 1 | class EmptyClass: 2 | pass 3 | 4 | 5 | class NoFitClass: 6 | def predict(self): 7 | pass 8 | 9 | 10 | class NoPredictClass: 11 | def fit(self): 12 | pass 13 | -------------------------------------------------------------------------------- /tests/transformation/test_jaskowski.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestClassifier 3 | from pyuplift.transformation import Jaskowski 4 | from .base import * 5 | 6 | 7 | def test_jaskowski__right_class(): 8 | model = RandomForestClassifier() 9 | Jaskowski(model) 10 | 11 | 12 | def test_jaskowski__empty_class(): 13 | model = EmptyClass() 14 | with pytest.raises(ValueError): 15 | Jaskowski(model) 16 | 17 | 18 | def test_jaskowski__non_fit_class(): 19 | model = NoFitClass() 20 | with pytest.raises(ValueError): 21 | Jaskowski(model) 22 | 23 | 24 | def test_jaskowski__non_predict_class(): 25 | model = NoPredictClass() 26 | with pytest.raises(ValueError): 27 | Jaskowski(model) 28 | -------------------------------------------------------------------------------- /tests/transformation/test_kane.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestClassifier 3 | from pyuplift.transformation import Kane 4 | from .base import * 5 | 6 | 7 | def test_kane__right_class(): 8 | model = RandomForestClassifier() 9 | Kane(model) 10 | 11 | 12 | def test_kane__empty_class(): 13 | model = EmptyClass() 14 | with pytest.raises(ValueError): 15 | Kane(model) 16 | 17 | 18 | def test_kane__non_fit_class(): 19 | model = NoFitClass() 20 | with pytest.raises(ValueError): 21 | Kane(model) 22 | 23 | 24 | def test_kane__non_predict_class(): 25 | model = NoPredictClass() 26 | with pytest.raises(ValueError): 27 | Kane(model) 28 | -------------------------------------------------------------------------------- /tests/transformation/test_lai.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestClassifier 3 | from pyuplift.transformation import Lai 4 | from .base import * 5 | 6 | 7 | def test_lai__right_class(): 8 | model = RandomForestClassifier() 9 | Lai(model) 10 | 11 | 12 | def test_lai__empty_class(): 13 | model = EmptyClass() 14 | with pytest.raises(ValueError): 15 | Lai(model) 16 | 17 | 18 | def test_lai__non_fit_class(): 19 | model = NoFitClass() 20 | with pytest.raises(ValueError): 21 | Lai(model) 22 | 23 | 24 | def test_lai__non_predict_class(): 25 | model = NoPredictClass() 26 | with pytest.raises(ValueError): 27 | Lai(model) 28 | -------------------------------------------------------------------------------- /tests/transformation/test_pessimistic.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestClassifier 3 | from pyuplift.transformation import Pessimistic 4 | from .base import * 5 | 6 | 7 | def test_pessimistic__right_class(): 8 | model = RandomForestClassifier() 9 | Pessimistic(model) 10 | 11 | 12 | def test_pessimistic__empty_class(): 13 | model = EmptyClass() 14 | with pytest.raises(ValueError): 15 | Pessimistic(model) 16 | 17 | 18 | def test_pessimistic__non_fit_class(): 19 | model = NoFitClass() 20 | with pytest.raises(ValueError): 21 | Pessimistic(model) 22 | 23 | 24 | def test_pessimistic__non_predict_class(): 25 | model = NoPredictClass() 26 | with pytest.raises(ValueError): 27 | Pessimistic(model) 28 | -------------------------------------------------------------------------------- /tests/transformation/test_reflective.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestClassifier 3 | from pyuplift.transformation import Reflective 4 | from .base import * 5 | 6 | 7 | def test_reflective__right_class(): 8 | model = RandomForestClassifier() 9 | Reflective(model) 10 | 11 | 12 | def test_reflective__empty_class(): 13 | model = EmptyClass() 14 | with pytest.raises(ValueError): 15 | Reflective(model) 16 | 17 | 18 | def test_reflective__non_fit_class(): 19 | model = NoFitClass() 20 | with pytest.raises(ValueError): 21 | Reflective(model) 22 | 23 | 24 | def test_reflective__non_predict_class(): 25 | model = NoPredictClass() 26 | with pytest.raises(ValueError): 27 | Reflective(model) 28 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/data/test.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/utils/data/test.test.gz -------------------------------------------------------------------------------- /tests/utils/test_downloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from pyuplift.utils import download_file 4 | 5 | 6 | def test_download_file__success(): 7 | url = 'https://github.com/duketemon/pyuplift/blob/master/LICENSE' 8 | output = 'LICENSE' 9 | download_file(url, output) 10 | os.remove(output) 11 | 12 | 13 | def test_download_file__exist_file(): 14 | output = 'exist_file_test.test' 15 | with open(output, 'w') as f: 16 | f.write('test') 17 | url = 'https://github.com/duketemon/pyuplift/blob/master/LICENSE' 18 | download_file(url, output) 19 | os.remove(output) 20 | 21 | 22 | def test_download_file__wrong_url(): 23 | output = 'LICENSE12' 24 | url = 'https://githu404b.com/duketemon/pyuplift/blob/master/LICENSE' 25 | with pytest.raises(Exception): 26 | download_file(url, output) 27 | 28 | 29 | def test_download_file__wrong_output_path(): 30 | output = '/data23/LICENSE' 31 | url = 'https://github.com/duketemon/pyuplift/blob/master/LICENSE' 32 | with pytest.raises(FileNotFoundError): 33 | download_file(url, output) 34 | -------------------------------------------------------------------------------- /tests/utils/test_retriever.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pytest 4 | from pyuplift.utils import retrieve_from_gz 5 | 6 | 7 | data_home = os.path.join(os.sep.join(__file__.split(os.sep)[:-1]), 'data') 8 | 9 | 10 | def test_retrieve_from_gz(): 11 | output_path = os.path.join(data_home, 'test.test') 12 | archive_path = output_path + '.gz' 13 | retrieve_from_gz(archive_path, output_path) 14 | with open(output_path, 'r') as f: 15 | text = f.read() 16 | os.remove(output_path) 17 | assert text == 'good' 18 | -------------------------------------------------------------------------------- /tests/variable_selection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duketemon/pyuplift/33daa0768ff333387cb8223ebfaedaffa57de335/tests/variable_selection/__init__.py -------------------------------------------------------------------------------- /tests/variable_selection/base.py: -------------------------------------------------------------------------------- 1 | class EmptyClass: 2 | pass 3 | 4 | 5 | class NoFitClass: 6 | def predict(self): 7 | pass 8 | 9 | 10 | class NoPredictClass: 11 | def fit(self): 12 | pass 13 | -------------------------------------------------------------------------------- /tests/variable_selection/test_cadit.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestRegressor 3 | from pyuplift.variable_selection import Cadit 4 | from .base import * 5 | 6 | 7 | def test_dummy__right_class(): 8 | reg_model = RandomForestRegressor() 9 | Cadit(reg_model) 10 | 11 | 12 | def test_dummy__empty_class(): 13 | reg_model = EmptyClass() 14 | with pytest.raises(ValueError): 15 | Cadit(reg_model) 16 | 17 | 18 | def test_dummy__non_fit_class(): 19 | reg_model = NoFitClass() 20 | with pytest.raises(ValueError): 21 | Cadit(reg_model) 22 | 23 | 24 | def test_dummy__non_predict_class(): 25 | reg_model = NoPredictClass() 26 | with pytest.raises(ValueError): 27 | Cadit(reg_model) 28 | -------------------------------------------------------------------------------- /tests/variable_selection/test_dummy.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestRegressor 3 | from pyuplift.variable_selection import Dummy 4 | from .base import * 5 | 6 | 7 | def test_dummy__right_class(): 8 | reg_model = RandomForestRegressor() 9 | Dummy(reg_model) 10 | 11 | 12 | def test_dummy__empty_class(): 13 | reg_model = EmptyClass() 14 | with pytest.raises(ValueError): 15 | Dummy(reg_model) 16 | 17 | 18 | def test_dummy__non_fit_class(): 19 | reg_model = NoFitClass() 20 | with pytest.raises(ValueError): 21 | Dummy(reg_model) 22 | 23 | 24 | def test_dummy__non_predict_class(): 25 | reg_model = NoPredictClass() 26 | with pytest.raises(ValueError): 27 | Dummy(reg_model) 28 | -------------------------------------------------------------------------------- /tests/variable_selection/test_econometric.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestRegressor 3 | from pyuplift.variable_selection import Econometric 4 | from .base import * 5 | 6 | 7 | def test_econometric__right_class(): 8 | reg_model = RandomForestRegressor() 9 | Econometric(reg_model) 10 | 11 | 12 | def test_econometric__empty_class(): 13 | reg_model = EmptyClass() 14 | with pytest.raises(ValueError): 15 | Econometric(reg_model) 16 | 17 | 18 | def test_econometric__non_fit_class(): 19 | reg_model = NoFitClass() 20 | with pytest.raises(ValueError): 21 | Econometric(reg_model) 22 | 23 | 24 | def test_econometric__non_predict_class(): 25 | reg_model = NoPredictClass() 26 | with pytest.raises(ValueError): 27 | Econometric(reg_model) 28 | -------------------------------------------------------------------------------- /tests/variable_selection/test_two_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn.ensemble import RandomForestRegressor 3 | from pyuplift.variable_selection import TwoModel 4 | from .base import * 5 | 6 | 7 | def test_two_model__right_class(): 8 | reg_model = RandomForestRegressor() 9 | TwoModel(reg_model, reg_model) 10 | 11 | 12 | def test_two_model__empty_class(): 13 | reg_model = EmptyClass() 14 | with pytest.raises(ValueError): 15 | TwoModel(reg_model, reg_model) 16 | 17 | 18 | def test_two_model__non_fit_class(): 19 | reg_model = NoFitClass() 20 | with pytest.raises(ValueError): 21 | TwoModel(reg_model, reg_model) 22 | 23 | 24 | def test_two_model__non_predict_class(): 25 | reg_model = NoPredictClass() 26 | with pytest.raises(ValueError): 27 | TwoModel(reg_model, reg_model) 28 | -------------------------------------------------------------------------------- /tutorials/Getting started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Getting started tutorial" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### ! This tutorial uses the Hillstrom Email Marketing dataset. More information about the dataset you can find on the [official site](http://minethatdata.com/Stochastic_Solutions_E-Mail_Challenge_2008.04.30.pdf)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from pyuplift.variable_selection import Econometric\n", 31 | "from pyuplift.datasets import load_hillstrom_email_marketing\n", 32 | "from pyuplift.model_selection import train_test_split" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "### Load data from the Hillstrom Email Marketing dataset\n", 47 | "Parameter `load_raw_data` allowed you to load raw data (original dataset) or preprocessed data (ready to go)." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "data = load_hillstrom_email_marketing(load_raw_data=False)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "{'description': 'This dataset contains 64,000 customers who last purchased within twelve months. The customers were involved in an e-mail test. 1/3 were randomly chosen to receive an e-mail campaign featuring Mens merchandise. 1/3 were randomly chosen to receive an e-mail campaign featuring Womens merchandise. 1/3 were randomly chosen to not receive an e-mail campaign. During a period of two weeks following the e-mail campaign, results were tracked. Your job is to tell the world if the Mens or Womens e-mail campaign was successful.',\n", 68 | " 'data': array([[ 10. , 142.44, 1. , ..., 0. , 1. , 0. ],\n", 69 | " [ 6. , 329.08, 1. , ..., 0. , 0. , 1. ],\n", 70 | " [ 7. , 180.65, 0. , ..., 0. , 0. , 1. ],\n", 71 | " ...,\n", 72 | " [ 6. , 29.99, 1. , ..., 0. , 1. , 0. ],\n", 73 | " [ 1. , 552.94, 1. , ..., 1. , 0. , 0. ],\n", 74 | " [ 1. , 472.82, 0. , ..., 0. , 0. , 1. ]]),\n", 75 | " 'feature_names': array(['recency', 'history', 'mens', 'womens', 'newbie', 'zip_code_Rural',\n", 76 | " 'zip_code_Surburban', 'zip_code_Urban',\n", 77 | " 'history_segment_$0 - $100', 'history_segment_$1,000 +',\n", 78 | " 'history_segment_$100 - $200', 'history_segment_$200 - $350',\n", 79 | " 'history_segment_$350 - $500', 'history_segment_$500 - $750',\n", 80 | " 'history_segment_$750 - $1,000', 'channel_Multichannel',\n", 81 | " 'channel_Phone', 'channel_Web'], dtype='" 199 | ] 200 | }, 201 | "execution_count": 8, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "model.fit(X_train, y_train, t_train)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "### Predict uplift for the test dataset" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 9, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "uplift = model.predict(X_test)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 10, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "array([1.0615553 , 0.41391224, 0.26028002, 2.09681851, 0.42625385,\n", 235 | " 1.94064929, 2.50369232, 0.52225684, 0.17712341, 0.91999936,\n", 236 | " 0.54780214, 0.27353447, 0.74778451, 0.77815588, 0.89413281,\n", 237 | " 0.50344916, 0.5541491 , 1.19713328, 1.62508446, 2.72094539])" 238 | ] 239 | }, 240 | "execution_count": 10, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "uplift[:20]" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.6.4" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 2 278 | } 279 | -------------------------------------------------------------------------------- /tutorials/README.MD: -------------------------------------------------------------------------------- 1 | ## Tutorials 2 | This directory contains tutorials which related to the pyuplift library. 3 | 4 | * [Getting started](https://github.com/duketemon/pyuplift/blob/master/tutorials/Getting%20started.ipynb) 5 | * [EDA of the Lalonde NSW dataset](https://github.com/duketemon/pyuplift/blob/master/tutorials/EDA%20Lalonde%20NSW.ipynb) 6 | * [EDA of the Hillstrom Email Marketing dataset](https://github.com/duketemon/pyuplift/blob/master/tutorials/EDA%20Hillstrom%20Email%20Marketing.ipynb) 7 | --------------------------------------------------------------------------------