├── .github
    ├── CODE_OF_CONDUCT.md
    └── workflows
    │   └── pythonpublish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prospector.yaml
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── Makefile
├── README.md
├── dev_requirements.txt
├── docs
    ├── Changelog.rst
    ├── High Level Overview.md
    ├── Makefile
    ├── More examples and recipes.md
    ├── Quickstart.md
    ├── Saving and loading model.md
    ├── conf.py
    ├── docs_requirements.txt
    ├── index.rst
    ├── intro.rst
    ├── lifetimes.datasets.rst
    ├── lifetimes.fitters.rst
    ├── lifetimes.rst
    ├── make.bat
    └── modules.rst
├── lifetimes
    ├── __init__.py
    ├── datasets
    │   ├── CDNOW_master.txt
    │   ├── CDNOW_sample.txt
    │   ├── __init__.py
    │   ├── cdnow_customers_summary.csv
    │   ├── cdnow_customers_summary_with_transactions.csv
    │   ├── donations.csv
    │   └── example_transactions.csv
    ├── fitters
    │   ├── __init__.py
    │   ├── beta_geo_beta_binom_fitter.py
    │   ├── beta_geo_fitter.py
    │   ├── gamma_gamma_fitter.py
    │   ├── modified_beta_geo_fitter.py
    │   └── pareto_nbd_fitter.py
    ├── generate_data.py
    ├── plotting.py
    ├── utils.py
    └── version.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── __main__.py
    ├── conftest.py
    ├── test_estimation.py
    ├── test_generate_data.py
    ├── test_plotting.py
    └── test_utils.py


/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Code of Conduct
 2 | 
 3 | As contributors and maintainers of this project, and in the interest of
 4 | fostering an open and welcoming community, we pledge to respect all people who
 5 | contribute through reporting issues, posting feature requests, updating
 6 | documentation, submitting pull requests or patches, and other activities.
 7 | 
 8 | We are committed to making participation in this project a harassment-free
 9 | experience for everyone, regardless of level of experience, gender, gender
10 | identity and expression, sexual orientation, disability, personal appearance,
11 | body size, race, ethnicity, age, religion, or nationality.
12 | 
13 | Examples of unacceptable behavior by participants include:
14 | 
15 | * The use of sexualized language or imagery
16 | * Personal attacks
17 | * Trolling or insulting/derogatory comments
18 | * Public or private harassment
19 | * Publishing other's private information, such as physical or electronic
20 |   addresses, without explicit permission
21 | * Other unethical or unprofessional conduct
22 | 
23 | Project maintainers have the right and responsibility to remove, edit, or
24 | reject comments, commits, code, wiki edits, issues, and other contributions
25 | that are not aligned to this Code of Conduct, or to ban temporarily or
26 | permanently any contributor for other behaviors that they deem inappropriate,
27 | threatening, offensive, or harmful.
28 | 
29 | By adopting this Code of Conduct, project maintainers commit themselves to
30 | fairly and consistently applying these principles to every aspect of managing
31 | this project. Project maintainers who do not follow or enforce the Code of
32 | Conduct may be permanently removed from the project team.
33 | 
34 | This Code of Conduct applies both within project spaces and in public spaces
35 | when an individual is representing the project or its community.
36 | 
37 | A working group of community members is committed to promptly addressing any
38 | reported issues. The working group is made up of pandas contributors and users.
39 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
40 | reported by contacting the working group by e-mail (lifelines-coc@googlegroups.com).
41 | Messages sent to this e-mail address will not be publicly visible but only to
42 | the working group members. The working group currently includes
43 | 
44 | - Cameron Davidson-Pilon
45 | - Stefanie Gibson
46 | - Paul Zivich
47 | 
48 | All complaints will be reviewed and investigated and will result in a response
49 | that is deemed necessary and appropriate to the circumstances. Maintainers are
50 | obligated to maintain confidentiality with regard to the reporter of an
51 | incident.
52 | 
53 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
54 | version 1.3.0, available at
55 | [http://contributor-covenant.org/version/1/3/0/][version],
56 | and the [Swift Code of Conduct][swift].
57 | 
58 | [homepage]: http://contributor-covenant.org
59 | [version]: http://contributor-covenant.org/version/1/3/0/
60 | [swift]: https://swift.org/community/#code-of-conduct
61 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v1
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | .pytest_cache
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 | 
44 | # Translations
45 | *.mo
46 | *.pot
47 | 
48 | # Django stuff:
49 | *.log
50 | 
51 | # Sphinx documentation
52 | docs/_build/
53 | 
54 | # PyBuilder
55 | target/
56 | 
57 | # OS generated files
58 | .DS_Store
59 | .DS_Store?
60 | 
61 | # Pyenv
62 | .python-version
63 | 
64 | # PyCharm
65 | .idea/*
66 | 
67 | # VS Code
68 | .vscode/
69 | 
70 | # Benchmarks Images
71 | benchmarks/images


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v2.0.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: check-ast
 7 |     -   id: check-yaml
 8 |     -   id: end-of-file-fixer
 9 |     -   id: fix-encoding-pragma
10 |     -   id: mixed-line-ending
11 |     -   id: trailing-whitespace
12 | -   repo: https://github.com/ambv/black
13 |     rev: stable
14 |     hooks:
15 |     - id: black
16 |       args: ["--line-length", "120"]
17 | 


--------------------------------------------------------------------------------
/.prospector.yaml:
--------------------------------------------------------------------------------
 1 | strictness: medium
 2 | 
 3 | pylint:
 4 |   options:
 5 |     bad-names: foo,baz,toto,tutu,tata
 6 |     # max-args default = 5
 7 |     max-args: 15
 8 |     # max-locals default = 15
 9 |     max-locals: 50
10 |     # max-branches default = 15
11 |     max-branches: 15
12 |   disable:
13 |     - line-too-long
14 |     - protected-access
15 |     - no-value-for-parameter
16 |     - assignment-from-no-return
17 |     - invalid-unary-operand-type
18 |     # remove if python2.7 support is dropped
19 |     - useless-object-inheritance
20 |     - old-style-class
21 | 
22 | pyflakes:
23 |   disable:
24 |     - F401
25 |     - F841
26 |     # let pylint used-before-assignment handle this
27 |     - F821
28 | 
29 | pep8:
30 |   options:
31 |     max-line-length: 120
32 |   disable:
33 |     - E501
34 |     - E241
35 | 
36 | mccabe:
37 |   options:
38 |     # max-complexity default = 10
39 |     max-complexity: 23
40 | 
41 | pyroma:
42 |   run: true
43 | 
44 | pep257:
45 |   run: false
46 | 
47 | ignore-paths:
48 |   - build
49 |   - benchmarks
50 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | cache: pip
 3 | dist: trusty
 4 | python:
 5 |    - "2.7"
 6 |    - "3.5"
 7 |    - "3.6"
 8 | env:
 9 |   - PANDAS_VERSION=0.24.1
10 | # Enable newer 3.7 without globally enabling sudo and dist: xenial for other build jobs
11 | matrix:
12 |   include:
13 |   - python: 3.7
14 |     dist: xenial
15 |     sudo: true
16 |     env: PANDAS_VERSION=0.24.1
17 | before_install:
18 |   - ls
19 | install:
20 |   - "pip install -r dev_requirements.txt --upgrade"
21 |   - "pip install pandas==$PANDAS_VERSION"
22 |   - "pip freeze --local"
23 | # command to run tests
24 | script:
25 |   - py.test --cov lifetimes
26 | after_success:
27 |   coveralls
28 | # Don't want notifications
29 | notifications:
30 |   email: false
31 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ### 0.11.3
 4 |  - a version bump for conda packaging
 5 | 
 6 | ### 0.11.2
 7 |  - some convergence improvements
 8 | 
 9 | ### 0.11.1
10 |  - bump the Pandas requirements to >= 0.24.0. This should have been done in 0.11.0
11 |  - suppress some warnings from autograd.
12 | 
13 | ### 0.11.0
14 |  - Move most models (all but Pareto) to autograd for automatic differentiation of their likelihood. This results in faster (at least 3x) and more successful convergence, plus allows for some really exciting extensions (coming soon).
15 |  - `GammaGammaFitter`, `BetaGeoFitter`, `ModifiedBetaGeoFitter` and `BetaGeoBetaBinomFitter` have three new attributes: `confidence_interval_`, `variance_matrix_` and `standard_errors_`
16 |  - `params_` on fitted models is not longer an OrderedDict, but a Pandas Series
17 |  - `GammaGammaFitter` can accept a `weights` argument now.
18 |  - `customer_lifelime_value` in `GammaGamma` now accepts a frequency argument.
19 |  - fixed a bug that was causing `ParetoNBDFitter` to generate data incorrectly.
20 | 
21 | ### 0.10.1
22 |  - performance improvements to `generate_data.py` for large datasets #195
23 |  - performance improvements to `summary_data_from_transaction_data`, thanks @MichaelSchreier
24 |  - Previously, `GammaGammaFitter` would have an infinite mean when its `q` parameter was less than 1. This was possible for some datasets. In 0.10.1, a new argument is added to `GammaGammaFitter` to constrain that `q` is greater than 1. This can be done with `q_constraint=True` in the call to `GammaGammaFitter.fit`. See issue #146. Thanks @vruvora
25 |  - Stop support of scipy < 1.0.
26 |  - Stop support of < Python 3.5.
27 | 
28 | ### 0.10.0
29 |  - `BetaGeoBetaBinomFitter.fit` has replaced `n_custs` with the more appropriately named `weights` (to align with other statisical libraries). By default and if unspecified, `weights` is equal to an array of 1s.
30 |  - The `conditional_` methods on `BetaGeoBetaBinomFitter` have been updated to handle exogenously provided recency, frequency and periods.
31 |  - Performance improvements in `BetaGeoBetaBinomFitter`. `fit` takes about 50% less time than previously.
32 |  - `BetaGeoFitter`, `ParetoNBDFitter`, and `ModifiedBetaGeoFitter` both have a new `weights` argument in their `fit`. This can be used to reduce the size of the data (collapsing subjects with the same recency, frequency, T).
33 | 
34 | ### 0.9.1
35 |  - Added a data generation method, `generate_new_data` to `BetaGeoBetaBinomFitter`. @zscore
36 |  - Fixed a bug in `summary_data_from_transaction_data` that was casting values to `int` prematurely. This was solved by including a new param `freq_multiplier` to be used to scale the resulting durations. See #100 for the original issue.  @aprotopopov
37 |  - Performance and bug fixes in `utils.expected_cumulative_transactions`. @aprotopopov
38 |  - Fixed a bug in `utils.calculate_alive_path` that was causing a difference in values compared to `summary_from_transaction_data`. @DaniGate
39 | 
40 | ### 0.9.0
41 |  - fixed many of the numpy warnings as the result of fitting
42 |  - added optional `initial_params` to all models
43 |  - Added `conditional_probability_of_n_purchases_up_to_time` to `ParetoNBDFitter`
44 |  - Fixed a bug in `expected_cumulative_transactions` and `plot_cumulative_transactions`
45 | 
46 | ### 0.8.1
47 |  - adding new `save_model` and `load_model` functions to all fitters. This will save the model locally as a pickle file.
48 |  - `observation_period_end` in `summary_data_from_transaction_data` and `calibration_and_holdout_data` now defaults to the max date in the dataset, instead of current time.
49 |  - improved stability of estimators.
50 |  - improve Runtime warnings.
51 |  - All fitters are now in a local file. This doesn't change the API however.
52 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ## The General Flow of a Contribution
 4 | 
 5 | A "good" contribution would follow this flow:
 6 | 
 7 | 1. (Sometimes Optional) Create an Issue.
 8 | 1. Prove that what you've created is better than what already exists.
 9 | 1. Create/modify an automated test to guarantee that what you did is not going to break some other thing inside the library.
10 | 1. Make sure your code follows this libraries editing conventions.
11 | 1. Create a *Pull Request* to an appropriate branch.
12 | 
13 | ## First Things First: Create an Issue
14 | 
15 | Most topics on this library are far from trivial, newcomers might misunderstand some concepts and, thus, if they blindly try to create *Pull Request*, their efforts might be for naught.
16 | 
17 | Therefore, post your question on the [Issues Section](https://github.com/CamDavidsonPilon/lifetimes/issues) of the library first. It will be quickly (hopefully) labeled and other people's collaboration will provide enough feedback for you to know what to do next.
18 | 
19 | ## Prove that What You've Created is Better than What Already Exists (or not)
20 | 
21 | It is paramount you prove that what you have is better than what the library looks like right now. This will not only have the functionality of being a source of *metadocumentation* but also a huge help for the eventual *reviewer(s)* of your *Pull Request*.
22 | 
23 | ### But how exactly do you do that?
24 | 
25 | My suggestion is for you to create a script where you compare the existing approach to what you've come up with. This script will go into a `benchmarks` folder on the top level of the library. The `benchmarks` folder might not be merged into the `master` branch, however, it might play an important role in the `dev` branch.
26 | 
27 | This is very similar to what (Data) Scientists do when they create `Jupyter Notebooks`. In those, they expose their reasoning towards a solution, which is not intended for production, only to explain their thoughts.
28 | 
29 | ## Tests
30 | 
31 | There are already quite a lot of tests in this library. However, nothing guarantees that what you're creating won't break an existing feature. It is recommended that you thus:
32 | 
33 | 1. Go through all the existing tests.
34 |     - Travis CI will do that automatically.
35 | 1. Examine the existing tests to see if they already guarantee that what you're doing is enough.
36 |     - This can be difficult because you will probably not know all of the tests. Nevertheless, using `Ctrl + F` is always your friend. Anyway, try your best.
37 | 1. Write new tests *if* necessary.
38 | 
39 | Additionally, if it were me, even if there already exists a test covering my code, I might end up writing a custom one &mdash; or mentioning the name of the existing one &mdash; in my `benchmarks` file anyway, just for the sake of documentation.
40 | 
41 | ## Editing Conventions
42 | 
43 | For the most part, this library follows [`PEP 8`](https://www.python.org/dev/peps/pep-0008/#a-foolish-consistency-is-the-hobgoblin-of-little-minds) conventions. Try to follow them when contributing. If you find code inside this library that does not respect those conventionse, please do create an issue and we will try to fix it. It's usually straight forward to fix it and it avoids a lot of pain in the long-term.
44 | 
45 | It is also crucial that you follow [`Numpy's Docstrings`](https://docs.scipy.org/doc/numpy/docs/howto_document.html) conventions when creating or editing `docstrings`. They are a subset of [`PEP 257`](https://www.python.org/dev/peps/pep-0257/#multi-line-docstrings).
46 | 
47 | ## Version Control
48 | 
49 | Except in some cases &mdash; like better Documentation &mdash; this library uses [Git Flow](https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow), i.e., most content will first be buffered inside a `dev` or `feature` branch before being merged into the `master` branch.
50 | 
51 | However, I would advise you not to use `feature` indiscriminately, but rather choose a more appropriate name for your branch. For example, if you're contributing to a bug fix, I would suggest you use the format `bug_fix/<more_specific_name>`. In the end, the major contribution branches should look like:
52 | 
53 | - `feature`
54 | - `code_improvement`
55 | - `bug_fix`
56 | - `docs`
57 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT) 
 2 | 
 3 | Copyright (c) 2015, Cameron Davidson-Pilon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | 1. The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | init:
 2 | ifeq ($(TRAVIS), true)
 3 | 		pip install -r reqs/travis-requirements.txt
 4 | 		pip install pandas==${PANDAS_VERSION}
 5 | 		pip list --local
 6 | else
 7 | 		pip install -r dev_requirements.txt
 8 | 		pre-commit install
 9 | endif
10 | 
11 | test:
12 | 	py.test -rfs --cov=lifetimes --block=False --cov-report term-missing
13 | 
14 | lint:
15 | ifeq ($(TRAVIS_PYTHON_VERSION), 2.7)
16 | 		echo "Skip linting for Python2.7"
17 | else
18 | 		black lifetimes/ -l 120 --fast
19 | 		black tests/ -l 120 --fast
20 | 		prospector --output-format grouped
21 | endif
22 | 
23 | format:
24 | 	black . --line-length 120
25 | 
26 | check_format:
27 | ifeq ($(TRAVIS_PYTHON_VERSION), 3.6)
28 | 		black . --check --line-length 120
29 | else
30 | 		echo "Only check format on Python3.6"
31 | endif
32 | 
33 | pre:
34 | 	pre-commit run --all-files
35 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![](http://i.imgur.com/7s3jqZM.png)
 2 | 
 3 | 
 4 | ## Read me first: Latest on the lifetimes project
 5 | 
 6 | 👋 This codebase has moved to "archived-mode". We won't be adding new features, improvements, or even answering issues in this codebase. 
 7 | 
 8 | **A project has emerged as a successor to lifetimes, [PyMC-Lab/PyMC-Marketing](https://github.com/pymc-labs/pymc-marketing)**, please check it out!
 9 | 
10 | 
11 | #### Measuring users is hard. Lifetimes makes it easy.
12 | [![Inactively Maintained](https://img.shields.io/badge/Maintenance%20Level-Inactively%20Maintained-yellowgreen.svg)](https://gist.github.com/cheerfulstoic/d107229326a01ff0f333a1d3476e068d)
13 | [![PyPI version](https://badge.fury.io/py/Lifetimes.svg)](https://badge.fury.io/py/Lifetimes)
14 | [![Documentation Status](https://readthedocs.org/projects/lifetimes/badge/?version=latest)](http://lifetimes.readthedocs.io/en/latest/?badge=latest)
15 | [![Build Status](https://travis-ci.org/CamDavidsonPilon/lifetimes.svg?branch=master)](https://travis-ci.org/CamDavidsonPilon/lifetimes)
16 | [![Coverage Status](https://coveralls.io/repos/CamDavidsonPilon/lifetimes/badge.svg?branch=master)](https://coveralls.io/r/CamDavidsonPilon/lifetimes?branch=master)
17 | 
18 | 
19 | 
20 | ## Introduction
21 | 
22 | Lifetimes can be used to analyze your users based on a few assumption:
23 | 
24 | 1. Users interact with you when they are "alive".
25 | 2. Users under study may "die" after some period of time.
26 | 
27 | I've quoted "alive" and "die" as these are the most abstract terms: feel free to use your own definition of "alive" and "die" (they are used similarly to "birth" and "death" in survival analysis). Whenever we have individuals repeating occurrences, we can use Lifetimes to help understand user behaviour.
28 | 
29 | ### Applications
30 | 
31 | If this is too abstract, consider these applications:
32 | 
33 |  - Predicting how often a visitor will return to your website. (Alive = visiting. Die = decided the website wasn't for them)
34 |  - Understanding how frequently a patient may return to a hospital. (Alive = visiting. Die = maybe the patient moved to a new city, or became deceased.)
35 |  - Predicting individuals who have churned from an app using only their usage history. (Alive = logins. Die = removed the app)
36 |  - Predicting repeat purchases from a customer. (Alive = actively purchasing. Die = became disinterested with your product)
37 |  - Predicting the lifetime value of your customers
38 | 
39 | ### Specific Application: Customer Lifetime Value
40 | 
41 | As emphasized by P. Fader and B. Hardie, understanding and acting on customer lifetime value (CLV) is the most important part of your business's sales efforts. [And (apparently) everyone is doing it wrong (Prof. Fader's Video Lecture)](https://www.youtube.com/watch?v=guj2gVEEx4s). *Lifetimes* is a Python library to calculate CLV for you.
42 | 
43 | ## Installation
44 | 
45 | ```bash
46 | pip install lifetimes
47 | ```
48 | 
49 | ## Contributing
50 | 
51 | Please refer to the [Contributing Guide](https://github.com/CamDavidsonPilon/lifetimes/blob/master/CONTRIBUTING.md) before creating any *Pull Requests*. It will make life easier for everyone.
52 | 
53 | ## Documentation and tutorials
54 | [Official documentation](http://lifetimes.readthedocs.io/en/latest/)
55 | 
56 | 
57 | ## Questions? Comments? Requests?
58 | 
59 | Please create an issue in the [lifetimes repository](https://github.com/CamDavidsonPilon/lifetimes).
60 | 
61 | ## Main Articles
62 | 
63 | 1. Probably, the seminal article of Non-Contractual CLV is [*Counting Your Customers: Who Are They and What Will They Do Next?*](https://www.jstor.org/stable/2631608?seq=1#page_scan_tab_contents), by David C. Schmittlein, Donald G. Morrison and Richard Colombo. Despite it being paid, it is worth the read. The relevant information will eventually end up in this library's documentation though.
64 | 1. The other (more recent) paper is [*“Counting Your Customers” the Easy Way:
65 | An Alternative to the Pareto/NBD Model*](http://brucehardie.com/papers/018/fader_et_al_mksc_05.pdf), by Peter Fader, Bruce Hardie and Ka Lok Lee.
66 | 
67 | ## More Information
68 | 
69 | 1. [Roberto Medri](http://cdn.oreillystatic.com/en/assets/1/event/85/Case%20Study_%20What_s%20a%20Customer%20Worth_%20Presentation.pdf) did a nice presentation on CLV at Etsy.
70 | 1. [Papers](http://mktg.uni-svishtov.bg/ivm/resources/Counting_Your_Customers.pdf), lots of [papers](http://brucehardie.com/notes/009/pareto_nbd_derivations_2005-11-05.pdf).
71 | 1. R implementation is called [BTYD](http://cran.r-project.org/web/packages/BTYD/vignettes/BTYD-walkthrough.pdf) (*Buy 'Til You Die*).
72 | 1. [Bruce Hardie's Website](http://brucehardie.com/), especially his notes, is full of useful and essential explanations, many of which are featured in this library.
73 | 


--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
 1 | -r requirements.txt
 2 | flake8
 3 | autopep8
 4 | pytest
 5 | matplotlib
 6 | pytest-cov==2.5.1
 7 | pytest-mpl
 8 | coveralls
 9 | pydocstyle
10 | pycodestyle
11 | 


--------------------------------------------------------------------------------
/docs/Changelog.rst:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | =========
  3 | 
  4 | 0.11.1
  5 | ~~~~~~
  6 | 
  7 | -  bump the Pandas requirements to >= 0.24.0. This should have been done
  8 |    in 0.11.0
  9 | -  suppress some warnings from autograd.
 10 | 
 11 | .. _section-1:
 12 | 
 13 | 0.11.0
 14 | ~~~~~~
 15 | 
 16 | -  Move most models (all but Pareto) to autograd for automatic
 17 |    differentiation of their likelihood. This results in faster (at least
 18 |    3x) and more successful convergence, plus allows for some really
 19 |    exciting extensions (coming soon).
 20 | -  ``GammaGammaFitter``, ``BetaGeoFitter``, ``ModifiedBetaGeoFitter``
 21 |    and ``BetaGeoBetaBinomFitter`` have three new attributes:
 22 |    ``confidence_interval_``, ``variance_matrix_`` and
 23 |    ``standard_errors_``
 24 | -  ``params_`` on fitted models is not longer an OrderedDict, but a
 25 |    Pandas Series
 26 | -  ``GammaGammaFitter`` can accept a ``weights`` argument now.
 27 | -  ``customer_lifelime_value`` in ``GammaGamma`` now accepts a frequency
 28 |    argument.
 29 | -  fixed a bug that was causing ``ParetoNBDFitter`` to generate data
 30 |    incorrectly.
 31 | 
 32 | .. _section-2:
 33 | 
 34 | 0.10.1
 35 | ~~~~~~
 36 | 
 37 | -  performance improvements to ``generate_data.py`` for large datasets
 38 |    #195
 39 | -  performance improvements to ``summary_data_from_transaction_data``,
 40 |    thanks @MichaelSchreier
 41 | -  Previously, ``GammaGammaFitter`` would have an infinite mean when its
 42 |    ``q`` parameter was less than 1. This was possible for some datasets.
 43 |    In 0.10.1, a new argument is added to ``GammaGammaFitter`` to
 44 |    constrain that ``q`` is greater than 1. This can be done with
 45 |    ``q_constraint=True`` in the call to ``GammaGammaFitter.fit``. See
 46 |    issue #146. Thanks @vruvora
 47 | -  Stop support of scipy < 1.0.
 48 | -  Stop support of < Python 3.5.
 49 | 
 50 | .. _section-3:
 51 | 
 52 | 0.10.0
 53 | ~~~~~~
 54 | 
 55 | -  ``BetaGeoBetaBinomFitter.fit`` has replaced ``n_custs`` with the more
 56 |    appropriately named ``weights`` (to align with other statisical
 57 |    libraries). By default and if unspecified, ``weights`` is equal to an
 58 |    array of 1s.
 59 | -  The ``conditional_`` methods on ``BetaGeoBetaBinomFitter`` have been
 60 |    updated to handle exogenously provided recency, frequency and
 61 |    periods.
 62 | -  Performance improvements in ``BetaGeoBetaBinomFitter``. ``fit`` takes
 63 |    about 50% less time than previously.
 64 | -  ``BetaGeoFitter``, ``ParetoNBDFitter``, and ``ModifiedBetaGeoFitter``
 65 |    both have a new ``weights`` argument in their ``fit``. This can be
 66 |    used to reduce the size of the data (collapsing subjects with the
 67 |    same recency, frequency, T).
 68 | 
 69 | .. _section-4:
 70 | 
 71 | 0.9.1
 72 | ~~~~~
 73 | 
 74 | -  Added a data generation method, ``generate_new_data`` to
 75 |    ``BetaGeoBetaBinomFitter``. @zscore
 76 | -  Fixed a bug in ``summary_data_from_transaction_data`` that was
 77 |    casting values to ``int`` prematurely. This was solved by including a
 78 |    new param ``freq_multiplier`` to be used to scale the resulting
 79 |    durations. See #100 for the original issue. @aprotopopov
 80 | -  Performance and bug fixes in
 81 |    ``utils.expected_cumulative_transactions``. @aprotopopov
 82 | -  Fixed a bug in ``utils.calculate_alive_path`` that was causing a
 83 |    difference in values compared to ``summary_from_transaction_data``.
 84 |    @DaniGate
 85 | 
 86 | .. _section-5:
 87 | 
 88 | 0.9.0
 89 | ~~~~~
 90 | 
 91 | -  fixed many of the numpy warnings as the result of fitting
 92 | -  added optional ``initial_params`` to all models
 93 | -  Added ``conditional_probability_of_n_purchases_up_to_time`` to
 94 |    ``ParetoNBDFitter``
 95 | -  Fixed a bug in ``expected_cumulative_transactions`` and
 96 |    ``plot_cumulative_transactions``
 97 | 
 98 | .. _section-6:
 99 | 
100 | 0.8.1
101 | ~~~~~
102 | 
103 | -  adding new ``save_model`` and ``load_model`` functions to all
104 |    fitters. This will save the model locally as a pickle file.
105 | -  ``observation_period_end`` in ``summary_data_from_transaction_data``
106 |    and ``calibration_and_holdout_data`` now defaults to the max date in
107 |    the dataset, instead of current time.
108 | -  improved stability of estimators.
109 | -  improve Runtime warnings.
110 | -  All fitters are now in a local file. This doesn’t change the API
111 |    however.
112 | 


--------------------------------------------------------------------------------
/docs/High Level Overview.md:
--------------------------------------------------------------------------------
 1 | # High Level Overview
 2 | 
 3 | This is intended to be a high-level documentation of how the code is structured. Whenever possible, [UML](https://en.wikipedia.org/wiki/Unified_Modeling_Language) is used. Some of the standards applied in this documentation can be found [here](https://www.lucidchart.com/pages/uml-class-diagram).
 4 | 
 5 | ## Workflow
 6 | 
 7 | The usual workflow of using the `Lifetimes` library is exemplified in the [Quickstart](Quickstart.md) page. It can also be represented through the following fluxogram:
 8 | 
 9 | ![Basic Workflow](https://i.imgur.com/oV2KpQG.png)
10 | 
11 | Notice that the right-most branch of the fluxogram actually refers to *monetary value* modeling.
12 | 
13 | ## Fitters
14 | 
15 | The core fitter is the `BaseFitter` class is inside the `__init__.py`, which serves as a *superclass* for most of the the other fitters. So far, only the `ModifiedBetaGeoFitter` is set on a higher layer, inheriting from the `BetaGeoFitter`. The following image shows the simplified interaction of the main fitter classes.
16 | 
17 | ![Simplified Fitters Fluxograms](https://i.imgur.com/RRF6ezC.png)
18 | 
19 | Below is a more detailed fluxogram of the classes. The arrows with the empty arrowheads symbolize inheritance. If the image is too small, you can find the source [here](https://i.imgur.com/ZPHg36q.png).
20 | 
21 | ![Complete UML Fluxogram](https://i.imgur.com/ZPHg36q.png)
22 | 
23 | ## Graphs
24 | 
25 | Graphs are plotted with functions coming from the `plotting.py` file. The main functions are cited below, alongside a brief description of how they are created.
26 | 
27 | ![plotting.py functions](https://i.imgur.com/hbmsPQk.png)
28 | 
29 | - `plot_period_transactions` : aggregation on how many purchases each customer has made in the calibration period.
30 | - `plot_calibration_purchases_vs_holdout_purchases` : aggregation over the conditional expected number of purchases.
31 | - `plot_frequency_recency_matrix` : conditional expected number of purchases.
32 | - `plot_probability_alive_matrix` : conditional probability of the customer being alive.
33 | - `plot_expected_repeat_purchases` : expected number of purchases.
34 | - `plot_history_alive` : resampling with the model with the specific parameters of the customer, using the `calculate_alive_path` from the `utils.py` file.
35 | - `plot_cumulative_transactions` : plot coming from the `expected_cumulative_transactions` function.
36 | - `plot_incremental_transactions` : decumulative sum over the `expected_cumulative_transactions` function.
37 | - `plot_transaction_rate_heterogeneity` : Gamma Distribution Histogram.
38 | - `plot_dropout_rate_heterogeneity` : Beta Distribution Histogram.
39 | 
40 | ## The `utils.py` File
41 | 
42 | In the `utils.py` file we can find some useful functions that are used inside the library and/or can be accessed by the end-user. Some of them are listed below:
43 | 
44 | - `calibration_and_holdout_data` : RFM data separated into calibration and holdout.
45 | - `_find_first_transactions` : DataFrame with the first transactions.
46 | - `summary_data_from_transaction_data` : RFM model for each customer coming from the transactional data.
47 | - `calculate_alive_path` : alive path (history) of a specified customer based on the fitted model.
48 | - `expected_cumulative_transactions` : expected and actual repeated cumulative transactions.


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = lifetimes
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | api:
23 | 	sphinx-apidoc ../lifetimes -o API -f 	


--------------------------------------------------------------------------------
/docs/More examples and recipes.md:
--------------------------------------------------------------------------------
 1 | ## More Examples and recipes
 2 | 
 3 | ### Example SQL statement to transform transactional data into RFM data
 4 | 
 5 | Let's review what our variables mean:
 6 | 
 7 | - `frequency` represents the number of *repeat* purchases the customer has made. This means that it's one less than the total number of purchases. This is actually slightly wrong. It's the count of distinct time periods the customer had a purchase in. So if using days as units, then it's the count of distinct days the customer had a purchase on.   
 8 | - `T` represents the age of the customer in whatever time units chosen. This is equal to the duration between a customer's first purchase and the end of the period under study.
 9 | - `recency` represents the age of the customer when they made their most recent purchases. This is equal to the duration between a customer's first purchase and their latest purchase. (Thus if they have made only 1 purchase, the recency is 0.)
10 | 
11 | Thus, executing a query against a transactional dataset, called `orders`, in a SQL-store may look like:
12 | 
13 | ```sql
14 | SELECT
15 |   customer_id,
16 |   COUNT(distinct date(transaction_at)) - 1 as frequency,
17 |   datediff('day', MIN(transaction_at), MAX(transaction_at)) as recency,
18 |   AVG(total_price) as monetary_value,
19 |   datediff('day', CURRENT_DATE, MIN(transaction_at)) as T
20 | FROM orders
21 | GROUP BY customer_id
22 | ```
23 | 
24 | ### Create table with RFM summary matrix with holdout
25 | 
26 | Variables `frequency`, `T` and `recency` have the same meaning as in previous section.
27 | 
28 | Two variables to set before executing:
29 | 
30 | - `duration_holdout` - holdout duration in days.
31 | - `CURRENT_DATE` - current date, could be changed to final date of the transactional data.
32 | 
33 | ```sql
34 | select
35 |     a.*,
36 |     COALESCE(b.frequency_holdout, 0) as frequency_holdout,
37 |     duration_holdout as duration_holdout
38 | from (
39 |     select
40 |         customer_id,
41 |         datediff(max(event_date), min(event_date)) as recency,
42 |         count(*) - 1 as frequency,
43 |         datediff(date_sub(CURRENT_DATE, duration_holdout), min(event_date)) as T
44 |     from orders
45 |     where event_date < date_sub(CURRENT_DATE, duration_holdout)
46 |     group by customer_id
47 | ) a
48 | left join (
49 |     select
50 |         customer_id,
51 |         count(*) as frequency_holdout
52 |     from orders
53 |     where event_date >= date_sub(CURRENT_DATE, duration_holdout)
54 |       and event_date < CURRENT_DATE
55 |     group by customer_id
56 | ) b
57 | on a.customer_id = b.customer_id
58 | ```
59 | 


--------------------------------------------------------------------------------
/docs/Quickstart.md:
--------------------------------------------------------------------------------
  1 | ## Quickstart
  2 | 
  3 | For the following examples, we'll use a dataset from an ecommerce provider to analyze their customers' repeat purchases. The examples below are using the `cdnow_customers.csv` located in the `datasets/` directory.
  4 | 
  5 | ```python
  6 | from lifetimes.datasets import load_cdnow_summary
  7 | data = load_cdnow_summary(index_col=[0])
  8 | 
  9 | print(data.head())
 10 | """
 11 |      frequency   recency      T
 12 | ID
 13 | 1    2           30.43       38.86
 14 | 2    1            1.71       38.86
 15 | 3    0            0.00       38.86
 16 | 4    0            0.00       38.86
 17 | 5    0            0.00       38.86
 18 | """
 19 | ```
 20 | 
 21 | #### The shape of your data
 22 | For all models, the following nomenclature is used:
 23 | 
 24 | - `frequency` represents the number of *repeat* purchases the customer has made. This means that it's one less than the total number of purchases. This is actually slightly wrong. It's the count of time periods the customer had a purchase in. So if using days as units, then it's the count of days the customer had a purchase on.
 25 | - `T` represents the age of the customer in whatever time units chosen (weekly, in the above dataset). This is equal to the duration between a customer's first purchase and the end of the period under study.
 26 | - `recency` represents the age of the customer when they made their most recent purchases. This is equal to the duration between a customer's first purchase and their latest purchase. (Thus if they have made only 1 purchase, the recency is 0.)
 27 | - `monetary_value` represents the average value of a given customer's purchases. This is equal to the sum of all a customer's purchases divided by the total number of purchases. Note that the denominator here is different than the `frequency` described above.
 28 | 
 29 | If your data is not in the format (very common), there are [utility functions](#example-using-transactional-datasets) in lifetimes to transform your data to look like this.
 30 | 
 31 | #### Basic Frequency/Recency analysis using the BG/NBD model
 32 | 
 33 | We'll use the **BG/NBD model** first. There are other models which we will explore in these docs, but this is the simplest to start with.
 34 | 
 35 | ```python
 36 | from lifetimes import BetaGeoFitter
 37 | 
 38 | # similar API to scikit-learn and lifelines.
 39 | bgf = BetaGeoFitter(penalizer_coef=0.0)
 40 | bgf.fit(data['frequency'], data['recency'], data['T'])
 41 | print(bgf)
 42 | """
 43 | <lifetimes.BetaGeoFitter: fitted with 2357 subjects, a: 0.79, alpha: 4.41, b: 2.43, r: 0.24>
 44 | """
 45 | 
 46 | bgf.summary
 47 | """
 48 |            coef  se(coef)  lower 95% bound  upper 95% bound
 49 | r      0.242593  0.012557         0.217981         0.267205
 50 | alpha  4.413532  0.378221         3.672218         5.154846
 51 | a      0.792886  0.185719         0.428877         1.156895
 52 | b      2.425752  0.705345         1.043276         3.808229
 53 | """
 54 | ```
 55 | 
 56 | After fitting, we have lots of nice methods and properties attached to the fitter object, like ``param_`` and ``summary``.
 57 | 
 58 | For small samples sizes, the parameters can get implausibly large, so by adding an l2 penalty the likelihood, we can control how large these parameters can be. This is implemented as setting as positive `penalizer_coef` in the initialization of the model. In typical applications, penalizers on the order of 0.001 to 0.1 are effective.
 59 | 
 60 | ##### Visualizing our Frequency/Recency Matrix
 61 | 
 62 | Consider: a customer bought from you every day for three weeks straight, and we haven't heard from them in months. What are the chances they are still "alive"? Pretty small. On the other hand, a customer who historically buys from you once a quarter, and bought last quarter, is likely still alive. We can visualize this relationship using the **Frequency/Recency matrix**, which computes the expected number of transactions an artificial customer is to make in the next time period, given his or her recency (age at last purchase) and frequency (the number of repeat transactions he or she has made).
 63 | 
 64 | ```
 65 | from lifetimes.plotting import plot_frequency_recency_matrix
 66 | 
 67 | plot_frequency_recency_matrix(bgf)
 68 | ```
 69 | 
 70 | ![fr_matrix](http://imgur.com/Rw8PGcq.png)
 71 | 
 72 | 
 73 | We can see that if a customer has bought 25 times from you, and their latest purchase was when they were 35 weeks old (given the individual is 35 weeks old), then they are your best customer (bottom-right). Your coldest customers are those that are in the top-right corner: they bought a lot quickly, and we haven't seen them in weeks.
 74 | 
 75 | There's also that beautiful "tail" around (5,25). That represents the customer who buys infrequently, but we've seen him or her recently, so they *might* buy again - we're not sure if they are dead or just between purchases.
 76 | 
 77 | Another interesting matrix to look at is the probability of still being *alive*:
 78 | 
 79 | ```python
 80 | from lifetimes.plotting import plot_probability_alive_matrix
 81 | 
 82 | plot_probability_alive_matrix(bgf)
 83 | ```
 84 | 
 85 | ![prob](http://imgur.com/di6MTic.png)
 86 | 
 87 | ##### Ranking customers from best to worst
 88 | 
 89 | Let's return to our customers and rank them from "highest expected purchases in the next period" to lowest. Models expose a method that will predict a customer's expected purchases in the next period using their history.
 90 | 
 91 | ```python
 92 | t = 1
 93 | data['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, data['frequency'], data['recency'], data['T'])
 94 | data.sort_values(by='predicted_purchases').tail(5)
 95 | """
 96 |        frequency  recency      T        predicted_purchases
 97 | ID
 98 | 509   18          35.14        35.86    0.424877
 99 | 841   19          34.00        34.14    0.474738
100 | 1981  17          28.43        28.86    0.486526
101 | 157   29          37.71        38.00    0.662396
102 | 1516  26          30.86        31.00    0.710623
103 | """
104 | ```
105 | 
106 | Great, we can see that the customer who has made 26 purchases, and bought very recently from us, is probably going to buy again in the next period.
107 | 
108 | ##### Assessing model fit
109 | 
110 | Ok, we can predict and we can visualize our customers' behaviour, but is our model correct? There are a few ways to assess the model's correctness. The first is to compare your data versus artificial data simulated with your fitted model's parameters.
111 | 
112 | ```python
113 | from lifetimes.plotting import plot_period_transactions
114 | plot_period_transactions(bgf)
115 | ```
116 | 
117 | ![model_fit_1](http://imgur.com/qlE4LDU.png)
118 | 
119 | We can see that our actual data and our simulated data line up well. This proves that our model doesn't suck.
120 | 
121 | ##### Example using transactional datasets
122 | 
123 | Most often, the dataset you have at hand will be at the transaction level. Lifetimes has some utility functions to transform that transactional data (one row per purchase) into summary data (a frequency, recency and age dataset).
124 | 
125 | ```python
126 | from lifetimes.datasets import load_transaction_data
127 | from lifetimes.utils import summary_data_from_transaction_data
128 | 
129 | transaction_data = load_transaction_data()
130 | print(transaction_data.head())
131 | """
132 |                   date  id
133 | 0  2014-03-08 00:00:00   0
134 | 1  2014-05-21 00:00:00   1
135 | 2  2014-03-14 00:00:00   2
136 | 3  2014-04-09 00:00:00   2
137 | 4  2014-05-21 00:00:00   2
138 | """
139 | 
140 | summary = summary_data_from_transaction_data(transaction_data, 'id', 'date', observation_period_end='2014-12-31')
141 | 
142 | print(summary.head())
143 | """
144 | frequency  recency      T
145 | id
146 | 0         0.0      0.0  298.0
147 | 1         0.0      0.0  224.0
148 | 2         6.0    142.0  292.0
149 | 3         0.0      0.0  147.0
150 | 4         2.0      9.0  183.0
151 | """
152 | 
153 | bgf.fit(summary['frequency'], summary['recency'], summary['T'])
154 | # <lifetimes.BetaGeoFitter: fitted with 5000 subjects, a: 1.85, alpha: 1.86, b: 3.18, r: 0.16>
155 | ```
156 | 
157 | ##### More model fitting
158 | 
159 | With transactional data, we can partition the dataset into a calibration period dataset and a holdout dataset. This is important as we want to test how our model performs on data not yet seen (think cross-validation in standard machine learning literature). Lifetimes has a function to partition our dataset like this:
160 | 
161 | ```python
162 | from lifetimes.utils import calibration_and_holdout_data
163 | 
164 | summary_cal_holdout = calibration_and_holdout_data(transaction_data, 'id', 'date',
165 |                                         calibration_period_end='2014-09-01',
166 |                                         observation_period_end='2014-12-31' )
167 | print(summary_cal_holdout.head())
168 | """
169 |     frequency_cal  recency_cal  T_cal  frequency_holdout  duration_holdout
170 | id
171 | 0             0.0          0.0  177.0                0.0               121
172 | 1             0.0          0.0  103.0                0.0               121
173 | 2             6.0        142.0  171.0                0.0               121
174 | 3             0.0          0.0   26.0                0.0               121
175 | 4             2.0          9.0   62.0                0.0               121
176 | """
177 | ```
178 | 
179 | With this dataset, we can perform fitting on the `_cal` columns, and test on the `_holdout` columns:
180 | 
181 | ```python
182 | from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases
183 | 
184 | bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal'])
185 | plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout)
186 | ```
187 | 
188 | ![holdout](http://imgur.com/LdSEYUwl.png)
189 | 
190 | ##### Customer Predictions
191 | 
192 | Based on customer history, we can predict what an individuals future purchases might look like:
193 | 
194 | ```python
195 | t = 10 #predict purchases in 10 periods
196 | individual = summary.iloc[20]
197 | # The below function is an alias to `bfg.conditional_expected_number_of_purchases_up_to_time`
198 | bgf.predict(t, individual['frequency'], individual['recency'], individual['T'])
199 | # 0.0576511
200 | ```
201 | 
202 | ##### Customer Probability Histories
203 | 
204 | Given a customer transaction history, we can calculate their historical probability of being alive, according to
205 | our trained model. For example:
206 | 
207 | ```python
208 | from lifetimes.plotting import plot_history_alive
209 | 
210 | id = 35
211 | days_since_birth = 200
212 | sp_trans = transaction_data.loc[transaction_data['id'] == id]
213 | plot_history_alive(bgf, days_since_birth, sp_trans, 'date')
214 | ```
215 | 
216 | ![history](http://i.imgur.com/y45tum4.png)
217 | 
218 | ### Estimating customer lifetime value using the Gamma-Gamma model
219 | 
220 | For this whole time we didn't take into account the economic value of each transaction and we focused mainly on
221 | transactions' occurrences. To estimate this we can use the Gamma-Gamma submodel. But first we need to create summary data
222 | from transactional data also containing economic values for each transaction (i.e. profits or revenues).
223 | 
224 | ```python
225 | from lifetimes.datasets import load_cdnow_summary_data_with_monetary_value
226 | 
227 | summary_with_money_value = load_cdnow_summary_data_with_monetary_value()
228 | summary_with_money_value.head()
229 | returning_customers_summary = summary_with_money_value[summary_with_money_value['frequency']>0]
230 | 
231 | print(returning_customers_summary.head())
232 | """
233 |              frequency  recency      T  monetary_value
234 | customer_id
235 | 1                    2    30.43  38.86           22.35
236 | 2                    1     1.71  38.86           11.77
237 | 6                    7    29.43  38.86           73.74
238 | 7                    1     5.00  38.86           11.77
239 | 9                    2    35.71  38.86           25.55
240 | """
241 | ```
242 | 
243 | If computing the monetary value from your own data, note that it is the __mean__ of a given customer's value, not the __sum__.
244 | `monetary_value` can be used to represent profit, or revenue, or any value as long as it is consistently calculated for each customer.
245 | 
246 | #### The Gamma-Gamma model and the independence assumption
247 | The model we are going to use to estimate the CLV for our userbase is called the Gamma-Gamma submodel,
248 | which relies upon an important assumption. The Gamma-Gamma submodel, in fact, assumes that there is no
249 | relationship between the monetary value and the purchase frequency. In practice we need to check whether
250 | the Pearson correlation between the two vectors is close to 0 in order to use this model.
251 | 
252 | ```python
253 | returning_customers_summary[['monetary_value', 'frequency']].corr()
254 | """
255 |                 monetary_value  frequency
256 | monetary_value        1.000000   0.113884
257 | frequency             0.113884   1.000000
258 | """
259 | ```
260 | 
261 | At this point we can train our Gamma-Gamma submodel and predict the conditional, expected average lifetime value of our customers.
262 | 
263 | ```python
264 | from lifetimes import GammaGammaFitter
265 | 
266 | ggf = GammaGammaFitter(penalizer_coef = 0)
267 | ggf.fit(returning_customers_summary['frequency'],
268 |         returning_customers_summary['monetary_value'])
269 | print(ggf)
270 | """
271 | <lifetimes.GammaGammaFitter: fitted with 946 subjects, p: 6.25, q: 3.74, v: 15.45>
272 | """
273 | ```
274 | We can now estimate the average transaction value:
275 | 
276 | ```python
277 | print(ggf.conditional_expected_average_profit(
278 |         summary_with_money_value['frequency'],
279 |         summary_with_money_value['monetary_value']
280 |     ).head(10))
281 | """
282 | customer_id
283 | 1     24.658619
284 | 2     18.911489
285 | 3     35.170981
286 | 4     35.170981
287 | 5     35.170981
288 | 6     71.462843
289 | 7     18.911489
290 | 8     35.170981
291 | 9     27.282408
292 | 10    35.170981
293 | dtype: float64
294 | """
295 | 
296 | print("Expected conditional average profit: %s, Average profit: %s" % (
297 |     ggf.conditional_expected_average_profit(
298 |         summary_with_money_value['frequency'],
299 |         summary_with_money_value['monetary_value']
300 |     ).mean(),
301 |     summary_with_money_value[summary_with_money_value['frequency']>0]['monetary_value'].mean()
302 | ))
303 | """
304 | Expected conditional average profit: 35.2529588256, Average profit: 35.078551797
305 | """
306 | ```
307 | 
308 | While for computing the total CLV using the DCF method (https://en.wikipedia.org/wiki/Discounted_cash_flow) adjusting for cost of capital:
309 | 
310 | ```python
311 | # refit the BG model to the summary_with_money_value dataset
312 | bgf.fit(summary_with_money_value['frequency'], summary_with_money_value['recency'], summary_with_money_value['T'])
313 | 
314 | print(ggf.customer_lifetime_value(
315 |     bgf, #the model to use to predict the number of future transactions
316 |     summary_with_money_value['frequency'],
317 |     summary_with_money_value['recency'],
318 |     summary_with_money_value['T'],
319 |     summary_with_money_value['monetary_value'],
320 |     time=12, # months
321 |     discount_rate=0.01 # monthly discount rate ~ 12.7% annually
322 | ).head(10))
323 | """
324 | customer_id
325 | 1      140.096211
326 | 2       18.943467
327 | 3       38.180574
328 | 4       38.180574
329 | 5       38.180574
330 | 6     1003.868107
331 | 7       28.109683
332 | 8       38.180574
333 | 9      167.418216
334 | 10      38.180574
335 | Name: clv, dtype: float64
336 | """
337 | ```
338 | 


--------------------------------------------------------------------------------
/docs/Saving and loading model.md:
--------------------------------------------------------------------------------
 1 | ## Saving and loading model
 2 | 
 3 | When you have lots of data and training takes a lot of time option with saving and loading model could be useful. First you need to fit the model, then save it and load.
 4 | 
 5 | ### Fit model
 6 | 
 7 | ```python
 8 | from lifetimes import BetaGeoFitter
 9 | from lifetimes.datasets import load_cdnow_summary
10 | 
11 | data = load_cdnow_summary(index_col=[0])
12 | bgf = BetaGeoFitter()
13 | bgf.fit(data['frequency'], data['recency'], data['T'])
14 | bgf
15 | """<lifetimes.BetaGeoFitter: fitted with 2357 subjects, a: 0.79, alpha: 4.41, b: 2.43, r: 0.24>"""
16 | ```
17 | 
18 | ### Saving model
19 | 
20 | Model will be saved with [dill](https://github.com/uqfoundation/dill) to pickle object. Optional parameters `save_data` and `save_generate_data_method` are present to reduce final pickle object size for big dataframes.
21 | Optional parameters:
22 | - `save_data` is used for saving data from model or not (default: `True`).
23 | - `save_generate_data_method` is used for saving `generate_new_data` method from model or not (default: `True`)
24 | 
25 | ```python
26 | bgf.save_model('bgf.pkl')
27 | ```
28 | 
29 | or to save only model with minumum size without `data` and `generate_new_data`:
30 | ```python
31 | bgf.save_model('bgf_small_size.pkl', save_data=False, save_generate_data_method=False)
32 | ```
33 | 
34 | ### Loading model
35 | 
36 | Before loading you should initialize the model first and then use method `load_model`
37 | 
38 | ```python
39 | bgf_loaded = BetaGeoFitter()
40 | bgf_loaded.load_model('bgf.pkl')
41 | bgf_loaded
42 | """<lifetimes.BetaGeoFitter: fitted with 2357 subjects, a: 0.79, alpha: 4.41, b: 2.43, r: 0.24>"""
43 | ```
44 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # lifetimes documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri Jul  7 14:10:36 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | import sphinx_rtd_theme
 25 | 
 26 | html_theme = "sphinx_rtd_theme"
 27 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 28 | 
 29 | # Convert package README.md to intro.rst to include in index.rst for docs
 30 | try:
 31 |     import pypandoc
 32 | 
 33 |     long_description = pypandoc.convert_file("../README.md", "rst", outputfile="intro.rst")
 34 | except (ImportError):
 35 |     print("Install pypandoc to convert README.md to intro.rst")
 36 | 
 37 | 
 38 | # -- General configuration ------------------------------------------------
 39 | 
 40 | # If your documentation needs a minimal Sphinx version, state it here.
 41 | #
 42 | # needs_sphinx = '1.0'
 43 | 
 44 | # Add any Sphinx extension module names here, as strings. They can be
 45 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 46 | # ones.
 47 | extensions = [
 48 |     "sphinx.ext.autodoc",
 49 |     "sphinx.ext.todo",
 50 |     "sphinx.ext.coverage",
 51 |     "sphinx.ext.mathjax",
 52 |     "sphinx.ext.napoleon",
 53 |     "recommonmark",
 54 | ]
 55 | 
 56 | # Add any paths that contain templates here, relative to this directory.
 57 | templates_path = ["_templates"]
 58 | 
 59 | # for parsing markdown files
 60 | source_suffix = {".rst": "restructuredtext", ".txt": "markdown", ".md": "markdown"}
 61 | 
 62 | 
 63 | # The master toctree document.
 64 | master_doc = "index"
 65 | 
 66 | # General information about the project.
 67 | project = "lifetimes"
 68 | copyright = "2015, Cameron Davidson-Pilon"
 69 | author = "Cameron Davidson-Pilon"
 70 | 
 71 | # The version info for the project you're documenting, acts as replacement for
 72 | # |version| and |release|, also used in various other places throughout the
 73 | # built documents.
 74 | #
 75 | # The short X.Y version.
 76 | version = "0.11.2"
 77 | # The full version, including alpha/beta/rc tags.
 78 | release = version
 79 | 
 80 | # The language for content autogenerated by Sphinx. Refer to documentation
 81 | # for a list of supported languages.
 82 | #
 83 | # This is also used if you do content translation via gettext catalogs.
 84 | # Usually you set "language" from the command line for these cases.
 85 | language = None
 86 | 
 87 | # List of patterns, relative to source directory, that match files and
 88 | # directories to ignore when looking for source files.
 89 | # This patterns also effect to html_static_path and html_extra_path
 90 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 91 | 
 92 | # The name of the Pygments (syntax highlighting) style to use.
 93 | pygments_style = "sphinx"
 94 | 
 95 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 96 | todo_include_todos = True
 97 | 
 98 | 
 99 | # -- Options for HTML output ----------------------------------------------
100 | 
101 | # The theme to use for HTML and HTML Help pages.  See the documentation for
102 | # a list of builtin themes.
103 | #
104 | # html_theme = 'alabaster'
105 | html_theme = "sphinx_rtd_theme"
106 | 
107 | # Theme options are theme-specific and customize the look and feel of a theme
108 | # further.  For a list of options available for each theme, see the
109 | # documentation.
110 | #
111 | # html_theme_options = {}
112 | 
113 | # Add any paths that contain custom static files (such as style sheets) here,
114 | # relative to this directory. They are copied after the builtin static files,
115 | # so a file named "default.css" will overwrite the builtin "default.css".
116 | html_static_path = ["_static"]
117 | 
118 | # -- Napoleon settings ----------------------------------------------------
119 | napoleon_google_docstring = True
120 | napoleon_numpy_docstring = True
121 | napoleon_include_init_with_doc = False
122 | napoleon_include_private_with_doc = False
123 | napoleon_include_special_with_doc = False
124 | napoleon_use_admonition_for_examples = False
125 | napoleon_use_admonition_for_notes = False
126 | napoleon_use_admonition_for_references = False
127 | napoleon_use_ivar = False
128 | napoleon_use_param = True
129 | napoleon_use_rtype = False
130 | napoleon_use_keyword = True
131 | 
132 | 
133 | # -- Options for HTMLHelp output ------------------------------------------
134 | 
135 | # Output file base name for HTML help builder.
136 | htmlhelp_basename = "lifetimesdoc"
137 | 
138 | 
139 | # -- Options for LaTeX output ---------------------------------------------
140 | 
141 | latex_elements = {
142 |     # The paper size ('letterpaper' or 'a4paper').
143 |     #
144 |     # 'papersize': 'letterpaper',
145 |     # The font size ('10pt', '11pt' or '12pt').
146 |     #
147 |     # 'pointsize': '10pt',
148 |     # Additional stuff for the LaTeX preamble.
149 |     #
150 |     # 'preamble': '',
151 |     # Latex figure (float) alignment
152 |     #
153 |     # 'figure_align': 'htbp',
154 | }
155 | 
156 | # Grouping the document tree into LaTeX files. List of tuples
157 | # (source start file, target name, title,
158 | #  author, documentclass [howto, manual, or own class]).
159 | latex_documents = [(master_doc, "lifetimes.tex", "lifetimes Documentation", "Cameron Davidson-Pilon", "manual")]
160 | 
161 | 
162 | # -- Options for manual page output ---------------------------------------
163 | 
164 | # One entry per manual page. List of tuples
165 | # (source start file, name, description, authors, manual section).
166 | man_pages = [(master_doc, "lifetimes", "lifetimes Documentation", [author], 1)]
167 | 
168 | 
169 | # -- Options for Texinfo output -------------------------------------------
170 | 
171 | # Grouping the document tree into Texinfo files. List of tuples
172 | # (source start file, target name, title, author,
173 | #  dir menu entry, description, category)
174 | texinfo_documents = [
175 |     (
176 |         master_doc,
177 |         "lifetimes",
178 |         "lifetimes Documentation",
179 |         author,
180 |         "lifetimes",
181 |         "One line description of project.",
182 |         "Miscellaneous",
183 |     )
184 | ]
185 | 


--------------------------------------------------------------------------------
/docs/docs_requirements.txt:
--------------------------------------------------------------------------------
1 | -r ../dev_requirements.txt
2 | recommonmark
3 | sphinxcontrib-napoleon
4 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. lifetimes documentation master file, created by
 2 |    sphinx-quickstart on Fri Jul  7 14:10:36 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | .. include:: intro.rst
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 2
10 |    :caption: Contents:
11 | 
12 |    Quickstart
13 |    High Level Overview
14 |    Saving and loading model
15 |    More examples and recipes
16 |    lifetimes
17 |    Changelog
18 | 
19 | 
20 | Indices and tables
21 | ------------------
22 | 
23 | * :ref:`genindex`
24 | * :ref:`modindex`
25 | * :ref:`search`
26 | 


--------------------------------------------------------------------------------
/docs/intro.rst:
--------------------------------------------------------------------------------
 1 | |image0|
 2 | 
 3 | Measuring users is hard. Lifetimes makes it easy.
 4 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 5 | 
 6 | |PyPI version| |Documentation Status| |Build Status| |Coverage Status|
 7 | 
 8 | 
 9 | Read me first: Latest on the lifetimes project
10 | -----------------------------------------------
11 | 
12 | 👋 This project has moved to "maintenance-mode". I won't be adding new features, improvements, or even answering issues on this project (but perhaps the occasional bug fix). Why? I don't use lifetimes anymore, nor do I keep up with the literature around RFM. I would love to see a successor library that elevates RFM even further (and please include covariates!)
13 | 
14 | 
15 | Introduction
16 | ------------
17 | 
18 | Lifetimes can be used to analyze your users based on a few assumption:
19 | 
20 | 1. Users interact with you when they are “alive”.
21 | 2. Users under study may “die” after some period of time.
22 | 
23 | I’ve quoted “alive” and “die” as these are the most abstract terms: feel
24 | free to use your own definition of “alive” and “die” (they are used
25 | similarly to “birth” and “death” in survival analysis). Whenever we have
26 | individuals repeating occurrences, we can use Lifetimes to help
27 | understand user behaviour.
28 | 
29 | Applications
30 | ~~~~~~~~~~~~
31 | 
32 | If this is too abstract, consider these applications:
33 | 
34 | -  Predicting how often a visitor will return to your website. (Alive =
35 |    visiting. Die = decided the website wasn’t for them)
36 | -  Understanding how frequently a patient may return to a hospital.
37 |    (Alive = visiting. Die = maybe the patient moved to a new city, or
38 |    became deceased.)
39 | -  Predicting individuals who have churned from an app using only their
40 |    usage history. (Alive = logins. Die = removed the app)
41 | -  Predicting repeat purchases from a customer. (Alive = actively
42 |    purchasing. Die = became disinterested with your product)
43 | -  Predicting the lifetime value of your customers
44 | 
45 | Specific Application: Customer Lifetime Value
46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
47 | 
48 | As emphasized by P. Fader and B. Hardie, understanding and acting on
49 | customer lifetime value (CLV) is the most important part of your
50 | business’s sales efforts. `And (apparently) everyone is doing it
51 | wrong <https://www.youtube.com/watch?v=guj2gVEEx4s>`__. *Lifetimes* is a
52 | Python library to calculate CLV for you.
53 | 
54 | Installation
55 | ------------
56 | 
57 | ::
58 | 
59 |    pip install lifetimes
60 | 
61 | Documentation and tutorials
62 | ---------------------------
63 | 
64 | `Official documentation <http://lifetimes.readthedocs.io/en/latest/>`__
65 | 
66 | Questions? Comments? Requests?
67 | ------------------------------
68 | 
69 | Please create an issue in the `lifetimes
70 | repository <https://github.com/CamDavidsonPilon/lifetimes>`__.
71 | 
72 | More Information
73 | ----------------
74 | 
75 | 1. `Roberto
76 |    Medri <http://cdn.oreillystatic.com/en/assets/1/event/85/Case%20Study_%20What_s%20a%20Customer%20Worth_%20Presentation.pdf>`__
77 |    did a nice presentation on CLV at Etsy.
78 | 2. `Papers <http://mktg.uni-svishtov.bg/ivm/resources/Counting_Your_Customers.pdf>`__,
79 |    lots of
80 |    `papers <http://brucehardie.com/notes/009/pareto_nbd_derivations_2005-11-05.pdf>`__.
81 | 3. R implementation is called
82 |    `BTYD <http://cran.r-project.org/web/packages/BTYD/vignettes/BTYD-walkthrough.pdf>`__
83 |    (for, *Buy ’Til You Die*).
84 | 
85 | .. |image0| image:: http://i.imgur.com/7s3jqZM.png
86 | .. |PyPI version| image:: https://badge.fury.io/py/Lifetimes.svg
87 |    :target: https://badge.fury.io/py/Lifetimes
88 | .. |Documentation Status| image:: https://readthedocs.org/projects/lifetimes/badge/?version=latest
89 |    :target: http://lifetimes.readthedocs.io/en/latest/?badge=latest
90 | .. |Build Status| image:: https://travis-ci.org/CamDavidsonPilon/lifetimes.svg?branch=master
91 |    :target: https://travis-ci.org/CamDavidsonPilon/lifetimes
92 | .. |Coverage Status| image:: https://coveralls.io/repos/CamDavidsonPilon/lifetimes/badge.svg?branch=master
93 |    :target: https://coveralls.io/r/CamDavidsonPilon/lifetimes?branch=master
94 | 


--------------------------------------------------------------------------------
/docs/lifetimes.datasets.rst:
--------------------------------------------------------------------------------
1 | lifetimes.datasets
2 | ==========================
3 | 
4 | .. automodule:: lifetimes.datasets
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/lifetimes.fitters.rst:
--------------------------------------------------------------------------------
 1 | lifetimes.fitters
 2 | =========================
 3 | 
 4 | 
 5 | lifetimes.fitters.beta\_geo\_beta\_binom\_fitter module
 6 | -------------------------------------------------------
 7 | 
 8 | .. automodule:: lifetimes.fitters.beta_geo_beta_binom_fitter
 9 |     :members:
10 |     :undoc-members:
11 |     :show-inheritance:
12 | 
13 | lifetimes.fitters.beta\_geo\_fitter module
14 | ------------------------------------------
15 | 
16 | .. automodule:: lifetimes.fitters.beta_geo_fitter
17 |     :members:
18 |     :undoc-members:
19 |     :show-inheritance:
20 | 
21 | lifetimes.fitters.gamma\_gamma\_fitter module
22 | ---------------------------------------------
23 | 
24 | .. automodule:: lifetimes.fitters.gamma_gamma_fitter
25 |     :members:
26 |     :undoc-members:
27 |     :show-inheritance:
28 | 
29 | lifetimes.fitters.modified\_beta\_geo\_fitter module
30 | ----------------------------------------------------
31 | 
32 | .. automodule:: lifetimes.fitters.modified_beta_geo_fitter
33 |     :members:
34 |     :undoc-members:
35 |     :show-inheritance:
36 | 
37 | lifetimes.fitters.pareto\_nbd\_fitter module
38 | --------------------------------------------
39 | 
40 | .. automodule:: lifetimes.fitters.pareto_nbd_fitter
41 |     :members:
42 |     :undoc-members:
43 |     :show-inheritance:
44 | 
45 | 
46 | .. automodule:: lifetimes.fitters
47 |     :members:
48 |     :undoc-members:
49 |     :show-inheritance:
50 | 


--------------------------------------------------------------------------------
/docs/lifetimes.rst:
--------------------------------------------------------------------------------
 1 | lifetimes package
 2 | =================
 3 | 
 4 | .. toctree::
 5 | 
 6 |     lifetimes.fitters
 7 |     lifetimes.utils
 8 |     lifetimes.datasets
 9 | 
10 | 
11 | lifetimes.generate\_data module
12 | -------------------------------
13 | 
14 | .. automodule:: lifetimes.generate_data
15 |     :members:
16 |     :undoc-members:
17 |     :show-inheritance:
18 | 
19 | lifetimes.plotting module
20 | -------------------------
21 | 
22 | .. automodule:: lifetimes.plotting
23 |     :members:
24 |     :undoc-members:
25 |     :show-inheritance:
26 | 
27 | lifetimes.utils module
28 | ----------------------
29 | 
30 | .. automodule:: lifetimes.utils
31 |     :members:
32 |     :undoc-members:
33 |     :show-inheritance:
34 | 
35 | .. automodule:: lifetimes
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=lifetimes
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | lifetimes
2 | =========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    lifetimes
8 | 


--------------------------------------------------------------------------------
/lifetimes/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """All fitters from fitters directory."""
 3 | from .version import __version__
 4 | from .fitters import BaseFitter
 5 | from .fitters.beta_geo_fitter import BetaGeoFitter
 6 | from .fitters.beta_geo_beta_binom_fitter import BetaGeoBetaBinomFitter
 7 | from .fitters.modified_beta_geo_fitter import ModifiedBetaGeoFitter
 8 | from .fitters.pareto_nbd_fitter import ParetoNBDFitter
 9 | from .fitters.gamma_gamma_fitter import GammaGammaFitter
10 | 
11 | __all__ = (
12 |     "__version__",
13 |     "BetaGeoFitter",
14 |     "ParetoNBDFitter",
15 |     "GammaGammaFitter",
16 |     "ModifiedBetaGeoFitter",
17 |     "BetaGeoBetaBinomFitter",
18 | )
19 | 


--------------------------------------------------------------------------------
/lifetimes/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # modified from https://github.com/CamDavidsonPilon/lifelines/
 3 | 
 4 | import pandas as pd
 5 | from .. import utils
 6 | from pkg_resources import resource_filename
 7 | 
 8 | __all__ = [
 9 |     "load_cdnow_summary",
10 |     "load_transaction_data",
11 |     "load_cdnow_summary_data_with_monetary_value",
12 |     "load_donations",
13 | ]
14 | 
15 | 
16 | def load_dataset(filename, **kwargs):
17 |     """
18 |     Load a dataset from lifetimes.datasets.
19 | 
20 |     Parameters
21 |     ----------
22 |     filename: str
23 |         for example "larynx.csv"
24 |     usecols: list
25 |         Passed to **kwargs, list of columns in file to use.
26 |     **kwargs
27 |         Passed to pandas.read_csv function.
28 | 
29 |     Returns
30 |     -------
31 |     DataFrame
32 | 
33 |     """
34 |     return pd.read_csv(resource_filename("lifetimes", "datasets/" + filename), **kwargs)
35 | 
36 | 
37 | def load_donations(**kwargs):
38 |     """Load donations dataset as pandas DataFrame."""
39 |     return load_dataset("donations.csv", **kwargs)
40 | 
41 | 
42 | def load_cdnow_summary(**kwargs):
43 |     """Load cdnow customers summary pandas DataFrame."""
44 |     return load_dataset("cdnow_customers_summary.csv", **kwargs)
45 | 
46 | 
47 | def load_transaction_data(**kwargs):
48 |     """
49 |     Return a Pandas dataframe of transactional data.
50 | 
51 |     Looks like:
52 | 
53 |                       date  id
54 |     0  2014-03-08 00:00:00   0
55 |     1  2014-05-21 00:00:00   1
56 |     2  2014-03-14 00:00:00   2
57 |     3  2014-04-09 00:00:00   2
58 |     4  2014-05-21 00:00:00   2
59 | 
60 |     The data was artificially created using Lifetimes data generation routines. Data was generated
61 |     between 2014-01-01 to 2014-12-31.
62 | 
63 |     """
64 |     return load_dataset("example_transactions.csv", **kwargs)
65 | 
66 | 
67 | def load_cdnow_summary_data_with_monetary_value(**kwargs):
68 |     """Load cdnow customers summary with monetary value as pandas DataFrame."""
69 |     df = load_dataset("cdnow_customers_summary_with_transactions.csv", **kwargs)
70 |     df.columns = ["customer_id", "frequency", "recency", "T", "monetary_value"]
71 |     df = df.set_index("customer_id")
72 |     return df
73 | 


--------------------------------------------------------------------------------
/lifetimes/datasets/donations.csv:
--------------------------------------------------------------------------------
 1 | frequency,recency,periods,weights
 2 | 0,0,6,3464
 3 | 1,1,6,1091
 4 | 1,2,6,277
 5 | 1,3,6,129
 6 | 1,4,6,78
 7 | 1,5,6,119
 8 | 1,6,6,129
 9 | 2,2,6,613
10 | 2,3,6,255
11 | 2,4,6,155
12 | 2,5,6,173
13 | 2,6,6,234
14 | 3,3,6,322
15 | 3,4,6,181
16 | 3,5,6,225
17 | 3,6,6,357
18 | 4,4,6,240
19 | 4,5,6,284
20 | 4,6,6,512
21 | 5,5,6,335
22 | 5,6,6,728
23 | 6,6,6,1203
24 | 


--------------------------------------------------------------------------------
/lifetimes/fitters/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Base fitter for other classes."""
  3 | import warnings
  4 | 
  5 | warnings.simplefilter(action="ignore", category=FutureWarning)
  6 | import dill
  7 | import numpy as np
  8 | import pandas as pd
  9 | from textwrap import dedent
 10 | from scipy.optimize import minimize
 11 | from autograd import value_and_grad, hessian
 12 | from ..utils import _save_obj_without_attr, ConvergenceError
 13 | 
 14 | 
 15 | class BaseFitter(object):
 16 |     """Base class for fitters."""
 17 | 
 18 |     def __repr__(self):
 19 |         """Representation of fitter."""
 20 |         classname = self.__class__.__name__
 21 |         try:
 22 |             subj_str = " fitted with {:d} subjects,".format(self.data.shape[0])
 23 |         except AttributeError:
 24 |             subj_str = ""
 25 | 
 26 |         try:
 27 |             param_str = ", ".join("{}: {:.2f}".format(par, val) for par, val in sorted(self.params_.items()))
 28 |             return "<lifetimes.{classname}:{subj_str} {param_str}>".format(
 29 |                 classname=classname, subj_str=subj_str, param_str=param_str
 30 |             )
 31 |         except AttributeError:
 32 |             return "<lifetimes.{classname}>".format(classname=classname)
 33 | 
 34 |     def _unload_params(self, *args):
 35 |         if not hasattr(self, "params_"):
 36 |             raise ValueError("Model has not been fit yet. Please call the .fit" " method first.")
 37 |         return [self.params_[x] for x in args]
 38 | 
 39 |     def save_model(self, path, save_data=True, save_generate_data_method=True, values_to_save=None):
 40 |         """
 41 |         Save model with dill package.
 42 | 
 43 |         Parameters
 44 |         ----------
 45 |         path: str
 46 |             Path where to save model.
 47 |         save_data: bool, optional
 48 |             Whether to save data from fitter.data to pickle object
 49 |         save_generate_data_method: bool, optional
 50 |             Whether to save generate_new_data method (if it exists) from
 51 |             fitter.generate_new_data to pickle object.
 52 |         values_to_save: list, optional
 53 |             Placeholders for original attributes for saving object. If None
 54 |             will be extended to attr_list length like [None] * len(attr_list)
 55 | 
 56 |         """
 57 |         attr_list = ["data" * (not save_data), "generate_new_data" * (not save_generate_data_method)]
 58 |         _save_obj_without_attr(self, attr_list, path, values_to_save=values_to_save)
 59 | 
 60 |     def load_model(self, path):
 61 |         """
 62 |         Load model with dill package.
 63 | 
 64 |         Parameters
 65 |         ----------
 66 |         path: str
 67 |             From what path load model.
 68 | 
 69 |         """
 70 |         with open(path, "rb") as in_file:
 71 |             self.__dict__.update(dill.load(in_file).__dict__)
 72 | 
 73 |     def _compute_variance_matrix(self):
 74 |         params_ = self.params_
 75 |         return pd.DataFrame(
 76 |             (params_ ** 2).values * np.linalg.inv(self._hessian_) / self.data["weights"].sum(),
 77 |             columns=params_.index,
 78 |             index=params_.index,
 79 |         )
 80 | 
 81 |     def _compute_standard_errors(self):
 82 |         return np.sqrt(pd.Series(np.diag(self.variance_matrix_.values), index=self.params_.index))
 83 | 
 84 |     def _compute_confidence_intervals(self):
 85 |         inv_cdf_at_5_confidence = 1.96
 86 |         return pd.DataFrame(
 87 |             {
 88 |                 "lower 95% bound": self.params_ - inv_cdf_at_5_confidence * self.standard_errors_,
 89 |                 "upper 95% bound": self.params_ + inv_cdf_at_5_confidence * self.standard_errors_,
 90 |             },
 91 |             index=self.params_.index,
 92 |         )
 93 | 
 94 |     def _fit(self, minimizing_function_args, initial_params, params_size, disp, tol=1e-7, bounds=None, **kwargs):
 95 |         # set options for minimize, if specified in kwargs will be overwritten
 96 |         minimize_options = {}
 97 |         minimize_options["disp"] = disp
 98 |         minimize_options.update(kwargs)
 99 | 
100 |         current_init_params = 0.1 * np.ones(params_size) if initial_params is None else initial_params
101 |         output = minimize(
102 |             value_and_grad(self._negative_log_likelihood),
103 |             jac=True,
104 |             method=None,
105 |             tol=tol,
106 |             x0=current_init_params,
107 |             args=minimizing_function_args,
108 |             options=minimize_options,
109 |             bounds=bounds,
110 |         )
111 |         if output.success:
112 |             hessian_ = hessian(self._negative_log_likelihood)(output.x, *minimizing_function_args)
113 |             return output.x, output.fun, hessian_
114 |         print(output)
115 |         raise ConvergenceError(
116 |             dedent(
117 |                 """
118 |             The model did not converge. Try adding a larger penalizer to see if that helps convergence.
119 |             """
120 |             )
121 |         )
122 | 
123 |     @property
124 |     def summary(self):
125 |         """
126 |         Summary statistics describing the fit.
127 | 
128 |         Returns
129 |         -------
130 |         df : pd.DataFrame
131 |             Contains columns coef, se(coef), lower, upper
132 | 
133 |         See Also
134 |         --------
135 |         ``print_summary``
136 |         """
137 |         df = pd.DataFrame(index=self.params_.index)
138 |         df["coef"] = self.params_
139 |         df["se(coef)"] = self.standard_errors_
140 |         df["lower 95% bound"] = self.confidence_intervals_["lower 95% bound"]
141 |         df["upper 95% bound"] = self.confidence_intervals_["upper 95% bound"]
142 |         return df
143 | 


--------------------------------------------------------------------------------
/lifetimes/fitters/beta_geo_beta_binom_fitter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Beta Geo Beta BinomFitter."""
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | import warnings
  6 | 
  7 | warnings.simplefilter(action="ignore", category=FutureWarning)
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from autograd.numpy import log, exp, logaddexp
 12 | from pandas import DataFrame
 13 | from autograd.scipy.special import gammaln, betaln, beta as betaf
 14 | from scipy.special import binom
 15 | 
 16 | from ..utils import _check_inputs
 17 | from . import BaseFitter
 18 | from ..generate_data import beta_geometric_beta_binom_model
 19 | 
 20 | 
 21 | class BetaGeoBetaBinomFitter(BaseFitter):
 22 |     """
 23 |     Also known as the Beta-Geometric/Beta-Binomial Model [1]_.
 24 | 
 25 |     Future purchases opportunities are treated as discrete points in time.
 26 |     In the literature, the model provides a better fit than the Pareto/NBD
 27 |     model for a nonprofit organization with regular giving patterns.
 28 | 
 29 |     The model is estimated with a recency-frequency matrix with n transaction
 30 |     opportunities.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     penalizer_coef: float
 35 |         The coefficient applied to an l2 norm on the parameters
 36 | 
 37 |     Attributes
 38 |     ----------
 39 |     penalizer_coef: float
 40 |         The coefficient applied to an l2 norm on the parameters
 41 |     params_: :obj: Series
 42 |         The fitted parameters of the model
 43 |     data: :obj: DataFrame
 44 |         A DataFrame with the values given in the call to `fit`
 45 |     variance_matrix_: :obj: DataFrame
 46 |         A DataFrame with the variance matrix of the parameters.
 47 |     confidence_intervals_: :obj: DataFrame
 48 |         A DataFrame 95% confidence intervals of the parameters
 49 |     standard_errors_: :obj: Series
 50 |         A Series with the standard errors of the parameters
 51 |     summary: :obj: DataFrame
 52 |         A DataFrame containing information about the fitted parameters
 53 | 
 54 |     References
 55 |     ----------
 56 |     .. [1] Fader, Peter S., Bruce G.S. Hardie, and Jen Shang (2010),
 57 |        "Customer-Base Analysis in a Discrete-Time Noncontractual Setting,"
 58 |        Marketing Science, 29 (6), 1086-1108.
 59 | 
 60 |     """
 61 | 
 62 |     def __init__(self, penalizer_coef=0.0):
 63 |         """Initialization, set penalizer_coef."""
 64 |         self.penalizer_coef = penalizer_coef
 65 | 
 66 |     @staticmethod
 67 |     def _loglikelihood(params, x, tx, T):
 68 |         warnings.simplefilter(action="ignore", category=FutureWarning)
 69 | 
 70 |         """Log likelihood for optimizer."""
 71 |         alpha, beta, gamma, delta = params
 72 | 
 73 |         betaln_ab = betaln(alpha, beta)
 74 |         betaln_gd = betaln(gamma, delta)
 75 | 
 76 |         A = betaln(alpha + x, beta + T - x) - betaln_ab + betaln(gamma, delta + T) - betaln_gd
 77 | 
 78 |         B = 1e-15 * np.ones_like(T)
 79 |         recency_T = T - tx - 1
 80 | 
 81 |         for j in np.arange(recency_T.max() + 1):
 82 |             ix = recency_T >= j
 83 |             B = B + ix * betaf(alpha + x, beta + tx - x + j) * betaf(gamma + 1, delta + tx + j)
 84 | 
 85 |         B = log(B) - betaln_gd - betaln_ab
 86 |         return logaddexp(A, B)
 87 | 
 88 |     @staticmethod
 89 |     def _negative_log_likelihood(log_params, frequency, recency, n_periods, weights, penalizer_coef=0):
 90 |         params = exp(log_params)
 91 |         penalizer_term = penalizer_coef * sum(params ** 2)
 92 |         return (
 93 |             -(BetaGeoBetaBinomFitter._loglikelihood(params, frequency, recency, n_periods) * weights).sum()
 94 |             / weights.sum()
 95 |             + penalizer_term
 96 |         )
 97 | 
 98 |     def fit(
 99 |         self,
100 |         frequency,
101 |         recency,
102 |         n_periods,
103 |         weights=None,
104 |         initial_params=None,
105 |         verbose=False,
106 |         tol=1e-7,
107 |         index=None,
108 |         **kwargs
109 |     ):
110 |         """
111 |         Fit the BG/BB model.
112 | 
113 |         Parameters
114 |         ----------
115 |         frequency: array_like
116 |             Total periods with observed transactions
117 |         recency: array_like
118 |             Period of most recent transaction
119 |         n_periods: array_like
120 |             Number of transaction opportunities. Previously called `n`.
121 |         weights: None or array_like
122 |             Number of customers with given frequency/recency/T,
123 |             defaults to 1 if not specified. Fader and
124 |             Hardie condense the individual RFM matrix into all
125 |             observed combinations of frequency/recency/T. This
126 |             parameter represents the count of customers with a given
127 |             purchase pattern. Instead of calculating individual
128 |             log-likelihood, the log-likelihood is calculated for each
129 |             pattern and multiplied by the number of customers with
130 |             that pattern.  Previously called `n_custs`.
131 |         verbose: boolean, optional
132 |             Set to true to print out convergence diagnostics.
133 |         tol: float, optional
134 |             Tolerance for termination of the function minimization process.
135 |         index: array_like, optional
136 |             Index for resulted DataFrame which is accessible via self.data
137 |         kwargs:
138 |             Key word arguments to pass to the scipy.optimize.minimize
139 |             function as options dict
140 | 
141 |         Returns
142 |         -------
143 |         BetaGeoBetaBinomFitter
144 |             fitted and with parameters estimated
145 | 
146 |         """
147 |         frequency = np.asarray(frequency).astype(int)
148 |         recency = np.asarray(recency).astype(int)
149 |         n_periods = np.asarray(n_periods).astype(int)
150 | 
151 |         if weights is None:
152 |             weights = np.ones_like(recency)
153 |         else:
154 |             weights = np.asarray(weights)
155 | 
156 |         _check_inputs(frequency, recency, n_periods)
157 | 
158 |         log_params_, self._negative_log_likelihood_, self._hessian_ = self._fit(
159 |             (frequency, recency, n_periods, weights, self.penalizer_coef), initial_params, 4, verbose, tol, **kwargs
160 |         )
161 |         self.params_ = pd.Series(np.exp(log_params_), index=["alpha", "beta", "gamma", "delta"])
162 | 
163 |         self.data = DataFrame(
164 |             {"frequency": frequency, "recency": recency, "n_periods": n_periods, "weights": weights}, index=index
165 |         )
166 | 
167 |         self.generate_new_data = lambda size=1: beta_geometric_beta_binom_model(
168 |             # Making a large array replicating n by n_custs having n.
169 |             np.array(sum([n_] * n_cust for (n_, n_cust) in zip(n_periods, weights))),
170 |             *self._unload_params("alpha", "beta", "gamma", "delta"),
171 |             size=size
172 |         )
173 | 
174 |         self.variance_matrix_ = self._compute_variance_matrix()
175 |         self.standard_errors_ = self._compute_standard_errors()
176 |         self.confidence_intervals_ = self._compute_confidence_intervals()
177 |         return self
178 | 
179 |     def conditional_expected_number_of_purchases_up_to_time(self, m_periods_in_future, frequency, recency, n_periods):
180 |         r"""
181 |         Conditional expected purchases in future time period.
182 | 
183 |         The  expected  number  of  future  transactions across the next m_periods_in_future
184 |         transaction opportunities by a customer with purchase history
185 |         (x, tx, n).
186 | 
187 |         .. math:: E(X(n_{periods}, n_{periods}+m_{periods_in_future})| \alpha, \beta, \gamma, \delta, frequency, recency, n_{periods})
188 | 
189 |         See (13) in Fader & Hardie 2010.
190 | 
191 |         Parameters
192 |         ----------
193 |         t: array_like
194 |             time n_periods (n+t)
195 | 
196 |         Returns
197 |         -------
198 |         array_like
199 |             predicted transactions
200 | 
201 |         """
202 |         x = frequency
203 |         tx = recency
204 |         n = n_periods
205 | 
206 |         params = self._unload_params("alpha", "beta", "gamma", "delta")
207 |         alpha, beta, gamma, delta = params
208 | 
209 |         p1 = 1 / exp(self._loglikelihood(params, x, tx, n))
210 |         p2 = exp(betaln(alpha + x + 1, beta + n - x) - betaln(alpha, beta))
211 |         p3 = delta / (gamma - 1) * exp(gammaln(gamma + delta) - gammaln(1 + delta))
212 |         p4 = exp(gammaln(1 + delta + n) - gammaln(gamma + delta + n))
213 |         p5 = exp(gammaln(1 + delta + n + m_periods_in_future) - gammaln(gamma + delta + n + m_periods_in_future))
214 | 
215 |         return p1 * p2 * p3 * (p4 - p5)
216 | 
217 |     def conditional_probability_alive(self, m_periods_in_future, frequency, recency, n_periods):
218 |         """
219 |         Conditional probability alive.
220 | 
221 |         Conditional probability customer is alive at transaction opportunity
222 |         n_periods + m_periods_in_future.
223 | 
224 |         .. math:: P(alive at n_periods + m_periods_in_future|alpha, beta, gamma, delta, frequency, recency, n_periods)
225 | 
226 |         See (A10) in Fader and Hardie 2010.
227 | 
228 |         Parameters
229 |         ----------
230 |         m: array_like
231 |             transaction opportunities
232 | 
233 |         Returns
234 |         -------
235 |         array_like
236 |             alive probabilities
237 | 
238 |         """
239 |         params = self._unload_params("alpha", "beta", "gamma", "delta")
240 |         alpha, beta, gamma, delta = params
241 | 
242 |         p1 = betaln(alpha + frequency, beta + n_periods - frequency) - betaln(alpha, beta)
243 |         p2 = betaln(gamma, delta + n_periods + m_periods_in_future) - betaln(gamma, delta)
244 |         p3 = self._loglikelihood(params, frequency, recency, n_periods)
245 | 
246 |         return exp(p1 + p2) / exp(p3)
247 | 
248 |     def expected_number_of_transactions_in_first_n_periods(self, n):
249 |         r"""
250 |         Return expected number of transactions in first n n_periods.
251 | 
252 |         Expected number of transactions occurring across first n transaction
253 |         opportunities.
254 |         Used by Fader and Hardie to assess in-sample fit.
255 | 
256 |         .. math:: Pr(X(n) = x| \alpha, \beta, \gamma, \delta)
257 | 
258 |         See (7) in Fader & Hardie 2010.
259 | 
260 |         Parameters
261 |         ----------
262 |         n: float
263 |             number of transaction opportunities
264 | 
265 |         Returns
266 |         -------
267 |         DataFrame:
268 |             Predicted values, indexed by x
269 | 
270 |         """
271 |         params = self._unload_params("alpha", "beta", "gamma", "delta")
272 |         alpha, beta, gamma, delta = params
273 | 
274 |         x_counts = self.data.groupby("frequency")["weights"].sum()
275 |         x = np.asarray(x_counts.index)
276 | 
277 |         p1 = binom(n, x) * exp(
278 |             betaln(alpha + x, beta + n - x) - betaln(alpha, beta) + betaln(gamma, delta + n) - betaln(gamma, delta)
279 |         )
280 | 
281 |         I = np.arange(x.min(), n)
282 | 
283 |         @np.vectorize
284 |         def p2(j, x):
285 |             i = I[int(j) :]
286 |             return np.sum(
287 |                 binom(i, x)
288 |                 * exp(
289 |                     betaln(alpha + x, beta + i - x)
290 |                     - betaln(alpha, beta)
291 |                     + betaln(gamma + 1, delta + i)
292 |                     - betaln(gamma, delta)
293 |                 )
294 |             )
295 | 
296 |         p1 += np.fromfunction(p2, (x.shape[0],), x=x)
297 | 
298 |         idx = pd.Index(x, name="frequency")
299 |         return DataFrame(p1 * x_counts.sum(), index=idx, columns=["model"])
300 | 


--------------------------------------------------------------------------------
/lifetimes/fitters/beta_geo_fitter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Beta Geo Fitter, also known as BG/NBD model."""
  3 | 
  4 | from __future__ import print_function
  5 | from __future__ import division
  6 | import warnings
  7 | 
  8 | import pandas as pd
  9 | import autograd.numpy as np
 10 | from autograd.scipy.special import gammaln, beta, gamma
 11 | from scipy.special import hyp2f1
 12 | from scipy.special import expit
 13 | from . import BaseFitter
 14 | from ..utils import _scale_time, _check_inputs
 15 | from ..generate_data import beta_geometric_nbd_model
 16 | 
 17 | 
 18 | class BetaGeoFitter(BaseFitter):
 19 |     """
 20 |     Also known as the BG/NBD model.
 21 | 
 22 |     Based on [2]_, this model has the following assumptions:
 23 | 
 24 |     1) Each individual, i, has a hidden lambda_i and p_i parameter
 25 |     2) These come from a population wide Gamma and a Beta distribution
 26 |        respectively.
 27 |     3) Individuals purchases follow a Poisson process with rate lambda_i*t .
 28 |     4) After each purchase, an individual has a p_i probability of dieing
 29 |        (never buying again).
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     penalizer_coef: float
 34 |         The coefficient applied to an l2 norm on the parameters
 35 | 
 36 |     Attributes
 37 |     ----------
 38 |     penalizer_coef: float
 39 |         The coefficient applied to an l2 norm on the parameters
 40 |     params_: :obj: Series
 41 |         The fitted parameters of the model
 42 |     data: :obj: DataFrame
 43 |         A DataFrame with the values given in the call to `fit`
 44 |     variance_matrix_: :obj: DataFrame
 45 |         A DataFrame with the variance matrix of the parameters.
 46 |     confidence_intervals_: :obj: DataFrame
 47 |         A DataFrame 95% confidence intervals of the parameters
 48 |     standard_errors_: :obj: Series
 49 |         A Series with the standard errors of the parameters
 50 |     summary: :obj: DataFrame
 51 |         A DataFrame containing information about the fitted parameters
 52 | 
 53 |     References
 54 |     ----------
 55 |     .. [2] Fader, Peter S., Bruce G.S. Hardie, and Ka Lok Lee (2005a),
 56 |        "Counting Your Customers the Easy Way: An Alternative to the
 57 |        Pareto/NBD Model," Marketing Science, 24 (2), 275-84.
 58 |     """
 59 | 
 60 |     def __init__(
 61 |         self, 
 62 |         penalizer_coef=0.0
 63 |     ):
 64 |         """
 65 |         Initialization, set penalizer_coef.
 66 |         """
 67 | 
 68 |         self.penalizer_coef = penalizer_coef
 69 | 
 70 |     def fit(
 71 |         self, 
 72 |         frequency, 
 73 |         recency, 
 74 |         T, 
 75 |         weights=None, 
 76 |         initial_params=None, 
 77 |         verbose=False, 
 78 |         tol=1e-7, 
 79 |         index=None, 
 80 |         **kwargs
 81 |     ):
 82 |         """
 83 |         Fit a dataset to the BG/NBD model.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         frequency: array_like
 88 |             the frequency vector of customers' purchases
 89 |             (denoted x in literature).
 90 |         recency: array_like
 91 |             the recency vector of customers' purchases
 92 |             (denoted t_x in literature).
 93 |         T: array_like
 94 |             customers' age (time units since first purchase)
 95 |         weights: None or array_like
 96 |             Number of customers with given frequency/recency/T,
 97 |             defaults to 1 if not specified. Fader and
 98 |             Hardie condense the individual RFM matrix into all
 99 |             observed combinations of frequency/recency/T. This
100 |             parameter represents the count of customers with a given
101 |             purchase pattern. Instead of calculating individual
102 |             loglikelihood, the loglikelihood is calculated for each
103 |             pattern and multiplied by the number of customers with
104 |             that pattern.
105 |         initial_params: array_like, optional
106 |             set the initial parameters for the fitter.
107 |         verbose : bool, optional
108 |             set to true to print out convergence diagnostics.
109 |         tol : float, optional
110 |             tolerance for termination of the function minimization process.
111 |         index: array_like, optional
112 |             index for resulted DataFrame which is accessible via self.data
113 |         kwargs:
114 |             key word arguments to pass to the scipy.optimize.minimize
115 |             function as options dict
116 | 
117 |         Returns
118 |         -------
119 |         BetaGeoFitter
120 |             with additional properties like ``params_`` and methods like ``predict``
121 |         """
122 | 
123 |         frequency = np.asarray(frequency).astype(int)
124 |         recency = np.asarray(recency)
125 |         T = np.asarray(T)
126 |         _check_inputs(frequency, recency, T)
127 | 
128 |         if weights is None:
129 |             weights = np.ones_like(recency, dtype=int)
130 |         else:
131 |             weights = np.asarray(weights)
132 | 
133 |         self._scale = _scale_time(T)
134 |         scaled_recency = recency * self._scale
135 |         scaled_T = T * self._scale
136 | 
137 |         log_params_, self._negative_log_likelihood_, self._hessian_ = self._fit(
138 |             (frequency, scaled_recency, scaled_T, weights, self.penalizer_coef),
139 |             initial_params,
140 |             4,
141 |             verbose,
142 |             tol,
143 |             **kwargs
144 |         )
145 | 
146 |         self.params_ = pd.Series(np.exp(log_params_), index=["r", "alpha", "a", "b"])
147 |         self.params_["alpha"] /= self._scale
148 | 
149 |         self.data = pd.DataFrame({"frequency": frequency, "recency": recency, "T": T, "weights": weights}, index=index)
150 | 
151 |         self.generate_new_data = lambda size=1: beta_geometric_nbd_model(
152 |             T, *self._unload_params("r", "alpha", "a", "b"), size=size
153 |         )
154 | 
155 |         self.predict = self.conditional_expected_number_of_purchases_up_to_time
156 | 
157 |         self.variance_matrix_ = self._compute_variance_matrix()
158 |         self.standard_errors_ = self._compute_standard_errors()
159 |         self.confidence_intervals_ = self._compute_confidence_intervals()
160 | 
161 |         return self
162 | 
163 |     @staticmethod
164 |     def _negative_log_likelihood(
165 |         log_params, 
166 |         freq, 
167 |         rec, 
168 |         T, 
169 |         weights, 
170 |         penalizer_coef
171 |     ):
172 |         """
173 |         The following method for calculatating the *log-likelihood* uses the method
174 |         specified in section 7 of [2]_. More information can also be found in [3]_.
175 | 
176 |         References
177 |         ----------
178 |         .. [2] Fader, Peter S., Bruce G.S. Hardie, and Ka Lok Lee (2005a),
179 |         "Counting Your Customers the Easy Way: An Alternative to the
180 |         Pareto/NBD Model," Marketing Science, 24 (2), 275-84.
181 |         .. [3] http://brucehardie.com/notes/004/
182 |         """
183 | 
184 |         warnings.simplefilter(action="ignore", category=FutureWarning)
185 | 
186 |         params = np.exp(log_params)
187 |         r, alpha, a, b = params
188 | 
189 |         A_1 = gammaln(r + freq) - gammaln(r) + r * np.log(alpha)
190 |         A_2 = gammaln(a + b) + gammaln(b + freq) - gammaln(b) - gammaln(a + b + freq)
191 |         A_3 = -(r + freq) * np.log(alpha + T)
192 |         A_4 = np.log(a) - np.log(b + np.maximum(freq, 1) - 1) - (r + freq) * np.log(rec + alpha)
193 | 
194 |         max_A_3_A_4 = np.maximum(A_3, A_4)
195 | 
196 |         penalizer_term = penalizer_coef * sum(params ** 2)
197 |         ll = weights * (A_1 + A_2 + np.log(np.exp(A_3 - max_A_3_A_4) + np.exp(A_4 - max_A_3_A_4) * (freq > 0)) + max_A_3_A_4)
198 | 
199 |         return -ll.sum() / weights.sum() + penalizer_term
200 | 
201 |     def conditional_expected_number_of_purchases_up_to_time(
202 |         self, 
203 |         t, 
204 |         frequency, 
205 |         recency, 
206 |         T
207 |     ):
208 |         """
209 |         Conditional expected number of purchases up to time.
210 | 
211 |         Calculate the expected number of repeat purchases up to time t for a
212 |         randomly chosen individual from the population, given they have
213 |         purchase history (frequency, recency, T).
214 | 
215 |         This function uses equation (10) from [2]_.
216 | 
217 |         Parameters
218 |         ----------
219 |         t: array_like
220 |             times to calculate the expectation for.
221 |         frequency: array_like
222 |             historical frequency of customer.
223 |         recency: array_like
224 |             historical recency of customer.
225 |         T: array_like
226 |             age of the customer.
227 | 
228 |         Returns
229 |         -------
230 |         array_like
231 | 
232 |         References
233 |         ----------
234 |         .. [2] Fader, Peter S., Bruce G.S. Hardie, and Ka Lok Lee (2005a),
235 |         "Counting Your Customers the Easy Way: An Alternative to the
236 |         Pareto/NBD Model," Marketing Science, 24 (2), 275-84.
237 |         """
238 | 
239 |         x = frequency
240 |         r, alpha, a, b = self._unload_params("r", "alpha", "a", "b")
241 | 
242 |         _a = r + x
243 |         _b = b + x
244 |         _c = a + b + x - 1
245 |         _z = t / (alpha + T + t)
246 |         ln_hyp_term = np.log(hyp2f1(_a, _b, _c, _z))
247 | 
248 |         # if the value is inf, we are using a different but equivalent
249 |         # formula to compute the function evaluation.
250 |         ln_hyp_term_alt = np.log(hyp2f1(_c - _a, _c - _b, _c, _z)) + (_c - _a - _b) * np.log(1 - _z)
251 |         ln_hyp_term = np.where(np.isinf(ln_hyp_term), ln_hyp_term_alt, ln_hyp_term)
252 |         first_term = (a + b + x - 1) / (a - 1)
253 |         second_term = 1 - np.exp(ln_hyp_term + (r + x) * np.log((alpha + T) / (alpha + t + T)))
254 | 
255 |         numerator = first_term * second_term
256 |         denominator = 1 + (x > 0) * (a / (b + x - 1)) * ((alpha + T) / (alpha + recency)) ** (r + x)
257 | 
258 |         return numerator / denominator
259 | 
260 |     def conditional_probability_alive(
261 |         self, 
262 |         frequency, 
263 |         recency, 
264 |         T
265 |     ):
266 |         """
267 |         Compute conditional probability alive.
268 | 
269 |         Compute the probability that a customer with history
270 |         (frequency, recency, T) is currently alive.
271 | 
272 |         From http://www.brucehardie.com/notes/021/palive_for_BGNBD.pdf
273 | 
274 |         Parameters
275 |         ----------
276 |         frequency: array or scalar
277 |             historical frequency of customer.
278 |         recency: array or scalar
279 |             historical recency of customer.
280 |         T: array or scalar
281 |             age of the customer.
282 | 
283 |         Returns
284 |         -------
285 |         array
286 |             value representing a probability
287 |         """
288 | 
289 |         r, alpha, a, b = self._unload_params("r", "alpha", "a", "b")
290 | 
291 |         log_div = (r + frequency) * np.log((alpha + T) / (alpha + recency)) + np.log(
292 |             a / (b + np.maximum(frequency, 1) - 1)
293 |         )
294 | 
295 |         return np.atleast_1d(np.where(frequency == 0, 1.0, expit(-log_div)))
296 | 
297 |     def conditional_probability_alive_matrix(
298 |         self, 
299 |         max_frequency=None, 
300 |         max_recency=None
301 |     ):
302 |         """
303 |         Compute the probability alive matrix.
304 | 
305 |         Uses the ``conditional_probability_alive()`` method to get calculate the matrix.
306 | 
307 |         Parameters
308 |         ----------
309 |         max_frequency: float, optional
310 |             the maximum frequency to plot. Default is max observed frequency.
311 |         max_recency: float, optional
312 |             the maximum recency to plot. This also determines the age of the
313 |             customer. Default to max observed age.
314 | 
315 |         Returns
316 |         -------
317 |         matrix:
318 |             A matrix of the form [t_x: historical recency, x: historical frequency]
319 |         """
320 | 
321 |         max_frequency = max_frequency or int(self.data["frequency"].max())
322 |         max_recency = max_recency or int(self.data["T"].max())
323 | 
324 |         return np.fromfunction(
325 |             self.conditional_probability_alive, (max_frequency + 1, max_recency + 1), T=max_recency
326 |         ).T
327 | 
328 |     def expected_number_of_purchases_up_to_time(
329 |         self, 
330 |         t
331 |     ):
332 |         """
333 |         Calculate the expected number of repeat purchases up to time t.
334 | 
335 |         Calculate repeat purchases for a randomly chosen individual from the
336 |         population.
337 | 
338 |         Equivalent to equation (9) of [2]_.
339 | 
340 |         Parameters
341 |         ----------
342 |         t: array_like
343 |             times to calculate the expection for
344 | 
345 |         Returns
346 |         -------
347 |         array_like
348 | 
349 |         References
350 |         ----------
351 |         .. [2] Fader, Peter S., Bruce G.S. Hardie, and Ka Lok Lee (2005a),
352 |         "Counting Your Customers the Easy Way: An Alternative to the
353 |         Pareto/NBD Model," Marketing Science, 24 (2), 275-84.
354 |         """
355 | 
356 |         r, alpha, a, b = self._unload_params("r", "alpha", "a", "b")
357 |         hyp = hyp2f1(r, b, a + b - 1, t / (alpha + t))
358 | 
359 |         return (a + b - 1) / (a - 1) * (1 - hyp * (alpha / (alpha + t)) ** r)
360 | 
361 |     def probability_of_n_purchases_up_to_time(
362 |         self, 
363 |         t, 
364 |         n
365 |     ):
366 |         r"""
367 |         Compute the probability of n purchases.
368 | 
369 |          .. math::  P( N(t) = n | \text{model} )
370 | 
371 |         where N(t) is the number of repeat purchases a customer makes in t
372 |         units of time.
373 | 
374 |         Comes from equation (8) of [2]_.
375 | 
376 |         Parameters
377 |         ----------
378 |         t: float
379 |             number units of time
380 |         n: int
381 |             number of purchases
382 | 
383 |         Returns
384 |         -------
385 |         float:
386 |             Probability to have n purchases up to t units of time
387 | 
388 |         References
389 |         ----------
390 |         .. [2] Fader, Peter S., Bruce G.S. Hardie, and Ka Lok Lee (2005a),
391 |         "Counting Your Customers the Easy Way: An Alternative to the
392 |         Pareto/NBD Model," Marketing Science, 24 (2), 275-84.
393 |         """
394 | 
395 |         r, alpha, a, b = self._unload_params("r", "alpha", "a", "b")
396 | 
397 |         first_term = (
398 |             beta(a, b + n)
399 |             / beta(a, b)
400 |             * gamma(r + n)
401 |             / gamma(r)
402 |             / gamma(n + 1)
403 |             * (alpha / (alpha + t)) ** r
404 |             * (t / (alpha + t)) ** n
405 |         )
406 | 
407 |         if n > 0:
408 |             j = np.arange(0, n)
409 |             finite_sum = (gamma(r + j) / gamma(r) / gamma(j + 1) * (t / (alpha + t)) ** j).sum()
410 |             second_term = beta(a + 1, b + n - 1) / beta(a, b) * (1 - (alpha / (alpha + t)) ** r * finite_sum)
411 |         else:
412 |             second_term = 0
413 | 
414 |         return first_term + second_term


--------------------------------------------------------------------------------
/lifetimes/fitters/gamma_gamma_fitter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Gamma-Gamma Model."""
  3 | 
  4 | from __future__ import print_function
  5 | from __future__ import division
  6 | import warnings
  7 | 
  8 | import pandas as pd
  9 | from autograd import numpy as np
 10 | from pandas import DataFrame
 11 | from autograd.scipy.special import gammaln
 12 | 
 13 | 
 14 | from . import BaseFitter
 15 | from ..utils import _check_inputs, _customer_lifetime_value
 16 | 
 17 | 
 18 | class GammaGammaFitter(BaseFitter):
 19 |     """
 20 |     Fitter for the gamma-gamma model.
 21 | 
 22 |     It is used to estimate the average monetary value of customer transactions.
 23 | 
 24 |     This implementation is based on the Excel spreadsheet found in [3]_.
 25 |     More details on the derivation and evaluation can be found in [4]_.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     penalizer_coef: float
 30 |         The coefficient applied to an l2 norm on the parameters
 31 | 
 32 |     Attributes
 33 |     ----------
 34 |     penalizer_coef: float
 35 |         The coefficient applied to an l2 norm on the parameters
 36 |     params_: :obj: OrderedDict
 37 |         The fitted parameters of the model
 38 |     data: :obj: DataFrame
 39 |         A DataFrame with the columns given in the call to `fit`
 40 | 
 41 |     References
 42 |     ----------
 43 |     .. [3] http://www.brucehardie.com/notes/025/
 44 |        The Gamma-Gamma Model of Monetary Value.
 45 |     .. [4] Peter S. Fader, Bruce G. S. Hardie, and Ka Lok Lee (2005),
 46 |        "RFM and CLV: Using iso-value curves for customer base analysis",
 47 |        Journal of Marketing Research, 42 (November), 415-430.
 48 | 
 49 |     Attributes
 50 |     -----------
 51 |     penalizer_coef: float
 52 |         The coefficient applied to an l2 norm on the parameters
 53 |     params_: :obj: Series
 54 |         The fitted parameters of the model
 55 |     data: :obj: DataFrame
 56 |         A DataFrame with the values given in the call to `fit`
 57 |     variance_matrix_: :obj: DataFrame
 58 |         A DataFrame with the variance matrix of the parameters.
 59 |     confidence_intervals_: :obj: DataFrame
 60 |         A DataFrame 95% confidence intervals of the parameters
 61 |     standard_errors_: :obj: Series
 62 |         A Series with the standard errors of the parameters
 63 |     summary: :obj: DataFrame
 64 |         A DataFrame containing information about the fitted parameters
 65 |     """
 66 | 
 67 |     def __init__(
 68 |         self, 
 69 |         penalizer_coef=0.0
 70 |     ):
 71 |         """
 72 |         Initialization, set penalizer_coef.
 73 |         """
 74 | 
 75 |         self.penalizer_coef = penalizer_coef
 76 | 
 77 |     @staticmethod
 78 |     def _negative_log_likelihood(
 79 |         log_params, 
 80 |         frequency, 
 81 |         avg_monetary_value, 
 82 |         weights, 
 83 |         penalizer_coef
 84 |     ):
 85 |         """
 86 |         Computes the Negative Log-Likelihood for the Gamma-Gamma Model as in:
 87 |         http://www.brucehardie.com/notes/025/
 88 | 
 89 |         This also applies a penalizer to the log-likelihood.
 90 | 
 91 |         Equivalent to equation (1a).
 92 | 
 93 |         Hardie's implementation of this method can be seen on page 8.
 94 |         """
 95 | 
 96 |         warnings.simplefilter(action="ignore", category=FutureWarning)
 97 | 
 98 |         params = np.exp(log_params)
 99 |         p, q, v = params
100 | 
101 |         x = frequency
102 |         m = avg_monetary_value
103 | 
104 |         negative_log_likelihood_values = (
105 |             gammaln(p * x + q)
106 |             - gammaln(p * x)
107 |             - gammaln(q)
108 |             + q * np.log(v)
109 |             + (p * x - 1) * np.log(m)
110 |             + (p * x) * np.log(x)
111 |             - (p * x + q) * np.log(x * m + v)
112 |         ) * weights
113 |         penalizer_term = penalizer_coef * sum(params ** 2)
114 | 
115 |         return -negative_log_likelihood_values.sum() / weights.sum() + penalizer_term
116 | 
117 |     def conditional_expected_average_profit(
118 |         self, 
119 |         frequency=None, 
120 |         monetary_value=None
121 |     ):
122 |         """
123 |         Conditional expectation of the average profit.
124 | 
125 |         This method computes the conditional expectation of the average profit
126 |         per transaction for a group of one or more customers.
127 | 
128 |         Equation (5) from:
129 |         http://www.brucehardie.com/notes/025/
130 | 
131 |         Parameters
132 |         ----------
133 |         frequency: array_like, optional
134 |             a vector containing the customers' frequencies.
135 |             Defaults to the whole set of frequencies used for fitting the model.
136 |         monetary_value: array_like, optional
137 |             a vector containing the customers' monetary values.
138 |             Defaults to the whole set of monetary values used for
139 |             fitting the model.
140 | 
141 |         Returns
142 |         -------
143 |         array_like:
144 |             The conditional expectation of the average profit per transaction
145 |         """
146 | 
147 |         if monetary_value is None:
148 |             monetary_value = self.data["monetary_value"]
149 |         if frequency is None:
150 |             frequency = self.data["frequency"]
151 |         p, q, v = self._unload_params("p", "q", "v")
152 | 
153 |         # The expected average profit is a weighted average of individual
154 |         # monetary value and the population mean.
155 |         individual_weight = p * frequency / (p * frequency + q - 1)
156 |         population_mean = v * p / (q - 1)
157 | 
158 |         return (1 - individual_weight) * population_mean + individual_weight * monetary_value
159 | 
160 |     def fit(
161 |         self,
162 |         frequency,
163 |         monetary_value,
164 |         weights=None,
165 |         initial_params=None,
166 |         verbose=False,
167 |         tol=1e-7,
168 |         index=None,
169 |         q_constraint=False,
170 |         **kwargs
171 |     ):
172 |         """
173 |         Fit the data to the Gamma/Gamma model.
174 | 
175 |         Parameters
176 |         ----------
177 |         frequency: array_like
178 |             the frequency vector of customers' purchases
179 |             (denoted x in literature).
180 |         monetary_value: array_like
181 |             the monetary value vector of customer's purchases
182 |             (denoted m in literature).
183 |         weights: None or array_like
184 |             Number of customers with given frequency/monetary_value,
185 |             defaults to 1 if not specified. Fader and
186 |             Hardie condense the individual RFM matrix into all
187 |             observed combinations of frequency/monetary_value. This
188 |             parameter represents the count of customers with a given
189 |             purchase pattern. Instead of calculating individual
190 |             loglikelihood, the loglikelihood is calculated for each
191 |             pattern and multiplied by the number of customers with
192 |             that pattern.
193 |         initial_params: array_like, optional
194 |             set the initial parameters for the fitter.
195 |         verbose : bool, optional
196 |             set to true to print out convergence diagnostics.
197 |         tol : float, optional
198 |             tolerance for termination of the function minimization process.
199 |         index: array_like, optional
200 |             index for resulted DataFrame which is accessible via self.data
201 |         q_constraint: bool, optional
202 |             when q < 1, population mean will result in a negative value
203 |             leading to negative CLV outputs. If True, we penalize negative values of q to avoid this issue.
204 |         kwargs:
205 |             key word arguments to pass to the scipy.optimize.minimize
206 |             function as options dict
207 | 
208 |         Returns
209 |         -------
210 |         GammaGammaFitter
211 |             fitted and with parameters estimated
212 |         """
213 | 
214 |         _check_inputs(frequency, monetary_value=monetary_value)
215 | 
216 |         frequency = np.asarray(frequency).astype(float)
217 |         monetary_value = np.asarray(monetary_value).astype(float)
218 | 
219 |         if weights is None:
220 |             weights = np.ones_like(frequency, dtype=int)
221 |         else:
222 |             weights = np.asarray(weights)
223 | 
224 |         log_params, self._negative_log_likelihood_, self._hessian_ = self._fit(
225 |             (frequency, monetary_value, weights, self.penalizer_coef),
226 |             initial_params,
227 |             3,
228 |             verbose,
229 |             tol=tol,
230 |             bounds=((None, None), (0, None), (None, None)) if q_constraint else None,
231 |             **kwargs
232 |         )
233 | 
234 |         self.data = DataFrame(
235 |             {"monetary_value": monetary_value, "frequency": frequency, "weights": weights}, index=index
236 |         )
237 | 
238 |         self.params_ = pd.Series(np.exp(log_params), index=["p", "q", "v"])
239 | 
240 |         self.variance_matrix_ = self._compute_variance_matrix()
241 |         self.standard_errors_ = self._compute_standard_errors()
242 |         self.confidence_intervals_ = self._compute_confidence_intervals()
243 | 
244 |         return self
245 | 
246 |     def customer_lifetime_value(
247 |         self, 
248 |         transaction_prediction_model, 
249 |         frequency, 
250 |         recency, 
251 |         T, 
252 |         monetary_value, 
253 |         time=12, 
254 |         discount_rate=0.01, 
255 |         freq="D"
256 |     ):
257 |         """
258 |         Return customer lifetime value.
259 | 
260 |         This method computes the average lifetime value for a group of one
261 |         or more customers.
262 | 
263 |         Parameters
264 |         ----------
265 |         transaction_prediction_model: model
266 |             the model to predict future transactions, literature uses
267 |             pareto/ndb models but we can also use a different model like beta-geo models
268 |         frequency: array_like
269 |             the frequency vector of customers' purchases
270 |             (denoted x in literature).
271 |         recency: the recency vector of customers' purchases
272 |                  (denoted t_x in literature).
273 |         T: array_like
274 |             customers' age (time units since first purchase)
275 |         monetary_value: array_like
276 |             the monetary value vector of customer's purchases
277 |             (denoted m in literature).
278 |         time: float, optional
279 |             the lifetime expected for the user in months. Default: 12
280 |         discount_rate: float, optional
281 |             the monthly adjusted discount rate. Default: 0.01
282 |         freq: string, optional
283 |             {"D", "H", "M", "W"} for day, hour, month, week. This represents what unit of time your T is measure in.
284 | 
285 |         Returns
286 |         -------
287 |         Series:
288 |             Series object with customer ids as index and the estimated customer
289 |             lifetime values as values
290 |         """
291 | 
292 |         frequency, recency, T, monetary_value = np.asarray(frequency), np.asarray(recency), np.asarray(T), np.asarray(monetary_value)
293 | 
294 |         # use the Gamma-Gamma estimates for the monetary_values
295 |         adjusted_monetary_value = self.conditional_expected_average_profit(frequency, monetary_value)
296 | 
297 |         return _customer_lifetime_value(
298 |             transaction_prediction_model, frequency, recency, T, adjusted_monetary_value, time, discount_rate, freq=freq
299 |         )


--------------------------------------------------------------------------------
/lifetimes/fitters/modified_beta_geo_fitter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import print_function
  3 | from __future__ import division
  4 | import warnings
  5 | 
  6 | import autograd.numpy as np
  7 | from autograd.numpy import log, logaddexp
  8 | from autograd.scipy.special import gammaln, beta, gamma
  9 | from scipy.special import hyp2f1
 10 | 
 11 | from lifetimes import BetaGeoFitter
 12 | from lifetimes.generate_data import modified_beta_geometric_nbd_model
 13 | 
 14 | 
 15 | class ModifiedBetaGeoFitter(BetaGeoFitter):
 16 |     r"""
 17 |     Also known as the MBG/NBD model.
 18 | 
 19 |     Based on [5]_, [6]_, this model has the following assumptions:
 20 |     1) Each individual, ``i``, has a hidden ``lambda_i`` and ``p_i`` parameter
 21 |     2) These come from a population wide Gamma and a Beta distribution
 22 |        respectively.
 23 |     3) Individuals purchases follow a Poisson process with rate :math:`\lambda_i*t` .
 24 |     4) At the beginning of their lifetime and after each purchase, an
 25 |        individual has a p_i probability of dieing (never buying again).
 26 | 
 27 |     References
 28 |     ----------
 29 |     .. [5] Batislam, E.P., M. Denizel, A. Filiztekin (2007),
 30 |        "Empirical validation and comparison of models for customer base
 31 |        analysis,"
 32 |        International Journal of Research in Marketing, 24 (3), 201-209.
 33 |     .. [6] Wagner, U. and Hoppe D. (2008), "Erratum on the MBG/NBD Model,"
 34 |        International Journal of Research in Marketing, 25 (3), 225-226.
 35 | 
 36 |     Attributes
 37 |     -----------
 38 |     penalizer_coef: float
 39 |         The coefficient applied to an l2 norm on the parameters
 40 |     params_: :obj: Series
 41 |         The fitted parameters of the model
 42 |     data: :obj: DataFrame
 43 |         A DataFrame with the values given in the call to `fit`
 44 |     variance_matrix_: :obj: DataFrame
 45 |         A DataFrame with the variance matrix of the parameters.
 46 |     confidence_intervals_: :obj: DataFrame
 47 |         A DataFrame 95% confidence intervals of the parameters
 48 |     standard_errors_: :obj: Series
 49 |         A Series with the standard errors of the parameters
 50 |     summary: :obj: DataFrame
 51 |         A DataFrame containing information about the fitted parameters
 52 |     """
 53 | 
 54 |     def __init__(self, penalizer_coef=0.0):
 55 |         """Initialization, set penalizer_coef."""
 56 |         super(ModifiedBetaGeoFitter, self).__init__(penalizer_coef)
 57 | 
 58 |     def fit(
 59 |         self, frequency, recency, T, weights=None, initial_params=None, verbose=False, tol=1e-7, index=None, **kwargs
 60 |     ):
 61 |         """
 62 |         Fit the data to the MBG/NBD model.
 63 | 
 64 |         Parameters
 65 |         ----------
 66 |         frequency: array_like
 67 |             the frequency vector of customers' purchases
 68 |             (denoted x in literature).
 69 |         recency: array_like
 70 |             the recency vector of customers' purchases
 71 |             (denoted t_x in literature).
 72 |         T: array_like
 73 |             customers' age (time units since first purchase)
 74 |         weights: None or array_like
 75 |             Number of customers with given frequency/recency/T,
 76 |             defaults to 1 if not specified. Fader and
 77 |             Hardie condense the individual RFM matrix into all
 78 |             observed combinations of frequency/recency/T. This
 79 |             parameter represents the count of customers with a given
 80 |             purchase pattern. Instead of calculating individual
 81 |             log-likelihood, the log-likelihood is calculated for each
 82 |             pattern and multiplied by the number of customers with
 83 |             that pattern.
 84 |         verbose : bool, optional
 85 |             set to true to print out convergence diagnostics.
 86 |         tol : float, optional
 87 |             tolerance for termination of the function minimization process.
 88 |         index: array_like, optional
 89 |             index for resulted DataFrame which is accessible via self.data
 90 |         kwargs:
 91 |             key word arguments to pass to the scipy.optimize.minimize
 92 |             function as options dict
 93 | 
 94 |         Returns
 95 |         -------
 96 |         ModifiedBetaGeoFitter:
 97 |             With additional properties and methods like ``params_`` and ``predict``
 98 | 
 99 |         """
100 |         # although the parent method is called, this class's
101 |         # _negative_log_likelihood is referenced
102 |         super(ModifiedBetaGeoFitter, self).fit(
103 |             frequency, recency, T, weights, initial_params, verbose, tol, index=index, **kwargs
104 |         )
105 |         # this needs to be reassigned from the parent method
106 |         self.generate_new_data = lambda size=1: modified_beta_geometric_nbd_model(
107 |             T, *self._unload_params("r", "alpha", "a", "b"), size=size
108 |         )
109 | 
110 |         self.variance_matrix_ = self._compute_variance_matrix()
111 |         self.standard_errors_ = self._compute_standard_errors()
112 |         self.confidence_intervals_ = self._compute_confidence_intervals()
113 |         return self
114 | 
115 |     @staticmethod
116 |     def _negative_log_likelihood(log_params, freq, rec, T, weights, penalizer_coef):
117 |         warnings.simplefilter(action="ignore", category=FutureWarning)
118 | 
119 |         params = np.exp(log_params)
120 |         r, alpha, a, b = params
121 | 
122 |         A_1 = gammaln(r + freq) - gammaln(r) + r * log(alpha)
123 |         A_2 = gammaln(a + b) + gammaln(b + freq + 1) - gammaln(b) - gammaln(a + b + freq + 1)
124 |         A_3 = -(r + freq) * log(alpha + T)
125 |         A_4 = log(a) - log(b + freq) + (r + freq) * (log(alpha + T) - log(alpha + rec))
126 | 
127 |         penalizer_term = penalizer_coef * sum(params ** 2)
128 |         return -(weights * (A_1 + A_2 + A_3 + logaddexp(A_4, 0))).sum() / weights.sum() + penalizer_term
129 | 
130 |     def expected_number_of_purchases_up_to_time(self, t):
131 |         """
132 |         Return expected number of repeat purchases up to time t.
133 | 
134 |         Calculate the expected number of repeat purchases up to time t for a
135 |         randomly choose individual from the population.
136 | 
137 |         Parameters
138 |         ----------
139 |         t: array_like
140 |             times to calculate the expectation for
141 | 
142 |         Returns
143 |         -------
144 |         array_like
145 | 
146 |         """
147 |         r, alpha, a, b = self._unload_params("r", "alpha", "a", "b")
148 |         hyp = hyp2f1(r, b + 1, a + b, t / (alpha + t))
149 |         return b / (a - 1) * (1 - hyp * (alpha / (alpha + t)) ** r)
150 | 
151 |     def conditional_expected_number_of_purchases_up_to_time(self, t, frequency, recency, T):
152 |         """
153 |         Conditional expected number of repeat purchases up to time t.
154 | 
155 |         Calculate the expected number of repeat purchases up to time t for a
156 |         randomly choose individual from the population, given they have
157 |         purchase history (frequency, recency, T)
158 |         See Wagner, U. and Hoppe D. (2008).
159 | 
160 |         Parameters
161 |         ----------
162 |         t: array_like
163 |             times to calculate the expectation for.
164 |         frequency: array_like
165 |             historical frequency of customer.
166 |         recency: array_like
167 |             historical recency of customer.
168 |         T: array_like
169 |             age of the customer.
170 | 
171 |         Returns
172 |         -------
173 |         array_like
174 | 
175 |         """
176 |         x = frequency
177 |         r, alpha, a, b = self._unload_params("r", "alpha", "a", "b")
178 | 
179 |         hyp_term = hyp2f1(r + x, b + x + 1, a + b + x, t / (alpha + T + t))
180 |         first_term = (a + b + x) / (a - 1)
181 |         second_term = 1 - hyp_term * ((alpha + T) / (alpha + t + T)) ** (r + x)
182 |         numerator = first_term * second_term
183 | 
184 |         denominator = 1 + (a / (b + x)) * ((alpha + T) / (alpha + recency)) ** (r + x)
185 | 
186 |         return numerator / denominator
187 | 
188 |     def conditional_probability_alive(self, frequency, recency, T):
189 |         """
190 |         Conditional probability alive.
191 | 
192 |         Compute the probability that a customer with history (frequency,
193 |         recency, T) is currently alive.
194 |         From https://www.researchgate.net/publication/247219660_Empirical_validation_and_comparison_of_models_for_customer_base_analysis
195 |         Appendix A, eq. (5)
196 | 
197 |         Parameters
198 |         ----------
199 |         frequency: array or float
200 |             historical frequency of customer.
201 |         recency: array or float
202 |             historical recency of customer.
203 |         T: array or float
204 |             age of the customer.
205 | 
206 |         Returns
207 |         -------
208 |         array:
209 |             value representing probability of being alive
210 | 
211 |         """
212 |         r, alpha, a, b = self._unload_params("r", "alpha", "a", "b")
213 |         return np.atleast_1d(1.0 / (1 + (a / (b + frequency)) * ((alpha + T) / (alpha + recency)) ** (r + frequency)))
214 | 
215 |     def probability_of_n_purchases_up_to_time(self, t, n):
216 |         r"""
217 |         Compute the probability of n purchases up to time t.
218 | 
219 |         .. math::  P( N(t) = n | \text{model} )
220 | 
221 |         where N(t) is the number of repeat purchases a customer makes in t
222 |         units of time.
223 | 
224 |         Parameters
225 |         ----------
226 |         t: float
227 |             number units of time
228 |         n: int
229 |             number of purchases
230 | 
231 |         Returns
232 |         -------
233 |         float:
234 |             Probability to have n purchases up to t units of time
235 | 
236 |         """
237 |         r, alpha, a, b = self._unload_params("r", "alpha", "a", "b")
238 |         _j = np.arange(0, n)
239 | 
240 |         first_term = (
241 |             beta(a, b + n + 1)
242 |             / beta(a, b)
243 |             * gamma(r + n)
244 |             / gamma(r)
245 |             / gamma(n + 1)
246 |             * (alpha / (alpha + t)) ** r
247 |             * (t / (alpha + t)) ** n
248 |         )
249 |         finite_sum = (gamma(r + _j) / gamma(r) / gamma(_j + 1) * (t / (alpha + t)) ** _j).sum()
250 |         second_term = beta(a + 1, b + n) / beta(a, b) * (1 - (alpha / (alpha + t)) ** r * finite_sum)
251 | 
252 |         return first_term + second_term
253 | 


--------------------------------------------------------------------------------
/lifetimes/fitters/pareto_nbd_fitter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Pareto/NBD model."""
  3 | 
  4 | from __future__ import print_function
  5 | from __future__ import division
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | from numpy import log, exp, logaddexp, asarray, any as npany
 10 | from pandas import DataFrame
 11 | from scipy.special import gammaln, hyp2f1, betaln
 12 | from scipy.special import logsumexp
 13 | from scipy.optimize import minimize
 14 | 
 15 | from lifetimes.fitters import BaseFitter
 16 | from lifetimes.utils import _check_inputs, _scale_time
 17 | from lifetimes.generate_data import pareto_nbd_model
 18 | 
 19 | 
 20 | class ParetoNBDFitter(BaseFitter):
 21 |     """
 22 |     Pareto NBD fitter [7]_.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     penalizer_coef: float
 27 |         The coefficient applied to an l2 norm on the parameters
 28 | 
 29 |     Attributes
 30 |     ----------
 31 |     penalizer_coef: float
 32 |         The coefficient applied to an l2 norm on the parameters
 33 |     params_: :obj: OrderedDict
 34 |         The fitted parameters of the model
 35 |     data: :obj: DataFrame
 36 |         A DataFrame with the columns given in the call to `fit`
 37 | 
 38 |     References
 39 |     ----------
 40 |     .. [7] David C. Schmittlein, Donald G. Morrison and Richard Colombo
 41 |        Management Science,Vol. 33, No. 1 (Jan., 1987), pp. 1-24
 42 |       "Counting Your Customers: Who Are They and What Will They Do Next,"
 43 |     """
 44 | 
 45 |     def __init__(
 46 |         self, 
 47 |         penalizer_coef=0.0
 48 |     ):
 49 |         """
 50 |         Initialization, set penalizer_coef.
 51 |         """
 52 | 
 53 |         self.penalizer_coef = penalizer_coef
 54 | 
 55 |     def fit(
 56 |         self,
 57 |         frequency,
 58 |         recency,
 59 |         T,
 60 |         weights=None,
 61 |         iterative_fitting=1,
 62 |         initial_params=None,
 63 |         verbose=False,
 64 |         tol=1e-4,
 65 |         index=None,
 66 |         fit_method="Nelder-Mead",
 67 |         maxiter=2000,
 68 |         **kwargs
 69 |     ):
 70 |         """
 71 |         Pareto/NBD model fitter.
 72 | 
 73 |         Parameters
 74 |         ----------
 75 |         frequency: array_like
 76 |             the frequency vector of customers' purchases
 77 |             (denoted x in literature).
 78 |         recency: array_like
 79 |             the recency vector of customers' purchases
 80 |             (denoted t_x in literature).
 81 |         T: array_like
 82 |             customers' age (time units since first purchase)
 83 |         weights: None or array_like
 84 |             Number of customers with given frequency/recency/T,
 85 |             defaults to 1 if not specified. Fader and
 86 |             Hardie condense the individual RFM matrix into all
 87 |             observed combinations of frequency/recency/T. This
 88 |             parameter represents the count of customers with a given
 89 |             purchase pattern. Instead of calculating individual
 90 |             log-likelihood, the log-likelihood is calculated for each
 91 |             pattern and multiplied by the number of customers with
 92 |             that pattern.
 93 |         iterative_fitting: int, optional
 94 |             perform iterative_fitting fits over random/warm-started initial params
 95 |         initial_params: array_like, optional
 96 |             set the initial parameters for the fitter.
 97 |         verbose : bool, optional
 98 |             set to true to print out convergence diagnostics.
 99 |         tol : float, optional
100 |             tolerance for termination of the function minimization process.
101 |         index: array_like, optional
102 |             index for resulted DataFrame which is accessible via self.data
103 |         fit_method : string, optional
104 |             fit_method to passing to scipy.optimize.minimize
105 |         maxiter : int, optional
106 |             max iterations for optimizer in scipy.optimize.minimize will be
107 |             overwritten if set in kwargs.
108 |         kwargs:
109 |             key word arguments to pass to the scipy.optimize.minimize
110 |             function as options dict
111 | 
112 |         Returns
113 |         -------
114 |         ParetoNBDFitter
115 |             with additional properties like ``params_`` and methods like ``predict``
116 |         """
117 | 
118 |         frequency = asarray(frequency).astype(int)
119 |         recency = asarray(recency)
120 |         T = asarray(T)
121 | 
122 |         if weights is None:
123 |             weights = np.ones(recency.shape[0], dtype=np.int64)
124 |         else:
125 |             weights = asarray(weights)
126 | 
127 |         _check_inputs(frequency, recency, T)
128 | 
129 |         self._scale = _scale_time(T)
130 |         scaled_recency = recency * self._scale
131 |         scaled_T = T * self._scale
132 | 
133 |         params, self._negative_log_likelihood_ = self._fit(
134 |             (frequency, scaled_recency, scaled_T, weights, self.penalizer_coef),
135 |             iterative_fitting,
136 |             initial_params,
137 |             4,
138 |             verbose,
139 |             tol,
140 |             fit_method,
141 |             maxiter,
142 |             **kwargs
143 |         )
144 |         self._hessian_ = None
145 |         self.params_ = pd.Series(*(params, ["r", "alpha", "s", "beta"]))
146 |         self.params_["alpha"] /= self._scale
147 |         self.params_["beta"] /= self._scale
148 | 
149 |         self.data = DataFrame({"frequency": frequency, "recency": recency, "T": T, "weights": weights}, index=index)
150 |         self.generate_new_data = lambda size=1: pareto_nbd_model(
151 |             T, *self._unload_params("r", "alpha", "s", "beta"), size=size
152 |         )
153 | 
154 |         self.predict = self.conditional_expected_number_of_purchases_up_to_time
155 | 
156 |         return self
157 | 
158 |     @staticmethod
159 |     def _log_A_0(
160 |         params, 
161 |         freq, 
162 |         recency, 
163 |         age
164 |     ):
165 |         """
166 |         log_A_0.
167 |         
168 |         Equation (19) and (20) from paper:
169 |         http://brucehardie.com/notes/009/pareto_nbd_derivations_2005-11-05.pdf
170 |         """
171 | 
172 |         r, alpha, s, beta = params
173 | 
174 |         if alpha < beta:
175 |             min_of_alpha_beta, max_of_alpha_beta, t = (alpha, beta, r + freq)
176 |         else:
177 |             min_of_alpha_beta, max_of_alpha_beta, t = (beta, alpha, s + 1)
178 |         abs_alpha_beta = max_of_alpha_beta - min_of_alpha_beta
179 | 
180 |         rsf = r + s + freq
181 |         p_1 = hyp2f1(rsf, t, rsf + 1.0, abs_alpha_beta / (max_of_alpha_beta + recency))
182 |         q_1 = max_of_alpha_beta + recency
183 |         p_2 = hyp2f1(rsf, t, rsf + 1.0, abs_alpha_beta / (max_of_alpha_beta + age))
184 |         q_2 = max_of_alpha_beta + age
185 | 
186 |         try:
187 |             size = len(freq)
188 |             sign = np.ones(size)
189 |         except TypeError:
190 |             sign = 1
191 | 
192 |         return logsumexp([log(p_1) + rsf * log(q_2), log(p_2) + rsf * log(q_1)], axis=0, b=[sign, -sign]) - rsf * log(
193 |             q_1 * q_2
194 |         )
195 | 
196 |     @staticmethod
197 |     def _conditional_log_likelihood(
198 |         params, 
199 |         freq, 
200 |         rec, 
201 |         T
202 |     ):
203 |         """
204 |         Implements equation (18) from:
205 |         http://brucehardie.com/notes/009/pareto_nbd_derivations_2005-11-05.pdf
206 |         """
207 | 
208 |         r, alpha, s, beta = params
209 |         x = freq
210 | 
211 |         r_s_x = r + s + x
212 | 
213 |         A_1 = gammaln(r + x) - gammaln(r) + r * log(alpha) + s * log(beta)
214 |         log_A_0 = ParetoNBDFitter._log_A_0(params, x, rec, T)
215 | 
216 |         A_2 = logaddexp(-(r + x) * log(alpha + T) - s * log(beta + T), log(s) + log_A_0 - log(r_s_x))
217 | 
218 |         return A_1 + A_2
219 | 
220 |     @staticmethod
221 |     def _negative_log_likelihood(
222 |         params, 
223 |         freq, 
224 |         rec, 
225 |         T, 
226 |         weights, 
227 |         penalizer_coef
228 |     ):
229 |         """
230 |         Sums the conditional log-likelihood from the ``_conditional_log_likelihood`` function
231 |         and applies a ``penalizer_coef``.
232 |         """
233 | 
234 |         if npany(asarray(params) <= 0.0):
235 |             return np.inf
236 | 
237 |         conditional_log_likelihood = ParetoNBDFitter._conditional_log_likelihood(params, freq, rec, T)
238 |         penalizer_term = penalizer_coef * sum(np.asarray(params) ** 2)
239 | 
240 |         return -(weights * conditional_log_likelihood).sum() / weights.mean() + penalizer_term
241 | 
242 |     def conditional_expected_number_of_purchases_up_to_time(
243 |         self, 
244 |         t, 
245 |         frequency, 
246 |         recency, 
247 |         T
248 |     ):
249 |         """
250 |         Conditional expected number of purchases up to time.
251 | 
252 |         Calculate the expected number of repeat purchases up to time t for a
253 |         randomly choose individual from the population, given they have
254 |         purchase history (frequency, recency, T).
255 | 
256 |         This is equation (41) from:
257 |         http://brucehardie.com/notes/009/pareto_nbd_derivations_2005-11-05.pdf
258 | 
259 |         Parameters
260 |         ----------
261 |         t: array_like
262 |             times to calculate the expectation for.
263 |         frequency: array_like
264 |             historical frequency of customer.
265 |         recency: array_like
266 |             historical recency of customer.
267 |         T: array_like
268 |             age of the customer.
269 | 
270 |         Returns
271 |         -------
272 |         array_like
273 |         """
274 | 
275 |         x, t_x = frequency, recency
276 |         params = self._unload_params("r", "alpha", "s", "beta")
277 |         r, alpha, s, beta = params
278 | 
279 |         likelihood = self._conditional_log_likelihood(params, x, t_x, T)
280 |         first_term = (
281 |             gammaln(r + x) - gammaln(r) + r * log(alpha) + s * log(beta) - (r + x) * log(alpha + T) - s * log(beta + T)
282 |         )
283 |         second_term = log(r + x) + log(beta + T) - log(alpha + T)
284 |         third_term = log((1 - ((beta + T) / (beta + T + t)) ** (s - 1)) / (s - 1))
285 | 
286 |         return exp(first_term + second_term + third_term - likelihood)
287 | 
288 |     def conditional_probability_alive(
289 |         self, 
290 |         frequency, 
291 |         recency, 
292 |         T
293 |     ):
294 |         """
295 |         Conditional probability alive.
296 | 
297 |         Compute the probability that a customer with history
298 |         (frequency, recency, T) is currently alive.
299 | 
300 |         Section 5.1 from (equations (36) and (37)):
301 |         http://brucehardie.com/notes/009/pareto_nbd_derivations_2005-11-05.pdf
302 | 
303 |         Parameters
304 |         ----------
305 |         frequency: float
306 |             historical frequency of customer.
307 |         recency: float
308 |             historical recency of customer.
309 |         T: float
310 |             age of the customer.
311 | 
312 |         Returns
313 |         -------
314 |         float
315 |             value representing a probability
316 | 
317 |         """
318 | 
319 |         x, t_x = frequency, recency
320 |         r, alpha, s, beta = self._unload_params("r", "alpha", "s", "beta")
321 |         A_0 = self._log_A_0([r, alpha, s, beta], x, t_x, T)
322 | 
323 |         return 1.0 / (1.0 + exp(log(s) - log(r + s + x) + (r + x) * log(alpha + T) + s * log(beta + T) + A_0))
324 | 
325 |     def conditional_probability_alive_matrix(
326 |         self, 
327 |         max_frequency=None, 
328 |         max_recency=None
329 |     ):
330 |         """
331 |         Compute the probability alive matrix. 
332 |         
333 |         Builds on the ``conditional_probability_alive()`` method.
334 | 
335 |         Parameters
336 |         ----------
337 |         max_frequency: float, optional
338 |             the maximum frequency to plot. Default is max observed frequency.
339 |         max_recency: float, optional
340 |             the maximum recency to plot. This also determines the age of the
341 |             customer. Default to max observed age.
342 | 
343 |         Returns
344 |         -------
345 |         matrix:
346 |             A matrix of the form [t_x: historical recency, x: historical frequency]
347 |         """
348 | 
349 |         max_frequency = max_frequency or int(self.data["frequency"].max())
350 |         max_recency = max_recency or int(self.data["T"].max())
351 | 
352 |         Z = np.zeros((max_recency + 1, max_frequency + 1))
353 |         for i, recency in enumerate(np.arange(max_recency + 1)):
354 |             for j, frequency in enumerate(np.arange(max_frequency + 1)):
355 |                 Z[i, j] = self.conditional_probability_alive(frequency, recency, max_recency)
356 | 
357 |         return Z
358 | 
359 |     def expected_number_of_purchases_up_to_time(
360 |         self, 
361 |         t
362 |     ):
363 |         """
364 |         Return expected number of repeat purchases up to time t.
365 | 
366 |         Calculate the expected number of repeat purchases up to time t for a
367 |         randomly choose individual from the population.
368 | 
369 |         Equation (27) from:
370 |         http://brucehardie.com/notes/009/pareto_nbd_derivations_2005-11-05.pdf
371 | 
372 |         Parameters
373 |         ----------
374 |         t: array_like
375 |             times to calculate the expectation for.
376 | 
377 |         Returns
378 |         -------
379 |         array_like
380 |         """
381 | 
382 |         r, alpha, s, beta = self._unload_params("r", "alpha", "s", "beta")
383 |         first_term = r * beta / alpha / (s - 1)
384 |         second_term = 1 - (beta / (beta + t)) ** (s - 1)
385 | 
386 |         return first_term * second_term
387 | 
388 |     def conditional_probability_of_n_purchases_up_to_time(
389 |         self, 
390 |         n, 
391 |         t, 
392 |         frequency, 
393 |         recency, 
394 |         T
395 |     ):
396 |         """
397 |         Return conditional probability of n purchases up to time t.
398 | 
399 |         Calculate the probability of n purchases up to time t for an individual
400 |         with history frequency, recency and T (age).
401 | 
402 |         The main equation being implemented is (16) from:
403 |         http://www.brucehardie.com/notes/028/pareto_nbd_conditional_pmf.pdf
404 | 
405 |         Parameters
406 |         ----------
407 |         n: int
408 |             number of purchases.
409 |         t: a scalar
410 |             time up to which probability should be calculated.
411 |         frequency: float
412 |             historical frequency of customer.
413 |         recency: float
414 |             historical recency of customer.
415 |         T: float
416 |             age of the customer.
417 | 
418 |         Returns
419 |         -------
420 |         array_like
421 |         """
422 | 
423 |         if t <= 0:
424 |             return 0
425 | 
426 |         x, t_x = frequency, recency
427 |         params = self._unload_params("r", "alpha", "s", "beta")
428 |         r, alpha, s, beta = params
429 | 
430 |         if alpha < beta:
431 |             min_of_alpha_beta, max_of_alpha_beta, p, _, _ = (alpha, beta, r + x + n, r + x, r + x + 1)
432 |         else:
433 |             min_of_alpha_beta, max_of_alpha_beta, p, _, _ = (beta, alpha, s + 1, s + 1, s)
434 |         abs_alpha_beta = max_of_alpha_beta - min_of_alpha_beta
435 | 
436 |         log_l = self._conditional_log_likelihood(params, x, t_x, T)
437 |         log_p_zero = (
438 |             gammaln(r + x)
439 |             + r * log(alpha)
440 |             + s * log(beta)
441 |             - (gammaln(r) + (r + x) * log(alpha + T) + s * log(beta + T) + log_l)
442 |         )
443 |         log_B_one = (
444 |             gammaln(r + x + n)
445 |             + r * log(alpha)
446 |             + s * log(beta)
447 |             - (gammaln(r) + (r + x + n) * log(alpha + T + t) + s * log(beta + T + t))
448 |         )
449 |         log_B_two = (
450 |             r * log(alpha)
451 |             + s * log(beta)
452 |             + gammaln(r + s + x)
453 |             + betaln(r + x + n, s + 1)
454 |             + log(hyp2f1(r + s + x, p, r + s + x + n + 1, abs_alpha_beta / (max_of_alpha_beta + T)))
455 |             - (gammaln(r) + gammaln(s) + (r + s + x) * log(max_of_alpha_beta + T))
456 |         )
457 | 
458 |         def _log_B_three(i):
459 |             return (
460 |                 r * log(alpha)
461 |                 + s * log(beta)
462 |                 + gammaln(r + s + x + i)
463 |                 + betaln(r + x + n, s + 1)
464 |                 + log(hyp2f1(r + s + x + i, p, r + s + x + n + 1, abs_alpha_beta / (max_of_alpha_beta + T + t)))
465 |                 - (gammaln(r) + gammaln(s) + (r + s + x + i) * log(max_of_alpha_beta + T + t))
466 |             )
467 | 
468 |         zeroth_term = (n == 0) * (1 - exp(log_p_zero))
469 |         first_term = n * log(t) - gammaln(n + 1) + log_B_one - log_l
470 |         second_term = log_B_two - log_l
471 |         third_term = logsumexp([i * log(t) - gammaln(i + 1) + _log_B_three(i) - log_l for i in range(n + 1)], axis=0)
472 | 
473 |         try:
474 |             size = len(x)
475 |             sign = np.ones(size)
476 |         except TypeError:
477 |             sign = 1
478 | 
479 |         # In some scenarios (e.g. large n) tiny numerical errors in the calculation of second_term and third_term
480 |         # cause sumexp to be ever so slightly negative and logsumexp throws an error. Hence we ignore the sign here.
481 |         return zeroth_term + exp(
482 |             logsumexp([first_term, second_term, third_term], b=[sign, sign, -sign], axis=0, return_sign=True)[0]
483 |         )
484 | 
485 |     def _fit(
486 |         self,
487 |         minimizing_function_args,
488 |         iterative_fitting,
489 |         initial_params,
490 |         params_size,
491 |         disp,
492 |         tol=1e-6,
493 |         fit_method="Nelder-Mead",
494 |         maxiter=2000,
495 |         **kwargs
496 |     ):
497 |         """
498 |         Fit function for fitters.
499 |         
500 |         Minimizer Callback for this fitters class.
501 |         """
502 | 
503 |         ll = []
504 |         sols = []
505 | 
506 |         if iterative_fitting <= 0:
507 |             raise ValueError("iterative_fitting parameter should be greater than 0 as of lifetimes v0.2.1")
508 | 
509 |         if iterative_fitting > 1 and initial_params is not None:
510 |             raise ValueError(
511 |                 "iterative_fitting and initial_params should not be both set, as no improvement could be made."
512 |             )
513 | 
514 |         # set options for minimize, if specified in kwargs will be overwritten
515 |         minimize_options = {}
516 |         minimize_options["disp"] = disp
517 |         minimize_options["maxiter"] = maxiter
518 |         minimize_options.update(kwargs)
519 | 
520 |         total_count = 0
521 |         while total_count < iterative_fitting:
522 |             current_init_params = (
523 |                 np.random.normal(1.0, scale=0.05, size=params_size) if initial_params is None else initial_params
524 |             )
525 |             if minimize_options["disp"]:
526 |                 print("Optimize function with {}".format(fit_method))
527 | 
528 |             output = minimize(
529 |                 self._negative_log_likelihood,
530 |                 method=fit_method,
531 |                 tol=tol,
532 |                 x0=current_init_params,
533 |                 args=minimizing_function_args,
534 |                 options=minimize_options,
535 |             )
536 |             sols.append(output.x)
537 |             ll.append(output.fun)
538 | 
539 |             total_count += 1
540 |         argmin_ll, min_ll = min(enumerate(ll), key=lambda x: x[1])
541 |         minimizing_params = sols[argmin_ll]
542 | 
543 |         return minimizing_params, min_ll


--------------------------------------------------------------------------------
/lifetimes/generate_data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import warnings
  3 | 
  4 | warnings.simplefilter(action="ignore", category=FutureWarning)
  5 | import numpy as np
  6 | from numpy import random
  7 | import pandas as pd
  8 | 
  9 | 
 10 | def beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
 11 |     """
 12 |     Generate artificial data according to the BG/NBD model.
 13 | 
 14 |     See [1] for model details
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     T: array_like
 19 |         The length of time observing new customers.
 20 |     r, alpha, a, b: float
 21 |         Parameters in the model. See [1]_
 22 |     size: int, optional
 23 |         The number of customers to generate
 24 | 
 25 |     Returns
 26 |     -------
 27 |     DataFrame
 28 |         With index as customer_ids and the following columns:
 29 |         'frequency', 'recency', 'T', 'lambda', 'p', 'alive', 'customer_id'
 30 | 
 31 |     References
 32 |     ----------
 33 |     .. [1]: '"Counting Your Customers" the Easy Way: An Alternative to the Pareto/NBD Model'
 34 |        (http://brucehardie.com/papers/bgnbd_2004-04-20.pdf)
 35 | 
 36 |     """
 37 |     if type(T) in [float, int]:
 38 |         T = T * np.ones(size)
 39 |     else:
 40 |         T = np.asarray(T)
 41 | 
 42 |     probability_of_post_purchase_death = random.beta(a, b, size=size)
 43 |     lambda_ = random.gamma(r, scale=1.0 / alpha, size=size)
 44 | 
 45 |     columns = ["frequency", "recency", "T", "lambda", "p", "alive", "customer_id"]
 46 |     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
 47 | 
 48 |     for i in range(size):
 49 |         p = probability_of_post_purchase_death[i]
 50 |         l = lambda_[i]
 51 | 
 52 |         # hacky until I can find something better
 53 |         times = []
 54 |         next_purchase_in = random.exponential(scale=1.0 / l)
 55 |         alive = True
 56 |         while (np.sum(times) + next_purchase_in < T[i]) and alive:
 57 |             times.append(next_purchase_in)
 58 |             next_purchase_in = random.exponential(scale=1.0 / l)
 59 |             alive = random.random() > p
 60 | 
 61 |         times = np.array(times).cumsum()
 62 |         df.iloc[i] = (
 63 |             np.unique(np.array(times).astype(int)).shape[0],
 64 |             np.max(times if times.shape[0] > 0 else 0),
 65 |             T[i],
 66 |             l,
 67 |             p,
 68 |             alive,
 69 |             i,
 70 |         )
 71 | 
 72 |     return df.set_index("customer_id")
 73 | 
 74 | 
 75 | def beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b, observation_period_end="2019-1-1", freq="D", size=1):
 76 |     """
 77 |     Generate artificial transactional data according to the BG/NBD model.
 78 | 
 79 |     See [1] for model details
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     T: int, float or array_like
 84 |         The length of time observing new customers.
 85 |     r, alpha, a, b: float
 86 |         Parameters in the model. See [1]_
 87 |     observation_period_end: date_like
 88 |         The date observation ends
 89 |     freq: string, optional
 90 |         Default 'D' for days, 'W' for weeks, 'h' for hours
 91 |     size: int, optional
 92 |         The number of customers to generate
 93 | 
 94 |     Returns
 95 |     -------
 96 |     DataFrame
 97 |         The following columns:
 98 |         'customer_id', 'date'
 99 | 
100 |     References
101 |     ----------
102 |     .. [1]: '"Counting Your Customers" the Easy Way: An Alternative to the Pareto/NBD Model'
103 |        (http://brucehardie.com/papers/bgnbd_2004-04-20.pdf)
104 | 
105 |     """
106 |     observation_period_end = pd.to_datetime(observation_period_end)
107 | 
108 |     if type(T) in [float, int]:
109 |         start_date = [observation_period_end - pd.Timedelta(T - 1, unit=freq)] * size
110 |         T = T * np.ones(size)
111 |     else:
112 |         start_date = [observation_period_end - pd.Timedelta(T[i] - 1, unit=freq) for i in range(size)]
113 |         T = np.asarray(T)
114 | 
115 |     probability_of_post_purchase_death = random.beta(a, b, size=size)
116 |     lambda_ = random.gamma(r, scale=1.0 / alpha, size=size)
117 | 
118 |     columns = ["customer_id", "date"]
119 |     df = pd.DataFrame(columns=columns)
120 | 
121 |     for i in range(size):
122 |         s = start_date[i]
123 |         p = probability_of_post_purchase_death[i]
124 |         l = lambda_[i]
125 |         age = T[i]
126 | 
127 |         purchases = [[i, s - pd.Timedelta(1, unit=freq)]]
128 |         next_purchase_in = random.exponential(scale=1.0 / l)
129 |         alive = True
130 | 
131 |         while next_purchase_in < age and alive:
132 |             purchases.append([i, s + pd.Timedelta(next_purchase_in, unit=freq)])
133 |             next_purchase_in += random.exponential(scale=1.0 / l)
134 |             alive = random.random() > p
135 | 
136 |         df = df.append(pd.DataFrame(purchases, columns=columns))
137 | 
138 |     return df.reset_index(drop=True)
139 | 
140 | 
141 | def pareto_nbd_model(T, r, alpha, s, beta, size=1):
142 |     """
143 |     Generate artificial data according to the Pareto/NBD model.
144 | 
145 |     See [2]_ for model details.
146 | 
147 |     Parameters
148 |     ----------
149 |     T: array_like
150 |         The length of time observing new customers.
151 |     r, alpha, s, beta: float
152 |         Parameters in the model. See [1]_
153 |     size: int, optional
154 |         The number of customers to generate
155 | 
156 |     Returns
157 |     -------
158 |     :obj: DataFrame
159 |         with index as customer_ids and the following columns:
160 |         'frequency', 'recency', 'T', 'lambda', 'mu', 'alive', 'customer_id'
161 | 
162 |     References
163 |     ----------
164 |     .. [2]: Fader, Peter S. and Bruce G. S. Hardie (2005), "A Note on Deriving the Pareto/NBD Model
165 |        and Related Expressions," <http://brucehardie.com/notes/009/>.
166 | 
167 |     """
168 |     if type(T) in [float, int]:
169 |         T = T * np.ones(size)
170 |     else:
171 |         T = np.asarray(T)
172 | 
173 |     lambda_ = random.gamma(r, scale=1.0 / alpha, size=size)
174 |     mus = random.gamma(s, scale=1.0 / beta, size=size)
175 | 
176 |     columns = ["frequency", "recency", "T", "lambda", "mu", "alive", "customer_id"]
177 |     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
178 | 
179 |     for i in range(size):
180 |         l = lambda_[i]
181 |         mu = mus[i]
182 |         time_of_death = random.exponential(scale=1.0 / mu)
183 | 
184 |         # hacky until I can find something better
185 |         times = []
186 |         next_purchase_in = random.exponential(scale=1.0 / l)
187 |         while np.sum(times) + next_purchase_in < min(time_of_death, T[i]):
188 |             times.append(next_purchase_in)
189 |             next_purchase_in = random.exponential(scale=1.0 / l)
190 | 
191 |         times = np.array(times).cumsum()
192 |         df.iloc[i] = (
193 |             np.unique(np.array(times).astype(int)).shape[0],
194 |             np.max(times if times.shape[0] > 0 else 0),
195 |             T[i],
196 |             l,
197 |             mu,
198 |             time_of_death > T[i],
199 |             i,
200 |         )
201 | 
202 |     return df.set_index("customer_id")
203 | 
204 | 
205 | def modified_beta_geometric_nbd_model(T, r, alpha, a, b, size=1):
206 |     """
207 |     Generate artificial data according to the MBG/NBD model.
208 | 
209 |     See [3]_, [4]_ for model details
210 | 
211 |     Parameters
212 |     ----------
213 |     T: array_like
214 |         The length of time observing new customers.
215 |     r, alpha, a, b: float
216 |         Parameters in the model. See [1]_
217 |     size: int, optional
218 |         The number of customers to generate
219 | 
220 |     Returns
221 |     -------
222 |     DataFrame
223 |         with index as customer_ids and the following columns:
224 |         'frequency', 'recency', 'T', 'lambda', 'p', 'alive', 'customer_id'
225 | 
226 |     References
227 |     ----------
228 |     .. [1]: '"Counting Your Customers" the Easy Way: An Alternative to the Pareto/NBD Model'
229 |        (http://brucehardie.com/papers/bgnbd_2004-04-20.pdf)
230 |     .. [2] Batislam, E.P., M. Denizel, A. Filiztekin (2007),
231 |        "Empirical validation and comparison of models for customer base analysis,"
232 |        International Journal of Research in Marketing, 24 (3), 201-209.
233 | 
234 |     """
235 |     if type(T) in [float, int]:
236 |         T = T * np.ones(size)
237 |     else:
238 |         T = np.asarray(T)
239 | 
240 |     probability_of_post_purchase_death = random.beta(a, b, size=size)
241 |     lambda_ = random.gamma(r, scale=1.0 / alpha, size=size)
242 | 
243 |     columns = ["frequency", "recency", "T", "lambda", "p", "alive", "customer_id"]
244 |     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
245 | 
246 |     for i in range(size):
247 |         p = probability_of_post_purchase_death[i]
248 |         l = lambda_[i]
249 | 
250 |         # hacky until I can find something better
251 |         times = []
252 |         next_purchase_in = random.exponential(scale=1.0 / l)
253 |         alive = random.random() > p  # essentially the difference between this model and BG/NBD
254 |         while (np.sum(times) + next_purchase_in < T[i]) and alive:
255 |             times.append(next_purchase_in)
256 |             next_purchase_in = random.exponential(scale=1.0 / l)
257 |             alive = random.random() > p
258 | 
259 |         times = np.array(times).cumsum()
260 |         df.iloc[i] = (
261 |             np.unique(np.array(times).astype(int)).shape[0],
262 |             np.max(times if times.shape[0] > 0 else 0),
263 |             T[i],
264 |             l,
265 |             p,
266 |             alive,
267 |             i,
268 |         )
269 | 
270 |     return df.set_index("customer_id")
271 | 
272 | 
273 | def beta_geometric_beta_binom_model(N, alpha, beta, gamma, delta, size=1):
274 |     """
275 |     Generate artificial data according to the Beta-Geometric/Beta-Binomial
276 |     Model.
277 | 
278 |     You may wonder why we can have frequency = n_periods, when frequency excludes their
279 |     first order. When a customer purchases something, they are born, _and in the next
280 |     period_ we start asking questions about their alive-ness. So really they customer has
281 |     bought frequency + 1, and been observed for n_periods + 1
282 | 
283 |     Parameters
284 |     ----------
285 |     N: array_like
286 |         Number of transaction opportunities for new customers.
287 |     alpha, beta, gamma, delta: float
288 |         Parameters in the model. See [1]_
289 |     size: int, optional
290 |         The number of customers to generate
291 | 
292 |     Returns
293 |     -------
294 |     DataFrame
295 |         with index as customer_ids and the following columns:
296 |         'frequency', 'recency', 'n_periods', 'lambda', 'p', 'alive', 'customer_id'
297 | 
298 |     References
299 |     ----------
300 |     .. [1] Fader, Peter S., Bruce G.S. Hardie, and Jen Shang (2010),
301 |        "Customer-Base Analysis in a Discrete-Time Noncontractual Setting,"
302 |        Marketing Science, 29 (6), 1086-1108.
303 | 
304 |     """
305 | 
306 |     if type(N) in [float, int, np.int64]:
307 |         N = N * np.ones(size)
308 |     else:
309 |         N = np.asarray(N)
310 | 
311 |     probability_of_post_purchase_death = random.beta(a=alpha, b=beta, size=size)
312 |     thetas = random.beta(a=gamma, b=delta, size=size)
313 | 
314 |     columns = ["frequency", "recency", "n_periods", "p", "theta", "alive", "customer_id"]
315 |     df = pd.DataFrame(np.zeros((size, len(columns))), columns=columns)
316 |     for i in range(size):
317 |         p = probability_of_post_purchase_death[i]
318 |         theta = thetas[i]
319 | 
320 |         # hacky until I can find something better
321 |         current_t = 0
322 |         alive = True
323 |         times = []
324 |         while current_t < N[i] and alive:
325 |             alive = random.binomial(1, theta) == 0
326 |             if alive and random.binomial(1, p) == 1:
327 |                 times.append(current_t)
328 |             current_t += 1
329 |         # adding in final death opportunity to agree with [1]
330 |         if alive:
331 |             alive = random.binomial(1, theta) == 0
332 |         df.iloc[i] = len(times), times[-1] + 1 if len(times) != 0 else 0, N[i], p, theta, alive, i
333 |     return df
334 | 


--------------------------------------------------------------------------------
/lifetimes/plotting.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import numpy as np
  3 | import pandas as pd
  4 | from lifetimes.utils import calculate_alive_path, expected_cumulative_transactions
  5 | from scipy import stats
  6 | 
  7 | __all__ = [
  8 |     "plot_period_transactions",
  9 |     "plot_calibration_purchases_vs_holdout_purchases",
 10 |     "plot_frequency_recency_matrix",
 11 |     "plot_probability_alive_matrix",
 12 |     "plot_expected_repeat_purchases",
 13 |     "plot_history_alive",
 14 |     "plot_cumulative_transactions",
 15 |     "plot_incremental_transactions",
 16 |     "plot_transaction_rate_heterogeneity",
 17 |     "plot_dropout_rate_heterogeneity",
 18 | ]
 19 | 
 20 | 
 21 | def coalesce(*args):
 22 |     return next(s for s in args if s is not None)
 23 | 
 24 | 
 25 | def plot_period_transactions(
 26 |     model,
 27 |     max_frequency=7,
 28 |     title="Frequency of Repeat Transactions",
 29 |     xlabel="Number of Calibration Period Transactions",
 30 |     ylabel="Customers",
 31 |     **kwargs
 32 | ):
 33 |     """
 34 |     Plot a figure with period actual and predicted transactions.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     model: lifetimes model
 39 |         A fitted lifetimes model.
 40 |     max_frequency: int, optional
 41 |         The maximum frequency to plot.
 42 |     title: str, optional
 43 |         Figure title
 44 |     xlabel: str, optional
 45 |         Figure xlabel
 46 |     ylabel: str, optional
 47 |         Figure ylabel
 48 |     kwargs
 49 |         Passed into the matplotlib.pyplot.plot command.
 50 | 
 51 |     Returns
 52 |     -------
 53 |     axes: matplotlib.AxesSubplot
 54 | 
 55 |     """
 56 |     from matplotlib import pyplot as plt
 57 | 
 58 |     labels = kwargs.pop("label", ["Actual", "Model"])
 59 | 
 60 |     n = model.data.shape[0]
 61 |     simulated_data = model.generate_new_data(size=n)
 62 | 
 63 |     model_counts = pd.DataFrame(model.data["frequency"].value_counts().sort_index().iloc[:max_frequency])
 64 |     simulated_counts = pd.DataFrame(simulated_data["frequency"].value_counts().sort_index().iloc[:max_frequency])
 65 |     combined_counts = model_counts.merge(simulated_counts, how="outer", left_index=True, right_index=True).fillna(0)
 66 |     combined_counts.columns = labels
 67 | 
 68 |     ax = combined_counts.plot(kind="bar", **kwargs)
 69 | 
 70 |     plt.legend()
 71 |     plt.title(title)
 72 |     plt.ylabel(ylabel)
 73 |     plt.xlabel(xlabel)
 74 |     return ax
 75 | 
 76 | 
 77 | def plot_calibration_purchases_vs_holdout_purchases(
 78 |     model, calibration_holdout_matrix, kind="frequency_cal", n=7, **kwargs
 79 | ):
 80 |     """
 81 |     Plot calibration purchases vs holdout.
 82 | 
 83 |     This currently relies too much on the lifetimes.util calibration_and_holdout_data function.
 84 | 
 85 |     Parameters
 86 |     ----------
 87 |     model: lifetimes model
 88 |         A fitted lifetimes model.
 89 |     calibration_holdout_matrix: pandas DataFrame
 90 |         DataFrame from calibration_and_holdout_data function.
 91 |     kind: str, optional
 92 |         x-axis :"frequency_cal". Purchases in calibration period,
 93 |                  "recency_cal". Age of customer at last purchase,
 94 |                  "T_cal". Age of customer at the end of calibration period,
 95 |                  "time_since_last_purchase". Time since user made last purchase
 96 |     n: int, optional
 97 |         Number of ticks on the x axis
 98 |     Returns
 99 |     -------
100 |     axes: matplotlib.AxesSubplot
101 | 
102 |     """
103 |     from matplotlib import pyplot as plt
104 | 
105 |     x_labels = {
106 |         "frequency_cal": "Purchases in calibration period",
107 |         "recency_cal": "Age of customer at last purchase",
108 |         "T_cal": "Age of customer at the end of calibration period",
109 |         "time_since_last_purchase": "Time since user made last purchase",
110 |     }
111 |     summary = calibration_holdout_matrix.copy()
112 |     duration_holdout = summary.iloc[0]["duration_holdout"]
113 | 
114 |     summary["model_predictions"] = model.conditional_expected_number_of_purchases_up_to_time(
115 |             duration_holdout, summary["frequency_cal"], summary["recency_cal"], summary["T_cal"])
116 | 
117 |     if kind == "time_since_last_purchase":
118 |         summary["time_since_last_purchase"] = summary["T_cal"] - summary["recency_cal"]
119 |         ax = (
120 |             summary.groupby(["time_since_last_purchase"])[["frequency_holdout", "model_predictions"]]
121 |             .mean()
122 |             .iloc[:n]
123 |             .plot(**kwargs)
124 |         )
125 |     else:
126 |         ax = summary.groupby(kind)[["frequency_holdout", "model_predictions"]].mean().iloc[:n].plot(**kwargs)
127 | 
128 |     plt.title("Actual Purchases in Holdout Period vs Predicted Purchases")
129 |     plt.xlabel(x_labels[kind])
130 |     plt.ylabel("Average of Purchases in Holdout Period")
131 |     plt.legend()
132 | 
133 |     return ax
134 | 
135 | 
136 | def plot_frequency_recency_matrix(
137 |     model,
138 |     T=1,
139 |     max_frequency=None,
140 |     max_recency=None,
141 |     title=None,
142 |     xlabel="Customer's Historical Frequency",
143 |     ylabel="Customer's Recency",
144 |     **kwargs
145 | ):
146 |     """
147 |     Plot recency frequecy matrix as heatmap.
148 | 
149 |     Plot a figure of expected transactions in T next units of time by a customer's frequency and recency.
150 | 
151 |     Parameters
152 |     ----------
153 |     model: lifetimes model
154 |         A fitted lifetimes model.
155 |     T: fload, optional
156 |         Next units of time to make predictions for
157 |     max_frequency: int, optional
158 |         The maximum frequency to plot. Default is max observed frequency.
159 |     max_recency: int, optional
160 |         The maximum recency to plot. This also determines the age of the customer.
161 |         Default to max observed age.
162 |     title: str, optional
163 |         Figure title
164 |     xlabel: str, optional
165 |         Figure xlabel
166 |     ylabel: str, optional
167 |         Figure ylabel
168 |     kwargs
169 |         Passed into the matplotlib.imshow command.
170 | 
171 |     Returns
172 |     -------
173 |     axes: matplotlib.AxesSubplot
174 | 
175 |     """
176 |     from matplotlib import pyplot as plt
177 | 
178 |     if max_frequency is None:
179 |         max_frequency = int(model.data["frequency"].max())
180 | 
181 |     if max_recency is None:
182 |         max_recency = int(model.data["T"].max())
183 | 
184 |     Z = np.zeros((max_recency + 1, max_frequency + 1))
185 |     for i, recency in enumerate(np.arange(max_recency + 1)):
186 |         for j, frequency in enumerate(np.arange(max_frequency + 1)):
187 |             Z[i, j] = model.conditional_expected_number_of_purchases_up_to_time(T, frequency, recency, max_recency)
188 | 
189 |     interpolation = kwargs.pop("interpolation", "none")
190 | 
191 |     ax = plt.subplot(111)
192 |     pcm = ax.imshow(Z, interpolation=interpolation, **kwargs)
193 |     plt.xlabel(xlabel)
194 |     plt.ylabel(ylabel)
195 |     if title is None:
196 |         title = (
197 |             "Expected Number of Future Purchases for {} Unit{} of Time,".format(T, "s"[T == 1 :])
198 |             + "\nby Frequency and Recency of a Customer"
199 |         )
200 |     plt.title(title)
201 | 
202 |     # turn matrix into square
203 |     forceAspect(ax)
204 | 
205 |     # plot colorbar beside matrix
206 |     plt.colorbar(pcm, ax=ax)
207 | 
208 |     return ax
209 | 
210 | 
211 | def plot_probability_alive_matrix(
212 |     model,
213 |     max_frequency=None,
214 |     max_recency=None,
215 |     title="Probability Customer is Alive,\nby Frequency and Recency of a Customer",
216 |     xlabel="Customer's Historical Frequency",
217 |     ylabel="Customer's Recency",
218 |     **kwargs
219 | ):
220 |     """
221 |     Plot probability alive matrix as heatmap.
222 | 
223 |     Plot a figure of the probability a customer is alive based on their
224 |     frequency and recency.
225 | 
226 |     Parameters
227 |     ----------
228 |     model: lifetimes model
229 |         A fitted lifetimes model.
230 |     max_frequency: int, optional
231 |         The maximum frequency to plot. Default is max observed frequency.
232 |     max_recency: int, optional
233 |         The maximum recency to plot. This also determines the age of the customer.
234 |         Default to max observed age.
235 |     title: str, optional
236 |         Figure title
237 |     xlabel: str, optional
238 |         Figure xlabel
239 |     ylabel: str, optional
240 |         Figure ylabel
241 |     kwargs
242 |         Passed into the matplotlib.imshow command.
243 | 
244 |     Returns
245 |     -------
246 |     axes: matplotlib.AxesSubplot
247 | 
248 |     """
249 |     from matplotlib import pyplot as plt
250 | 
251 |     z = model.conditional_probability_alive_matrix(max_frequency, max_recency)
252 | 
253 |     interpolation = kwargs.pop("interpolation", "none")
254 | 
255 |     ax = plt.subplot(111)
256 |     pcm = ax.imshow(z, interpolation=interpolation, **kwargs)
257 |     plt.xlabel(xlabel)
258 |     plt.ylabel(ylabel)
259 |     plt.title(title)
260 | 
261 |     # turn matrix into square
262 |     forceAspect(ax)
263 | 
264 |     # plot colorbar beside matrix
265 |     plt.colorbar(pcm, ax=ax)
266 | 
267 |     return ax
268 | 
269 | 
270 | def plot_expected_repeat_purchases(
271 |     model,
272 |     title="Expected Number of Repeat Purchases per Customer",
273 |     xlabel="Time Since First Purchase",
274 |     ax=None,
275 |     label=None,
276 |     **kwargs
277 | ):
278 |     """
279 |     Plot expected repeat purchases on calibration period .
280 | 
281 |     Parameters
282 |     ----------
283 |     model: lifetimes model
284 |         A fitted lifetimes model.
285 |     max_frequency: int, optional
286 |         The maximum frequency to plot.
287 |     title: str, optional
288 |         Figure title
289 |     xlabel: str, optional
290 |         Figure xlabel
291 |     ax: matplotlib.AxesSubplot, optional
292 |         Using user axes
293 |     label: str, optional
294 |         Label for plot.
295 |     kwargs
296 |         Passed into the matplotlib.pyplot.plot command.
297 | 
298 |     Returns
299 |     -------
300 |     axes: matplotlib.AxesSubplot
301 | 
302 |     """
303 |     from matplotlib import pyplot as plt
304 | 
305 |     if ax is None:
306 |         ax = plt.subplot(111)
307 | 
308 |     if plt.matplotlib.__version__ >= "1.5":
309 |         color_cycle = ax._get_lines.prop_cycler
310 |         color = coalesce(kwargs.pop("c", None), kwargs.pop("color", None), next(color_cycle)["color"])
311 |     else:
312 |         color_cycle = ax._get_lines.color_cycle
313 |         color = coalesce(kwargs.pop("c", None), kwargs.pop("color", None), next(color_cycle))
314 | 
315 |     max_T = model.data["T"].max()
316 | 
317 |     times = np.linspace(0, max_T, 100)
318 |     ax.plot(times, model.expected_number_of_purchases_up_to_time(times), color=color, label=label, **kwargs)
319 | 
320 |     times = np.linspace(max_T, 1.5 * max_T, 100)
321 |     ax.plot(times, model.expected_number_of_purchases_up_to_time(times), color=color, ls="--", **kwargs)
322 | 
323 |     plt.title(title)
324 |     plt.xlabel(xlabel)
325 |     plt.legend(loc="lower right")
326 |     return ax
327 | 
328 | 
329 | def plot_history_alive(model, t, transactions, datetime_col, freq="D", start_date=None, ax=None, **kwargs):
330 |     """
331 |     Draw a graph showing the probability of being alive for a customer in time.
332 | 
333 |     Parameters
334 |     ----------
335 |     model: lifetimes model
336 |         A fitted lifetimes model.
337 |     t: int
338 |         the number of time units since the birth we want to draw the p_alive
339 |     transactions: pandas DataFrame
340 |         DataFrame containing the transactions history of the customer_id
341 |     datetime_col: str
342 |         The column in the transactions that denotes the datetime the purchase was made
343 |     freq: str, optional
344 |         Default 'D' for days. Other examples= 'W' for weekly
345 |     start_date: datetime, optional
346 |         Limit xaxis to start date
347 |     ax: matplotlib.AxesSubplot, optional
348 |         Using user axes
349 |     kwargs
350 |         Passed into the matplotlib.pyplot.plot command.
351 | 
352 |     Returns
353 |     -------
354 |     axes: matplotlib.AxesSubplot
355 | 
356 |     """
357 |     from matplotlib import pyplot as plt
358 | 
359 |     if start_date is None:
360 |         start_date = min(transactions[datetime_col])
361 | 
362 |     if ax is None:
363 |         ax = plt.subplot(111)
364 | 
365 |     # Get purchasing history of user
366 |     customer_history = transactions[[datetime_col]].copy()
367 |     customer_history.index = pd.DatetimeIndex(customer_history[datetime_col])
368 | 
369 |     # Add transactions column
370 |     customer_history["transactions"] = 1
371 |     customer_history = customer_history.resample(freq).sum()
372 | 
373 |     # plot alive_path
374 |     path = calculate_alive_path(model, transactions, datetime_col, t, freq)
375 |     path_dates = pd.date_range(start=min(transactions[datetime_col]), periods=len(path), freq=freq)
376 |     plt.plot(path_dates, path, "-", label="P_alive")
377 | 
378 |     # plot buying dates
379 |     payment_dates = customer_history[customer_history["transactions"] >= 1].index
380 |     plt.vlines(payment_dates.values, ymin=0, ymax=1, colors="r", linestyles="dashed", label="purchases")
381 | 
382 |     plt.ylim(0, 1.0)
383 |     plt.yticks(np.arange(0, 1.1, 0.1))
384 |     plt.xlim(start_date, path_dates[-1])
385 |     plt.legend(loc=3)
386 |     plt.ylabel("P_alive")
387 |     plt.title("History of P_alive")
388 | 
389 |     return ax
390 | 
391 | 
392 | def plot_cumulative_transactions(
393 |     model,
394 |     transactions,
395 |     datetime_col,
396 |     customer_id_col,
397 |     t,
398 |     t_cal,
399 |     datetime_format=None,
400 |     freq="D",
401 |     set_index_date=False,
402 |     title="Tracking Cumulative Transactions",
403 |     xlabel="day",
404 |     ylabel="Cumulative Transactions",
405 |     ax=None,
406 |     **kwargs
407 | ):
408 |     """
409 |     Plot a figure of the predicted and actual cumulative transactions of users.
410 | 
411 |     Parameters
412 |     ----------
413 |     model: lifetimes model
414 |         A fitted lifetimes model
415 |     transactions: pandas DataFrame
416 |         DataFrame containing the transactions history of the customer_id
417 |     datetime_col: str
418 |         The column in transactions that denotes the datetime the purchase was made.
419 |     customer_id_col: str
420 |         The column in transactions that denotes the customer_id
421 |     t: float
422 |         The number of time units since the begining of
423 |         data for which we want to calculate cumulative transactions
424 |     t_cal: float
425 |         A marker used to indicate where the vertical line for plotting should be.
426 |     datetime_format: str, optional
427 |         A string that represents the timestamp format. Useful if Pandas
428 |         can't understand the provided format.
429 |     freq: str, optional
430 |         Default 'D' for days, 'W' for weeks, 'M' for months... etc.
431 |         Full list here:
432 |         http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
433 |     set_index_date: bool, optional
434 |         When True set date as Pandas DataFrame index, default False - number of time units
435 |     title: str, optional
436 |         Figure title
437 |     xlabel: str, optional
438 |         Figure xlabel
439 |     ylabel: str, optional
440 |         Figure ylabel
441 |     ax: matplotlib.AxesSubplot, optional
442 |         Using user axes
443 |     kwargs
444 |         Passed into the pandas.DataFrame.plot command.
445 | 
446 |     Returns
447 |     -------
448 |     axes: matplotlib.AxesSubplot
449 | 
450 |     """
451 |     from matplotlib import pyplot as plt
452 | 
453 |     if ax is None:
454 |         ax = plt.subplot(111)
455 | 
456 |     df_cum_transactions = expected_cumulative_transactions(
457 |         model,
458 |         transactions,
459 |         datetime_col,
460 |         customer_id_col,
461 |         t,
462 |         datetime_format=datetime_format,
463 |         freq=freq,
464 |         set_index_date=set_index_date,
465 |     )
466 | 
467 |     ax = df_cum_transactions.plot(ax=ax, title=title, **kwargs)
468 | 
469 |     if set_index_date:
470 |         x_vline = df_cum_transactions.index[int(t_cal)]
471 |         xlabel = "date"
472 |     else:
473 |         x_vline = t_cal
474 |     ax.axvline(x=x_vline, color="r", linestyle="--")
475 |     ax.set_xlabel(xlabel)
476 |     ax.set_ylabel(ylabel)
477 |     return ax
478 | 
479 | 
480 | def plot_incremental_transactions(
481 |     model,
482 |     transactions,
483 |     datetime_col,
484 |     customer_id_col,
485 |     t,
486 |     t_cal,
487 |     datetime_format=None,
488 |     freq="D",
489 |     set_index_date=False,
490 |     title="Tracking Daily Transactions",
491 |     xlabel="day",
492 |     ylabel="Transactions",
493 |     ax=None,
494 |     **kwargs
495 | ):
496 |     """
497 |     Plot a figure of the predicted and actual incremental transactions of users.
498 | 
499 |     Parameters
500 |     ----------
501 |     model: lifetimes model
502 |         A fitted lifetimes model
503 |     transactions: pandas DataFrame
504 |         DataFrame containing the transactions history of the customer_id
505 |     datetime_col: str
506 |         The column in transactions that denotes the datetime the purchase was made.
507 |     customer_id_col: str
508 |         The column in transactions that denotes the customer_id
509 |     t: float
510 |         The number of time units since the begining of
511 |         data for which we want to calculate cumulative transactions
512 |     t_cal: float
513 |         A marker used to indicate where the vertical line for plotting should be.
514 |     datetime_format: str, optional
515 |         A string that represents the timestamp format. Useful if Pandas
516 |         can't understand the provided format.
517 |     freq: str, optional
518 |         Default 'D' for days, 'W' for weeks, 'M' for months... etc.
519 |         Full list here:
520 |         http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
521 |     set_index_date: bool, optional
522 |         When True set date as Pandas DataFrame index, default False - number of time units
523 |     title: str, optional
524 |         Figure title
525 |     xlabel: str, optional
526 |         Figure xlabel
527 |     ylabel: str, optional
528 |         Figure ylabel
529 |     ax: matplotlib.AxesSubplot, optional
530 |         Using user axes
531 |     kwargs
532 |         Passed into the pandas.DataFrame.plot command.
533 | 
534 |     Returns
535 |     -------
536 |     axes: matplotlib.AxesSubplot
537 | 
538 |     """
539 |     from matplotlib import pyplot as plt
540 | 
541 |     if ax is None:
542 |         ax = plt.subplot(111)
543 | 
544 |     df_cum_transactions = expected_cumulative_transactions(
545 |         model,
546 |         transactions,
547 |         datetime_col,
548 |         customer_id_col,
549 |         t,
550 |         datetime_format=datetime_format,
551 |         freq=freq,
552 |         set_index_date=set_index_date,
553 |     )
554 | 
555 |     # get incremental from cumulative transactions
556 |     df_cum_transactions = df_cum_transactions.apply(lambda x: x - x.shift(1))
557 |     ax = df_cum_transactions.plot(ax=ax, title=title, **kwargs)
558 | 
559 |     if set_index_date:
560 |         x_vline = df_cum_transactions.index[int(t_cal)]
561 |         xlabel = "date"
562 |     else:
563 |         x_vline = t_cal
564 |     ax.axvline(x=x_vline, color="r", linestyle="--")
565 |     ax.set_xlabel(xlabel)
566 |     ax.set_ylabel(ylabel)
567 |     return ax
568 | 
569 | 
570 | def plot_transaction_rate_heterogeneity(
571 |     model,
572 |     suptitle="Heterogeneity in Transaction Rate",
573 |     xlabel="Transaction Rate",
574 |     ylabel="Density",
575 |     suptitle_fontsize=14,
576 |     **kwargs
577 | ):
578 |     """
579 |     Plot the estimated gamma distribution of lambda (customers' propensities to purchase).
580 | 
581 |     Parameters
582 |     ----------
583 |     model: lifetimes model
584 |         A fitted lifetimes model, for now only for BG/NBD
585 |     suptitle: str, optional
586 |         Figure suptitle
587 |     xlabel: str, optional
588 |         Figure xlabel
589 |     ylabel: str, optional
590 |         Figure ylabel
591 |     kwargs
592 |         Passed into the matplotlib.pyplot.plot command.
593 | 
594 |     Returns
595 |     -------
596 |     axes: matplotlib.AxesSubplot
597 | 
598 |     """
599 |     from matplotlib import pyplot as plt
600 | 
601 |     r, alpha = model._unload_params("r", "alpha")
602 |     rate_mean = r / alpha
603 |     rate_var = r / alpha ** 2
604 | 
605 |     rv = stats.gamma(r, scale=1 / alpha)
606 |     lim = rv.ppf(0.99)
607 |     x = np.linspace(0, lim, 100)
608 | 
609 |     fig, ax = plt.subplots(1)
610 |     fig.suptitle("Heterogeneity in Transaction Rate", fontsize=suptitle_fontsize, fontweight="bold")
611 | 
612 |     ax.set_title("mean: {:.3f}, var: {:.3f}".format(rate_mean, rate_var))
613 |     ax.set_xlabel(xlabel)
614 |     ax.set_ylabel(ylabel)
615 | 
616 |     fig.tight_layout(rect=[0, 0.03, 1, 0.95])
617 |     plt.plot(x, rv.pdf(x), **kwargs)
618 |     return ax
619 | 
620 | 
621 | def plot_dropout_rate_heterogeneity(
622 |     model,
623 |     suptitle="Heterogeneity in Dropout Probability",
624 |     xlabel="Dropout Probability p",
625 |     ylabel="Density",
626 |     suptitle_fontsize=14,
627 |     **kwargs
628 | ):
629 |     """
630 |     Plot the estimated beta distribution of p.
631 | 
632 |     p - (customers' probability of dropping out immediately after a transaction).
633 | 
634 |     Parameters
635 |     ----------
636 |     model: lifetimes model
637 |         A fitted lifetimes model, for now only for BG/NBD
638 |     suptitle: str, optional
639 |         Figure suptitle
640 |     xlabel: str, optional
641 |         Figure xlabel
642 |     ylabel: str, optional
643 |         Figure ylabel
644 |     kwargs
645 |         Passed into the matplotlib.pyplot.plot command.
646 | 
647 |     Returns
648 |     -------
649 |     axes: matplotlib.AxesSubplot
650 | 
651 |     """
652 |     from matplotlib import pyplot as plt
653 | 
654 |     a, b = model._unload_params("a", "b")
655 |     beta_mean = a / (a + b)
656 |     beta_var = a * b / ((a + b) ** 2) / (a + b + 1)
657 | 
658 |     rv = stats.beta(a, b)
659 |     lim = rv.ppf(0.99)
660 |     x = np.linspace(0, lim, 100)
661 | 
662 |     fig, ax = plt.subplots(1)
663 |     fig.suptitle(suptitle, fontsize=suptitle_fontsize, fontweight="bold")
664 | 
665 |     ax.set_title("mean: {:.3f}, var: {:.3f}".format(beta_mean, beta_var))
666 |     ax.set_xlabel(xlabel)
667 |     ax.set_ylabel(ylabel)
668 | 
669 |     fig.tight_layout(rect=[0, 0.03, 1, 0.95])
670 |     plt.plot(x, rv.pdf(x), **kwargs)
671 |     return ax
672 | 
673 | 
674 | def forceAspect(ax, aspect=1):
675 |     im = ax.get_images()
676 |     extent = im[0].get_extent()
677 |     ax.set_aspect(abs((extent[1] - extent[0]) / (extent[3] - extent[2])) / aspect)
678 | 


--------------------------------------------------------------------------------
/lifetimes/version.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import unicode_literals
3 | 
4 | __version__ = "0.11.3"
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>1.10.0
2 | scipy>=1.0.0
3 | pandas>=0.24.0
4 | dill>=0.2.6
5 | # autograd>=1.2.0
6 | # TODO, should be updated when new pip release for the autograd would be available
7 | -e git://github.com/HIPS/autograd.git@4ec5aede380c5accf20afd4f4a96d59f1a9cd1df#egg=autograd


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | license_file=LICENSE.txt
 3 | 
 4 | [pep8]
 5 | ignore=E501,E241
 6 | 
 7 | [flake8]
 8 | # F401: imported but unused
 9 | # E501: line too long
10 | # N802: function name should be lowercase
11 | # N803: argument name should be lowercase
12 | # N806: variable in function should be lowercase
13 | ignore=F401,E501,N802,N803,N806
14 | 
15 | [pycodestyle]
16 | # E741: ambiguous variable name
17 | # E501: line too long
18 | ignore=E741,E501
19 | 
20 | [pydocstyle]
21 | ## necessary
22 | # D212: Multi-line docstring summary should start at the first line
23 | # D203: 1 blank line required before class docstring
24 | 
25 | ## should be removed in future
26 | # D406: Section name should end with a newline
27 | # D407: Missing dashed underline after section
28 | # D100: Missing docstring in public module
29 | # D104: Missing docstring in public package
30 | ignore=D406,D212,D407,D203,D100,D104
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import io
 4 | import os
 5 | from setuptools import setup
 6 | 
 7 | 
 8 | exec(compile(open("lifetimes/version.py").read(), "lifetimes/version.py", "exec"))
 9 | 
10 | 
11 | readme_path = os.path.join(os.path.dirname(__file__), "README.md")
12 | 
13 | 
14 | long_description = io.open(readme_path, encoding="utf8").read()
15 | 
16 | 
17 | setup(
18 |     name="Lifetimes",
19 |     version=__version__,
20 |     description="Measure customer lifetime value in Python",
21 |     author="Cam Davidson-Pilon",
22 |     author_email="cam.davidson.pilon@gmail.com",
23 |     packages=["lifetimes", "lifetimes.datasets"],
24 |     license="MIT",
25 |     keywords="customer lifetime value, clv, ltv, BG/NBD, pareto/NBD, frequency, recency",
26 |     url="https://github.com/CamDavidsonPilon/lifetimes",
27 |     long_description=long_description,
28 |     long_description_content_type="text/markdown",
29 |     classifiers=[
30 |         "Development Status :: 4 - Beta",
31 |         "License :: OSI Approved :: MIT License",
32 |         "Programming Language :: Python",
33 |         "Programming Language :: Python :: 2.7",
34 |         "Programming Language :: Python :: 3.5",
35 |         "Topic :: Scientific/Engineering",
36 |     ],
37 |     install_requires=["numpy>=1.10.0", "scipy>=1.0.0", "pandas>=0.24.0", "autograd>=1.2.0", "dill>=0.2.6"],
38 |     package_data={
39 |         "lifetimes": ["datasets/*", "../README.md", "../README.txt", "../LICENSE", "../MANIFEST.in", "fitters/*"]
40 |     },
41 | )
42 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CamDavidsonPilon/lifetimes/4f2833f4518621343bb6983eb3e540c11f66ec6a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/__main__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pytest
3 | 
4 | 
5 | if __name__ == "__main__":
6 |     # Exit with correct code
7 |     sys.exit(pytest.main(["--pyargs", "tests"] + sys.argv[1:]))
8 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import numpy as np
3 | 
4 | 
5 | def pytest_runtest_setup(item):
6 |     seed = np.random.randint(1000)
7 |     print("Seed used in np.random.seed(): %d" % seed)
8 |     np.random.seed(seed)
9 | 


--------------------------------------------------------------------------------
/tests/test_generate_data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from collections import OrderedDict
  3 | import pytest
  4 | import pandas as pd
  5 | import numpy as np
  6 | import numpy.testing as npt
  7 | 
  8 | import scipy.stats as stats
  9 | 
 10 | from lifetimes import BetaGeoBetaBinomFitter
 11 | from lifetimes.generate_data import (
 12 |     beta_geometric_nbd_model,
 13 |     pareto_nbd_model,
 14 |     modified_beta_geometric_nbd_model,
 15 |     beta_geometric_beta_binom_model,
 16 |     beta_geometric_nbd_model_transactional_data,
 17 | )
 18 | from lifetimes.utils import summary_data_from_transaction_data
 19 | 
 20 | 
 21 | def setup_module(module):
 22 |     np.random.seed(188898)
 23 | 
 24 | 
 25 | class TestBetaGeoGeneration:
 26 |     params = [0.243, 4.414, 0.793, 2.426]
 27 | 
 28 | 
 29 | class TestParetoNBDGeneration:
 30 |     params = [0.553, 10.578, 0.606, 11.669]
 31 | 
 32 | 
 33 | class TestModifiedBetaGeoNBDGeneration:
 34 |     params = [0.525, 6.183, 0.891, 1.614]
 35 |     times = np.array([0.1429, 1.0, 3.00, 31.8571, 32.00, 78.00])
 36 |     expected = np.array([0.0078, 0.0532, 0.1506, 1.0405, 1.0437, 1.8576])
 37 | 
 38 | 
 39 | class TestBetaGeoBetaBinomGeneration:
 40 |     @pytest.fixture()
 41 |     def bbgb_params(self):
 42 |         return OrderedDict([("alpha", 1.204), ("beta", 0.750), ("gamma", 0.657), ("delta", 2.783)])
 43 | 
 44 |     def test_positivity(self, bbgb_params):
 45 |         sim_data = beta_geometric_beta_binom_model(N=6, size=5000, **bbgb_params)
 46 |         assert (sim_data["frequency"] >= 0).all()
 47 |         assert (sim_data["recency"] >= 0).all()
 48 | 
 49 |     def test_hitting_max(self, bbgb_params):
 50 |         sim_data = beta_geometric_beta_binom_model(N=6, size=5000, **bbgb_params)
 51 |         assert sim_data["frequency"].max() == 6
 52 |         assert sim_data["recency"].max() == 6
 53 | 
 54 |     def test_alive_probs(self, bbgb_params):
 55 |         sim_data = beta_geometric_beta_binom_model(N=6, size=50000, **bbgb_params)
 56 |         assert (
 57 |             np.abs(sim_data.loc[(sim_data["frequency"] == 0) & (sim_data["recency"] == 0), "alive"].mean() - 0.11)
 58 |             < 0.01
 59 |         )
 60 |         assert (
 61 |             np.abs(sim_data.loc[(sim_data["frequency"] == 2) & (sim_data["recency"] == 4), "alive"].mean() - 0.59)
 62 |             < 0.01
 63 |         )
 64 |         assert (
 65 |             np.abs(sim_data.loc[(sim_data["frequency"] == 6) & (sim_data["recency"] == 6), "alive"].mean() - 0.93)
 66 |             < 0.01
 67 |         )
 68 | 
 69 |     def test_params_same_from_sim_data(self, bbgb_params):
 70 |         sim_data = beta_geometric_beta_binom_model(N=6, size=100000, **bbgb_params)
 71 |         bbtf = BetaGeoBetaBinomFitter()
 72 |         grouped_data = sim_data.groupby(["frequency", "recency", "n_periods"])["customer_id"].count()
 73 |         grouped_data = grouped_data.reset_index().rename(columns={"customer_id": "weights"})
 74 |         bbtf.fit(grouped_data["frequency"], grouped_data["recency"], grouped_data["n_periods"], grouped_data["weights"])
 75 | 
 76 |         npt.assert_allclose(
 77 |             np.asarray(list(bbgb_params.values())).astype(float),
 78 |             np.asarray(bbtf._unload_params("alpha", "beta", "gamma", "delta")).astype(float),
 79 |             atol=0.1,
 80 |             rtol=1e-2,
 81 |         )
 82 | 
 83 | 
 84 | @pytest.mark.parametrize(
 85 |     "T,r,alpha,a,b,observation_period_end,freq,size",
 86 |     [
 87 |         (100, 0.24, 4.41, 0.79, 2.43, "2019-1-1", "D", 500),
 88 |         ([400, 200, 5, 103, 198, 401], 0.24, 4.41, 0.79, 2.43, "2019-1-1", "D", 6),
 89 |         (100, 0.24, 4.41, 0.79, 2.43, "2019-1-1", "h", 500),
 90 |     ],
 91 | )
 92 | def test_beta_geometric_nbd_model_transactional_data(T, r, alpha, a, b, observation_period_end, freq, size):
 93 |     np.random.seed(188898)
 94 |     transaction_data = beta_geometric_nbd_model_transactional_data(
 95 |         T=T, r=r, alpha=alpha, a=a, b=b, observation_period_end=observation_period_end, freq=freq, size=size
 96 |     )
 97 |     actual = summary_data_from_transaction_data(
 98 |         transactions=transaction_data,
 99 |         customer_id_col="customer_id",
100 |         datetime_col="date",
101 |         observation_period_end=observation_period_end,
102 |         freq=freq,
103 |     )
104 |     np.random.seed(188898)
105 |     expected = beta_geometric_nbd_model(T=T, r=r, alpha=alpha, a=a, b=b, size=size)[["frequency", "recency", "T"]]
106 |     expected["recency"] = expected["recency"].apply(np.ceil)
107 |     expected = expected.reset_index(drop=True)
108 |     actual = actual.reset_index(drop=True)
109 |     assert expected.equals(actual)
110 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Test lifetimes utils."""
  3 | import pytest
  4 | import pandas as pd
  5 | import numpy as np
  6 | from pandas.util.testing import assert_frame_equal
  7 | from numpy.testing import assert_almost_equal, assert_allclose
  8 | 
  9 | from lifetimes import utils, BetaGeoFitter, ParetoNBDFitter
 10 | from lifetimes.datasets import load_dataset
 11 | 
 12 | 
 13 | @pytest.fixture()
 14 | def example_transaction_data():
 15 |     return pd.read_csv("lifetimes/datasets/example_transactions.csv", parse_dates=["date"])
 16 | 
 17 | 
 18 | @pytest.fixture()
 19 | def example_summary_data(example_transaction_data):
 20 |     return utils.summary_data_from_transaction_data(
 21 |         example_transaction_data, "id", "date", observation_period_end=max(example_transaction_data.date)
 22 |     )
 23 | 
 24 | 
 25 | @pytest.fixture()
 26 | def fitted_bg(example_summary_data):
 27 |     bg = BetaGeoFitter()
 28 |     bg.fit(
 29 |         example_summary_data["frequency"],
 30 |         example_summary_data["recency"],
 31 |         example_summary_data["T"],
 32 |         iterative_fitting=2,
 33 |         tol=1e-6,
 34 |     )
 35 |     return bg
 36 | 
 37 | 
 38 | @pytest.fixture()
 39 | def transaction_level_data():
 40 |     d = [
 41 |         [1, "2015-02-01"],
 42 |         [1, "2015-02-06"],
 43 |         [2, "2015-01-01"],
 44 |         [3, "2015-01-01"],
 45 |         [3, "2015-01-02"],
 46 |         [3, "2015-01-05"],
 47 |     ]
 48 |     return pd.DataFrame(d, columns=["id", "date"])
 49 | 
 50 | 
 51 | @pytest.fixture()
 52 | def large_transaction_level_data():
 53 |     d = [
 54 |         [1, "2015-01-01"],
 55 |         [1, "2015-02-06"],
 56 |         [2, "2015-01-01"],
 57 |         [3, "2015-01-01"],
 58 |         [3, "2015-01-02"],
 59 |         [3, "2015-01-05"],
 60 |         [4, "2015-01-16"],
 61 |         [4, "2015-02-02"],
 62 |         [4, "2015-02-05"],
 63 |         [5, "2015-01-16"],
 64 |         [5, "2015-01-17"],
 65 |         [5, "2015-01-18"],
 66 |         [6, "2015-02-02"],
 67 |     ]
 68 |     return pd.DataFrame(d, columns=["id", "date"])
 69 | 
 70 | 
 71 | @pytest.fixture()
 72 | def large_transaction_level_data_with_monetary_value():
 73 |     d = [
 74 |         [1, "2015-01-01", 1],
 75 |         [1, "2015-02-06", 2],
 76 |         [2, "2015-01-01", 2],
 77 |         [3, "2015-01-01", 3],
 78 |         [3, "2015-01-02", 1],
 79 |         [3, "2015-01-05", 5],
 80 |         [4, "2015-01-16", 6],
 81 |         [4, "2015-02-02", 3],
 82 |         [4, "2015-02-05", 3],
 83 |         [5, "2015-01-16", 3],
 84 |         [5, "2015-01-17", 1],
 85 |         [5, "2015-01-18", 8],
 86 |         [6, "2015-02-02", 5],
 87 |     ]
 88 |     return pd.DataFrame(d, columns=["id", "date", "monetary_value"])
 89 | 
 90 | 
 91 | @pytest.fixture()
 92 | def cdnow_transactions():
 93 |     transactions = load_dataset("CDNOW_sample.txt", header=None, sep=r"\s+")
 94 |     transactions.columns = ["id_total", "id_sample", "date", "num_cd_purc", "total_value"]
 95 |     return transactions[["id_sample", "date"]]
 96 | 
 97 | 
 98 | @pytest.fixture()
 99 | def df_cum_transactions(cdnow_transactions):
100 |     datetime_col = "date"
101 |     customer_id_col = "id_sample"
102 |     t = 25 * 7
103 |     datetime_format = "%Y%m%d"
104 |     freq = "D"
105 |     observation_period_end = "19970930"
106 |     freq_multiplier = 7
107 | 
108 |     transactions_summary = utils.summary_data_from_transaction_data(
109 |         cdnow_transactions,
110 |         customer_id_col,
111 |         datetime_col,
112 |         datetime_format=datetime_format,
113 |         freq=freq,
114 |         freq_multiplier=freq_multiplier,
115 |         observation_period_end=observation_period_end,
116 |     )
117 | 
118 |     transactions_summary = transactions_summary.reset_index()
119 | 
120 |     model = ParetoNBDFitter()
121 |     model.fit(transactions_summary["frequency"], transactions_summary["recency"], transactions_summary["T"])
122 | 
123 |     df_cum = utils.expected_cumulative_transactions(
124 |         model,
125 |         cdnow_transactions,
126 |         datetime_col,
127 |         customer_id_col,
128 |         t,
129 |         datetime_format,
130 |         freq,
131 |         set_index_date=False,
132 |         freq_multiplier=freq_multiplier,
133 |     )
134 |     return df_cum
135 | 
136 | 
137 | def test_find_first_transactions_returns_correct_results(large_transaction_level_data):
138 |     today = "2015-02-07"
139 |     actual = utils._find_first_transactions(large_transaction_level_data, "id", "date", observation_period_end=today)
140 |     expected = pd.DataFrame(
141 |         [
142 |             [1, pd.Period("2015-01-01", "D"), True],
143 |             [1, pd.Period("2015-02-06", "D"), False],
144 |             [2, pd.Period("2015-01-01", "D"), True],
145 |             [3, pd.Period("2015-01-01", "D"), True],
146 |             [3, pd.Period("2015-01-02", "D"), False],
147 |             [3, pd.Period("2015-01-05", "D"), False],
148 |             [4, pd.Period("2015-01-16", "D"), True],
149 |             [4, pd.Period("2015-02-02", "D"), False],
150 |             [4, pd.Period("2015-02-05", "D"), False],
151 |             [5, pd.Period("2015-01-16", "D"), True],
152 |             [5, pd.Period("2015-01-17", "D"), False],
153 |             [5, pd.Period("2015-01-18", "D"), False],
154 |             [6, pd.Period("2015-02-02", "D"), True],
155 |         ],
156 |         columns=["id", "date", "first"],
157 |     )
158 |     assert_frame_equal(actual, expected)
159 | 
160 | 
161 | def test_find_first_transactions_with_specific_non_daily_frequency(large_transaction_level_data):
162 |     today = "2015-02-07"
163 |     actual = utils._find_first_transactions(
164 |         large_transaction_level_data, "id", "date", observation_period_end=today, freq="W"
165 |     )
166 |     expected = pd.DataFrame(
167 |         [
168 |             [1, pd.Period("2014-12-29/2015-01-04", "W-SUN"), True],
169 |             [1, pd.Period("2015-02-02/2015-02-08", "W-SUN"), False],
170 |             [2, pd.Period("2014-12-29/2015-01-04", "W-SUN"), True],
171 |             [3, pd.Period("2014-12-29/2015-01-04", "W-SUN"), True],
172 |             [3, pd.Period("2015-01-05/2015-01-11", "W-SUN"), False],
173 |             [4, pd.Period("2015-01-12/2015-01-18", "W-SUN"), True],
174 |             [4, pd.Period("2015-02-02/2015-02-08", "W-SUN"), False],
175 |             [5, pd.Period("2015-01-12/2015-01-18", "W-SUN"), True],
176 |             [6, pd.Period("2015-02-02/2015-02-08", "W-SUN"), True],
177 |         ],
178 |         columns=["id", "date", "first"],
179 |         index=actual.index,
180 |     )  # we shouldn't really care about row ordering or indexing, but assert_frame_equals is strict about it
181 |     assert_frame_equal(actual, expected)
182 | 
183 | 
184 | def test_find_first_transactions_with_monetary_values(large_transaction_level_data_with_monetary_value):
185 |     today = "2015-02-07"
186 |     actual = utils._find_first_transactions(
187 |         large_transaction_level_data_with_monetary_value, "id", "date", "monetary_value", observation_period_end=today
188 |     )
189 |     expected = pd.DataFrame(
190 |         [
191 |             [1, pd.Period("2015-01-01", "D"), 1, True],
192 |             [1, pd.Period("2015-02-06", "D"), 2, False],
193 |             [2, pd.Period("2015-01-01", "D"), 2, True],
194 |             [3, pd.Period("2015-01-01", "D"), 3, True],
195 |             [3, pd.Period("2015-01-02", "D"), 1, False],
196 |             [3, pd.Period("2015-01-05", "D"), 5, False],
197 |             [4, pd.Period("2015-01-16", "D"), 6, True],
198 |             [4, pd.Period("2015-02-02", "D"), 3, False],
199 |             [4, pd.Period("2015-02-05", "D"), 3, False],
200 |             [5, pd.Period("2015-01-16", "D"), 3, True],
201 |             [5, pd.Period("2015-01-17", "D"), 1, False],
202 |             [5, pd.Period("2015-01-18", "D"), 8, False],
203 |             [6, pd.Period("2015-02-02", "D"), 5, True],
204 |         ],
205 |         columns=["id", "date", "monetary_value", "first"],
206 |     )
207 |     assert_frame_equal(actual, expected)
208 | 
209 | 
210 | def test_find_first_transactions_with_monetary_values_with_specific_non_daily_frequency(
211 |     large_transaction_level_data_with_monetary_value
212 | ):
213 |     today = "2015-02-07"
214 |     actual = utils._find_first_transactions(
215 |         large_transaction_level_data_with_monetary_value,
216 |         "id",
217 |         "date",
218 |         "monetary_value",
219 |         observation_period_end=today,
220 |         freq="W",
221 |     )
222 |     expected = pd.DataFrame(
223 |         [
224 |             [1, pd.Period("2014-12-29/2015-01-04", "W-SUN"), 1, True],
225 |             [1, pd.Period("2015-02-02/2015-02-08", "W-SUN"), 2, False],
226 |             [2, pd.Period("2014-12-29/2015-01-04", "W-SUN"), 2, True],
227 |             [3, pd.Period("2014-12-29/2015-01-04", "W-SUN"), 4, True],
228 |             [3, pd.Period("2015-01-05/2015-01-11", "W-SUN"), 5, False],
229 |             [4, pd.Period("2015-01-12/2015-01-18", "W-SUN"), 6, True],
230 |             [4, pd.Period("2015-02-02/2015-02-08", "W-SUN"), 6, False],
231 |             [5, pd.Period("2015-01-12/2015-01-18", "W-SUN"), 12, True],
232 |             [6, pd.Period("2015-02-02/2015-02-08", "W-SUN"), 5, True],
233 |         ],
234 |         columns=["id", "date", "monetary_value", "first"],
235 |     )
236 |     assert_frame_equal(actual, expected)
237 | 
238 | 
239 | def test_summary_data_from_transaction_data_returns_correct_results(transaction_level_data):
240 |     today = "2015-02-07"
241 |     actual = utils.summary_data_from_transaction_data(
242 |         transaction_level_data, "id", "date", observation_period_end=today
243 |     )
244 |     expected = pd.DataFrame(
245 |         [[1, 1.0, 5.0, 6.0], [2, 0.0, 0.0, 37.0], [3, 2.0, 4.0, 37.0]], columns=["id", "frequency", "recency", "T"]
246 |     ).set_index("id")
247 |     assert_frame_equal(actual, expected)
248 | 
249 | 
250 | def test_summary_data_from_transaction_data_works_with_string_customer_ids(transaction_level_data):
251 |     d = [
252 |         ["X", "2015-02-01"],
253 |         ["X", "2015-02-06"],
254 |         ["Y", "2015-01-01"],
255 |         ["Y", "2015-01-01"],
256 |         ["Y", "2015-01-02"],
257 |         ["Y", "2015-01-05"],
258 |     ]
259 |     df = pd.DataFrame(d, columns=["id", "date"])
260 |     utils.summary_data_from_transaction_data(df, "id", "date")
261 | 
262 | 
263 | def test_summary_data_from_transaction_data_works_with_int_customer_ids_and_doesnt_coerce_to_float(
264 |     transaction_level_data
265 | ):
266 |     d = [
267 |         [1, "2015-02-01"],
268 |         [1, "2015-02-06"],
269 |         [1, "2015-01-01"],
270 |         [2, "2015-01-01"],
271 |         [2, "2015-01-02"],
272 |         [2, "2015-01-05"],
273 |     ]
274 |     df = pd.DataFrame(d, columns=["id", "date"])
275 |     actual = utils.summary_data_from_transaction_data(df, "id", "date")
276 |     assert actual.index.dtype == "int64"
277 | 
278 | 
279 | def test_summary_data_from_transaction_data_with_specific_datetime_format(transaction_level_data):
280 |     transaction_level_data["date"] = transaction_level_data["date"].map(lambda x: x.replace("-", ""))
281 |     format = "%Y%m%d"
282 |     today = "20150207"
283 |     actual = utils.summary_data_from_transaction_data(
284 |         transaction_level_data, "id", "date", observation_period_end=today, datetime_format=format
285 |     )
286 |     expected = pd.DataFrame(
287 |         [[1, 1.0, 5.0, 6.0], [2, 0.0, 0.0, 37.0], [3, 2.0, 4.0, 37.0]], columns=["id", "frequency", "recency", "T"]
288 |     ).set_index("id")
289 |     assert_frame_equal(actual, expected)
290 | 
291 | 
292 | def test_summary_date_from_transaction_data_with_specific_non_daily_frequency(large_transaction_level_data):
293 |     today = "20150207"
294 |     actual = utils.summary_data_from_transaction_data(
295 |         large_transaction_level_data, "id", "date", observation_period_end=today, freq="W"
296 |     )
297 |     expected = pd.DataFrame(
298 |         [
299 |             [1, 1.0, 5.0, 5.0],
300 |             [2, 0.0, 0.0, 5.0],
301 |             [3, 1.0, 1.0, 5.0],
302 |             [4, 1.0, 3.0, 3.0],
303 |             [5, 0.0, 0.0, 3.0],
304 |             [6, 0.0, 0.0, 0.0],
305 |         ],
306 |         columns=["id", "frequency", "recency", "T"],
307 |     ).set_index("id")
308 |     assert_frame_equal(actual, expected)
309 | 
310 | 
311 | def test_summary_date_from_transaction_with_monetary_values(large_transaction_level_data_with_monetary_value):
312 |     today = "20150207"
313 |     actual = utils.summary_data_from_transaction_data(
314 |         large_transaction_level_data_with_monetary_value,
315 |         "id",
316 |         "date",
317 |         monetary_value_col="monetary_value",
318 |         observation_period_end=today,
319 |     )
320 |     expected = pd.DataFrame(
321 |         [
322 |             [1, 1.0, 36.0, 37.0, 2],
323 |             [2, 0.0, 0.0, 37.0, 0],
324 |             [3, 2.0, 4.0, 37.0, 3],
325 |             [4, 2.0, 20.0, 22.0, 3],
326 |             [5, 2.0, 2.0, 22.0, 4.5],
327 |             [6, 0.0, 0.0, 5.0, 0],
328 |         ],
329 |         columns=["id", "frequency", "recency", "T", "monetary_value"],
330 |     ).set_index("id")
331 |     assert_frame_equal(actual, expected)
332 | 
333 | 
334 | def test_summary_data_from_transaction_data_will_choose_the_correct_first_order_to_drop_in_monetary_transactions():
335 |     # this is the correct behaviour. See https://github.com/CamDavidsonPilon/lifetimes/issues/85
336 |     # and test_summary_statistics_are_indentical_to_hardies_paper_confirming_correct_aggregations
337 |     cust = pd.Series([2, 2, 2])
338 |     dates_ordered = pd.to_datetime(pd.Series(["2014-03-14 00:00:00", "2014-04-09 00:00:00", "2014-05-21 00:00:00"]))
339 |     sales = pd.Series([10, 20, 25])
340 |     transaction_data = pd.DataFrame({"date": dates_ordered, "id": cust, "sales": sales})
341 |     summary_ordered_data = utils.summary_data_from_transaction_data(transaction_data, "id", "date", "sales")
342 | 
343 |     dates_unordered = pd.to_datetime(pd.Series(["2014-04-09 00:00:00", "2014-03-14 00:00:00", "2014-05-21 00:00:00"]))
344 |     sales = pd.Series([20, 10, 25])
345 |     transaction_data = pd.DataFrame({"date": dates_unordered, "id": cust, "sales": sales})
346 |     summary_unordered_data = utils.summary_data_from_transaction_data(transaction_data, "id", "date", "sales")
347 | 
348 |     assert_frame_equal(summary_ordered_data, summary_unordered_data)
349 |     assert summary_ordered_data["monetary_value"].loc[2] == 22.5
350 | 
351 | 
352 | def test_summary_statistics_are_indentical_to_hardies_paper_confirming_correct_aggregations():
353 |     # see http://brucehardie.com/papers/rfm_clv_2005-02-16.pdf
354 |     # RFM and CLV: Using Iso-value Curves for Customer Base Analysis
355 |     df = pd.read_csv(
356 |         "lifetimes/datasets/CDNOW_sample.txt",
357 |         sep=r"\s+",
358 |         header=None,
359 |         names=["_id", "id", "date", "cds_bought", "spent"],
360 |     )
361 |     df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
362 |     df_train = df[df["date"] < "1997-10-01"]
363 |     summary = utils.summary_data_from_transaction_data(df_train, "id", "date", "spent")
364 |     results = summary[summary["frequency"] > 0]["monetary_value"].describe()
365 | 
366 |     assert np.round(results.loc["mean"]) == 35
367 |     assert np.round(results.loc["std"]) == 30
368 |     assert np.round(results.loc["min"]) == 3
369 |     assert np.round(results.loc["50%"]) == 27
370 |     assert np.round(results.loc["max"]) == 300
371 |     assert np.round(results.loc["count"]) == 946
372 | 
373 | 
374 | def test_calibration_and_holdout_data(large_transaction_level_data):
375 |     today = "2015-02-07"
376 |     calibration_end = "2015-02-01"
377 |     actual = utils.calibration_and_holdout_data(
378 |         large_transaction_level_data, "id", "date", calibration_end, observation_period_end=today
379 |     )
380 |     assert actual.loc[1]["frequency_holdout"] == 1
381 |     assert actual.loc[2]["frequency_holdout"] == 0
382 | 
383 |     with pytest.raises(KeyError):
384 |         actual.loc[6]
385 | 
386 | 
387 | def test_calibration_and_holdout_data_throws_better_error_if_observation_period_end_is_too_early(
388 |     large_transaction_level_data
389 | ):
390 |     # max date is 2015-02-02
391 |     today = "2014-02-07"
392 |     calibration_end = "2014-02-01"
393 | 
394 |     with pytest.raises(ValueError, match="There is no data available"):
395 |         utils.calibration_and_holdout_data(
396 |             large_transaction_level_data, "id", "date", calibration_end, observation_period_end=today
397 |         )
398 | 
399 | 
400 | def test_calibration_and_holdout_data_is_okay_with_other_indexes(large_transaction_level_data):
401 |     n = large_transaction_level_data.shape[0]
402 |     large_transaction_level_data.index = np.random.randint(0, n, size=n)
403 |     today = "2015-02-07"
404 |     calibration_end = "2015-02-01"
405 |     actual = utils.calibration_and_holdout_data(
406 |         large_transaction_level_data, "id", "date", calibration_end, observation_period_end=today
407 |     )
408 |     assert actual.loc[1]["frequency_holdout"] == 1
409 |     assert actual.loc[2]["frequency_holdout"] == 0
410 | 
411 | 
412 | def test_calibration_and_holdout_data_works_with_specific_frequency(large_transaction_level_data):
413 |     today = "2015-02-07"
414 |     calibration_end = "2015-02-01"
415 |     actual = utils.calibration_and_holdout_data(
416 |         large_transaction_level_data, "id", "date", calibration_end, observation_period_end=today, freq="W"
417 |     )
418 |     expected_cols = ["id", "frequency_cal", "recency_cal", "T_cal", "frequency_holdout", "duration_holdout"]
419 |     expected = pd.DataFrame(
420 |         [
421 |             [1, 0.0, 0.0, 4.0, 1, 1],
422 |             [2, 0.0, 0.0, 4.0, 0, 1],
423 |             [3, 1.0, 1.0, 4.0, 0, 1],
424 |             [4, 0.0, 0.0, 2.0, 1, 1],
425 |             [5, 0.0, 0.0, 2.0, 0, 1],
426 |         ],
427 |         columns=expected_cols,
428 |     ).set_index("id")
429 |     assert_frame_equal(actual, expected, check_dtype=False)
430 | 
431 | 
432 | def test_calibration_and_holdout_data_gives_correct_date_boundaries():
433 | 
434 |     d = [
435 |         [1, "2015-01-01"],
436 |         [1, "2015-02-06"],  # excluded from both holdout and calibration
437 |         [2, "2015-01-01"],
438 |         [3, "2015-01-01"],
439 |         [3, "2015-01-02"],
440 |         [3, "2015-01-05"],
441 |         [4, "2015-01-16"],
442 |         [4, "2015-02-02"],
443 |         [4, "2015-02-05"],  # excluded from both holdout and calibration
444 |         [5, "2015-01-16"],
445 |         [5, "2015-01-17"],
446 |         [5, "2015-01-18"],
447 |         [6, "2015-02-02"],
448 |     ]
449 |     transactions = pd.DataFrame(d, columns=["id", "date"])
450 |     actual = utils.calibration_and_holdout_data(
451 |         transactions, "id", "date", calibration_period_end="2015-02-01", observation_period_end="2015-02-04"
452 |     )
453 |     assert actual["frequency_holdout"].loc[1] == 0
454 |     assert actual["frequency_holdout"].loc[4] == 1
455 | 
456 | 
457 | def test_calibration_and_holdout_data_with_monetary_value(large_transaction_level_data_with_monetary_value):
458 |     today = "2015-02-07"
459 |     calibration_end = "2015-02-01"
460 |     actual = utils.calibration_and_holdout_data(
461 |         large_transaction_level_data_with_monetary_value,
462 |         "id",
463 |         "date",
464 |         calibration_end,
465 |         observation_period_end=today,
466 |         monetary_value_col="monetary_value",
467 |     )
468 |     assert (actual["monetary_value_cal"] == [0, 0, 3, 0, 4.5]).all()
469 |     assert (actual["monetary_value_holdout"] == [2, 0, 0, 3, 0]).all()
470 | 
471 | 
472 | def test_summary_data_from_transaction_data_squashes_period_purchases_to_one_purchase():
473 |     transactions = pd.DataFrame([[1, "2015-01-01"], [1, "2015-01-01"]], columns=["id", "t"])
474 |     actual = utils.summary_data_from_transaction_data(transactions, "id", "t", freq="W")
475 |     assert actual.loc[1]["frequency"] == 1.0 - 1.0
476 | 
477 | 
478 | def test_calculate_alive_path(example_transaction_data, example_summary_data, fitted_bg):
479 |     user_data = example_transaction_data[example_transaction_data["id"] == 33]
480 |     frequency, recency, T = example_summary_data.loc[33]
481 |     alive_path = utils.calculate_alive_path(fitted_bg, user_data, "date", 205)
482 |     assert alive_path[0] == 1
483 |     assert alive_path[T] == fitted_bg.conditional_probability_alive(frequency, recency, T)
484 | 
485 | 
486 | def test_check_inputs():
487 |     frequency = np.array([0, 1, 2])
488 |     recency = np.array([0, 1, 10])
489 |     T = np.array([5, 6, 15])
490 |     monetary_value = np.array([2.3, 490, 33.33])
491 |     assert utils._check_inputs(frequency, recency, T, monetary_value) is None
492 | 
493 |     with pytest.raises(ValueError):
494 |         bad_recency = T + 1
495 |         utils._check_inputs(frequency, bad_recency, T)
496 | 
497 |     with pytest.raises(ValueError):
498 |         bad_recency = recency.copy()
499 |         bad_recency[0] = 1
500 |         utils._check_inputs(frequency, bad_recency, T)
501 | 
502 |     with pytest.raises(ValueError):
503 |         bad_freq = np.array([0, 0.5, 2])
504 |         utils._check_inputs(bad_freq, recency, T)
505 | 
506 |     with pytest.raises(ValueError):
507 |         bad_monetary_value = monetary_value.copy()
508 |         bad_monetary_value[0] = 0
509 |         utils._check_inputs(frequency, recency, T, bad_monetary_value)
510 | 
511 | 
512 | def test_summary_data_from_transaction_data_obeys_data_contraints(example_summary_data):
513 |     assert (
514 |         utils._check_inputs(
515 |             example_summary_data["frequency"], example_summary_data["recency"], example_summary_data["T"]
516 |         )
517 |         is None
518 |     )
519 | 
520 | 
521 | def test_scale_time():
522 |     max_T = 200.0
523 |     T = np.arange(max_T)
524 |     assert utils._scale_time(T) == 1.0 / (max_T - 1)
525 | 
526 | 
527 | def test_customer_lifetime_value_with_known_values(fitted_bg):
528 |     """
529 |     >>> print fitted_bg
530 |     <lifetimes.BetaGeoFitter: fitted with 5000 subjects, r: 0.16, alpha: 1.86, a: 1.85, b: 3.18>
531 |     >>> t = fitted_bg.data.head()
532 |     >>> t
533 |        frequency  recency    T
534 |        0          0        0  298
535 |        1          0        0  224
536 |        2          6      142  292
537 |        3          0        0  147
538 |        4          2        9  183
539 |     >>> print fitted_bg.predict(30, t['frequency'], t['recency'], t['T'])
540 |     0    0.016053
541 |     1    0.021171
542 |     2    0.030461
543 |     3    0.031686
544 |     4    0.001607
545 |     dtype: float64
546 |     """
547 |     t = fitted_bg.data.head()
548 |     expected = np.array([0.016053, 0.021171, 0.030461, 0.031686, 0.001607])
549 |     # discount_rate=0 means the clv will be the same as the predicted
550 |     clv_d0 = utils._customer_lifetime_value(
551 |         fitted_bg,
552 |         t["frequency"],
553 |         t["recency"],
554 |         t["T"],
555 |         monetary_value=pd.Series([1, 1, 1, 1, 1]),
556 |         time=1,
557 |         discount_rate=0.0,
558 |     )
559 |     assert_almost_equal(clv_d0.values, expected, decimal=5)
560 |     # discount_rate=1 means the clv will halve over a period
561 |     clv_d1 = utils._customer_lifetime_value(
562 |         fitted_bg,
563 |         t["frequency"],
564 |         t["recency"],
565 |         t["T"],
566 |         monetary_value=pd.Series([1, 1, 1, 1, 1]),
567 |         time=1,
568 |         discount_rate=1.0,
569 |     )
570 |     assert_almost_equal(clv_d1.values, expected / 2.0, decimal=5)
571 |     # time=2, discount_rate=0 means the clv will be twice the initial
572 |     clv_t2_d0 = utils._customer_lifetime_value(
573 |         fitted_bg,
574 |         t["frequency"],
575 |         t["recency"],
576 |         t["T"],
577 |         monetary_value=pd.Series([1, 1, 1, 1, 1]),
578 |         time=2,
579 |         discount_rate=0,
580 |     )
581 |     assert_allclose(clv_t2_d0.values, expected * 2.0, rtol=0.1)
582 |     # time=2, discount_rate=1 means the clv will be twice the initial
583 |     clv_t2_d1 = utils._customer_lifetime_value(
584 |         fitted_bg,
585 |         t["frequency"],
586 |         t["recency"],
587 |         t["T"],
588 |         monetary_value=pd.Series([1, 1, 1, 1, 1]),
589 |         time=2,
590 |         discount_rate=1.0,
591 |     )
592 |     assert_allclose(clv_t2_d1.values, expected / 2.0 + expected / 4.0, rtol=0.1)
593 | 
594 | 
595 | def test_expected_cumulative_transactions_dedups_inside_a_time_period(fitted_bg, example_transaction_data):
596 |     by_week = utils.expected_cumulative_transactions(fitted_bg, example_transaction_data, "date", "id", 10, freq="W")
597 |     by_day = utils.expected_cumulative_transactions(fitted_bg, example_transaction_data, "date", "id", 10, freq="D")
598 |     assert (by_week["actual"] >= by_day["actual"]).all()
599 | 
600 | 
601 | def test_expected_cumulative_transactions_equals_r_btyd_walktrough(df_cum_transactions):
602 |     """
603 |     Validate expected cumulative transactions with BTYD walktrough
604 | 
605 |     https://cran.r-project.org/web/packages/BTYD/vignettes/BTYD-walkthrough.pdf
606 | 
607 |     cum.tracking[,20:25]
608 |     # [,1] [,2] [,3] [,4] [,5] [,6]
609 |     # actual 1359 1414 1484 1517 1573 1672
610 |     # expected 1309 1385 1460 1533 1604 1674
611 | 
612 |     """
613 |     actual_btyd = [1359, 1414, 1484, 1517, 1573, 1672]
614 |     expected_btyd = [1309, 1385, 1460, 1533, 1604, 1674]
615 | 
616 |     actual = df_cum_transactions["actual"].iloc[19:25].values
617 |     predicted = df_cum_transactions["predicted"].iloc[19:25].values.round()
618 | 
619 |     assert_allclose(actual, actual_btyd)
620 |     assert_allclose(predicted, expected_btyd)
621 | 
622 | 
623 | def test_incremental_transactions_equals_r_btyd_walktrough(df_cum_transactions):
624 |     """
625 |     Validate incremental transactions with BTYD walktrough
626 | 
627 |     https://cran.r-project.org/web/packages/BTYD/vignettes/BTYD-walkthrough.pdf
628 | 
629 |     inc.tracking[,20:25]
630 |     # [,1] [,2] [,3] [,4] [,5] [,6]
631 |     # actual 73.00 55.00 70.00 33.00 56.00 99.00
632 |     # expected 78.31 76.42 74.65 72.98 71.41 69.93
633 | 
634 |     """
635 |     # get incremental from cumulative transactions
636 |     df_inc_transactions = df_cum_transactions.apply(lambda x: x - x.shift(1))
637 | 
638 |     actual_btyd = [73.00, 55.00, 70.00, 33.00, 56.00, 99.00]
639 |     expected_btyd = [78.31, 76.42, 74.65, 72.98, 71.41, 69.93]
640 | 
641 |     actual = df_inc_transactions["actual"].iloc[19:25].values
642 |     predicted = df_inc_transactions["predicted"].iloc[19:25].values.round(2)
643 | 
644 |     assert_allclose(actual, actual_btyd)
645 |     assert_allclose(predicted, expected_btyd, atol=1e-2)
646 | 
647 | 
648 | def test_expected_cumulative_transactions_date_index(cdnow_transactions):
649 |     """
650 |     Test set_index as date for cumulative transactions and bgf fitter.
651 | 
652 |     Get first 14 cdnow transactions dates and validate that date index,
653 |     freq_multiplier = 1 working and compare with tested data for last 4 records.
654 | 
655 |     dates = ['1997-01-11', '1997-01-12', '1997-01-13', '1997-01-14']
656 |     actual_trans = [11, 12, 15, 19]
657 |     expected_trans = [10.67, 12.67, 14.87, 17.24]
658 | 
659 |     """
660 |     datetime_col = "date"
661 |     customer_id_col = "id_sample"
662 |     t = 14
663 |     datetime_format = "%Y%m%d"
664 |     freq = "D"
665 |     observation_period_end = "19970930"
666 |     freq_multiplier = 1
667 | 
668 |     transactions_summary = utils.summary_data_from_transaction_data(
669 |         cdnow_transactions,
670 |         customer_id_col,
671 |         datetime_col,
672 |         datetime_format=datetime_format,
673 |         freq=freq,
674 |         freq_multiplier=freq_multiplier,
675 |         observation_period_end=observation_period_end,
676 |     )
677 | 
678 |     transactions_summary = transactions_summary.reset_index()
679 | 
680 |     model = BetaGeoFitter()
681 |     model.fit(transactions_summary["frequency"], transactions_summary["recency"], transactions_summary["T"])
682 | 
683 |     df_cum = utils.expected_cumulative_transactions(
684 |         model,
685 |         cdnow_transactions,
686 |         datetime_col,
687 |         customer_id_col,
688 |         t,
689 |         datetime_format,
690 |         freq,
691 |         set_index_date=True,
692 |         freq_multiplier=freq_multiplier,
693 |     )
694 | 
695 |     dates = ["1997-01-11", "1997-01-12", "1997-01-13", "1997-01-14"]
696 |     actual_trans = [11, 12, 15, 19]
697 |     expected_trans = [10.67, 12.67, 14.87, 17.24]
698 | 
699 |     date_index = df_cum.iloc[-4:].index.to_timestamp().astype(str)
700 |     actual = df_cum["actual"].iloc[-4:].values
701 |     predicted = df_cum["predicted"].iloc[-4:].values.round(2)
702 | 
703 |     assert all(dates == date_index)
704 |     assert_allclose(actual, actual_trans)
705 |     assert_allclose(predicted, expected_trans, atol=1e-2)
706 | 


--------------------------------------------------------------------------------