├── tests
    ├── __init__.py
    ├── mocks
    │   ├── __init__.py
    │   ├── random_generators.py
    │   └── operations.py
    ├── notebooks
    │   ├── __init__.py
    │   └── pos_stocks.py
    ├── scenarios
    │   ├── __init__.py
    │   └── long_cdr.py
    └── unit_tests
    │   ├── __init__.py
    │   ├── test_circus.py
    │   ├── test_clock.py
    │   ├── test_attribute.py
    │   ├── test_random_generators.py
    │   ├── test_util_functions.py
    │   ├── test_operations.py
    │   ├── test_activity.py
    │   └── test_populations.py
├── trumania
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── attribute.py
    │   ├── util_functions.py
    │   ├── circus.py
    │   ├── operations.py
    │   └── clock.py
    └── components
    │   ├── __init__.py
    │   ├── geographies
    │       ├── __init__.py
    │       ├── random_geo.py
    │       └── uganda.py
    │   ├── social_networks
    │       ├── __init__.py
    │       └── erdos_renyi.py
    │   ├── time_patterns
    │       ├── __init__.py
    │       └── profilers.py
    │   └── db.py
├── examples
    ├── tutorial
    │   ├── __init__.py
    │   └── example4.py
    ├── presentation
    │   ├── 01_empty_circus.py
    │   ├── 02_circus_with_actor.py
    │   ├── 03_circus_with_story.py
    │   ├── 05_circus_with_story.py
    │   ├── 04_circus_with_story.py
    │   ├── 06_circus_with_story.py
    │   ├── 08_circus_with_timed_story.py
    │   └── 07_circus_with_story_and_relationship.py
    └── datacamp-blogpost
    │   ├── 01-a-basic-user-population.py
    │   ├── 02-hello-world-statements.py
    │   ├── 03-someone-to-say-hello-world-to.py
    │   ├── 04-you-always-say-that.py
    │   ├── 05-it-aint-what-yo-do-it-s-the-time-that-you-do-it.py
    │   └── 06-the-social-network.py
├── .flake8
├── docs
    ├── source
    │   ├── modules.rst
    │   ├── trumania.rst
    │   ├── trumania.components.rst
    │   ├── index.rst
    │   ├── trumania.components.time_patterns.rst
    │   ├── trumania.components.social_networks.rst
    │   ├── trumania.components.geographies.rst
    │   ├── trumania.core.rst
    │   └── conf.py
    ├── REAMDE.md
    └── Makefile
├── setup.py
├── .gitignore
├── Pipfile
├── README.md
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/mocks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trumania/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/notebooks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/scenarios/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trumania/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/tutorial/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trumania/components/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trumania/components/geographies/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trumania/components/social_networks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/trumania/components/time_patterns/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E121,E126,E131
3 | max-line-length = 120
4 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | trumania
2 | ========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    trumania
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | 
3 | setup(
4 |     name='trumania',
5 |     version='1.0',
6 |     py_modules=['trumania']
7 | )
8 | 


--------------------------------------------------------------------------------
/docs/source/trumania.rst:
--------------------------------------------------------------------------------
 1 | trumania package
 2 | ================
 3 | 
 4 | Module contents
 5 | ---------------
 6 | 
 7 | .. automodule:: trumania
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /trumania/components/_DB/
 2 | /trumania/components/geographies/source_data/
 3 | tests/tutorial/output/
 4 | 
 5 | # python
 6 | *.pyc
 7 | .ipynb_checkpoints
 8 | *.egg-info
 9 | .idea/
10 | .cache
11 | explore.ipynb
12 | cdr_output_logs/
13 | snd_output_logs/
14 | .venv
15 | /output/
16 | /venv/
17 | 
18 | # scala
19 | metastore_db
20 | target
21 | *.class
22 | derby.log
23 | run.log
24 | 
25 | # mac
26 | .DS_Store
27 | 


--------------------------------------------------------------------------------
/docs/source/trumania.components.rst:
--------------------------------------------------------------------------------
 1 | trumania\.components package
 2 | ============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | trumania\.components\.db module
 8 | -------------------------------
 9 | 
10 | .. automodule:: trumania.components.db
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: trumania.components
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.python.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [dev-packages]
 7 | jupyter = "*"
 8 | flake8 = "*"
 9 | 
10 | [packages]
11 | networkx = "*"
12 | pandas = "==1.4.0"
13 | numpy = "==1.22.0"
14 | scipy = "*"
15 | pytest = "*"
16 | pytest-metadata = "*"
17 | faker = "*"
18 | pymongo = "*"
19 | "path.py" = "*"
20 | bson = "*"
21 | tabulate = "*"
22 | "e1839a8" = {path = ".", editable = true}
23 | trumania = {editable = true, path = "."}
24 | 
25 | [requires]
26 | python_version = "3.9.10"
27 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. trumania documentation master file, created by
 2 |    sphinx-quickstart on Mon Jan 15 12:02:36 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to trumania's documentation!
 7 | ====================================
 8 | 
 9 | WIKI
10 | ====
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    wiki.md
15 | 
16 | 
17 | Indices and tables
18 | ==================
19 | 
20 | * :ref:`genindex`
21 | * :ref:`modindex`
22 | * :ref:`search`
23 | 


--------------------------------------------------------------------------------
/examples/presentation/01_empty_circus.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | 
 4 | from trumania.core import circus
 5 | import trumania.core.util_functions as util_functions
 6 | 
 7 | util_functions.setup_logging()
 8 | 
 9 | logging.info("building circus")
10 | 
11 | example1 = circus.Circus(
12 |     name="example1",
13 |     master_seed=12345,
14 |     start=pd.Timestamp("1 Jan 2017 00:00"),
15 |     step_duration=pd.Timedelta("1h"))
16 | 
17 | example1.run(
18 |     duration=pd.Timedelta("48h"),
19 |     log_output_folder="output/example1",
20 |     delete_existing_logs=True
21 | )
22 | 


--------------------------------------------------------------------------------
/docs/source/trumania.components.time_patterns.rst:
--------------------------------------------------------------------------------
 1 | trumania\.components\.time\_patterns package
 2 | ============================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | trumania\.components\.time\_patterns\.profilers module
 8 | ------------------------------------------------------
 9 | 
10 | .. automodule:: trumania.components.time_patterns.profilers
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: trumania.components.time_patterns
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/trumania.components.social_networks.rst:
--------------------------------------------------------------------------------
 1 | trumania\.components\.social\_networks package
 2 | ==============================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | trumania\.components\.social\_networks\.erdos\_renyi module
 8 | -----------------------------------------------------------
 9 | 
10 | .. automodule:: trumania.components.social_networks.erdos_renyi
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: trumania.components.social_networks
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/REAMDE.md:
--------------------------------------------------------------------------------
 1 | # How to generate the wiki and the doc
 2 | 
 3 | You need to have have two copy of the trumania repository, one where you work on the code and one where you generate the docs.
 4 | The one with the docs should be trumania-docs/html.
 5 | The structure should be
 6 | ```
 7 | - trumania (repository on the master branch)
 8 | - trumania-docs
 9 |     - html (repository on the branch gh_pages
10 | ```
11 | 
12 | Once you have the correct structure, go to `trumania/docs` and run the two following commands
13 | ```
14 | # Only required if the code api changed, it will update the code structure
15 | sphinx-apidoc ../trumania -o source
16 | 
17 | # It will update the html pages
18 | make html
19 | ```
20 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = trumania
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = ../../trumania-docs/
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/tests/mocks/random_generators.py:
--------------------------------------------------------------------------------
 1 | from trumania.core.random_generators import Generator
 2 | 
 3 | 
 4 | class ConstantsMockGenerator(Generator):
 5 |     """
 6 |     For test only: a (non random) Generator returning pre-defined values
 7 |     """
 8 |     def __init__(self, values):
 9 |         Generator.__init__(self)
10 |         self.values = values
11 | 
12 |     def generate(self, size):
13 |         # (value is ignored)
14 |         return self.values
15 | 
16 | 
17 | class MockTimerGenerator(Generator):
18 |     """
19 |     For test only: a (non random) Profiler returning pre-defined values
20 |     """
21 |     def __init__(self, values_series):
22 |         Generator.__init__(self)
23 |         self.values_series = values_series
24 | 
25 |     def generate(self, observations):
26 |         # (value is ignored)
27 |         return self.values_series[observations.index]
28 | 


--------------------------------------------------------------------------------
/trumania/components/geographies/random_geo.py:
--------------------------------------------------------------------------------
 1 | from trumania.core.circus import Circus
 2 | from trumania.core.population import Population
 3 | from trumania.core.random_generators import FakerGenerator
 4 | 
 5 | 
 6 | class WithRandomGeo(Circus):
 7 |     """
 8 |     Circus mix-in that adds the creation of random cells
 9 |     """
10 | 
11 |     def create_random_cells(self, n_cells):
12 |         """
13 |         Creation of a basic population for cells, with latitude and longitude
14 |         """
15 | 
16 |         cells = Population(size=n_cells)
17 | 
18 |         latitude_generator = FakerGenerator(method="latitude", seed=next(self.seeder))
19 |         longitude_generator = FakerGenerator(method="longitude", seed=next(self.seeder))
20 | 
21 |         cells.create_attribute("latitude", init_gen=latitude_generator)
22 |         cells.create_attribute("longitude", init_gen=longitude_generator)
23 | 
24 |         return cells
25 | 


--------------------------------------------------------------------------------
/docs/source/trumania.components.geographies.rst:
--------------------------------------------------------------------------------
 1 | trumania\.components\.geographies package
 2 | =========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | trumania\.components\.geographies\.belgium module
 8 | -------------------------------------------------
 9 | 
10 | .. automodule:: trumania.components.geographies.belgium
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | trumania\.components\.geographies\.random\_geo module
16 | -----------------------------------------------------
17 | 
18 | .. automodule:: trumania.components.geographies.random_geo
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | trumania\.components\.geographies\.uganda module
24 | ------------------------------------------------
25 | 
26 | .. automodule:: trumania.components.geographies.uganda
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | 
32 | Module contents
33 | ---------------
34 | 
35 | .. automodule:: trumania.components.geographies
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/tests/mocks/operations.py:
--------------------------------------------------------------------------------
 1 | from trumania.core import operations
 2 | 
 3 | 
 4 | class FakeOp(operations.Operation):
 5 |     """
 6 |     just returning hard-coded results as output
 7 |     """
 8 | 
 9 |     def __init__(self, output, logs):
10 |         self.output = output
11 |         self.logs = logs
12 | 
13 |     def __call__(self, story_data):
14 |         return self.output, self.logs
15 | 
16 | 
17 | class FakeRecording(operations.Operation):
18 | 
19 |     def __init__(self):
20 |         self.last_seen_population_ids = []
21 | 
22 |     def __call__(self, story_data):
23 |         self.last_seen_population_ids = story_data.index.tolist()
24 |         return story_data, {}
25 | 
26 |     def reset(self):
27 |         self.last_seen_population_ids = []
28 | 
29 | 
30 | class MockDropOp(operations.Operation):
31 |     """
32 |     simulating an story that drops rows
33 |     """
34 | 
35 |     def __init__(self, from_idx, to_idx):
36 |         self.from_idx = from_idx
37 |         self.to_idx = to_idx
38 | 
39 |     def __call__(self, story_data):
40 |         return story_data.iloc[self.from_idx: self.to_idx, :], {}
41 | 


--------------------------------------------------------------------------------
/examples/datacamp-blogpost/01-a-basic-user-population.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | from tabulate import tabulate 
 4 | 
 5 | from trumania.core import circus
 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator
 7 | import trumania.core.util_functions as util_functions
 8 | 
 9 | 
10 | util_functions.setup_logging()
11 | 
12 | example_circus = circus.Circus(name="example1", 
13 |                                master_seed=12345,
14 |                                start=pd.Timestamp("1 Jan 2017 00:00"),
15 |                                step_duration=pd.Timedelta("1h"))
16 | 
17 | id_gen = SequencialGenerator(prefix="PERSON_")
18 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5,
19 |                                seed=next(example_circus.seeder))
20 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder))
21 | 
22 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen)
23 | person.create_attribute("NAME", init_gen=name_gen)
24 | person.create_attribute("AGE", init_gen=age_gen)
25 | 
26 | 
27 | logging.info("\n" + 
28 |   tabulate(person.to_dataframe().head(10), headers='keys', tablefmt='psql')
29 | )
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/presentation/02_circus_with_actor.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | from tabulate import tabulate
 4 | 
 5 | from trumania.core import circus
 6 | import trumania.core.util_functions as util_functions
 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator
 8 | 
 9 | 
10 | util_functions.setup_logging()
11 | 
12 | logging.info("building circus")
13 | 
14 | example = circus.Circus(
15 |     name="example",
16 |     master_seed=12345,
17 |     start=pd.Timestamp("1 Jan 2017 00:00"),
18 |     step_duration=pd.Timedelta("1h"))
19 | 
20 | person = example.create_population(
21 |     name="person", size=1000,
22 |     ids_gen=SequencialGenerator(prefix="PERSON_"))
23 | 
24 | person.create_attribute(
25 |     "NAME",
26 |     init_gen=FakerGenerator(method="name",
27 |                             seed=next(example.seeder)))
28 | 
29 | person.create_attribute(
30 |     "age",
31 |     init_gen=NumpyRandomGenerator(
32 |         method="normal", loc=35, scale=5,
33 |         seed=next(example.seeder)))
34 | 
35 | example.run(
36 |     duration=pd.Timedelta("48h"),
37 |     log_output_folder="output/example2",
38 |     delete_existing_logs=True)
39 | 
40 | logging.info("10 first persons: \n" + tabulate(person.to_dataframe().head(10),
41 |              headers='keys', tablefmt='psql'))
42 | 


--------------------------------------------------------------------------------
/tests/scenarios/long_cdr.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from test_cdr import run_cdr_scenario
 4 | from trumania.core.util_functions import setup_logging
 5 | 
 6 | # better run this outside of PyCharm for consistent measures...
 7 | #
 8 | # python tests/scenarios/long_cdr.py
 9 | 
10 | if __name__ == "__main__":
11 |     setup_logging()
12 |     logging.info("starting a long CDR test ")
13 |     params = {
14 |         "time_step": 60,
15 |         "n_cells": 200,
16 |         "n_agents": 500,
17 |         "n_subscribers": 25000,
18 |         "average_degree": 20,
19 |         "n_iterations": 200
20 |     }
21 | 
22 |     run_cdr_scenario(params)
23 | 
24 |     """
25 |     result on Svends's laptop:
26 | 
27 |      total number of logs: 392086
28 |      execution times: "
29 |      - building the circus: 0 days 00:01:32.156013
30 |      - running the simulation: 0 days 00:14:07.355875
31 | 
32 |      Note: that world is a bit irrealistic:
33 |       * 200 clock steps is about 3 hours
34 |       * 400k logs for 25k users is 16 actions per persons
35 | 
36 |        => a bit more than 5 actions per user per hour
37 | 
38 | 
39 | New result (25 Aug) after adding multi-sim
40 | 
41 |        total number of logs: 380270
42 | 2016-08-25 15:11:47,849  71736 topups logs
43 | 2016-08-25 15:11:47,849  86 cell_status logs
44 | 2016-08-25 15:11:47,849  154358 voice_cdr logs
45 | 2016-08-25 15:11:47,850  152642 sms_cdr logs
46 | 2016-08-25 15:11:47,850  1448 mobility_logs logs
47 | 2016-08-25 15:11:47,852
48 | execution times: "
49 |      - building the circus: 0 days 00:02:43.163336
50 |      - running the simulation: 0 days 00:30:58.923819
51 |     """
52 | 


--------------------------------------------------------------------------------
/tests/notebooks/pos_stocks.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import numpy as np
 4 | from scipy import stats
 5 | import seaborn as sns
 6 | 
 7 | 
 8 | def ev(dist):
 9 | 
10 |     """
11 |     :param dist: dictionary representing a probability distribution
12 |     :return: the expected value of this distribution
13 |     """
14 |     assert sum(dist.values()) == 1
15 |     return sum(dist[v] * v for v in dist.keys())
16 | 
17 | 
18 | def post_pad(vect, target_size):
19 |     """
20 |     pads the end of this vectors with 0s s.t. it is as long as T_size
21 |     """
22 |     return np.pad(vect, [0, target_size - len(vect)], "constant", constant_values=0)
23 | 
24 | 
25 | def pre_pad(vect, n_pads):
26 |     """
27 |     pads the beginning of this vectors with the requested amount of 0s
28 |     """
29 |     return np.pad(vect, [n_pads, 0], "constant", constant_values=0)
30 | 
31 | 
32 | def binom_pmf(n, p):
33 |     """
34 |     return a binomial(n,p) pmf
35 |     """
36 | 
37 |     def _pmf(k):
38 |         return stats.binom.pmf(k, n, p)
39 | 
40 |     return _pmf
41 | 
42 | 
43 | def build_heatmap(transition_matrix, **kwargs):
44 |     """
45 |     convenience method to show a heatmap representing this transition matrix
46 |     """
47 |     return sns.heatmap(transition_matrix,
48 |                        xticklabels=False,
49 |                        yticklabels=False,
50 |                        **kwargs)
51 | 
52 | 
53 | def compute_stationary(transition_matrix):
54 |     A = transition_matrix - np.identity(transition_matrix.shape[0])
55 | 
56 |     # adding one more constraint force x being a probability vector
57 |     prob_const = np.ones([1, transition_matrix.shape[1]])
58 |     A2 = np.concatenate([A, prob_const], axis=0)
59 | 
60 |     b = np.concatenate([np.zeros([transition_matrix.shape[0], 1]), [[1]]], axis=0)
61 | 
62 |     x, res, rank, s = np.linalg.lstsq(A2, b)
63 | 
64 |     return x.T[0], res
65 | 


--------------------------------------------------------------------------------
/examples/datacamp-blogpost/02-hello-world-statements.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | from tabulate import tabulate 
 4 | 
 5 | from trumania.core import circus, operations
 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator
 7 | import trumania.core.util_functions as util_functions
 8 | 
 9 | 
10 | util_functions.setup_logging()
11 | 
12 | example_circus = circus.Circus(name="example", 
13 |                                master_seed=12345,
14 |                                start=pd.Timestamp("1 Jan 2017 00:00"),
15 |                                step_duration=pd.Timedelta("1h"))
16 | 
17 | id_gen = SequencialGenerator(prefix="PERSON_")
18 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5,
19 |                                seed=next(example_circus.seeder))
20 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder))
21 | 
22 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen)
23 | person.create_attribute("NAME", init_gen=name_gen)
24 | person.create_attribute("AGE", init_gen=age_gen)
25 | 
26 | hello_world = example_circus.create_story(
27 |     name="hello_world",
28 |     initiating_population=example_circus.populations["person"],
29 |     member_id_field="PERSON_ID",
30 |     timer_gen=ConstantDependentGenerator(value=1)
31 | )
32 | 
33 | hello_world.set_operations(
34 |     example_circus.clock.ops.timestamp(named_as="TIME"),
35 |     ConstantGenerator(value="hello world").ops.generate(named_as="MESSAGE"),
36 |     operations.FieldLogger(log_id="hello")
37 | )
38 | 
39 | example_circus.run(
40 |     duration=pd.Timedelta("48h"),
41 |     log_output_folder="output/example_scenario",
42 |     delete_existing_logs=True
43 | )
44 | 
45 | # -- DEBUG output printout
46 | 
47 | df = pd.read_csv("output/example_scenario/hello.csv")
48 | print(df.head(10))
49 | print(df.tail(10))
50 | 


--------------------------------------------------------------------------------
/docs/source/trumania.core.rst:
--------------------------------------------------------------------------------
 1 | trumania\.core package
 2 | ======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | trumania\.core\.attribute module
 8 | --------------------------------
 9 | 
10 | .. automodule:: trumania.core.attribute
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | trumania\.core\.circus module
16 | -----------------------------
17 | 
18 | .. automodule:: trumania.core.circus
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | trumania\.core\.clock module
24 | ----------------------------
25 | 
26 | .. automodule:: trumania.core.clock
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | trumania\.core\.operations module
32 | ---------------------------------
33 | 
34 | .. automodule:: trumania.core.operations
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | trumania\.core\.population module
40 | ---------------------------------
41 | 
42 | .. automodule:: trumania.core.population
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | trumania\.core\.random\_generators module
48 | -----------------------------------------
49 | 
50 | .. automodule:: trumania.core.random_generators
51 |     :members:
52 |     :undoc-members:
53 |     :show-inheritance:
54 | 
55 | trumania\.core\.relationship module
56 | -----------------------------------
57 | 
58 | .. automodule:: trumania.core.relationship
59 |     :members:
60 |     :undoc-members:
61 |     :show-inheritance:
62 | 
63 | trumania\.core\.story module
64 | ----------------------------
65 | 
66 | .. automodule:: trumania.core.story
67 |     :members:
68 |     :undoc-members:
69 |     :show-inheritance:
70 | 
71 | trumania\.core\.util\_functions module
72 | --------------------------------------
73 | 
74 | .. automodule:: trumania.core.util_functions
75 |     :members:
76 |     :undoc-members:
77 |     :show-inheritance:
78 | 
79 | 
80 | Module contents
81 | ---------------
82 | 
83 | .. automodule:: trumania.core
84 |     :members:
85 |     :undoc-members:
86 |     :show-inheritance:
87 | 


--------------------------------------------------------------------------------
/examples/presentation/03_circus_with_story.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | 
 4 | from trumania.core import circus
 5 | import trumania.core.util_functions as util_functions
 6 | from trumania.core.operations import FieldLogger
 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator
 8 | from trumania.core.random_generators import ConstantDependentGenerator, ConstantGenerator
 9 | 
10 | 
11 | util_functions.setup_logging()
12 | 
13 | logging.info("building circus")
14 | 
15 | 
16 | def create_circus_with_population():
17 |     example_circus = circus.Circus(
18 |         name="example",
19 |         master_seed=12345,
20 |         start=pd.Timestamp("1 Jan 2017 00:00"),
21 |         step_duration=pd.Timedelta("1h"))
22 | 
23 |     person = example_circus.create_population(
24 |         name="person", size=1000,
25 |         ids_gen=SequencialGenerator(prefix="PERSON_"))
26 | 
27 |     person.create_attribute(
28 |         "NAME",
29 |         init_gen=FakerGenerator(method="name",
30 |                                 seed=next(example_circus.seeder)))
31 | 
32 |     person.create_attribute(
33 |         "age",
34 |         init_gen=NumpyRandomGenerator(
35 |             method="normal", loc=3, scale=5,
36 |             seed=next(example_circus.seeder)))
37 | 
38 |     return example_circus
39 | 
40 | 
41 | example = create_circus_with_population()
42 | 
43 | hello_world = example.create_story(
44 |     name="hello_world",
45 |     initiating_population=example.populations["person"],
46 |     member_id_field="PERSON_ID",
47 | 
48 |     timer_gen=ConstantDependentGenerator(value=1)
49 | )
50 | 
51 | hello_world.set_operations(
52 |     example.clock.ops.timestamp(named_as="TIME"),
53 |     ConstantGenerator(value="hello world").ops.generate(named_as="MESSAGE"),
54 |     FieldLogger(log_id="hello")
55 | )
56 | 
57 | example.run(
58 |     duration=pd.Timedelta("48h"),
59 |     log_output_folder="output/example3",
60 |     delete_existing_logs=True
61 | )
62 | 
63 | with open("output/example3/hello.csv") as log:
64 |     logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10]))
65 | 


--------------------------------------------------------------------------------
/trumania/components/social_networks/erdos_renyi.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import logging
 3 | import networkx as nx
 4 | import trumania.core.random_generators as rg
 5 | from trumania.core.circus import Circus
 6 | 
 7 | import pandas as pd
 8 | 
 9 | 
10 | class WithErdosRenyi(Circus):
11 |     """
12 |         Circus mix-in that provides method to build ER random graph
13 |     """
14 | 
15 |     def add_er_social_network_relationship(self, population, relationship_name, average_degree):
16 |         """
17 |         Adds to this population a relationship from and to its members based an ER random graph
18 |         """
19 |         logging.info("Creating the social network ")
20 | 
21 |         # create a random A to B symmetric relationship
22 |         network_weight_gen = rg.ParetoGenerator(xmin=1., a=1.2, seed=next(self.seeder))
23 | 
24 |         social_network_values = create_er_social_network(
25 |             customer_ids=population.ids,
26 |             p=average_degree / len(population.ids),
27 |             seed=next(self.seeder))
28 | 
29 |         social_network = population.create_relationship(relationship_name)
30 |         social_network.add_relations(
31 |             from_ids=social_network_values["A"].values,
32 |             to_ids=social_network_values["B"].values,
33 |             weights=network_weight_gen.generate(social_network_values.shape[0]))
34 | 
35 |         social_network.add_relations(
36 |             from_ids=social_network_values["B"].values,
37 |             to_ids=social_network_values["A"].values,
38 |             weights=network_weight_gen.generate(social_network_values.shape[0]))
39 | 
40 | 
41 | def create_er_social_network(customer_ids, p, seed):
42 |     """
43 | 
44 |     :type customer_ids: list
45 |     :param customer_ids: list of IDs as defined in the data
46 |     :type p: float
47 |     :param p: probability of existence of 1 edge
48 |     :type seed: int
49 |     :param seed: seed for random generator
50 |     :rtype: Pandas DataFrame, with two columns (A and B)
51 |     :return: all edges in the graph
52 |     """
53 | 
54 |     return pd.DataFrame.from_records([(customer_ids[e[0]], customer_ids[e[1]])
55 |                                       for e in nx.fast_gnp_random_graph(len(customer_ids), p, seed).edges()],
56 |                                      columns=["A", "B"])
57 | 


--------------------------------------------------------------------------------
/examples/datacamp-blogpost/03-someone-to-say-hello-world-to.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | from tabulate import tabulate 
 4 | 
 5 | from trumania.core import circus, operations
 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator
 7 | import trumania.core.util_functions as util_functions
 8 | 
 9 | 
10 | util_functions.setup_logging()
11 | 
12 | example_circus = circus.Circus(name="example", 
13 |                                master_seed=12345,
14 |                                start=pd.Timestamp("1 Jan 2017 00:00"),
15 |                                step_duration=pd.Timedelta("1h"))
16 | 
17 | id_gen = SequencialGenerator(prefix="PERSON_")
18 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5,
19 |                                seed=next(example_circus.seeder))
20 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder))
21 | 
22 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen)
23 | person.create_attribute("NAME", init_gen=name_gen)
24 | person.create_attribute("AGE", init_gen=age_gen)
25 | 
26 | hello_world = example_circus.create_story(
27 |     name="hello_world",
28 |     initiating_population=example_circus.populations["person"],
29 |     member_id_field="PERSON_ID",
30 |     timer_gen=ConstantDependentGenerator(value=1)
31 | )
32 | 
33 | hello_world.set_operations(
34 |     example_circus.clock.ops.timestamp(named_as="TIME"),
35 |     ConstantGenerator(value="hello world").ops.generate(named_as="MESSAGE"),
36 |     
37 |     example_circus.populations["person"].ops.select_one(named_as="OTHER_PERSON"),
38 | 
39 |     example_circus.populations["person"]
40 |         .ops.lookup(id_field="PERSON_ID", select={"NAME": "EMITTER_NAME"}),
41 | 
42 |     example_circus.populations["person"]
43 |         .ops.lookup(id_field="OTHER_PERSON", select={"NAME": "RECEIVER_NAME"}),
44 | 
45 |     operations.FieldLogger(log_id="hello_3")
46 | )
47 | 
48 | example_circus.run(
49 |     duration=pd.Timedelta("48h"),
50 |     log_output_folder="output/example_scenario",
51 |     delete_existing_logs=True
52 | )
53 | 
54 | # -- DEBUG output printout
55 | pd.set_option('display.max_columns', 500)
56 | pd.set_option('display.width', 1000)
57 | df = pd.read_csv("output/example_scenario/hello_3.csv")
58 | print(df.head(10))
59 | print(df.tail(10))
60 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_circus.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import pytest
 3 | import pandas as pd
 4 | 
 5 | from trumania.core.random_generators import SequencialGenerator
 6 | from trumania.core.circus import Circus
 7 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator
 8 | 
 9 | 
10 | def test_create_story_get_story_should_work_as_expected():
11 | 
12 |     flying = Circus(name="tested_circus",
13 |                     master_seed=1,
14 |                     start=pd.Timestamp("8 June 2016"),
15 |                     step_duration=pd.Timedelta("60s"))
16 | 
17 |     customers = flying.create_population(
18 |         "the_customers", size=100,
19 |         ids_gen=SequencialGenerator(prefix="a"))
20 | 
21 |     mobility_time_gen = DefaultDailyTimerGenerator(flying.clock, seed=1)
22 | 
23 |     mobility_story = flying.create_story(
24 |         name="mobility",
25 | 
26 |         initiating_population=customers,
27 |         member_id_field="A_ID",
28 | 
29 |         timer_gen=mobility_time_gen,
30 |     )
31 | 
32 |     # add and get story by name should work as expected
33 |     result = flying.get_story("mobility")
34 | 
35 |     assert result.name == "mobility"
36 |     assert result.member_id_field == mobility_story.member_id_field
37 | 
38 |     # also retrieving this initiating population of that population
39 | 
40 |     retrieved_pop = flying.get_population_of("mobility")
41 | 
42 |     assert retrieved_pop == customers
43 | 
44 | 
45 | def test_get_non_existing_story_should_return_none():
46 | 
47 |     flying = Circus(name="tested_circus",
48 |                     master_seed=1,
49 |                     start=pd.Timestamp("8 June 2016"),
50 |                     step_duration=pd.Timedelta("60s"))
51 | 
52 |     assert flying.get_story("non_existing_name") is None
53 | 
54 | 
55 | def test_adding_a_second_story_with_same_name_should_be_refused():
56 | 
57 |     flying = Circus(name="tested_circus",
58 |                     master_seed=1,
59 |                     start=pd.Timestamp("8 June 2016"),
60 |                     step_duration=pd.Timedelta("60s"))
61 | 
62 |     customers = flying.create_population(
63 |         name="tested", size=100,
64 |         ids_gen=SequencialGenerator(prefix="a"))
65 | 
66 |     flying.create_story(name="the_story",
67 |                         initiating_population=customers,
68 |                         member_id_field="population_id")
69 | 
70 |     with pytest.raises(ValueError):
71 |         flying.create_story(name="the_story",
72 |                             initiating_population=customers,
73 |                             member_id_field="population_id")
74 | 


--------------------------------------------------------------------------------
/examples/presentation/05_circus_with_story.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | 
 4 | from trumania.core import circus
 5 | import trumania.core.util_functions as util_functions
 6 | from trumania.core.operations import FieldLogger
 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator
 8 | from trumania.core.random_generators import ConstantDependentGenerator
 9 | 
10 | 
11 | util_functions.setup_logging()
12 | 
13 | logging.info("building circus")
14 | 
15 | 
16 | def create_circus_with_population():
17 |     example_circus = circus.Circus(
18 |         name="example",
19 |         master_seed=12345,
20 |         start=pd.Timestamp("1 Jan 2017 00:00"),
21 |         step_duration=pd.Timedelta("1h"))
22 | 
23 |     person = example_circus.create_population(
24 |         name="person", size=1000,
25 |         ids_gen=SequencialGenerator(prefix="PERSON_"))
26 | 
27 |     person.create_attribute(
28 |         "NAME",
29 |         init_gen=FakerGenerator(method="name",
30 |                                 seed=next(example_circus.seeder)))
31 | 
32 |     person.create_attribute(
33 |         "age",
34 |         init_gen=NumpyRandomGenerator(
35 |             method="normal", loc=35, scale=5,
36 |             seed=next(example_circus.seeder)))
37 | 
38 |     return example_circus
39 | 
40 | 
41 | the_circus = create_circus_with_population()
42 | 
43 | hello_world = the_circus.create_story(
44 |     name="hello_world",
45 |     initiating_population=the_circus.populations["person"],
46 |     member_id_field="PERSON_ID",
47 | 
48 |     timer_gen=ConstantDependentGenerator(value=1)
49 | )
50 | 
51 | hello_world.set_operations(
52 | 
53 |     # adding a random timestamp, within the current clock step
54 |     the_circus.clock.ops.timestamp(named_as="TIME"),
55 | 
56 |     # message is now a random sentence from Faker
57 |     FakerGenerator(method="sentence",
58 |                    nb_words=6, variable_nb_words=True,
59 |                    seed=next(the_circus.seeder)
60 |                    )
61 |         .ops
62 |         .generate(named_as="MESSAGE"),
63 | 
64 |     # selecting a random "other person"
65 |     the_circus.populations["person"]
66 |         .ops
67 |         .select_one(named_as="OTHER_PERSON"),
68 | 
69 |     # specifying which fields to put in the log
70 |     FieldLogger(log_id="hello",
71 |                 cols=["TIME", "PERSON_ID", "OTHER_PERSON", "MESSAGE"]
72 |                 )
73 | 
74 | )
75 | 
76 | the_circus.run(
77 |     duration=pd.Timedelta("48h"),
78 |     log_output_folder="output/example4",
79 |     delete_existing_logs=True
80 | )
81 | 
82 | with open("output/example4/hello.csv") as log:
83 |     logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10]))
84 | 


--------------------------------------------------------------------------------
/examples/presentation/04_circus_with_story.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | 
 4 | from trumania.core import circus
 5 | import trumania.core.util_functions as util_functions
 6 | from trumania.core.operations import FieldLogger
 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator
 8 | from trumania.core.random_generators import ConstantDependentGenerator, ConstantGenerator
 9 | 
10 | 
11 | util_functions.setup_logging()
12 | 
13 | logging.info("building circus")
14 | 
15 | 
16 | def create_circus_with_population():
17 |     example_circus = circus.Circus(
18 |         name="example",
19 |         master_seed=12345,
20 |         start=pd.Timestamp("1 Jan 2017 00:00"),
21 |         step_duration=pd.Timedelta("1h"))
22 | 
23 |     person = example_circus.create_population(
24 |         name="person", size=1000,
25 |         ids_gen=SequencialGenerator(prefix="PERSON_"))
26 | 
27 |     person.create_attribute(
28 |         "NAME",
29 |         init_gen=FakerGenerator(method="name",
30 |                                 seed=next(example_circus.seeder)))
31 | 
32 |     person.create_attribute(
33 |         "age",
34 |         init_gen=NumpyRandomGenerator(
35 |             method="normal", loc=35, scale=5,
36 |             seed=next(example_circus.seeder)))
37 | 
38 |     return example_circus
39 | 
40 | 
41 | the_circus = create_circus_with_population()
42 | 
43 | hello_world = the_circus.create_story(
44 |     name="hello_world",
45 |     initiating_population=the_circus.populations["person"],
46 |     member_id_field="PERSON_ID",
47 | 
48 |     timer_gen=ConstantDependentGenerator(value=1)
49 | )
50 | 
51 | hello_world.set_operations(
52 | 
53 |     # adding a random timestamp, within the current clock step
54 |     the_circus.clock.ops.timestamp(named_as="TIME"),
55 | 
56 |     ConstantGenerator(value="hello world").ops.generate(named_as="MESSAGE"),
57 | 
58 |     # selecting a random "other person"
59 |     the_circus.populations["person"].ops.select_one(named_as="OTHER_PERSON"),
60 | 
61 |     the_circus.populations["person"]
62 |         .ops
63 |         .lookup(id_field="PERSON_ID",
64 |                 select={"NAME": "EMITTER_NAME"}),
65 | 
66 |     the_circus.populations["person"]
67 |         .ops
68 |         .lookup(id_field="OTHER_PERSON",
69 |                 select={"NAME": "RECEIVER_NAME"}),
70 | 
71 |     # specifying which fields to put in the log
72 |     FieldLogger(log_id="hello",    
73 |                 cols=["TIME", "PERSON_ID", "OTHER_PERSON", "MESSAGE"])
74 | 
75 | )
76 | 
77 | the_circus.run(
78 |     duration=pd.Timedelta("48h"),
79 |     log_output_folder="output/example4",
80 |     delete_existing_logs=True
81 | )
82 | 
83 | with open("output/example4/hello.csv") as log:
84 |     logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10]))
85 | 
86 |     from tabulate import tabulate
87 |     logging.info(tabulate(log.readlines(1000)[:10]))
88 | 


--------------------------------------------------------------------------------
/examples/datacamp-blogpost/04-you-always-say-that.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | from tabulate import tabulate 
 4 | 
 5 | from trumania.core import circus, operations
 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator
 7 | import trumania.core.util_functions as util_functions
 8 | 
 9 | 
10 | util_functions.setup_logging()
11 | 
12 | example_circus = circus.Circus(name="example", 
13 |                                master_seed=12345,
14 |                                start=pd.Timestamp("1 Jan 2017 00:00"),
15 |                                step_duration=pd.Timedelta("1h"))
16 | # person population
17 | 
18 | id_gen = SequencialGenerator(prefix="PERSON_")
19 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5,
20 |                                seed=next(example_circus.seeder))
21 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder))
22 | 
23 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen)
24 | person.create_attribute("NAME", init_gen=name_gen)
25 | person.create_attribute("AGE", init_gen=age_gen)
26 | 
27 | # basic relationship to store people's quote
28 | 
29 | quote_generator = FakerGenerator(method="sentence", nb_words=6, variable_nb_words=True,
30 |                                  seed=next(example_circus.seeder))
31 | 
32 | quotes_rel = example_circus.populations["person"].create_relationship("quotes")
33 | 
34 | for w in range(4):
35 |     quotes_rel.add_relations(
36 |         from_ids=person.ids,
37 |         to_ids=quote_generator.generate(size=person.size),
38 |         weights=w
39 |     )
40 | 
41 | # message story
42 | 
43 | hello_world = example_circus.create_story(
44 |     name="hello_world",
45 |     initiating_population=example_circus.populations["person"],
46 |     member_id_field="PERSON_ID",
47 |     timer_gen=ConstantDependentGenerator(value=1)
48 | )
49 | 
50 | hello_world.set_operations(
51 |     example_circus.clock.ops.timestamp(named_as="TIME"),
52 |     
53 |     example_circus.populations["person"].get_relationship("quotes")
54 |         .ops.select_one(from_field="PERSON_ID",named_as="MESSAGE"),
55 |     
56 |     example_circus.populations["person"].ops.select_one(named_as="OTHER_PERSON"),
57 | 
58 |     example_circus.populations["person"]
59 |         .ops.lookup(id_field="PERSON_ID", select={"NAME": "EMITTER_NAME"}),
60 | 
61 |     example_circus.populations["person"]
62 |         .ops.lookup(id_field="OTHER_PERSON", select={"NAME": "RECEIVER_NAME"}),
63 | 
64 |     operations.FieldLogger(log_id="hello_4")
65 | )
66 | 
67 | # message story
68 | 
69 | example_circus.run(
70 |     duration=pd.Timedelta("48h"),
71 |     log_output_folder="output/example_scenario",
72 |     delete_existing_logs=True
73 | )
74 | 
75 | # -- DEBUG output printout
76 | pd.set_option('display.max_columns', 500)
77 | pd.set_option('display.width', 1000)
78 | df = pd.read_csv("output/example_scenario/hello_4.csv")
79 | print(df.head(10))
80 | print(df.tail(10))
81 | 


--------------------------------------------------------------------------------
/examples/presentation/06_circus_with_story.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | 
 4 | from trumania.core import circus
 5 | import trumania.core.util_functions as util_functions
 6 | from trumania.core.operations import FieldLogger
 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator
 8 | from trumania.core.random_generators import ConstantDependentGenerator
 9 | 
10 | 
11 | util_functions.setup_logging()
12 | 
13 | logging.info("building circus")
14 | 
15 | 
16 | def create_circus_with_population():
17 |     example_circus = circus.Circus(
18 |         name="example",
19 |         master_seed=12345,
20 |         start=pd.Timestamp("1 Jan 2017 00:00"),
21 |         step_duration=pd.Timedelta("1h"))
22 | 
23 |     person = example_circus.create_population(
24 |         name="person", size=1000,
25 |         ids_gen=SequencialGenerator(prefix="PERSON_"))
26 | 
27 |     person.create_attribute(
28 |         "NAME",
29 |         init_gen=FakerGenerator(method="name",
30 |                                 seed=next(example_circus.seeder)))
31 | 
32 |     person.create_attribute(
33 |         "age",
34 |         init_gen=NumpyRandomGenerator(
35 |             method="normal", loc=35, scale=5,
36 |             seed=next(example_circus.seeder)))
37 | 
38 |     return example_circus
39 | 
40 | 
41 | the_circus = create_circus_with_population()
42 | 
43 | hello_world = the_circus.create_story(
44 |     name="hello_world",
45 |     initiating_population=the_circus.populations["person"],
46 |     member_id_field="PERSON_ID",
47 | 
48 |     timer_gen=ConstantDependentGenerator(value=1)
49 | )
50 | 
51 | hello_world.set_operations(
52 | 
53 |     # adding a random timestamp, within the current clock step
54 |     the_circus.clock
55 |         .ops
56 |         .timestamp(named_as="TIME"),
57 | 
58 |     # message is now a random sentence from Faker
59 |     FakerGenerator(method="sentence",
60 |                    nb_words=6, variable_nb_words=True,
61 |                    seed=next(the_circus.seeder)
62 |                    )
63 |         .ops
64 |         .generate(named_as="MESSAGE"),
65 | 
66 |     # selecting a random "other person"
67 |     the_circus.populations["person"]
68 |         .ops
69 |         .select_one(named_as="OTHER_PERSON"),
70 | 
71 |     the_circus.populations["person"]
72 |         .ops
73 |         .lookup(id_field="PERSON_ID",
74 |                 select={"NAME": "EMITTER_NAME"}),
75 | 
76 |     the_circus.populations["person"]
77 |         .ops
78 |         .lookup(id_field="OTHER_PERSON",
79 |                 select={"NAME": "RECEIVER_NAME"}),
80 | 
81 |     # specifying which fields to put in the log
82 |     FieldLogger(log_id="hello",
83 |                 cols=["TIME", "EMITTER_NAME", "RECEIVER_NAME", "MESSAGE"]
84 |                 )
85 | 
86 | )
87 | 
88 | the_circus.run(
89 |     duration=pd.Timedelta("48h"),
90 |     log_output_folder="output/example4",
91 |     delete_existing_logs=True
92 | )
93 | 
94 | with open("output/example4/hello.csv") as log:
95 |     logging.info("some produced logs: \n\n" + "".join(log.readlines(10)[:10]))
96 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Trumania
 2 | 
 3 | ## Documentation and tutorial
 4 | 
 5 | Trumania is a scenario-based random dataset generator library in python 3. 
 6 | 
 7 | A [detailed step-by-step tutorial has is available on Datacamp](https://www.datacamp.com/community/tutorials/generate-data-trumania). 
 8 | 
 9 | The [Trumania github page](http://realimpactanalytics.github.io/trumania/) also contains 
10 | a detailed documentation of each of the concepts as well as a step-by-step explanation of 4 example scenarios. Those scenarios, and more, are present in the [examples/](examples/) folder in this repository.
11 | 
12 | The code pydoc documentation is available [here](http://realimpactanalytics.github.io/trumania/py-modindex.html).
13 | 
14 | You can also join the Trumania slack channel: [trumania.slack.com](https://trumania.slack.com)
15 | 
16 | ## How to install 
17 | 
18 | Trumania is not packaged in any special way, the way it is used at the moment is simply to clone the code and install the required dependencies. This section describes how to do that.
19 | 
20 | Pre-requisites: 
21 | 
22 | - If you installed python 3 with homebrew, then the executable is called `python3` and pip is called `pip3`. See [homebrew python documentation](https://docs.brew.sh/Homebrew-and-Python.html) for details
23 | - If you installed python 3 with Conda, make sure you understand how environments work since they might end up conflicting with pipenv environments. See [this ticket](https://github.com/pypa/pipenv/issues/699) for a discussion
24 | - In anycase, in order to specify the exact path of the python to be used, you can always specify `--python /path/to/python` among the `pipenv` arguments. 
25 | 
26 | That being said, start by installing `pipenv` if necessary: 
27 | 
28 | ```sh
29 | # this could be called "pip", depending on the environment, and must be linked to python 3
30 | pip3 install --user pipenv
31 | ```
32 | 
33 | then install all python dependencies for this project: 
34 | 
35 | ```sh
36 | pipenv install --three --python /Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6
37 | ```
38 | 
39 | The steps below mention to prefix the commands with `pipenv run` whenever necessary in order to have access to those python dependencies. Alternatively, you can enter the corresponding virtualenv once with `pipenv shell`, in which case that prefix is no longer necessary. See [https://docs.pipenv.org](https://docs.pipenv.org) for more details about how to use pipenv to handle python dependencies. 
40 | 
41 | 
42 | ## Where and how to create a scenario
43 | 
44 | To create a scenario, simply create another python project that depends on trumania: 
45 | 
46 | ```sh
47 | mkdir -p /path/to/your/project
48 | cd /path/to/your/project
49 | 
50 | # make sure /path/to/trumania/ is the absolute path where trumania is stored
51 | pipenv install -e /path/to/trumania/
52 | ```
53 | 
54 | You can then create your scenario in python, let's call it `burbanks_and_friends_talking.py`.  In order to execute it, simply launch it from pipenv: 
55 | 
56 | ```sh
57 | pipenv run python burbanks_and_friends_talking.py  
58 | ```
59 | 
60 | ## Contributing
61 | 
62 | This section provides a few pointers on how to handle the trumania codebase.
63 | 
64 | ### Running Trumania unit tests locally
65 | 
66 | ```sh
67 | # make sure you are not inside another pipenv shell when running this
68 | pipenv run py.test -s -v
69 | ```
70 | 
71 | ### Python linting
72 | Run `pipenv run flake8`. If nothing is returned, the correct styling has been applied.
73 | 


--------------------------------------------------------------------------------
/examples/presentation/08_circus_with_timed_story.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pandas as pd
  3 | 
  4 | from trumania.core import circus
  5 | import trumania.core.util_functions as util_functions
  6 | from trumania.core.operations import FieldLogger
  7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator
  8 | from trumania. components.time_patterns.profilers import WorkHoursTimerGenerator
  9 | 
 10 | 
 11 | util_functions.setup_logging()
 12 | 
 13 | logging.info("building circus")
 14 | 
 15 | 
 16 | def create_circus_with_population():
 17 |     example_circus = circus.Circus(
 18 |         name="example",
 19 |         master_seed=12345,
 20 |         start=pd.Timestamp("1 Jan 2017 00:00"),
 21 |         step_duration=pd.Timedelta("1h"))
 22 | 
 23 |     person = example_circus.create_population(
 24 |         name="person", size=1000,
 25 |         ids_gen=SequencialGenerator(prefix="PERSON_"))
 26 | 
 27 |     person.create_attribute(
 28 |         "NAME",
 29 |         init_gen=FakerGenerator(method="name",
 30 |                                 seed=next(example_circus.seeder)))
 31 | 
 32 |     person.create_attribute(
 33 |         "age",
 34 |         init_gen=NumpyRandomGenerator(
 35 |             method="normal", loc=35, scale=5,
 36 |             seed=next(example_circus.seeder)))
 37 | 
 38 |     return example_circus
 39 | 
 40 | 
 41 | the_circus = create_circus_with_population()
 42 | 
 43 | hello_world = the_circus.create_story(
 44 |     name="hello_world",
 45 |     initiating_population=the_circus.populations["person"],
 46 |     member_id_field="PERSON_ID",
 47 | 
 48 |     # each population instance is now going to have 10, 20 or 30
 49 |     # trigger of this story per week
 50 |     activity_gen=NumpyRandomGenerator(
 51 |         method="choice", a=[10, 20, 30],
 52 |         seed=next(the_circus.seeder)
 53 |     ),
 54 | 
 55 |     # story now only tiggers during office hours
 56 |     timer_gen=WorkHoursTimerGenerator(
 57 |         clock=the_circus.clock,
 58 |         seed=next(the_circus.seeder))
 59 | )
 60 | 
 61 | hello_world.set_operations(
 62 | 
 63 |     # adding a random timestamp, within the current clock step
 64 |     the_circus.clock
 65 |         .ops
 66 |         .timestamp(named_as="TIME"),
 67 | 
 68 |     # message is now a random sentence from Faker
 69 |     FakerGenerator(method="sentence",
 70 |                    nb_words=6, variable_nb_words=True,
 71 |                    seed=next(the_circus.seeder)
 72 |                    )
 73 |         .ops
 74 |         .generate(named_as="MESSAGE"),
 75 | 
 76 |     # selecting a random "other person"
 77 |     the_circus.populations["person"]
 78 |         .ops
 79 |         .select_one(named_as="OTHER_PERSON"),
 80 | 
 81 |     the_circus.populations["person"]
 82 |         .ops
 83 |         .lookup(id_field="PERSON_ID",
 84 |                 select={"NAME": "EMITTER_NAME"}),
 85 | 
 86 |     the_circus.populations["person"]
 87 |         .ops
 88 |         .lookup(id_field="OTHER_PERSON",
 89 |                 select={"NAME": "RECEIVER_NAME"}),
 90 | 
 91 |     # specifying which fields to put in the log
 92 |     FieldLogger(log_id="hello",
 93 |                 cols=["TIME", "EMITTER_NAME", "RECEIVER_NAME", "MESSAGE"]
 94 |                 )
 95 | 
 96 | )
 97 | 
 98 | the_circus.run(
 99 |     duration=pd.Timedelta("48h"),
100 |     log_output_folder="output/example8",
101 |     delete_existing_logs=True
102 | )
103 | 
104 | with open("output/example8/hello.csv") as log:
105 |     logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10]))
106 | 


--------------------------------------------------------------------------------
/examples/datacamp-blogpost/05-it-aint-what-yo-do-it-s-the-time-that-you-do-it.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | from tabulate import tabulate 
 4 | 
 5 | from trumania.core import circus, operations
 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator
 7 | import trumania.core.util_functions as util_functions
 8 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator
 9 | 
10 | 
11 | util_functions.setup_logging()
12 | 
13 | example_circus = circus.Circus(name="example", 
14 |                                master_seed=12345,
15 |                                start=pd.Timestamp("1 Jan 2017 00:00"),
16 |                                step_duration=pd.Timedelta("1h"))
17 | # person population
18 | 
19 | id_gen = SequencialGenerator(prefix="PERSON_")
20 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5,
21 |                                seed=next(example_circus.seeder))
22 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder))
23 | 
24 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen)
25 | person.create_attribute("NAME", init_gen=name_gen)
26 | person.create_attribute("AGE", init_gen=age_gen)
27 | 
28 | # basic relationship to store people's quote
29 | 
30 | quote_generator = FakerGenerator(method="sentence", nb_words=6, variable_nb_words=True,
31 |                                  seed=next(example_circus.seeder))
32 | 
33 | quotes_rel = example_circus.populations["person"].create_relationship("quotes")
34 | 
35 | for w in range(4):
36 |     quotes_rel.add_relations(
37 |         from_ids=person.ids,
38 |         to_ids=quote_generator.generate(size=person.size),
39 |         weights=w
40 |     )
41 | 
42 | # message story
43 | 
44 | story_timer_gen = DefaultDailyTimerGenerator(
45 |     clock=example_circus.clock, 
46 |     seed=next(example_circus.seeder))
47 | 
48 | low_activity = story_timer_gen.activity(n=3, per=pd.Timedelta("1 day"))
49 | med_activity = story_timer_gen.activity(n=10, per=pd.Timedelta("1 day"))
50 | high_activity = story_timer_gen.activity(n=20, per=pd.Timedelta("1 day"))
51 | 
52 | activity_gen = NumpyRandomGenerator(
53 |     method="choice", 
54 |     a=[low_activity, med_activity, high_activity],
55 |     p=[.2, .7, .1],
56 |     seed=next(example_circus.seeder))
57 | 
58 | hello_world = example_circus.create_story(
59 |     name="hello_world",
60 |     initiating_population=example_circus.populations["person"],
61 |     member_id_field="PERSON_ID",
62 | 
63 |     timer_gen=story_timer_gen,
64 |     activity_gen=activity_gen
65 | )
66 | 
67 | hello_world.set_operations(
68 |     example_circus.clock.ops.timestamp(named_as="TIME"),
69 |     
70 |     example_circus.populations["person"].get_relationship("quotes")
71 |         .ops.select_one(from_field="PERSON_ID",named_as="MESSAGE"),
72 |     
73 |     example_circus.populations["person"].ops.select_one(named_as="OTHER_PERSON"),
74 | 
75 |     example_circus.populations["person"]
76 |         .ops.lookup(id_field="PERSON_ID", select={"NAME": "EMITTER_NAME"}),
77 | 
78 |     example_circus.populations["person"]
79 |         .ops.lookup(id_field="OTHER_PERSON", select={"NAME": "RECEIVER_NAME"}),
80 | 
81 |     operations.FieldLogger(log_id="hello_5")
82 | )
83 | 
84 | # message story
85 | 
86 | example_circus.run(
87 |     duration=pd.Timedelta("72h"),
88 |     log_output_folder="output/example_scenario",
89 |     delete_existing_logs=True
90 | )
91 | 
92 | # -- DEBUG output printout
93 | pd.set_option('display.max_columns', 500)
94 | pd.set_option('display.width', 1000)
95 | df = pd.read_csv("output/example_scenario/hello_5.csv")
96 | print(df.head(10))
97 | print(df.tail(10))
98 | 


--------------------------------------------------------------------------------
/examples/presentation/07_circus_with_story_and_relationship.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pandas as pd
  3 | 
  4 | from trumania.core import circus
  5 | import trumania.core.util_functions as util_functions
  6 | from trumania.core.operations import FieldLogger
  7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator
  8 | from trumania.core.random_generators import ConstantDependentGenerator
  9 | 
 10 | 
 11 | util_functions.setup_logging()
 12 | 
 13 | logging.info("building circus")
 14 | 
 15 | 
 16 | def create_circus_with_population():
 17 |     example_circus = circus.Circus(
 18 |         name="example",
 19 |         master_seed=12345,
 20 |         start=pd.Timestamp("1 Jan 2017 00:00"),
 21 |         step_duration=pd.Timedelta("1h"))
 22 | 
 23 |     person = example_circus.create_population(
 24 |         name="person", size=1000,
 25 |         ids_gen=SequencialGenerator(prefix="PERSON_"))
 26 | 
 27 |     person.create_attribute(
 28 |         "NAME",
 29 |         init_gen=FakerGenerator(method="name",
 30 |                                 seed=next(example_circus.seeder)))
 31 | 
 32 |     person.create_attribute(
 33 |         "age",
 34 |         init_gen=NumpyRandomGenerator(
 35 |             method="normal", loc=35, scale=5,
 36 |             seed=next(example_circus.seeder)))
 37 | 
 38 |     return example_circus
 39 | 
 40 | 
 41 | def add_quotes(the_circus):
 42 | 
 43 |     quote_generator = FakerGenerator(method="sentence",
 44 |                                      nb_words=6,
 45 |                                      variable_nb_words=True,
 46 |                                      seed=next(the_circus.seeder))
 47 | 
 48 |     person = the_circus.populations["person"]
 49 | 
 50 |     quotes_rel = person.create_relationship("quotes")
 51 | 
 52 |     for w in range(4):
 53 |         quotes_rel.add_relations(
 54 |             from_ids=person.ids,
 55 |             to_ids=quote_generator.generate(size=person.size),
 56 |             weights=w
 57 |         )
 58 | 
 59 | 
 60 | the_circus = create_circus_with_population()
 61 | add_quotes(the_circus)
 62 | 
 63 | hello_world = the_circus.create_story(
 64 |     name="hello_world",
 65 |     initiating_population=the_circus.populations["person"],
 66 |     member_id_field="PERSON_ID",
 67 |     timer_gen=ConstantDependentGenerator(value=1)
 68 | )
 69 | 
 70 | hello_world.set_operations(
 71 | 
 72 |     # adding a random timestamp, within the current clock step
 73 |     the_circus.clock
 74 |         .ops
 75 |         .timestamp(named_as="TIME"),
 76 | 
 77 |     # message is now selected from the favourite quotes of the speaker
 78 |     the_circus.populations["person"].get_relationship("quotes")
 79 |         .ops
 80 |         .select_one(
 81 |             from_field="PERSON_ID",
 82 |             named_as="MESSAGE"),
 83 | 
 84 |     # selecting a random "other person"
 85 |     the_circus.populations["person"]
 86 |         .ops
 87 |         .select_one(named_as="OTHER_PERSON"),
 88 | 
 89 |     the_circus.populations["person"]
 90 |         .ops
 91 |         .lookup(id_field="PERSON_ID",
 92 |                 select={"NAME": "EMITTER_NAME"}),
 93 | 
 94 |     the_circus.populations["person"]
 95 |         .ops
 96 |         .lookup(id_field="OTHER_PERSON",
 97 |                 select={"NAME": "RECEIVER_NAME"}),
 98 | 
 99 |     # specifying which fields to put in the log
100 |     FieldLogger(log_id="hello")
101 | 
102 | )
103 | 
104 | the_circus.run(
105 |     duration=pd.Timedelta("12h"),
106 |     log_output_folder="output/example4",
107 |     delete_existing_logs=True
108 | )
109 | 
110 | with open("output/example4/hello.csv") as log:
111 |     logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10]))
112 | 


--------------------------------------------------------------------------------
/trumania/components/time_patterns/profilers.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from trumania.core.clock import CyclicTimerGenerator, CyclicTimerProfile
 4 | 
 5 | 
 6 | class HighWeekDaysTimerGenerator(CyclicTimerGenerator):
 7 |     """
 8 |     Basic CyclicTimerGenerator with a one week period that allocates higher
 9 |     probabilities to week-day vs week-ends
10 |     """
11 |     def __init__(self, clock, seed):
12 | 
13 |         start_date = pd.Timestamp("6 June 2016 00:00:00")
14 |         CyclicTimerGenerator.__init__(self,
15 |                                       clock=clock,
16 |                                       seed=seed,
17 |                                       config=CyclicTimerProfile(
18 |                                         profile=[5., 5., 5., 5., 5., 3., 3.],
19 |                                         profile_time_steps="1D",
20 |                                         start_date=start_date),
21 |                                       )
22 | 
23 | 
24 | class WorkHoursTimerGenerator(CyclicTimerGenerator):
25 |     """
26 |     Basic CyclicTimerGenerator with a one week period that allocates uniform
27 |     probabilities to work hours.
28 | 
29 |     Work hours happen during week days (Monday to Friday),
30 |     and between start_hour and end_hour, both included
31 | 
32 |     """
33 |     def __init__(self, clock, seed, start_hour=9, end_hour=17):
34 | 
35 |         assert start_hour >= 0
36 |         assert end_hour < 24
37 |         assert start_hour <= end_hour
38 | 
39 |         # if start_hour = 0, before_work is empty
40 |         before_work = [0] * start_hour
41 |         during_work = [1.] * (end_hour - start_hour + 1)
42 |         # if end_hour = 23, after_work is empty
43 |         after_work = [0] * (23 - end_hour)
44 | 
45 |         # the sum of before_work, during_work and after_work is always 24
46 |         week_day_profile = before_work + during_work + after_work
47 |         weekend_day_profile = [0] * 24
48 | 
49 |         week_profile = week_day_profile * 5 + weekend_day_profile * 2
50 | 
51 |         start_date = pd.Timestamp("6 June 2016 00:00:00")
52 |         CyclicTimerGenerator.__init__(self,
53 |                                       clock=clock,
54 |                                       seed=seed,
55 |                                       config=CyclicTimerProfile(
56 |                                           profile=week_profile,
57 |                                           profile_time_steps="1h",
58 |                                           start_date=start_date))
59 | 
60 | 
61 | class DefaultDailyTimerGenerator(CyclicTimerGenerator):
62 |     """
63 |     Basic CyclicTimerGenerator with a one dat period with hourly weights
64 |     vaguely inspired from
65 | 
66 |     https://github.com/RealImpactAnalytics/lab-home-work-detection/blob/3bacb58a53f69824102437a27218149f75d322e2/pub/chimayblue/01%20basic%20exploration.ipynb
67 | 
68 |     """
69 |     def __init__(self, clock, seed):
70 |         # any date starting at midnight is ok...
71 |         start_date = pd.Timestamp("6 June 2016 00:00:00")
72 |         CyclicTimerGenerator.__init__(self,
73 |                                       clock=clock,
74 |                                       seed=seed,
75 |                                       config=CyclicTimerProfile(
76 |                                                   profile=[1, .5, .2, .15, .2, .4, 3.8,
77 |                                                            7.2, 8.4, 9.1, 9.0, 8.3, 8.1,
78 |                                                            7.7, 7.4, 7.8, 8.0, 7.9, 9.7,
79 |                                                            10.4, 10.5, 8.8, 5.7, 2.8],
80 |                                                   profile_time_steps="1h",
81 |                                                   start_date=start_date,
82 |                                               ),
83 |                                       )
84 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_clock.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from trumania.core.clock import CyclicTimerProfile, CyclicTimerGenerator
  4 | from trumania.core.clock import Clock
  5 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator
  6 | 
  7 | 
  8 | def test_clock_tick_per_day():
  9 | 
 10 |     clock = Clock(start=pd.Timestamp("10 June 2016 5:45pm"),
 11 |                   step_duration=pd.Timedelta("15 min"),
 12 |                   seed=1234)
 13 | 
 14 |     # time steps is 900 s, i.e 15 min
 15 |     assert clock.n_iterations(pd.Timedelta("7D")) == 7 * 24 * 4
 16 |     assert clock.n_iterations(pd.Timedelta("1D")) == 24 * 4
 17 | 
 18 |     # 47 min should be rounded up to 4 quarters
 19 |     assert clock.n_iterations(pd.Timedelta("47min")) == 4
 20 | 
 21 | 
 22 | def test_init_cyclictimergenerator():
 23 | 
 24 |     # say we have a clock at 5.45pm on 10th June
 25 |     clock = Clock(start=pd.Timestamp("10 June 2016 5:45pm"),
 26 |                   # time steps by 15min
 27 |                   step_duration=pd.Timedelta("15 min"),
 28 |                   seed=1234)
 29 | 
 30 |     # 1 to 12 then 12 to 1, from midnight to midnight
 31 |     timer_gen = CyclicTimerGenerator(
 32 |         clock=clock,
 33 |         config=CyclicTimerProfile(
 34 |             profile=list(range(1, 13)) + list(range(12, 0, -1)),
 35 |             profile_time_steps="1H",
 36 |             start_date=pd.Timestamp("1 January 2014 00:00:00"),
 37 |         ),
 38 |         seed=1234
 39 |     )
 40 | 
 41 |     # after the initialization, the 1h time delta of the profile should have
 42 |     # been aligned to the 15min of the clock
 43 |     assert timer_gen.profile.index.shape[0] == 24 * 4
 44 | 
 45 |     # the first index should be shifted to the time of the clock
 46 |     assert timer_gen.profile.index[0] == pd.Timestamp("10 June 2016 5:45pm")
 47 | 
 48 | 
 49 | def test_DefaultDailyTimerGenerator_should_be_initialized_correctly():
 50 | 
 51 |     clock = Clock(start=pd.Timestamp("12 Sept 2016"),
 52 |                   step_duration=pd.Timedelta("60 s"),
 53 |                   seed=1234)
 54 | 
 55 |     daily = DefaultDailyTimerGenerator(clock=clock, seed=1234)
 56 | 
 57 |     assert daily.profile.index[0] == pd.Timestamp("12 Sept 2016")
 58 | 
 59 | 
 60 | def test_cyclic_timer_profile_should_compute_duration_correct():
 61 | 
 62 |     tested = CyclicTimerProfile(
 63 |         profile=[10, 20, 10, 40],
 64 |         profile_time_steps="2h",
 65 |         start_date=pd.Timestamp("21 March 1956")
 66 |     )
 67 | 
 68 |     assert tested.duration() == pd.Timedelta("8h")
 69 | 
 70 | 
 71 | def test_activity_level_should_be_scaled_according_to_profile_duration():
 72 | 
 73 |     clock = Clock(start=pd.Timestamp("10 June 2016 5:45pm"),
 74 |                   # time steps by 15min
 75 |                   step_duration=pd.Timedelta("1 h"),
 76 |                   seed=1234)
 77 | 
 78 |     # 1 to 12 then 12 to 1, from midnight to midnight
 79 |     one_day_timer = CyclicTimerGenerator(
 80 |         clock=clock,
 81 |         config=CyclicTimerProfile(
 82 |             profile=list(range(24)),
 83 |             profile_time_steps="1H",
 84 |             start_date=pd.Timestamp("1 January 2014 00:00:00"),
 85 |         ),
 86 |         seed=1234
 87 |     )
 88 | 
 89 |     # 14 actions/week should be scaled to activity 2 since profile lasts 1 day
 90 |     assert 2 == one_day_timer.activity(n=14, per=pd.Timedelta("7 days"))
 91 | 
 92 |     # this one should generate a warning log since the corresponding freq
 93 |     # is shorter than the clock step
 94 |     assert 48 == one_day_timer.activity(n=4, per=pd.Timedelta("2h"))
 95 | 
 96 |     assert .5 == one_day_timer.activity(n=1, per=pd.Timedelta("2 days"))
 97 | 
 98 |     assert .5 == one_day_timer.activity(n=.25, per=pd.Timedelta("12h"))
 99 | 
100 |     assert 1. / 360 - one_day_timer.activity(
101 |         n=1, per=pd.Timedelta("360 days")) < 1e-10
102 | 


--------------------------------------------------------------------------------
/examples/datacamp-blogpost/06-the-social-network.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pandas as pd
  3 | from tabulate import tabulate 
  4 | 
  5 | from trumania.core.circus import Circus
  6 | from trumania.core import circus, operations
  7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator
  8 | import trumania.core.util_functions as util_functions
  9 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator
 10 | from trumania.components.social_networks.erdos_renyi import WithErdosRenyi
 11 | 
 12 | util_functions.setup_logging()
 13 | 
 14 | class Calling_scenario(WithErdosRenyi, Circus):
 15 | 
 16 | 
 17 |     def __init__(self):
 18 | 
 19 |         Circus.__init__(self,
 20 |             name="example", 
 21 |             master_seed=12345,
 22 |             start=pd.Timestamp("1 Jan 2017 00:00"),
 23 |             step_duration=pd.Timedelta("1h"))
 24 | 
 25 |         self._add_person_population()
 26 |        
 27 |         self.add_er_social_network_relationship(
 28 |             self.populations["person"],
 29 |             relationship_name="friends",
 30 |             average_degree=20)
 31 | 
 32 |         self._add_message_story()
 33 | 
 34 |     def _add_person_population(self):
 35 | 
 36 |         id_gen = SequencialGenerator(prefix="PERSON_")
 37 |         age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5,
 38 |                                        seed=next(self.seeder))
 39 |         name_gen = FakerGenerator(method="name", seed=next(self.seeder))
 40 | 
 41 |         person = self.create_population(name="person", size=1000, ids_gen=id_gen)
 42 |         person.create_attribute("NAME", init_gen=name_gen)
 43 |         person.create_attribute("AGE", init_gen=age_gen)
 44 | 
 45 |         quote_generator = FakerGenerator(method="sentence", nb_words=6, variable_nb_words=True,
 46 |                                          seed=next(self.seeder))
 47 | 
 48 |         quotes_rel = self.populations["person"].create_relationship("quotes")
 49 | 
 50 |         for w in range(4):
 51 |             quotes_rel.add_relations(
 52 |                 from_ids=person.ids,
 53 |                 to_ids=quote_generator.generate(size=person.size),
 54 |                 weights=w
 55 |             )
 56 | 
 57 |     def _add_message_story(self):
 58 | 
 59 |         story_timer_gen = DefaultDailyTimerGenerator(
 60 |             clock=self.clock, 
 61 |             seed=next(self.seeder))
 62 | 
 63 |         low_activity = story_timer_gen.activity(n=3, per=pd.Timedelta("1 day"))
 64 |         med_activity = story_timer_gen.activity(n=10, per=pd.Timedelta("1 day"))
 65 |         high_activity = story_timer_gen.activity(n=20, per=pd.Timedelta("1 day"))
 66 | 
 67 |         activity_gen = NumpyRandomGenerator(
 68 |             method="choice", 
 69 |             a=[low_activity, med_activity, high_activity],
 70 |             p=[.2, .7, .1],
 71 |             seed=next(self.seeder))
 72 | 
 73 |         hello_world = self.create_story(
 74 |             name="hello_world",
 75 |             initiating_population=self.populations["person"],
 76 |             member_id_field="PERSON_ID",
 77 | 
 78 |             timer_gen=story_timer_gen,
 79 |             activity_gen=activity_gen
 80 |         )
 81 | 
 82 |         hello_world.set_operations(
 83 |             self.clock.ops.timestamp(named_as="TIME"),
 84 |             
 85 |             self.populations["person"].get_relationship("quotes")
 86 |                 .ops.select_one(from_field="PERSON_ID",named_as="MESSAGE"),
 87 |             
 88 |             self.populations["person"]
 89 |                 .get_relationship("friends")
 90 |                 .ops.select_one(from_field="PERSON_ID", named_as="OTHER_PERSON"),
 91 | 
 92 |             self.populations["person"]
 93 |                 .ops.lookup(id_field="PERSON_ID", select={"NAME": "EMITTER_NAME"}),
 94 | 
 95 |             self.populations["person"]
 96 |                 .ops.lookup(id_field="OTHER_PERSON", select={"NAME": "RECEIVER_NAME"}),
 97 | 
 98 |             operations.FieldLogger(log_id="hello_6")
 99 |         )
100 | 
101 | # message story
102 | example = Calling_scenario()
103 | 
104 | example.run(
105 |     duration=pd.Timedelta("72h"),
106 |     log_output_folder="output/example_scenario",
107 |     delete_existing_logs=True
108 | )
109 | 
110 | # -- DEBUG output printout
111 | pd.set_option('display.max_columns', 500)
112 | pd.set_option('display.width', 1000)
113 | df = pd.read_csv("output/example_scenario/hello_6.csv")
114 | print(df.head(10))
115 | print(df.tail(10))
116 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_attribute.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | 
  3 | import path
  4 | import pandas as pd
  5 | import os
  6 | 
  7 | from trumania.core.random_generators import SequencialGenerator
  8 | from trumania.core.circus import Circus
  9 | from trumania.core.population import Population, Attribute
 10 | 
 11 | tc = Circus("c", master_seed=1234, start=pd.Timestamp("1 Jan 2011"),
 12 |             step_duration=pd.Timedelta("1h"))
 13 | 
 14 | 
 15 | def test_set_and_read_values_in_attribute_should_be_equal():
 16 | 
 17 |     population = Population(circus=None, size=5,
 18 |                             ids_gen=SequencialGenerator(prefix="abc", max_length=1))
 19 | 
 20 |     tested = Attribute(population, init_values=[10, 20, 30, 40, 50])
 21 | 
 22 |     assert tested.get_values(["abc0"]).tolist() == [10]
 23 |     assert tested.get_values(["abc0", "abc3", "abc1"]).tolist() == [10, 40, 20]
 24 | 
 25 |     # getting no id should return empty list
 26 |     assert tested.get_values([]).tolist() == []
 27 | 
 28 | 
 29 | def test_updated_and_read_values_in_attribute_should_be_equal():
 30 |     population = Population(circus=tc, size=5, ids_gen=SequencialGenerator(
 31 |         prefix="abc", max_length=1))
 32 |     tested = Attribute(population, init_values=[10, 20, 30, 40, 50])
 33 | 
 34 |     tested.update(pd.Series([22, 44], index=["abc1", "abc3"]))
 35 | 
 36 |     # value of a should untouched
 37 |     assert tested.get_values(["abc0"]).tolist() == [10]
 38 | 
 39 |     # arbitrary order should not be impacted
 40 |     assert tested.get_values(["abc0", "abc3", "abc1"]).tolist() == [10, 44, 22]
 41 | 
 42 | 
 43 | def test_updating_non_existing_population_ids_should_add_them():
 44 |     population = Population(circus=tc, size=5, ids_gen=SequencialGenerator(
 45 |         prefix="abc", max_length=1))
 46 |     tested = Attribute(population, init_values=[10, 20, 30, 40, 50])
 47 | 
 48 |     tested.update(pd.Series([22, 1000, 44], index=["abc1", "not_yet_there", "abc3"]))
 49 | 
 50 |     assert tested.get_values(["not_yet_there", "abc0", "abc3", "abc4"]).tolist() == [1000, 10, 44, 50]
 51 | 
 52 | 
 53 | def test_initializing_attribute_from_relationship_must_have_a_value_for_all():
 54 | 
 55 |     population = Population(circus=tc, size=5, ids_gen=SequencialGenerator(
 56 |         prefix="abc", max_length=1))
 57 |     oneto1 = population.create_relationship("rel")
 58 |     oneto1.add_relations(from_ids=["abc0", "abc1", "abc2", "abc3", "abc4"],
 59 |                          to_ids=["ta", "tb", "tc", "td", "te"])
 60 | 
 61 |     attr = Attribute(population, init_relationship="rel")
 62 | 
 63 |     expected = pd.DataFrame({"value": ["ta", "tb", "tc", "td", "te"]},
 64 |                             index=["abc0", "abc1", "abc2", "abc3", "abc4"])
 65 | 
 66 |     assert attr._table.sort_index().equals(expected)
 67 | 
 68 | 
 69 | def test_overwrite_attribute():
 70 | 
 71 |     population = Population(circus=tc, size=10,
 72 |                             ids_gen=SequencialGenerator(prefix="u_", max_length=1))
 73 | 
 74 |     ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23]
 75 |     age_attr = population.create_attribute("age", init_values=ages)
 76 | 
 77 |     # before modification
 78 |     ages = age_attr.get_values(["u_0", "u_4", "u_9"]).tolist()
 79 |     assert ages == [10, 100, 23]
 80 | 
 81 |     story_data = pd.DataFrame({
 82 |         # id of the populations to update
 83 |         "A_ID": ["u_4", "u_0"],
 84 | 
 85 |         # new values to copy
 86 |         "new_ages": [34, 30]},
 87 | 
 88 |         # index of the story data has, in general, nothing to do with the
 89 |         # updated population
 90 |         index=["cust_1", "cust_2"]
 91 |     )
 92 | 
 93 |     update = age_attr.ops.update(
 94 |         member_id_field="A_ID",
 95 |         copy_from_field="new_ages"
 96 |     )
 97 | 
 98 |     _, logs = update(story_data)
 99 | 
100 |     assert logs == {}
101 |     # before modification
102 |     ages = age_attr.get_values(["u_0", "u_4", "u_9"]).tolist()
103 |     assert ages == [30, 34, 23]
104 | 
105 | 
106 | def test_added_and_read_values_in_attribute_should_be_equal():
107 |     population = Population(circus=tc, size=5,
108 |                             ids_gen=SequencialGenerator(prefix="abc", max_length=1))
109 |     tested = Attribute(population, init_values=[10, 20, 30, 40, 50])
110 | 
111 |     tested.add(["abc1", "abc3"], [22, 44])
112 | 
113 |     assert tested.get_values(["abc0", "abc1", "abc2", "abc3", "abc4"]).tolist() == [10, 20 + 22, 30, 40 + 44, 50]
114 | 
115 | 
116 | def test_adding_several_times_to_the_same_from_should_pile_up():
117 |     population = Population(circus=tc, size=5,
118 |                             ids_gen=SequencialGenerator(prefix="abc", max_length=1))
119 |     tested = Attribute(population, init_values=[10, 20, 30, 40, 50])
120 | 
121 |     tested.add(["abc1", "abc3", "abc1"], [22, 44, 10])
122 | 
123 |     assert tested.get_values(["abc0", "abc1", "abc2", "abc3", "abc4"]).tolist() == [10, 20 + 22 + 10, 30, 40 + 44, 50]
124 | 
125 | 
126 | def test_io_round_trip():
127 | 
128 |     with tempfile.TemporaryDirectory() as root_dir:
129 | 
130 |         population = Population(circus=tc, size=5,
131 |                                 ids_gen=SequencialGenerator(prefix="abc", max_length=1))
132 |         orig = Attribute(population, init_values=[10, 20, 30, 40, 50])
133 | 
134 |         full_path = os.path.join(root_dir, "attribute.csv")
135 | 
136 |         orig.save_to(full_path)
137 |         retrieved = Attribute.load_from(full_path)
138 | 
139 |         assert orig._table.equals(retrieved._table)
140 | 


--------------------------------------------------------------------------------
/trumania/components/db.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is just the provider of the IO methods save and retrieve various
  3 | simulation components to/from persistence.
  4 | 
  5 | A namespace defines a place where to put objects that belong together
  6 | (typically, from the same scenario or component, e.g. "Uganda").
  7 | 
  8 | """
  9 | 
 10 | # TODO: we should store this elsewhere than in the git repo...
 11 | 
 12 | # TODO: would be cool to also be able to store empirical probability
 13 | # distribution here, for the random generators...
 14 | 
 15 | import pandas as pd
 16 | import os
 17 | 
 18 | from trumania.core.util_functions import ensure_folder_exists, ensure_non_existing_dir
 19 | from trumania.core.population import Population
 20 | import trumania.core.clock as clock
 21 | from trumania.core.random_generators import Generator, NumpyRandomGenerator
 22 | 
 23 | 
 24 | def save_population(population, namespace, population_id):
 25 |     population.save_to(population_folder(namespace, population_id))
 26 | 
 27 | 
 28 | def load_population(namespace, population_id, circus):
 29 |     return Population.load_from(population_folder(namespace, population_id), circus)
 30 | 
 31 | 
 32 | def list_populations(namespace):
 33 |     folder = _population_folder(namespace)
 34 |     return [d for d in os.listdir(folder)
 35 |             if os.path.isdir(os.path.join(folder, d))]
 36 | 
 37 | 
 38 | def save_generator(generator, namespace, gen_id):
 39 | 
 40 |     output_folder = _gen_folder(namespace=namespace,
 41 |                                 gen_type=generator.__class__.__name__,)
 42 | 
 43 |     ensure_folder_exists(output_folder)
 44 |     generator.save_to(json_item_path(output_folder, gen_id))
 45 | 
 46 | 
 47 | def list_generators(namespace):
 48 |     folder = _generators_folder(namespace)
 49 |     if not os.path.exists(folder):
 50 |         return []
 51 | 
 52 |     def _list():
 53 |         for gen_type in os.listdir(folder):
 54 |             for gen_file in os.listdir(os.path.join(folder, gen_type)):
 55 |                 gen_id = gen_file.split(".")[0]
 56 |                 yield [gen_type, gen_id]
 57 | 
 58 |     return list(_list())
 59 | 
 60 | 
 61 | def load_generator(namespace, gen_type, gen_id):
 62 | 
 63 |     input_file = json_item_path(
 64 |         _gen_folder(namespace=namespace, gen_type=gen_type), gen_id)
 65 | 
 66 |     return Generator.load_generator(gen_type, input_file)
 67 | 
 68 | 
 69 | # TODO: this can now be refactored to save as NumpyGenerator, togheter with
 70 | # its state
 71 | def save_timer_gen(timer_gen, namespace, timer_gen_id):
 72 | 
 73 |     timer_gen_folder = _timer_gens_root_folder(namespace)
 74 |     ensure_folder_exists(timer_gen_folder)
 75 |     timer_gen.save_to(csv_item_path(timer_gen_folder, timer_gen_id))
 76 | 
 77 | 
 78 | def load_timer_gen_config(namespace, timer_gen_id):
 79 |     timer_gen_folder = _timer_gens_root_folder(namespace)
 80 | 
 81 |     return clock.CyclicTimerProfile.load_from(
 82 |         csv_item_path(timer_gen_folder, timer_gen_id))
 83 | 
 84 | 
 85 | def save_empirical_discrete_generator(distribution, values, namespace, gen_id):
 86 |     assert distribution.sum() - 1 < 1e-6
 87 | 
 88 |     root_folder = _empirical_discrete_gen_folder(namespace)
 89 |     ensure_folder_exists(root_folder)
 90 |     gen_file_path = csv_item_path(root_folder, gen_id)
 91 | 
 92 |     df = pd.DataFrame({
 93 |         "px": distribution,
 94 |       }, index=pd.Series(values, name="x"))
 95 | 
 96 |     df.to_csv(gen_file_path, index=True)
 97 | 
 98 | 
 99 | def load_empirical_discrete_generator(namespace, gen_id, seed):
100 |     root_folder = _empirical_discrete_gen_folder(namespace)
101 |     gen_file_path = os.path.join(root_folder, "%s.csv" % gen_id)
102 |     df = pd.read_csv(gen_file_path)
103 | 
104 |     gen = NumpyRandomGenerator(
105 |         method="choice",
106 |         a=df["x"].tolist(),
107 |         p=df["px"].tolist(),
108 |         seed=seed)
109 | 
110 |     return gen
111 | 
112 | 
113 | def is_namespace_existing(namespace):
114 |     return os.path.exists(namespace_folder(namespace))
115 | 
116 | 
117 | def namespace_folder(namespace):
118 |     return os.path.join(_db_folder(), namespace)
119 | 
120 | 
121 | def create_namespace(namespace):
122 |     folder = namespace_folder(namespace)
123 |     if not os.path.exists(folder):
124 |         os.makedirs(folder)
125 |     return folder
126 | 
127 | 
128 | def remove_namespace(namespace):
129 |     ensure_non_existing_dir(namespace_folder(namespace))
130 | 
131 | 
132 | def _population_folder(namespace):
133 |     return os.path.join(namespace_folder(namespace), "populations")
134 | 
135 | 
136 | def population_folder(namespace, population_id):
137 |     return os.path.join(_population_folder(namespace), population_id)
138 | 
139 | 
140 | def _generators_folder(namespace):
141 |     return os.path.join(
142 |         namespace_folder(namespace),
143 |         "generators")
144 | 
145 | 
146 | def _gen_folder(namespace, gen_type):
147 |     return os.path.join(_generators_folder(namespace), gen_type)
148 | 
149 | 
150 | def csv_item_path(folder, item_id):
151 |     return os.path.join(folder, "{}.csv".format(item_id))
152 | 
153 | 
154 | def json_item_path(folder, item_id):
155 |     return os.path.join(folder, "{}.json".format(item_id))
156 | 
157 | 
158 | def _timer_gens_root_folder(namespace):
159 |     return os.path.join(
160 |         _generators_folder(namespace),
161 |         "timer_gens")
162 | 
163 | 
164 | def _empirical_discrete_gen_folder(namespace):
165 |     return os.path.join(
166 |         _generators_folder(namespace),
167 |         "empirical_discrete_gens")
168 | 
169 | 
170 | def _db_folder():
171 |     this_folder = os.path.dirname(os.path.realpath(__file__))
172 |     return os.path.join(this_folder, "_DB")
173 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # trumania documentation build configuration file, created by
  5 | # sphinx-quickstart on Mon Jan 15 12:02:36 2018.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = ['sphinx.ext.autodoc',
 35 |     'sphinx.ext.doctest',
 36 |     'sphinx.ext.coverage',
 37 |     'sphinx.ext.mathjax',
 38 |     'sphinx.ext.viewcode',
 39 |     'sphinx.ext.githubpages']
 40 | 
 41 | # Add any paths that contain templates here, relative to this directory.
 42 | templates_path = ['_templates']
 43 | 
 44 | # The suffix(es) of source filenames.
 45 | # You can specify multiple suffix as a list of string:
 46 | #
 47 | from recommonmark.parser import CommonMarkParser
 48 | source_parsers = {
 49 |    '.md': CommonMarkParser,
 50 | }
 51 | 
 52 | source_suffix = ['.rst', '.md']
 53 | #source_suffix = '.rst'
 54 | 
 55 | # The master toctree document.
 56 | master_doc = 'index'
 57 | 
 58 | # General information about the project.
 59 | project = 'trumania'
 60 | copyright = '2018, RIA'
 61 | author = 'RIA'
 62 | 
 63 | # The version info for the project you're documenting, acts as replacement for
 64 | # |version| and |release|, also used in various other places throughout the
 65 | # built documents.
 66 | #
 67 | # The short X.Y version.
 68 | version = '1.0.1'
 69 | # The full version, including alpha/beta/rc tags.
 70 | release = '1.0.1'
 71 | 
 72 | # The language for content autogenerated by Sphinx. Refer to documentation
 73 | # for a list of supported languages.
 74 | #
 75 | # This is also used if you do content translation via gettext catalogs.
 76 | # Usually you set "language" from the command line for these cases.
 77 | language = None
 78 | 
 79 | # List of patterns, relative to source directory, that match files and
 80 | # directories to ignore when looking for source files.
 81 | # This patterns also effect to html_static_path and html_extra_path
 82 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 83 | 
 84 | # The name of the Pygments (syntax highlighting) style to use.
 85 | pygments_style = 'sphinx'
 86 | 
 87 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 88 | todo_include_todos = False
 89 | 
 90 | 
 91 | # -- Options for HTML output ----------------------------------------------
 92 | 
 93 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 94 | # a list of builtin themes.
 95 | #
 96 | html_theme = "classic"
 97 | 
 98 | # Theme options are theme-specific and customize the look and feel of a theme
 99 | # further.  For a list of options available for each theme, see the
100 | # documentation.
101 | #
102 | html_theme_options = {
103 |     "rightsidebar": "false",
104 |     "relbarbgcolor": "black"
105 | }
106 | 
107 | # Add any paths that contain custom static files (such as style sheets) here,
108 | # relative to this directory. They are copied after the builtin static files,
109 | # so a file named "default.css" will overwrite the builtin "default.css".
110 | html_static_path = ['_static']
111 | 
112 | # Custom sidebar templates, must be a dictionary that maps document names
113 | # to template names.
114 | #
115 | # This is required for the alabaster theme
116 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
117 | html_sidebars = {
118 |     '**': [
119 |         'relations.html',  # needs 'show_related': True theme option to display
120 |         'searchbox.html',
121 |     ]
122 | }
123 | 
124 | 
125 | # -- Options for HTMLHelp output ------------------------------------------
126 | 
127 | # Output file base name for HTML help builder.
128 | htmlhelp_basename = 'trumaniadoc'
129 | 
130 | 
131 | # -- Options for LaTeX output ---------------------------------------------
132 | 
133 | latex_elements = {
134 |     # The paper size ('letterpaper' or 'a4paper').
135 |     #
136 |     # 'papersize': 'letterpaper',
137 | 
138 |     # The font size ('10pt', '11pt' or '12pt').
139 |     #
140 |     # 'pointsize': '10pt',
141 | 
142 |     # Additional stuff for the LaTeX preamble.
143 |     #
144 |     # 'preamble': '',
145 | 
146 |     # Latex figure (float) alignment
147 |     #
148 |     # 'figure_align': 'htbp',
149 | }
150 | 
151 | # Grouping the document tree into LaTeX files. List of tuples
152 | # (source start file, target name, title,
153 | #  author, documentclass [howto, manual, or own class]).
154 | latex_documents = [
155 |     (master_doc, 'trumania.tex', 'trumania Documentation',
156 |      'RIA', 'manual'),
157 | ]
158 | 
159 | 
160 | # -- Options for manual page output ---------------------------------------
161 | 
162 | # One entry per manual page. List of tuples
163 | # (source start file, name, description, authors, manual section).
164 | man_pages = [
165 |     (master_doc, 'trumania', 'trumania Documentation',
166 |      [author], 1)
167 | ]
168 | 
169 | 
170 | # -- Options for Texinfo output -------------------------------------------
171 | 
172 | # Grouping the document tree into Texinfo files. List of tuples
173 | # (source start file, target name, title, author,
174 | #  dir menu entry, description, category)
175 | texinfo_documents = [
176 |     (master_doc, 'trumania', 'trumania Documentation',
177 |      author, 'trumania', 'One line description of project.',
178 |      'Miscellaneous'),
179 | ]
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_random_generators.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | 
  4 | import path
  5 | import functools
  6 | from itertools import islice
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | from trumania.core.random_generators import SequencialGenerator, NumpyRandomGenerator, ConstantGenerator, seed_provider
 11 | from trumania.core.random_generators import DependentTriggerGenerator, FakerGenerator, Generator
 12 | 
 13 | 
 14 | def test_constant_generator_should_produce_constant_values():
 15 |     tested = ConstantGenerator(value="c")
 16 | 
 17 |     assert [] == tested.generate(size=0)
 18 |     assert ["c"] == tested.generate(size=1)
 19 |     assert ["c", "c", "c", "c", "c"] == tested.generate(size=5)
 20 | 
 21 | 
 22 | def test_numpy_random_generator_should_delegate_to_numpy_correctly():
 23 | 
 24 |     # basic "smoke" test, if it does not crash it at least proves it's able
 25 |     # to load the appropriate method
 26 |     tested = NumpyRandomGenerator(method="normal", loc=10, scale=4, seed=1)
 27 |     assert len(tested.generate(size=10)) == 10
 28 | 
 29 | 
 30 | def test_seeder_should_be_deterministic():
 31 |     """
 32 |     makes sure the seeds always provides the same sequence of seeds
 33 |     """
 34 | 
 35 |     master_seed = 12345
 36 | 
 37 |     seeder1 = seed_provider(master_seed)
 38 |     seeder2 = seed_provider(master_seed)
 39 | 
 40 |     assert list(islice(seeder1, 1000)) == list(islice(seeder2, 1000))
 41 | 
 42 | 
 43 | def test_depend_trigger_should_trigger_given_constant_value():
 44 | 
 45 |     # returns 6 hard-coded 1 and zero
 46 |     def fake_mapper(x):
 47 |         return [1, 1, 0, 0, 1, 0]
 48 | 
 49 |     g = DependentTriggerGenerator(value_to_proba_mapper=fake_mapper)
 50 | 
 51 |     triggers = g.generate(observations=pd.Series([10, 20, 30, 0, 1, 2]))
 52 | 
 53 |     # because the fake_mapper returns fake values, we should always have the
 54 |     # following triggers, no matter what the internal uniform distro provided
 55 |     assert triggers.tolist() == [True, True, False, False, True, False]
 56 | 
 57 | 
 58 | def test_sequencial_generator_should_create_unique_values():
 59 | 
 60 |     tested = SequencialGenerator(start=10, prefix="test_p_", max_length=10)
 61 | 
 62 |     sizes = [100, 200, 300, 400, 500]
 63 |     sets = [set(tested.generate(size)) for size in sizes]
 64 | 
 65 |     # generated values should be unique within each set
 66 |     all_values = functools.reduce(lambda s1, s2: s1 | s2, sets)
 67 | 
 68 |     assert len(all_values) == np.sum(sizes)
 69 | 
 70 | 
 71 | def test_random_generator_should_provide_correct_amount_of_single_values():
 72 | 
 73 |     tested = NumpyRandomGenerator(method="gamma", scale=10, shape=1.8, seed=1)
 74 | 
 75 |     genops = tested.ops.generate(named_as="rand")
 76 | 
 77 |     story_data = pd.DataFrame(
 78 |         np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"])
 79 | 
 80 |     result, logs = genops(story_data)
 81 | 
 82 |     assert result.columns.tolist() == ["A", "B", "C", "D", "E", "rand"]
 83 | 
 84 |     # should be float and not list of values
 85 |     assert result["rand"].dtype == float
 86 | 
 87 | 
 88 | def test_random_generator_should_provide_correct_amount_of_list_of_values():
 89 | 
 90 |     tested = NumpyRandomGenerator(method="gamma", scale=10, shape=1.8, seed=1)
 91 | 
 92 |     story_data = pd.DataFrame(
 93 |         np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"],
 94 |     )
 95 |     story_data["how_many"] = pd.Series([10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
 96 | 
 97 |     genops = tested.ops.generate(named_as="rand", quantity_field="how_many")
 98 | 
 99 |     result, logs = genops(story_data)
100 | 
101 |     assert result.columns.tolist() == ["A", "B", "C", "D", "E", "how_many", "rand"]
102 | 
103 |     # should be list of the expected sizes
104 |     assert result["rand"].dtype == list
105 |     assert result["rand"].apply(len).tolist() == [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
106 | 
107 | 
108 | def test_faker_generator_should_delegate_to_faker_correct():
109 | 
110 |     tested_name = FakerGenerator(seed=1234, method="name")
111 |     some_names = tested_name.generate(10)
112 |     assert len(some_names) == 10
113 | 
114 |     tested_text = FakerGenerator(seed=1234, method="text")
115 |     some_text = tested_text.generate(20)
116 |     assert len(some_text) == 20
117 | 
118 |     tested_address = FakerGenerator(seed=1234, method="address")
119 |     some_addresses = tested_address.generate(30)
120 |     assert len(some_addresses) == 30
121 | 
122 | 
123 | def test_sequencial_generator_read_from_disk_should_continue_sequence():
124 | 
125 |     with tempfile.TemporaryDirectory() as p:
126 | 
127 |         tested = SequencialGenerator(start=10, prefix="o_", max_length=2)
128 | 
129 |         list_1 = tested.generate(size=4)
130 |         assert list_1 == ["o_10", "o_11", "o_12", "o_13"]
131 | 
132 |         gen_file = os.path.join(p, "tested.json")
133 |         tested.save_to(gen_file)
134 | 
135 |         tested2 = Generator.load_generator(gen_type="SequencialGenerator",
136 |                                            input_file=gen_file)
137 | 
138 |         list_2 = tested2.generate(size=4)
139 |         assert list_2 == ["o_14", "o_15", "o_16", "o_17"]
140 | 
141 |         # loading it again => we should have the same result
142 |         tested3 = Generator.load_generator(gen_type="SequencialGenerator",
143 |                                            input_file=gen_file)
144 | 
145 |         list_3 = tested3.generate(size=4)
146 |         assert list_3 == ["o_14", "o_15", "o_16", "o_17"]
147 | 
148 | 
149 | def numpy_generators_read_from_disk_should_generate_same_sequence_as_original():
150 | 
151 |     with tempfile.TemporaryDirectory() as p:
152 | 
153 |         # making sure we're not using the default seed
154 |         tested = NumpyRandomGenerator(method="normal", loc=10, scale=4,
155 |                                       seed=123456)
156 | 
157 |         gen_file = os.path.join(p, "tested2.json")
158 |         tested.save_to(gen_file)
159 | 
160 |         reloaded = Generator.load_generator(gen_type="NumpyRandomGenerator",
161 |                                             input_file=gen_file)
162 | 
163 |         assert tested.generate(size=10000) == reloaded.generate(size=10000)
164 | 
165 | 
166 | def test_sequencial_generator_must_not_change_format_when_size_is_float():
167 | 
168 |     seq = SequencialGenerator(prefix="sq", max_length=2)
169 | 
170 |     # bugfix: this was previously generating "sq00.0", "sq01.0",...
171 |     assert ["sq00", "sq01", "sq02"] == seq.generate(size=3.3)
172 |     assert ["sq03", "sq04", "sq05"] == seq.generate(size=3.3)
173 | 


--------------------------------------------------------------------------------
/trumania/core/attribute.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import logging
  3 | from trumania.core.operations import SideEffectOnly
  4 | 
  5 | 
  6 | class Attribute(object):
  7 |     """
  8 |         Static population attribute, with various ways to initialize it randomly
  9 |     """
 10 | 
 11 |     def __init__(self,
 12 |                  population,
 13 | 
 14 |                  # if initializing with value, must provide ids and one of the
 15 |                  # init values
 16 |                  init_values=None,
 17 |                  init_gen=None,
 18 | 
 19 |                  # otherwise, we can also initialise randomly from a
 20 |                  # relationship (in which case the ids are extracted from the
 21 |                  # "from" field. init_relationship is a string that contains
 22 |                  # the name of the
 23 |                  init_relationship=None):
 24 |         self.ops = self.AttributeOps(self)
 25 | 
 26 |         if population.size == 0:
 27 |             self._table = pd.DataFrame(columns=["value"])
 28 | 
 29 |         elif init_relationship is None:
 30 |             if not ((init_values is None) ^ (init_gen is None)):
 31 |                 raise ValueError("if init_relationship is not provided, "
 32 |                                  "you must also provide init_values or "
 33 |                                  "init_values_gen")
 34 | 
 35 |             elif init_values is None:
 36 |                 init_values = init_gen.generate(size=population.size)
 37 | 
 38 |             if type(init_values) == pd.Series:
 39 |                 logging.warn("  Trying to create attribute with a series "
 40 |                              "but indices will be lost.")
 41 |                 init_values = init_values.tolist()
 42 | 
 43 |             self._table = pd.DataFrame({"value": init_values}, index=population.ids)
 44 | 
 45 |         else:
 46 |             if init_relationship is None:
 47 |                 raise ValueError("must provide either ids or relationship to "
 48 |                                  "initialize the attribute")
 49 | 
 50 |             self._table = population.get_relationship(init_relationship).select_one()
 51 |             self._table.set_index("from", drop=True, inplace=True)
 52 |             self._table.rename(columns={"to": "value"}, inplace=True)
 53 | 
 54 |     def get_values(self, ids=None):
 55 |         """
 56 |         :param ids: members ids for which the attribute values are desired
 57 |         :return: the current attribute values for those members, as Series
 58 |         """
 59 |         if ids is None:
 60 |             return self._table["value"]
 61 |         else:
 62 |             return self._table.loc[ids]["value"]
 63 | 
 64 |     def update(self, series):
 65 |         """
 66 |         updates or adds values of this attributes from the values of the provided
 67 |         series, using its index as member id
 68 |         """
 69 |         self._table = self._table.reindex(self._table.index | series.index)
 70 |         self._table.loc[series.index, "value"] = series.values
 71 | 
 72 |     def add(self, ids, added_values):
 73 |         """
 74 |         This only makes sense for attributes that support a + operation (e.g. numerical values or list)
 75 |         : this simply performs a += operation
 76 |         """
 77 |         assert len(ids) == len(added_values)
 78 | 
 79 |         # putting together any add to the same attribute id
 80 |         to_add = pd.Series(added_values, index=ids).groupby(level=0).agg(sum)
 81 | 
 82 |         self._table.loc[to_add.index, "value"] = self._table.loc[to_add.index, "value"] + to_add
 83 | 
 84 |     def transform_inplace(self, f):
 85 |         """
 86 |         transform the values of this attribute inplace with f
 87 |         """
 88 |         self._table["value"] = self._table["value"].map(f)
 89 | 
 90 |     ############
 91 |     # IO
 92 |     def save_to(self, file_path):
 93 |         logging.info("saving attribute to {}".format(file_path))
 94 |         self._table.to_csv(file_path)
 95 | 
 96 |     @staticmethod
 97 |     def load_from(file_path):
 98 |         table = pd.read_csv(file_path, index_col=0)
 99 | 
100 |         # we're basically hacking our own constructor, feeding it fake data
101 |         # so it's initialized correctly.
102 |         #
103 |         # Don't do that outside this class!
104 |         class FakePopulation(object):
105 |             def __init__(self):
106 |                 self.size = table.shape[0]
107 |                 self.ids = table.index
108 | 
109 |         return Attribute(population=FakePopulation(), init_values=table["value"])
110 | 
111 |     ############
112 |     # operations
113 | 
114 |     class AttributeOps(object):
115 |         def __init__(self, attribute):
116 |             self.attribute = attribute
117 | 
118 |         class Update(SideEffectOnly):
119 |             def __init__(self, attribute, member_id_field, copy_from_field):
120 |                 self.attribute = attribute
121 |                 self.copy_from_field = copy_from_field
122 |                 self.member_id_field = member_id_field
123 | 
124 |             def side_effect(self, story_data):
125 |                 if story_data.shape[0] > 0:
126 |                     update_series = pd.Series(
127 |                         data=story_data[self.copy_from_field].values,
128 |                         index=story_data[self.member_id_field].values)
129 |                     self.attribute.update(update_series)
130 | 
131 |         def update(self, member_id_field, copy_from_field):
132 |             """
133 |             Overwrite the value of this attribute with values in this field
134 | 
135 |             :param member_id_field: name of the field of the story data
136 |                 containing the member ids whose attribute should be updated
137 |             :param copy_from_field: name of the field of the story data
138 |                 containing the new values of the attribute
139 |             :return:
140 |             """
141 |             return self.Update(self.attribute, member_id_field,
142 |                                copy_from_field)
143 | 
144 |         class Add(SideEffectOnly):
145 |             def __init__(self, attribute, member_id_field,
146 |                          added_value_field, subtract):
147 |                 self.attribute = attribute
148 |                 self.added_value_field = added_value_field
149 |                 self.member_id_field = member_id_field
150 |                 self.subtract = subtract
151 | 
152 |             def side_effect(self, story_data):
153 |                 if story_data.shape[0] > 0:
154 | 
155 |                     values = story_data[self.added_value_field].values
156 |                     if self.subtract:
157 |                         values = -values
158 | 
159 |                     self.attribute.add(
160 |                         ids=story_data[self.member_id_field].values,
161 |                         added_values=values)
162 | 
163 |         def add(self, member_id_field, added_value_field):
164 |             return self.Add(self.attribute, member_id_field, added_value_field, subtract=False)
165 | 
166 |         def subtract(self, member_id_field, subtracted_value_field):
167 |             return self.Add(self.attribute, member_id_field, subtracted_value_field, subtract=True)
168 | 


--------------------------------------------------------------------------------
/trumania/components/geographies/uganda.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is just an illustration of how to persist various scenario components
  3 | """
  4 | import logging
  5 | import pandas as pd
  6 | 
  7 | from trumania.core import operations
  8 | from trumania.components import db
  9 | from trumania.core.circus import Circus
 10 | from trumania.core.util_functions import make_random_assign, setup_logging
 11 | from trumania.core.random_generators import NumpyRandomGenerator, ParetoGenerator, seed_provider, SequencialGenerator
 12 | from trumania.core.random_generators import FakerGenerator
 13 | from trumania.core.clock import CyclicTimerGenerator, CyclicTimerProfile
 14 | 
 15 | 
 16 | def build_unhealthy_level_gen(seed):
 17 |     return NumpyRandomGenerator(method="beta", a=1, b=999, seed=seed)
 18 | 
 19 | 
 20 | def build_healthy_level_gen(seed):
 21 |     return NumpyRandomGenerator(method="beta", a=1, b=999, seed=seed)
 22 | 
 23 | 
 24 | class WithUganda(Circus):
 25 | 
 26 |     def add_uganda_geography(self, force_build=False):
 27 |         """
 28 |         Loads the cells definition from Uganda + adds 2 stories to control
 29 |         """
 30 |         logging.info(" adding Uganda Geography")
 31 |         seeder = seed_provider(12345)
 32 | 
 33 |         if force_build:
 34 |             uganda_cells, uganda_cities, timer_config = build_uganda_populations(
 35 |                 self)
 36 | 
 37 |         else:
 38 |             uganda_cells = db.load_population(namespace="uganda", population_id="cells")
 39 |             uganda_cities = db.load_population(namespace="uganda", population_id="cities")
 40 |             timer_config = db.load_timer_gen_config("uganda",
 41 |                                                     "cell_repair_timer_profile")
 42 | 
 43 |         repair_n_fix_timer = CyclicTimerGenerator(
 44 |             clock=self.clock,
 45 |             seed=next(self.seeder),
 46 |             config=timer_config)
 47 | 
 48 |         unhealthy_level_gen = build_unhealthy_level_gen(next(seeder))
 49 |         healthy_level_gen = build_healthy_level_gen(next(seeder))
 50 | 
 51 |         # tendency is inversed in case of broken cell: it's probability of
 52 |         # accepting a call is much lower
 53 | 
 54 |         # same profiler for breakdown and repair: they are both related to
 55 |         # typical human activity
 56 | 
 57 |         logging.info(" adding Uganda Geography6")
 58 |         cell_break_down_story = self.create_story(
 59 |             name="cell_break_down",
 60 | 
 61 |             initiating_population=uganda_cells,
 62 |             member_id_field="CELL_ID",
 63 | 
 64 |             timer_gen=repair_n_fix_timer,
 65 | 
 66 |             # fault activity is very low: most cell tend never to break down (
 67 |             # hopefully...)
 68 |             activity_gen=ParetoGenerator(xmin=5, a=1.4, seed=next(self.seeder))
 69 |         )
 70 | 
 71 |         cell_repair_story = self.create_story(
 72 |             name="cell_repair_down",
 73 | 
 74 |             initiating_population=uganda_cells,
 75 |             member_id_field="CELL_ID",
 76 | 
 77 |             timer_gen=repair_n_fix_timer,
 78 | 
 79 |             # repair activity is much higher
 80 |             activity_gen=ParetoGenerator(xmin=100, a=1.2,
 81 |                                          seed=next(self.seeder)),
 82 | 
 83 |             # repair is not re-scheduled at the end of a repair, but only triggered
 84 |             # from a "break-down" story
 85 |             auto_reset_timer=False
 86 |         )
 87 | 
 88 |         cell_break_down_story.set_operations(
 89 |             unhealthy_level_gen.ops.generate(named_as="NEW_HEALTH_LEVEL"),
 90 | 
 91 |             uganda_cells.get_attribute("HEALTH").ops.update(
 92 |                 member_id_field="CELL_ID",
 93 |                 copy_from_field="NEW_HEALTH_LEVEL"),
 94 | 
 95 |             cell_repair_story.ops.reset_timers(member_id_field="CELL_ID"),
 96 |             self.clock.ops.timestamp(named_as="TIME"),
 97 | 
 98 |             operations.FieldLogger(log_id="cell_status",
 99 |                                    cols=["TIME", "CELL_ID",
100 |                                          "NEW_HEALTH_LEVEL"]),
101 |         )
102 | 
103 |         cell_repair_story.set_operations(
104 |             healthy_level_gen.ops.generate(named_as="NEW_HEALTH_LEVEL"),
105 | 
106 |             uganda_cells.get_attribute("HEALTH").ops.update(
107 |                 member_id_field="CELL_ID",
108 |                 copy_from_field="NEW_HEALTH_LEVEL"),
109 | 
110 |             self.clock.ops.timestamp(named_as="TIME"),
111 | 
112 |             # note that both stories are contributing to the same
113 |             # "cell_status" log
114 |             operations.FieldLogger(log_id="cell_status",
115 |                                    cols=["TIME", "CELL_ID",
116 |                                          "NEW_HEALTH_LEVEL"]),
117 |         )
118 | 
119 |         return uganda_cells, uganda_cities
120 | 
121 | 
122 | def build_uganda_populations(circus):
123 | 
124 |     seeder = seed_provider(12345)
125 | 
126 |     cells = circus.create_population(name="cells",
127 |                                      ids_gen=SequencialGenerator(prefix="CELL_"),
128 |                                      size=200)
129 |     latitude_generator = FakerGenerator(method="latitude",
130 |                                         seed=next(seeder))
131 |     cells.create_attribute("latitude", init_gen=latitude_generator)
132 | 
133 |     longitude_generator = FakerGenerator(method="longitude",
134 |                                          seed=next(seeder))
135 |     cells.create_attribute("longitude", init_gen=longitude_generator)
136 | 
137 |     # the cell "health" is its probability of accepting a call. By default
138 |     # let's says it's one expected failure every 1000 calls
139 |     healthy_level_gen = build_healthy_level_gen(next(seeder))
140 | 
141 |     cells.create_attribute(name="HEALTH", init_gen=healthy_level_gen)
142 | 
143 |     city_gen = FakerGenerator(method="city", seed=next(seeder))
144 |     cities_values = pd.unique(city_gen.generate(500))[:200]
145 |     cities = circus.create_population(name="cities", ids=cities_values)
146 | 
147 |     cell_city_rel = cities.create_relationship("CELLS")
148 | 
149 |     cell_city_df = make_random_assign(cells.ids, cities.ids, next(seeder))
150 |     cell_city_rel.add_relations(
151 |         from_ids=cell_city_df["chosen_from_set2"],
152 |         to_ids=cell_city_df["set1"])
153 | 
154 |     pop_gen = ParetoGenerator(xmin=10000, a=1.4, seed=next(seeder))
155 |     cities.create_attribute("population", init_gen=pop_gen)
156 | 
157 |     timer_config = CyclicTimerProfile(
158 |         profile=[1, .5, .2, .15, .2, .4, 3.8,
159 |                  7.2, 8.4, 9.1, 9.0, 8.3, 8.1,
160 |                  7.7, 7.4, 7.8, 8.0, 7.9, 9.7,
161 |                  10.4, 10.5, 8.8, 5.7, 2.8],
162 |         profile_time_steps="1h",
163 |         start_date=pd.Timestamp("6 June 2016 00:00:00"))
164 | 
165 |     return cells, cities, timer_config
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     # This is meant to be executed only once, to create the data on disk.
170 | 
171 |     # Note: using generators and persisting the result could make sense
172 |     # if such generation is costly or for facilitating reproduceability,
173 |     # though a more common use cas might be to build such Populations  and
174 |     # relationship from empirical exploration of a dataset.
175 | 
176 |     # Note2: only the "static" properties of an environment are saved here,
177 |     # whereas the "dynamic parts" (e.g. stories) are stored "in code", i.e.
178 |     # in the withXYZ() class above that then need to be mixed in a Circus.
179 | 
180 |     setup_logging()
181 | 
182 |     cells, cities, timer_config = build_uganda_populations()
183 | 
184 |     db.remove_namespace("uganda")
185 |     db.save_population(population=cells, namespace="uganda", population_id="cells")
186 |     db.save_population(population=cities, namespace="uganda", population_id="cities")
187 | 
188 |     db.save_timer_gen(timer_gen=timer_config, namespace="uganda",
189 |                       timer_gen_id="cell_repair_timer_profile")
190 | 


--------------------------------------------------------------------------------
/trumania/core/util_functions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Collection of utility functions
  3 | """
  4 | 
  5 | from numpy.random import RandomState
  6 | import pandas as pd
  7 | import numpy as np
  8 | import os
  9 | import functools
 10 | from networkx.algorithms import bipartite
 11 | import logging
 12 | 
 13 | 
 14 | def make_random_bipartite_data(group1, group2, p, seed):
 15 |     """
 16 | 
 17 |     :type group1: list
 18 |     :param group1: Ids of first group
 19 |     :type group2: list
 20 |     :param group2: Ids of second group
 21 |     :type p: float
 22 |     :param p: probability of existence of 1 edge
 23 |     :type seed: int
 24 |     :param seed: seed for random generator
 25 |     :rtype: list
 26 |     :return: all edges in the graph
 27 |     """
 28 |     logging.info("  creating a bipartite graph between {} items in group1, {} "
 29 |                  "items in group2 and edge probability {}".format(
 30 |                     len(group1), len(group2), p))
 31 | 
 32 |     if len(group1) == 0 or len(group2) == 0 or p == 0:
 33 |         return []
 34 | 
 35 |     bp = pd.DataFrame.from_records(list(bipartite.random_graph(len(group1), len(group2), p, seed).edges()),
 36 |                                    columns=["from", "to"])
 37 |     logging.info("  (bipartite index created, now resolving item values)")
 38 | 
 39 |     # as all "to" nodes are from the second group,
 40 |     # but numbered by networkx in range(len(group1),len(group1)+len(group2))
 41 |     # we need to deduct len(group1) to have proper indexes.
 42 |     bp["to"] -= len(group1)
 43 | 
 44 |     bp["from"] = bp.apply(lambda x: group1[x["from"]], axis=1)
 45 |     bp["to"] = bp.apply(lambda x: group2[x["to"]], axis=1)
 46 |     logging.info("  (resolution done, now converting to tuples)")
 47 |     out = [tuple(x) for x in bp.to_records(index=False)]
 48 |     logging.info("  (exiting bipartite)")
 49 |     return out
 50 | 
 51 | 
 52 | def assign_random_proportions(name1, name2, group1, group2, seed):
 53 | 
 54 |     state = RandomState(seed)
 55 |     assignments = state.rand(len(group1), len(group2))
 56 |     assignments = assignments / assignments.sum(axis=1, keepdims=True)
 57 |     data = pd.DataFrame(assignments, index=group1,
 58 |                         columns=group2).stack().reset_index(level=[0, 1])
 59 |     data.rename(columns={"level_0": name1,
 60 |                          "level_1": name2,
 61 |                          0: "weight"},
 62 |                 inplace=True)
 63 |     return data
 64 | 
 65 | 
 66 | def make_random_assign(set1, set2, seed):
 67 |     """Assign randomly a member of set2 to each member of set1
 68 |       :return: a dataframe with as many rows as set1
 69 |     """
 70 |     chosen_froms = RandomState(seed).choice(set2, size=len(set1))
 71 |     return pd.DataFrame({"set1": set1, "chosen_from_set2": chosen_froms})
 72 | 
 73 | 
 74 | def merge_2_dicts(dict1, dict2, value_merge_func=None):
 75 |     """
 76 |     :param dict1: first dictionary to be merged
 77 |     :param dict2: first dictionary to be merged
 78 |     :param value_merge_func: specifies how to merge 2 values if present in
 79 |     both dictionaries
 80 |     :type value_merge_func: function (value1, value) => value
 81 |     :return:
 82 |     """
 83 |     if dict1 is None and dict2 is None:
 84 |         return {}
 85 | 
 86 |     if dict2 is None:
 87 |         return dict1
 88 | 
 89 |     if dict1 is None:
 90 |         return dict2
 91 | 
 92 |     def merged_value(key):
 93 |         if key not in dict1:
 94 |             return dict2[key]
 95 |         elif key not in dict2:
 96 |             return dict1[key]
 97 |         else:
 98 |             if value_merge_func is None:
 99 |                 raise ValueError(
100 |                     "Conflict in merged dictionaries: merge function not "
101 |                     "provided but key {} exists in both dictionaries".format(
102 |                         key))
103 | 
104 |             return value_merge_func(dict1[key], dict2[key])
105 | 
106 |     keys = set(dict1.keys()) | set(dict2.keys())
107 | 
108 |     return {key: merged_value(key) for key in keys}
109 | 
110 | 
111 | def df_concat(d1, d2):
112 |     return pd.concat([d1, d2], ignore_index=True, copy=False)
113 | 
114 | 
115 | def merge_dicts(dicts, merge_func=None):
116 |     """
117 |     :param dicts: list of dictionnaries to be merged
118 |     :type dicts: list[dict]
119 |     :param merge_func:
120 |     :type merge_func: function
121 |     :return: one single dictionary containing all entries received
122 |     """
123 |     from itertools import tee
124 | 
125 |     # check if the input list or iterator is empty
126 |     dict_backup, test = tee(iter(dicts))
127 |     try:
128 |         next(test)
129 |     except StopIteration:
130 |         return {}
131 | 
132 |     return functools.reduce(lambda d1, d2: merge_2_dicts(d1, d2, merge_func), dict_backup)
133 | 
134 | 
135 | def setup_logging():
136 |     logging.basicConfig(
137 |         format='%(asctime)s %(message)s',
138 |         level=logging.INFO)
139 | 
140 | 
141 | # stolen from http://stackoverflow.com/questions/1835018/
142 | # python-check-if-an-object-is-a-list-or-tuple-but-not-string#answer-1835259
143 | def is_sequence(arg):
144 |     return type(arg) is list or type(arg) is tuple or type(arg) is set
145 | 
146 | 
147 | def build_ids(size, id_start=0, prefix="id_", max_length=10):
148 |     """
149 |     builds a sequencial list of string ids of specified size
150 |     """
151 |     return [prefix + str(x).zfill(max_length)
152 |             for x in np.arange(id_start, id_start + size)]
153 | 
154 | 
155 | def log_dataframe_sample(msg, df):
156 | 
157 |     if df.shape[0] == 0:
158 |         logging.info("{}:  [empty]".format(msg))
159 |     else:
160 |         logging.info("{}: \n  {}".format(msg, df.sample(min(df.shape[0], 15))))
161 | 
162 | 
163 | def cap_to_total(values, target_total):
164 |     """
165 |     return a copy of values with the largest values possible s.t.:
166 |        - all return values are <= the original ones
167 |        - their sum is == total
168 |        -
169 |     """
170 | 
171 |     excedent = np.sum(values) - target_total
172 |     if excedent <= 0:
173 |         return values
174 |     elif values[-1] >= excedent:
175 |         return values[:-1] + [values[-1] - excedent]
176 |     else:
177 |         return cap_to_total(values[:-1], target_total) + [0]
178 | 
179 | 
180 | def ensure_folder_exists(folder):
181 |     if not os.path.exists(folder):
182 |         os.makedirs(folder)
183 | 
184 | 
185 | def ensure_non_existing_dir(folder):
186 |     """
187 |     makes sure the specified directory does not exist, potentially deleting
188 |     any file or folder it contains
189 |     """
190 | 
191 |     if not os.path.exists(folder):
192 |         return
193 | 
194 |     if os.path.isfile(folder):
195 |         os.remove(folder)
196 | 
197 |     else:
198 |         for f in os.listdir(folder):
199 |             full_path = os.path.join(folder, f)
200 |             ensure_non_existing_dir(full_path)
201 |         os.rmdir(folder)
202 | 
203 | 
204 | def latest_date_before(starting_date, upper_bound, time_step):
205 |     """
206 |     Looks for the latest result_date s.t
207 | 
208 |         result_date = starting_date + n * time_step     for any integer n
209 |         result_date <= upper_bound
210 | 
211 |     :type starting_date: pd.Timestamp
212 |     :type upper_bound: pd.Timestamp
213 |     :type time_step: pd.Timedelta
214 |     :return: pd.Timestamp
215 |     """
216 | 
217 |     result = starting_date
218 | 
219 |     while result > upper_bound:
220 |         result -= time_step
221 | 
222 |     while upper_bound - result >= time_step:
223 |         result += time_step
224 | 
225 |     return result
226 | 
227 | 
228 | def load_all_logs(folder):
229 |     """
230 |     loads all csv file contained in this folder and retun them as one
231 |     dictionary where the key is the filename without the extension
232 |     """
233 | 
234 |     all_logs = {}
235 | 
236 |     for file_name in os.listdir(folder):
237 |         full_path = os.path.join(folder, file_name)
238 |         logs = pd.read_csv(full_path, index_col=None)
239 |         log_id = file_name[:-4]
240 | 
241 |         all_logs[log_id] = logs
242 | 
243 |     return all_logs
244 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_util_functions.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import functools
  3 | 
  4 | from trumania.core.util_functions import merge_2_dicts, merge_dicts, is_sequence, make_random_assign, cap_to_total
  5 | from trumania.core.util_functions import build_ids, latest_date_before, bipartite, make_random_bipartite_data
  6 | 
  7 | 
  8 | def test_merge_two_empty_dict_should_return_empty_dict():
  9 |     assert {} == merge_2_dicts({}, {})
 10 | 
 11 | 
 12 | def test_merge_two_none_dict_should_return_empty_dict():
 13 |     assert {} == merge_2_dicts(None, None)
 14 | 
 15 | 
 16 | def test_merging_one_dict_with_none_should_yield_dict():
 17 |     d1 = {"a": 1, "b": 2}
 18 |     assert d1 == merge_2_dicts(d1, None)
 19 | 
 20 | 
 21 | def test_merging_none_with_one_dict_should_yield_dict():
 22 |     d2 = {"a": 1, "b": 2}
 23 |     assert d2 == merge_2_dicts(None, d2)
 24 | 
 25 | 
 26 | def test_merge_empty_with_dict_should_return_itself():
 27 | 
 28 |     d1 = {"a": 1, "b": 2}
 29 |     assert d1 == merge_2_dicts(d1, {})
 30 |     assert d1 == merge_2_dicts({}, d1)
 31 | 
 32 | 
 33 | def test_merge_non_overlapping_dict_should_return_all_values():
 34 | 
 35 |     d1 = {"a": 1, "b": 2}
 36 |     d2 = {"c": 3, "d": 4}
 37 |     assert {"a": 1, "b": 2, "c": 3, "d": 4} == merge_2_dicts(d1, d2)
 38 | 
 39 | 
 40 | def test_merge_dict_to_itself_should_return_doubled_values():
 41 | 
 42 |     d1 = {"a": 1, "b": 2}
 43 |     assert {"a": 2, "b": 4} == merge_2_dicts(d1, d1, lambda a, b: a + b)
 44 | 
 45 | 
 46 | def test_merging_one_dictionary_should_yield_itself():
 47 |     d1 = {"a": 1, "b": 2}
 48 |     assert d1 == merge_dicts([d1], lambda a, b: a + b)
 49 | 
 50 | 
 51 | def test_merging_an_empty_list_of_dicts_should_yield_empty_dict():
 52 |     assert {} == merge_dicts([])
 53 | 
 54 | 
 55 | def test_merging_an_empty_gen_of_dicts_should_yield_empty_dict():
 56 |     emtpy_gen = ({"a": 1} for _ in [])
 57 |     assert {} == merge_dicts(emtpy_gen)
 58 | 
 59 | 
 60 | def test_merging_many_dictionary_should_yield_expected_result():
 61 |     d1 = {"a": 10, "b": 20}
 62 |     d2 = {"a": 100, "c": 30}
 63 |     d3 = {}
 64 |     d4 = {"b": 200, "z": 1000}
 65 |     d5 = {"z": -10}
 66 | 
 67 |     merged = merge_dicts([d1, d2, d3, d4, d5], lambda a, b: a + b)
 68 | 
 69 |     assert {"a": 110, "b": 220, "c": 30, "z": 990} == merged
 70 | 
 71 | 
 72 | def test_merging_many_dictionary_from_gen_should_yield_expected_result():
 73 |     ds = [{"a": 10, "b": 20},
 74 |           {"a": 100, "c": 30},
 75 |           {},
 76 |           {"b": 200, "z": 1000},
 77 |           {"z": -10}]
 78 | 
 79 |     dicts_gens = (d for d in ds)
 80 | 
 81 |     merged = merge_dicts(dicts_gens, lambda a, b: a + b)
 82 | 
 83 |     assert {"a": 110, "b": 220, "c": 30, "z": 990} == merged
 84 | 
 85 | 
 86 | def test_is_sequence():
 87 |     assert is_sequence([])
 88 |     assert is_sequence([1, 2, 3, 1])
 89 |     assert is_sequence({1, 2, 3, 1})
 90 |     assert not is_sequence(1)
 91 |     assert not is_sequence("hello")
 92 | 
 93 | 
 94 | def test_make_random_assign_shoud_assign_each_element_only_once():
 95 | 
 96 |     dealers = build_ids(size=10, prefix="DEALER_", max_length=2)
 97 |     sims = build_ids(size=1000, prefix="SIM_", max_length=4)
 98 | 
 99 |     assignment = make_random_assign(set1=sims, set2=dealers, seed=10)
100 | 
101 |     # all sims should have been assigned
102 |     assert assignment.shape == (1000, 2)
103 | 
104 |     # all SIM should have been given
105 |     assert set(assignment["set1"].unique().tolist()) == set(sims)
106 | 
107 |     # all owners should be part of the dealers
108 |     assert set(assignment["chosen_from_set2"].unique().tolist()) <= set(dealers)
109 | 
110 | 
111 | def test_cap_to_total_should_leave_untouched_values_below_target():
112 |     assert [10, 20, 30] == cap_to_total([10, 20, 30], target_total=100)
113 | 
114 | 
115 | def test_cap_to_total_should_leave_untouched_equal_to_target():
116 |     assert [50, 40, 20] == cap_to_total([50, 40, 20], target_total=110)
117 | 
118 | 
119 | def test_cap_to_total_should_lower_last_correctly():
120 |     assert [50, 40, 5] == cap_to_total([50, 40, 20], target_total=95)
121 | 
122 | 
123 | def test_cap_to_total_should_zero_last_correctly():
124 |     assert [50, 40, 0] == cap_to_total([50, 40, 20], target_total=90)
125 | 
126 | 
127 | def test_cap_to_total_should_zero_several_correctly():
128 |     assert [38, 0, 0] == cap_to_total([50, 40, 20], target_total=38)
129 | 
130 | 
131 | def test_latest_date_before_should_return_input_if_within_range():
132 | 
133 |     starting_date = pd.Timestamp("6 June 2016")
134 |     upper_bound = pd.Timestamp("8 June 2016")
135 |     time_step = pd.Timedelta("7D")
136 | 
137 |     result = latest_date_before(starting_date, upper_bound, time_step)
138 | 
139 |     assert result == starting_date
140 | 
141 | 
142 | def test_latest_date_before_should_return_input_if_start_equals_ub():
143 | 
144 |     starting_date = pd.Timestamp("8 June 2016")
145 |     upper_bound = pd.Timestamp("8 June 2016")
146 |     time_step = pd.Timedelta("7D")
147 | 
148 |     result = latest_date_before(starting_date, upper_bound, time_step)
149 | 
150 |     assert result == starting_date
151 | 
152 | 
153 | def test_latest_date_before_should_shift_backward_ne_week_input_as_required():
154 | 
155 |     starting_date = pd.Timestamp("10 June 2016")
156 |     expected_date = pd.Timestamp("3 June 2016")
157 |     upper_bound = pd.Timestamp("8 June 2016")
158 |     time_step = pd.Timedelta("7D")
159 | 
160 |     result = latest_date_before(starting_date, upper_bound, time_step)
161 | 
162 |     assert result == expected_date
163 | 
164 | 
165 | def test_latest_date_before_should_shift_backward_n_weeks_input_as_required():
166 | 
167 |     starting_date = pd.Timestamp("10 June 2016")
168 |     expected_date = pd.Timestamp("25 March 2016")
169 |     upper_bound = pd.Timestamp("31 March 2016")
170 |     time_step = pd.Timedelta("7D")
171 | 
172 |     result = latest_date_before(starting_date, upper_bound, time_step)
173 | 
174 |     assert result == expected_date
175 | 
176 | 
177 | def test_latest_date_before_should_shift_forward_n_weeks_input_as_required():
178 | 
179 |     starting_date = pd.Timestamp("10 June 2016")
180 |     expected_date = pd.Timestamp("27 January 2017")
181 |     upper_bound = pd.Timestamp("29 January 2017")
182 |     time_step = pd.Timedelta("7D")
183 | 
184 |     result = latest_date_before(starting_date, upper_bound, time_step)
185 | 
186 |     assert result == expected_date
187 | 
188 | 
189 | def test_latest_date_before_should_shift_forward_until_upper_bound():
190 | 
191 |     # here the upper bound IS the expected date => makes sure we go up to
192 |     # thsi ons
193 |     starting_date = pd.Timestamp("10 June 2016")
194 |     upper_bound = pd.Timestamp("24 June 2016")
195 |     time_step = pd.Timedelta("7D")
196 | 
197 |     result = latest_date_before(starting_date, upper_bound, time_step)
198 | 
199 |     assert result == upper_bound
200 | 
201 | 
202 | def test_if_networkx_bipartite_keeps_actual_structure():
203 | 
204 |     # Currently, Netorkx.bipartite returns bipartite networks where the first node
205 |     # is always in the first group, and the second node is always in the second group
206 |     RB = bipartite.random_graph(5, 10, 0.9, 1234)
207 | 
208 |     assert functools.reduce(lambda x, y: x & y, [e[0] < 5 for e in RB.edges()])
209 | 
210 | 
211 | def test_random_bipartite_network_generation_returns_empty_list_if_first_entry_is_empty():
212 | 
213 |     assert [] == make_random_bipartite_data([], [1, 2], 1., 1234)
214 | 
215 | 
216 | def test_random_bipartite_network_generation_returns_empty_list_if_second_entry_is_empty():
217 | 
218 |     assert [] == make_random_bipartite_data([1, 2], [], 1., 1234)
219 | 
220 | 
221 | def test_random_bipartite_network_generation_returns_empty_list_if_both_entries_are_empty():
222 | 
223 |     assert [] == make_random_bipartite_data([], [], 1., 1234)
224 | 
225 | 
226 | def test_random_bipartite_network_generation_returns_empty_list_if_prob_is_zero():
227 | 
228 |     assert [] == make_random_bipartite_data([1, 2], [5, 6], 0., 1234)
229 | 
230 | 
231 | def test_random_bipartite_network_generation_returns_bipartite_network():
232 | 
233 |     all_edges = [(1, 5), (1, 6), (2, 5), (2, 6)]
234 |     bp = make_random_bipartite_data([1, 2], [5, 6], 1., 1234)
235 | 
236 |     assert functools.reduce(lambda x, y: x & y, [e in bp for e in all_edges])
237 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_operations.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | import tests.mocks.operations as mockops
  5 | from trumania.core import operations
  6 | from trumania.core.util_functions import build_ids
  7 | 
  8 | 
  9 | def test_apply_should_delegate_to_single_col_dataframe_function_correctly():
 10 | 
 11 |     # some function that expect a dataframe as input => must return
 12 |     # dataframe with "result" column
 13 |     def f(df):
 14 |         return pd.DataFrame({"result": df["A"] + df["D"] - df["C"]})
 15 | 
 16 |     tested = operations.Apply(source_fields=["A", "C", "D"],
 17 |                               named_as="r",
 18 |                               f=f, f_args="dataframe")
 19 | 
 20 |     story_data = pd.DataFrame(
 21 |         np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"])
 22 | 
 23 |     result = tested.build_output(story_data)
 24 | 
 25 |     assert result["r"].equals(story_data["A"] + story_data["D"] - story_data[
 26 |         "C"])
 27 | 
 28 | 
 29 | def test_apply_should_delegate_to_multi_col_dataframe_function_correctly():
 30 | 
 31 |     # now f returns several columns
 32 |     def f(df):
 33 |         return pd.DataFrame({
 34 |             "r1": df["A"] + df["D"] - df["C"],
 35 |             "r2": df["A"] + df["C"],
 36 |             "r3": df["A"] * df["C"],
 37 |         })
 38 | 
 39 |     tested = operations.Apply(source_fields=["A", "C", "D"],
 40 |                               named_as=["op1", "op2", "op3"],
 41 |                               f=f, f_args="dataframe")
 42 | 
 43 |     story_data = pd.DataFrame(
 44 |         np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"])
 45 | 
 46 |     result = tested.transform(story_data)
 47 |     assert result.columns.tolist() == ["A", "B", "C", "D", "E", "op1", "op2",
 48 |                                        "op3"]
 49 | 
 50 |     assert result["op1"].equals(
 51 |         story_data["A"] + story_data["D"] - story_data["C"])
 52 |     assert result["op2"].equals(
 53 |         story_data["A"] + story_data["C"])
 54 |     assert result["op3"].equals(
 55 |         story_data["A"] * story_data["C"])
 56 | 
 57 | 
 58 | def test_apply_should_delegate_to_columns_function_correctly():
 59 |     """
 60 |         same as the above, but this time f input and output arguments are
 61 |         pandas Series
 62 |     """
 63 | 
 64 |     def f(ca, cc, cd):
 65 |         return ca + cd - cc
 66 | 
 67 |     tested = operations.Apply(source_fields=["A", "C", "D"],
 68 |                               named_as="r",
 69 |                               f=f, f_args="series")
 70 | 
 71 |     story_data = pd.DataFrame(
 72 |         np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"])
 73 | 
 74 |     result = tested.build_output(story_data)
 75 | 
 76 |     assert result["r"].equals(
 77 |         story_data["A"] + story_data["D"] - story_data["C"])
 78 | 
 79 | 
 80 | def test_one_execution_should_merge_empty_data_correctly():
 81 | 
 82 |     # empty previous
 83 |     prev_df = pd.DataFrame(columns=[])
 84 |     prev_log = {}
 85 |     nop = operations.Operation()
 86 | 
 87 |     output, logs = operations.Chain._execute_operation((prev_df, prev_log), nop)
 88 | 
 89 |     assert logs == {}
 90 |     assert output.equals(prev_df)
 91 | 
 92 | 
 93 | def test_one_execution_should_merge_one_op_with_nothing_into_one_result():
 94 | 
 95 |     # empty previous
 96 |     prev = pd.DataFrame(columns=[]), {}
 97 | 
 98 |     cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"])
 99 |     input = pd.DataFrame(np.random.rand(10, 2), columns=["C", "D"])
100 |     op = mockops.FakeOp(input, logs={"cdrs": cdrs})
101 | 
102 |     output, logs = operations.Chain._execute_operation(prev, op)
103 | 
104 |     assert logs == {"cdrs": cdrs}
105 |     assert input.equals(output)
106 | 
107 | 
108 | def test_one_execution_should_merge_2_ops_correctly():
109 | 
110 |     # previous results
111 |     init = pd.DataFrame(columns=[])
112 |     mobility_logs = pd.DataFrame(np.random.rand(12, 3),
113 |                                  columns=["A", "CELL", "duration"])
114 | 
115 |     cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"])
116 |     input = pd.DataFrame(np.random.rand(10, 2), columns=["C", "D"])
117 |     op = mockops.FakeOp(input, {"cdrs": cdrs})
118 | 
119 |     output, logs = operations.Chain._execute_operation(
120 |         (init, {"mobility": mobility_logs}), op)
121 | 
122 |     assert logs == {"cdrs": cdrs, "mobility": mobility_logs}
123 |     assert input.equals(output)
124 | 
125 | 
126 | def test_chain_of_3_operation_should_return_merged_logs():
127 | 
128 |     cdrs1 = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"])
129 |     op1 = mockops.FakeOp(input, {"cdrs1": cdrs1})
130 | 
131 |     cdrs2 = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"])
132 |     op2 = mockops.FakeOp(input, {"cdrs2": cdrs2})
133 | 
134 |     cdrs3 = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"])
135 |     op3 = mockops.FakeOp(input, {"cdrs3": cdrs3})
136 | 
137 |     chain = operations.Chain(op1, op2, op3)
138 | 
139 |     prev_data = pd.DataFrame(columns=[])
140 |     story_data, all_logs = chain(prev_data)
141 | 
142 |     assert set(all_logs.keys()) == {"cdrs1", "cdrs2", "cdrs3"}
143 |     assert all_logs["cdrs1"].equals(cdrs1)
144 |     assert all_logs["cdrs2"].equals(cdrs2)
145 |     assert all_logs["cdrs3"].equals(cdrs3)
146 | 
147 | 
148 | def test_drop_when_condition_is_all_false_should_have_no_impact():
149 | 
150 |     cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"])
151 |     cdrs["all_nos"] = False
152 | 
153 |     rem = operations.DropRow(condition_field="all_nos")
154 |     story_data, all_logs = rem(cdrs)
155 | 
156 |     # all rows should still be there
157 |     assert story_data.shape == (12, 4)
158 |     assert story_data.columns.tolist() == ["A", "B", "duration", "all_nos"]
159 |     assert story_data["A"].equals(cdrs["A"])
160 |     assert story_data["B"].equals(cdrs["B"])
161 |     assert story_data["duration"].equals(cdrs["duration"])
162 | 
163 | 
164 | def test_drop_when_condition_is_all_true_should_remove_everything():
165 | 
166 |     cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"])
167 |     cdrs["all_yes"] = True
168 | 
169 |     rem = operations.DropRow(condition_field="all_yes")
170 |     story_data, all_logs = rem(cdrs)
171 | 
172 |     # all rows should still be there
173 |     assert story_data.shape == (0, 4)
174 |     assert story_data.columns.tolist() == ["A", "B", "duration", "all_yes"]
175 |     assert story_data["A"].equals(pd.Series())
176 |     assert story_data["B"].equals(pd.Series())
177 |     assert story_data["duration"].equals(pd.Series())
178 | 
179 | 
180 | def test_drop_should_remove_the_rows_where_condition_is_true_():
181 |     cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"])
182 |     cdrs.index = build_ids(12, prefix="ix_", max_length=2)
183 |     cdrs["cond"] = ([True] * 3 + [False] * 3) * 2
184 | 
185 |     rem = operations.DropRow(condition_field="cond")
186 |     story_data, all_logs = rem(cdrs)
187 | 
188 |     kept_index = ["ix_03", "ix_04", "ix_05", "ix_09", "ix_10", "ix_11"]
189 | 
190 |     # 6 rows should have been removed
191 |     assert story_data.shape == (6, 4)
192 |     assert story_data.columns.tolist() == ["A", "B", "duration", "cond"]
193 |     assert story_data["A"].equals(cdrs.loc[kept_index]["A"])
194 |     assert story_data["B"].equals(cdrs.loc[kept_index]["B"])
195 |     assert story_data["duration"].equals(cdrs.loc[kept_index]["duration"])
196 | 
197 | 
198 | def test_increasing_bounded_sigmoid_must_reach_min_and_max_at_boundaries():
199 | 
200 |     freud = operations.bounded_sigmoid(x_min=2, x_max=15, shape=5,
201 |                                        incrementing=True)
202 | 
203 |     # all values before x_min should be 0
204 |     for x in np.linspace(-100, 2, 200):
205 |         assert freud(x) == 0
206 | 
207 |     # all values after x_max should be 1
208 |     for x in np.linspace(15, 100, 200):
209 |         assert freud(x) == 1
210 | 
211 |     # all values in between should be in [0,1 ]
212 |     for x in np.linspace(0, 1, 200):
213 |         assert 0 <= freud(x) <= 1
214 | 
215 | 
216 | def test_decreasing_bounded_sigmoid_must_reach_min_and_max_at_boundaries():
217 | 
218 |     freud = operations.bounded_sigmoid(x_min=2, x_max=15, shape=5,
219 |                                        incrementing=False)
220 | 
221 |     # all values before x_min should be 1
222 |     for x in np.linspace(-100, 2, 200):
223 |         assert freud(x) == 1
224 | 
225 |     # all values after x_max should be 0
226 |     for x in np.linspace(15, 100, 200):
227 |         assert freud(x) == 0
228 | 
229 |     # all values in between should be in [0,1 ]
230 |     for x in np.linspace(0, 1, 200):
231 |         assert 0 <= freud(x) <= 1
232 | 
233 | 
234 | def test_bounded_sigmoid_should_broadcast_as_a_ufunc():
235 | 
236 |     freud = operations.bounded_sigmoid(x_min=2, x_max=15, shape=5,
237 |                                        incrementing=True)
238 | 
239 |     # passing a range of x should yield a range of y's
240 |     for y in freud(np.linspace(-100, 2, 200)):
241 |         assert y == 0
242 | 
243 |     # all values after x_max should be 1
244 |     for y in freud(np.linspace(15, 100, 200)):
245 |         assert y == 1
246 | 
247 |     # all values in between should be in [0,1 ]
248 |     for y in freud(np.linspace(0, 1, 200)):
249 |         assert 0 <= y <= 1
250 | 
251 | 
252 | def test_bounding_function_should_not_modify_unbounded_values():
253 |     bound_f = operations.bound_value(lb=None, ub=None)
254 | 
255 |     for x in np.arange(-1000, 2000, 10000):
256 |         assert x == bound_f(x)
257 | 
258 | 
259 | def test_bounded_generator_should_limnit_with_lower_bound():
260 | 
261 |     bound_f = operations.bound_value(lb=15)
262 |     assert bound_f(10) == 15
263 |     assert bound_f(15) == 15
264 |     assert bound_f(20) == 20
265 | 
266 | 
267 | def test_bounded_generator_should_limnit_with_upper_bound():
268 | 
269 |     bound_f = operations.bound_value(ub=15)
270 |     assert bound_f(10) == 10
271 |     assert bound_f(15) == 15
272 |     assert bound_f(20) == 15
273 | 
274 | 
275 | def test_bounded_generator_should_limnit_with_both_bound():
276 | 
277 |     bound_f = operations.bound_value(lb=10, ub=15)
278 |     assert bound_f(5) == 10
279 |     assert bound_f(10) == 10
280 |     assert bound_f(12) == 12
281 |     assert bound_f(15) == 15
282 |     assert bound_f(20) == 15
283 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_activity.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | 
  3 | import path
  4 | import pandas as pd
  5 | import logging
  6 | import os
  7 | 
  8 | from trumania.core.util_functions import setup_logging, load_all_logs
  9 | from trumania.core.clock import CyclicTimerProfile, CyclicTimerGenerator
 10 | from trumania.core.random_generators import SequencialGenerator, NumpyRandomGenerator, ConstantGenerator
 11 | from trumania.core.circus import Circus
 12 | from trumania.core.operations import FieldLogger, bound_value
 13 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator, WorkHoursTimerGenerator
 14 | 
 15 | setup_logging()
 16 | 
 17 | 
 18 | def run_test_scenario_1(clock_step, simulation_duration,
 19 |                         n_stories, per,
 20 |                         log_folder):
 21 | 
 22 |     circus = Circus(name="tested_circus", master_seed=1,
 23 |                     start=pd.Timestamp("8 June 2016"),
 24 |                     step_duration=pd.Timedelta(clock_step))
 25 | 
 26 |     population = circus.create_population(
 27 |         name="a",
 28 |         size=1000,
 29 |         ids_gen=SequencialGenerator(max_length=3, prefix="id_"))
 30 | 
 31 |     daily_profile = CyclicTimerGenerator(
 32 |         clock=circus.clock,
 33 |         config=CyclicTimerProfile(
 34 |             profile=[1] * 24,
 35 |             profile_time_steps="1h",
 36 |             start_date=pd.Timestamp("8 June 2016")
 37 |         ),
 38 |         seed=1234)
 39 | 
 40 |     # each of the 500 populations have a constant 12 logs per day rate
 41 |     activity_gen = ConstantGenerator(
 42 |         value=daily_profile.activity(
 43 |             n=n_stories, per=per
 44 |         ))
 45 | 
 46 |     # just a dummy operation to produce some logs
 47 |     story = circus.create_story(
 48 |         name="test_story",
 49 |         initiating_population=population,
 50 |         member_id_field="some_id",
 51 |         timer_gen=daily_profile,
 52 |         activity_gen=activity_gen)
 53 | 
 54 |     story.set_operations(
 55 |         circus.clock.ops.timestamp(named_as="TIME"),
 56 |         FieldLogger(log_id="the_logs")
 57 |     )
 58 | 
 59 |     circus.run(duration=pd.Timedelta(simulation_duration), log_output_folder=log_folder)
 60 | 
 61 | 
 62 | def test_1000_populations_with_activity_12perday_should_yield_24k_logs_in_2days():
 63 |     """
 64 |     this is a "high frequency test", where the number of stories per cycle (
 65 |     i.e. per day here) is largely below 1 => the cyclic generator should
 66 |     typically generate timers smaller than the length of the cycle
 67 |     """
 68 | 
 69 |     with tempfile.TemporaryDirectory() as log_parent_folder:
 70 |         log_folder = os.path.join(log_parent_folder, "logs")
 71 | 
 72 |         run_test_scenario_1(clock_step="15 min",
 73 |                             simulation_duration="2 days",
 74 |                             n_stories=12,
 75 |                             per=pd.Timedelta("1day"),
 76 |                             log_folder=log_folder)
 77 | 
 78 |         logging.info("loading produced logs")
 79 |         logs = load_all_logs(log_folder)["the_logs"]
 80 | 
 81 |         # 2 days of simulation should produce 1000 * 12 * 2 == 24k logs
 82 |         logging.info("number of produced logs: {} logs".format(logs.shape[0]))
 83 |         assert 22e3 <= logs.shape[0] <= 26e3
 84 | 
 85 | 
 86 | def test_1000_populations_with_activity_12perday_should_yield_60k_logs_in_5days():
 87 |     """
 88 |     same test as above, with bigger clock step => typically more "rounding
 89 |     errors", and longer total simulation duration
 90 |     """
 91 | 
 92 |     with tempfile.TemporaryDirectory() as log_parent_folder:
 93 |         log_folder = os.path.join(log_parent_folder, "logs")
 94 | 
 95 |         # note that we cannot have clock_step > 2h since that
 96 |         run_test_scenario_1(clock_step="1h",
 97 |                             simulation_duration="5 days",
 98 |                             n_stories=12,
 99 |                             per=pd.Timedelta("1day"),
100 |                             log_folder=log_folder)
101 | 
102 |         logging.info("loading produced logs")
103 |         logs = load_all_logs(log_folder)["the_logs"]
104 | 
105 |         logging.info("number of produced logs: {} logs".format(logs.shape[0]))
106 | 
107 |         # 5 days of simulation should produce 1000 * 12 * 5 == 60k logs
108 |         assert 55e3 <= logs.shape[0] <= 65e3
109 | 
110 | 
111 | def test_1000_populations_with_low_activity():
112 |     """
113 | 
114 |     This is a low activity test, where the populations have less than one activity
115 |     per cycle
116 | 
117 |     """
118 | 
119 |     with tempfile.TemporaryDirectory() as log_parent_folder:
120 |         log_folder = os.path.join(log_parent_folder, "logs")
121 | 
122 |         run_test_scenario_1(clock_step="1 h",
123 |                             simulation_duration="20days",
124 |                             n_stories=1,
125 |                             per=pd.Timedelta("5 days"),
126 |                             log_folder=log_folder)
127 | 
128 |         logging.info("loading produced logs")
129 |         logs = load_all_logs(log_folder)["the_logs"]
130 | 
131 |         logging.info("number of produced logs: {} logs".format(logs.shape[0]))
132 | 
133 |         # 20 days of simulation should produce 1000 * .2 * 20 == 4000 logs
134 |         assert 3500 <= logs.shape[0] <= 4500
135 | 
136 | 
137 | def test_1000_populations_with_low_activity2():
138 |     """
139 | 
140 |     This is a low activity test, where the populations have less than one activity
141 |     per cycle
142 | 
143 |     """
144 | 
145 |     with tempfile.TemporaryDirectory() as log_parent_folder:
146 |         log_folder = os.path.join(log_parent_folder, "logs")
147 | 
148 |         run_test_scenario_1(clock_step="3 h",
149 |                             simulation_duration="15days",
150 |                             n_stories=1,
151 |                             per=pd.Timedelta("5 days"),
152 |                             log_folder=log_folder)
153 | 
154 |         logging.info("loading produced logs")
155 |         logs = load_all_logs(log_folder)["the_logs"]
156 | 
157 |         # 2 days of simulation should produce 1000 * 15 * 1/5 == 3000 logs
158 |         assert 2600 <= logs.shape[0] <= 3400
159 | 
160 | 
161 | def test_1000_populations_with_activity_one_per_cycle():
162 |     """
163 |     This is a border case between low and high activity, where the desired
164 |     amount of logs per cycle is close to 1 (i.e. close to 1 per day with our
165 |     timer) => we still need to have generated timers a bit above or below one
166 |     day, and achieve the expected total amount of logs
167 |     """
168 | 
169 |     with tempfile.TemporaryDirectory() as log_parent_folder:
170 |         log_folder = os.path.join(log_parent_folder, "logs")
171 | 
172 |         run_test_scenario_1(clock_step="15 min",
173 |                             simulation_duration="10days",
174 |                             n_stories=1,
175 |                             per=pd.Timedelta("1 day"),
176 |                             log_folder=log_folder)
177 | 
178 |         logging.info("loading produced logs")
179 |         logs = load_all_logs(log_folder)["the_logs"]
180 | 
181 |         logging.info("number of produced logs: {} logs".format(logs.shape[0]))
182 | 
183 |         # 10 days of simulation should produce 1000 * 1 * 10 == 10000 logs
184 |         assert 9500 <= logs.shape[0] <= 10500
185 | 
186 | 
187 | def test_populations_during_default_daily():
188 | 
189 |     with tempfile.TemporaryDirectory() as log_parent_folder:
190 |         log_folder = os.path.join(log_parent_folder, "logs")
191 | 
192 |         circus = Circus(name="tested_circus",
193 |                         master_seed=1,
194 |                         start=pd.Timestamp("8 June 2016"),
195 |                         step_duration=pd.Timedelta("1h"))
196 | 
197 |         field_agents = circus.create_population(
198 |             name="fa",
199 |             size=100,
200 |             ids_gen=SequencialGenerator(max_length=3, prefix="id_"))
201 | 
202 |         mobility_time_gen = DefaultDailyTimerGenerator(
203 |             clock=circus.clock, seed=next(circus.seeder))
204 | 
205 |         gaussian_activity = NumpyRandomGenerator(
206 |             method="normal", loc=5,
207 |             scale=.5, seed=1)
208 |         mobility_activity_gen = gaussian_activity.map(bound_value(lb=1))
209 | 
210 |         # just a dummy operation to produce some logs
211 |         story = circus.create_story(
212 |             name="test_story",
213 |             initiating_population=field_agents,
214 |             member_id_field="some_id",
215 |             timer_gen=mobility_time_gen,
216 |             activity_gen=mobility_activity_gen)
217 | 
218 |         story.set_operations(
219 |             circus.clock.ops.timestamp(named_as="TIME"),
220 |             FieldLogger(log_id="the_logs")
221 |         )
222 | 
223 |         circus.run(duration=pd.Timedelta("30 days"), log_output_folder=log_folder)
224 | 
225 |         logging.info("loading produced logs")
226 |         logs = load_all_logs(log_folder)["the_logs"]
227 | 
228 |         logging.info("number of produced logs: {} logs".format(logs.shape[0]))
229 | 
230 |         # 30 days of simulation should produce 100 * 5 * 30 == 15k logs
231 |         assert 14e3 <= logs.shape[0] <= 16.5e3
232 | 
233 | 
234 | def test_populations_during_working_hours():
235 | 
236 |     with tempfile.TemporaryDirectory() as log_parent_folder:
237 |         log_folder = os.path.join(log_parent_folder, "logs")
238 | 
239 |         circus = Circus(name="tested_circus",
240 |                         master_seed=1,
241 |                         start=pd.Timestamp("8 June 2016"),
242 |                         step_duration=pd.Timedelta("1h"))
243 | 
244 |         field_agents = circus.create_population(
245 |             name="fa",
246 |             size=100,
247 |             ids_gen=SequencialGenerator(max_length=3, prefix="id_"))
248 | 
249 |         mobility_time_gen = WorkHoursTimerGenerator(
250 |             clock=circus.clock, seed=next(circus.seeder))
251 | 
252 |         five_per_day = mobility_time_gen.activity(
253 |             n=5, per=pd.Timedelta("1day"))
254 | 
255 |         std_per_day = mobility_time_gen.activity(
256 |             n=.5, per=pd.Timedelta("1day"))
257 | 
258 |         gaussian_activity = NumpyRandomGenerator(
259 |             method="normal", loc=five_per_day,
260 |             scale=std_per_day, seed=1)
261 |         mobility_activity_gen = gaussian_activity.map(bound_value(lb=1))
262 | 
263 |         # just a dummy operation to produce some logs
264 |         story = circus.create_story(
265 |             name="test_story",
266 |             initiating_population=field_agents,
267 |             member_id_field="some_id",
268 |             timer_gen=mobility_time_gen,
269 |             activity_gen=mobility_activity_gen)
270 | 
271 |         story.set_operations(
272 |             circus.clock.ops.timestamp(named_as="TIME"),
273 |             FieldLogger(log_id="the_logs")
274 |         )
275 | 
276 |         circus.run(duration=pd.Timedelta("30 days"), log_output_folder=log_folder)
277 | 
278 |         logging.info("loading produced logs")
279 |         logs = load_all_logs(log_folder)["the_logs"]
280 | 
281 |         logging.info("number of produced logs: {} logs".format(logs.shape[0]))
282 | 
283 |         # 30 days of simulation should produce 100 * 5 * 30 == 15k logs
284 |         assert 14e3 <= logs.shape[0] <= 16e3
285 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_populations.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | 
  3 | import path
  4 | import pandas as pd
  5 | import os
  6 | import pytest
  7 | 
  8 | from trumania.core.random_generators import SequencialGenerator
  9 | from trumania.core.population import Population
 10 | 
 11 | dummy_population = Population(circus=None,
 12 |                               size=10,
 13 |                               ids_gen=SequencialGenerator(max_length=1, prefix="id_"))
 14 | 
 15 | ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23]
 16 | dummy_population.create_attribute("age", init_values=ages)
 17 | 
 18 | city = ["a", "b", "b", "a", "d", "e", "r", "a", "z", "c"]
 19 | dummy_population.create_attribute("city", init_values=city)
 20 | 
 21 | # some fake story data with an index corresponding to another population
 22 | # => simulates an story triggered by that other population
 23 | # the column "NEIGHBOUR" contains value that point to the dummy population, with
 24 | # a duplication (id2)
 25 | story_data = pd.DataFrame({
 26 |         "A": ["a1", "a2", "a3", "a4"],
 27 |         "B": ["b1", "b2", "b3", "b4"],
 28 |         "NEIGHBOUR": ["id_2", "id_4", "id_7", "id_2"],
 29 |         "COUSINS": [
 30 |             ["id_2", "id_4", "id_7", "id_2"],
 31 |             ["id_3"],
 32 |             ["id_4", "id_5", "id_8"],
 33 |             ["id_0", "id_4"],
 34 |         ],
 35 |     },
 36 |     index=["cust_1", "cust_2", "cust_3", "cust_4"]
 37 | )
 38 | 
 39 | 
 40 | def test_resulting_size_should_be_as_expected():
 41 |     assert dummy_population.size == 10
 42 |     assert len(dummy_population.ids) == 10
 43 | 
 44 | 
 45 | def test_population_constructor_should_refuse_duplicated_ids():
 46 |     with pytest.raises(ValueError):
 47 |         Population(circus=None,
 48 |                    size=10,
 49 |                    # these ids have duplicated values, that is not good
 50 |                    ids=[1,2,3,4,5,6,7,8,9,9])
 51 | 
 52 | 
 53 | def test_transforming_population_to_dataframe_should_provide_all_data():
 54 |     df = dummy_population.to_dataframe()
 55 | 
 56 |     # order of the columns in the resulting dataframe is currently not
 57 |     # deterministic
 58 |     assert sorted(df.columns) == ["age", "city"]
 59 | 
 60 |     assert df["age"].values.tolist() == ages
 61 |     assert df["city"].values.tolist() == city
 62 | 
 63 | 
 64 | def test_lookup_values_by_scalar_should_return_correct_values():
 65 | 
 66 |     lookup = dummy_population.ops.lookup(
 67 |         id_field="NEIGHBOUR",
 68 |         select={
 69 |             "age": "neighbour_age",
 70 |             "city": "neighbour_city",
 71 |         }
 72 |     )
 73 | 
 74 |     result, logs = lookup(story_data)
 75 |     expected_cols = ["A", "B", "COUSINS", "NEIGHBOUR", "neighbour_age",
 76 |                      "neighbour_city"]
 77 |     assert logs == {}
 78 | 
 79 |     assert sorted(result.columns) == expected_cols
 80 | 
 81 |     # values of the age corresponding to the neighbour id
 82 |     assert [40, 100, 39, 40] == result["neighbour_age"].tolist()
 83 | 
 84 |     # values of the age corresponding to the neighbour id
 85 |     assert ["b", "d", "a", "b"] == result["neighbour_city"].tolist()
 86 | 
 87 | 
 88 | def test_lookup_operation_from_empty_story_data_should_return_empty_df_with_all_columns():
 89 | 
 90 |     lookup = dummy_population.ops.lookup(
 91 |         id_field="NEIGHBOUR",
 92 |         select={
 93 |             "age": "neighbour_age",
 94 |             "city": "neighbour_city",
 95 |         }
 96 |     )
 97 | 
 98 |     empty_story_data = pd.DataFrame(columns=["A", "B", "COUSINS", "NEIGHBOUR"])
 99 | 
100 |     result, logs = lookup(empty_story_data)
101 |     expected_cols = ["A", "B", "COUSINS", "NEIGHBOUR", "neighbour_age",
102 |                      "neighbour_city"]
103 |     assert logs == {}
104 |     assert sorted(result.columns) == expected_cols
105 |     assert result.shape[0] == 0
106 | 
107 | 
108 | def test_lookup_values_by_array_should_return_correct_values():
109 | 
110 |     lookup = dummy_population.ops.lookup(
111 |         id_field="COUSINS",
112 |         select={
113 |             "age": "cousins_age",
114 |             "city": "cousins_city",
115 |         }
116 |     )
117 | 
118 |     result, logs = lookup(story_data)
119 |     expected_cols = ["A", "B", "COUSINS", "NEIGHBOUR", "cousins_age",
120 |                      "cousins_city"]
121 |     assert logs == {}
122 | 
123 |     assert sorted(result.columns) == expected_cols
124 | 
125 |     # list of values of the age corresponding to the coussin id, in the correct
126 |     # order
127 |     assert [
128 |                [40, 100, 39, 40],
129 |                [10],
130 |                [100, 98, 76],
131 |                [10, 100]
132 |            ] == result["cousins_age"].tolist()
133 | 
134 |     assert [
135 |                ["b", "d", "a", "b"],
136 |                ["a"],
137 |                ["d", "e", "z"],
138 |                ["a", "d"]
139 |            ] == result["cousins_city"].tolist()
140 | 
141 | 
142 | def test_insert_poppulation_value_for_existing_populations_should_update_all_values():
143 | 
144 |     # copy of dummy population that will be updated
145 |     tested_population = Population(
146 |         circus=None,
147 |         size=10,
148 |         ids_gen=SequencialGenerator(max_length=1, prefix="a_")
149 |     )
150 |     ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23]
151 |     tested_population.create_attribute("age", init_values=ages)
152 |     city = ["a", "b", "b", "a", "d", "e", "r", "a", "z", "c"]
153 |     tested_population.create_attribute("city", init_values=city)
154 | 
155 |     current = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9"])
156 |     assert current.tolist() == [10, 39, 23]
157 | 
158 |     update = pd.DataFrame(
159 |         {
160 |             "age": [139, 123],
161 |             "city": ["city_7", "city_9"]
162 |         },
163 |         index=["a_7", "a_9"]
164 |     )
165 | 
166 |     tested_population.update(update)
167 | 
168 |     # we should have the same number of populations
169 |     assert tested_population.ids.shape[0] == 10
170 | 
171 |     updated_age = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9"])
172 |     updated_city = tested_population.get_attribute_values("city", ["a_0", "a_7", "a_9"])
173 | 
174 |     assert updated_age.tolist() == [10, 139, 123]
175 |     assert updated_city.tolist() == ["a", "city_7", "city_9"]
176 | 
177 | 
178 | def test_insert_population_value_for_existing_and_new_populations_should_update_and_add_values():
179 | 
180 |     # copy of dummy population that will be updated
181 |     tested_population = Population(
182 |         circus=None, size=10,
183 |         ids_gen=SequencialGenerator(max_length=1, prefix="a_"))
184 |     ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23]
185 |     tested_population.create_attribute("age", init_values=ages)
186 |     city = ["a", "b", "b", "a", "d", "e", "r", "a", "z", "c"]
187 |     tested_population.create_attribute("city", init_values=city)
188 | 
189 |     current = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9"])
190 |     assert current.tolist() == [10, 39, 23]
191 | 
192 |     update = pd.DataFrame(
193 |         {
194 |             "age": [139, 123, 54, 25],
195 |             "city": ["city_7", "city_9", "city_11", "city_10"]
196 |         },
197 |         index=["a_7", "a_9", "a_11", "a_10"]
198 |     )
199 | 
200 |     tested_population.update(update)
201 | 
202 |     # we should have 2 new populations
203 |     assert tested_population.ids.shape[0] == 12
204 | 
205 |     updated_age = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9", "a_10", "a_11"])
206 |     updated_city = tested_population.get_attribute_values("city", ["a_0", "a_7", "a_9", "a_10", "a_11"])
207 | 
208 |     assert updated_age.tolist() == [10, 139, 123, 25, 54]
209 |     assert updated_city.tolist() == ["a", "city_7", "city_9", "city_10", "city_11"]
210 | 
211 | 
212 | def test_insert_op_population_value_for_existing_populations_should_update_all_values():
213 |     # same as test above but triggered as an Operation on story data
214 | 
215 |     # copy of dummy population that will be updated
216 |     tested_population = Population(
217 |         circus=None, size=10,
218 |         ids_gen=SequencialGenerator(max_length=1, prefix="a_"))
219 |     ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23]
220 |     tested_population.create_attribute("age", init_values=ages)
221 |     city = ["a", "b", "b", "a", "d", "e", "r", "a", "z", "c"]
222 |     tested_population.create_attribute("city", init_values=city)
223 | 
224 |     story_data = pd.DataFrame(
225 |         {
226 |             "the_new_age": [139, 123, 1, 2],
227 |             "location": ["city_7", "city_9", "city_11", "city_10"],
228 |             "updated_populations": ["a_7", "a_9", "a_11", "a_10"]
229 |         },
230 |         index=["d_1", "d_2", "d_4", "d_3"]
231 |     )
232 | 
233 |     update_op = tested_population.ops.update(
234 |         id_field="updated_populations",
235 |         copy_attributes_from_fields={
236 |             "age": "the_new_age",
237 |             "city": "location"
238 |         }
239 |     )
240 | 
241 |     story_data_2, logs = update_op(story_data)
242 | 
243 |     # there should be no impact on the story data
244 |     assert story_data_2.shape == (4, 3)
245 |     assert sorted(story_data_2.columns.tolist()) == ["location", "the_new_age", "updated_populations"]
246 | 
247 |     # we should have 2 new populations
248 |     assert tested_population.ids.shape[0] == 12
249 | 
250 |     updated_age = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9", "a_10", "a_11"])
251 |     updated_city = tested_population.get_attribute_values("city", ["a_0", "a_7", "a_9", "a_10", "a_11"])
252 | 
253 |     assert updated_age.tolist() == [10, 139, 123, 2, 1]
254 |     assert updated_city.tolist() == ["a", "city_7", "city_9", "city_10", "city_11"]
255 | 
256 | 
257 | def test_creating_an_empty_population_and_adding_attributes_later_should_be_possible():
258 | 
259 |     # empty population
260 |     a = Population(circus=None, size=0)
261 |     assert a.ids.shape[0] == 0
262 | 
263 |     # empty attributes
264 |     a.create_attribute("att1")
265 |     a.create_attribute("att2")
266 | 
267 |     dynamically_created = pd.DataFrame({
268 |             "att1": [1, 2, 3],
269 |             "att2": [11, 12, 13],
270 |         }, index=["ac1", "ac2", "ac3"]
271 |     )
272 | 
273 |     a.update(dynamically_created)
274 | 
275 |     assert a.ids.tolist() == ["ac1", "ac2", "ac3"]
276 |     assert a.get_attribute_values("att1", ["ac1", "ac2", "ac3"]).tolist() == [1, 2, 3]
277 |     assert a.get_attribute_values("att2", ["ac1", "ac2", "ac3"]).tolist() == [11, 12, 13]
278 | 
279 | 
280 | def test_io_round_trip():
281 | 
282 |     with tempfile.TemporaryDirectory() as p:
283 | 
284 |         population_path = os.path.join(p, "test_location")
285 |         dummy_population.save_to(population_path)
286 |         retrieved = Population.load_from(circus=None, folder=population_path)
287 | 
288 |         assert dummy_population.size == retrieved.size
289 |         assert dummy_population.ids.tolist() == retrieved.ids.tolist()
290 | 
291 |         ids = dummy_population.ids.tolist()
292 | 
293 |         for att_name in dummy_population.attribute_names():
294 |             assert dummy_population.get_attribute_values(att_name, ids).equals(
295 |                 retrieved.get_attribute_values(att_name, ids)
296 |             )
297 | 
298 |         for rel_name in dummy_population.relationship_names():
299 |             assert dummy_population.get_relationship(rel_name)._table.equals(
300 |                 retrieved.get_relationship(rel_name)._table
301 |             )
302 | 


--------------------------------------------------------------------------------
/trumania/core/circus.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import json
  4 | import pandas as pd
  5 | 
  6 | from trumania.core import population
  7 | from trumania.components import db
  8 | from trumania.core.random_generators import seed_provider
  9 | from trumania.core.util_functions import ensure_non_existing_dir
 10 | from trumania.core.clock import Clock
 11 | from trumania.core.story import Story
 12 | 
 13 | 
 14 | class Circus(object):
 15 |     """
 16 |     A Circus is just a container of a lot of objects that are required to make the simulation
 17 |     It is also the object that will execute the stories required for 1 iteration
 18 |     """
 19 | 
 20 |     def __init__(self, name, master_seed, **clock_params):
 21 |         """Create a new Circus object
 22 | 
 23 |         :param master_seed: seed used to initialized random generatof of
 24 |         other seeds
 25 |         :type master_seed: int
 26 | 
 27 |         :rtype: Circus
 28 |         :return: a new Circus object, with the clock, is created
 29 |         """
 30 |         self.name = name
 31 | 
 32 |         self.master_seed = master_seed
 33 |         self.clock_params = clock_params
 34 | 
 35 |         self.seeder = seed_provider(master_seed=master_seed)
 36 |         self.clock = Clock(seed=next(self.seeder), **clock_params)
 37 |         self.stories = []
 38 |         self.populations = {}
 39 |         self.generators = {}
 40 | 
 41 |     def create_population(self, name, **population_params):
 42 |         """
 43 |         Creates a population with the specifed parameters and attach it to this
 44 |         circus.
 45 |         """
 46 |         if name in self.populations:
 47 |             raise ValueError("refusing to overwrite existing population: {} "
 48 |                              "".format(name))
 49 | 
 50 |         self.populations[name] = population.Population(circus=self, **population_params)
 51 |         return self.populations[name]
 52 | 
 53 |     def load_population(self, population_id, namespace=None):
 54 |         """
 55 |         Load this population definition add attach it to this circus
 56 |         """
 57 | 
 58 |         # Defaulting to the namespace associated to this circus if none
 59 |         # specified
 60 |         if namespace is None:
 61 |             namespace = self.name
 62 | 
 63 |         loaded = db.load_population(namespace=namespace,
 64 |                                     population_id=population_id, circus=self)
 65 |         self.populations[population_id] = loaded
 66 |         return loaded
 67 | 
 68 |     def create_story(self, name, **story_params):
 69 |         """
 70 |         Creates a story with the provided parameters and attach it to this
 71 |         circus.
 72 |         """
 73 | 
 74 |         existing = self.get_story(name)
 75 | 
 76 |         if existing is None:
 77 |             story = Story(name=name, **story_params)
 78 |             self.stories.append(story)
 79 |             return story
 80 | 
 81 |         else:
 82 |             raise ValueError("Cannot add story {}: another story with "
 83 |                              "identical name is already in the circus".format(name))
 84 | 
 85 |     def get_story(self, story_name):
 86 |         """
 87 |         Looks up and story by name in this circus and returns it. Returns none
 88 |         if not found.
 89 |         """
 90 |         remaining_stories = filter(lambda a: a.name == story_name, self.stories)
 91 |         try:
 92 |             return next(remaining_stories)
 93 |         except StopIteration:
 94 |             logging.warn("story not found: {}".format(story_name))
 95 |             return None
 96 | 
 97 |     def get_population_of(self, story_name):
 98 |         """
 99 |         Looks up the initiating population associated to this story
100 |         """
101 |         return self.get_story(story_name).triggering_population
102 | 
103 |     def attach_generator(self, gen_id, generator):
104 |         """
105 |         "attach" a random generator to this circus, s.t. it gets persisted
106 |         with the rest
107 |         """
108 |         if gen_id in self.generators:
109 |             raise ValueError("refusing to replace existing generator: {} "
110 |                              "".format(gen_id))
111 | 
112 |         self.generators[gen_id] = generator
113 | 
114 |     def load_generator(self, gen_type, gen_id):
115 |         """
116 |         Load this generator definition add attach it to this circus
117 |         """
118 |         gen = db.load_generator(
119 |             namespace=self.name, gen_type=gen_type, gen_id=gen_id)
120 | 
121 |         self.attach_generator(gen_id, gen)
122 |         return gen
123 | 
124 |     @staticmethod
125 |     def save_logs(log_id, logs, log_output_folder):
126 |         """
127 |         Appends those logs to the corresponding output file, creating it if
128 |         it does not exist or appending lines to it otherwise.
129 |         """
130 | 
131 |         output_file = os.path.join(log_output_folder, "{}.csv".format(log_id))
132 | 
133 |         if not os.path.exists(log_output_folder):
134 |             os.makedirs(log_output_folder)
135 | 
136 |         if logs.shape[0] > 0:
137 |             logging.info("appending {} rows to {}".format(logs.shape[0], output_file))
138 | 
139 |             if not os.path.exists(output_file):
140 |                 # If these are this first persisted logs, we create the file
141 |                 # and include the field names as column header.
142 |                 logs.to_csv(output_file, index=False, header=True)
143 | 
144 |             else:
145 |                 # Otherwise, open the existing log file in append mode and add
146 |                 # the new logs at the end, this time without columns headers.
147 |                 with open(output_file, "a") as out_f:
148 |                     logs.to_csv(out_f, index=False, header=False)
149 | 
150 |     def run(self, duration, log_output_folder, delete_existing_logs=False):
151 |         """
152 |         Executes all stories in the circus for as long as requested.
153 | 
154 |         :param duration: duration of the desired simulation (start date is
155 |         dictated by the clock)
156 |         :type duration: pd.TimeDelta
157 | 
158 |         :param log_output_folder: folder where to write the logs.
159 |         :type log_output_folder: string
160 | 
161 |         :param delete_existing_logs:
162 |         """
163 | 
164 |         n_iterations = self.clock.n_iterations(duration)
165 |         logging.info("Starting circus for {} iterations of {} for a "
166 |                      "total duration of {}".format(
167 |                         n_iterations, self.clock.step_duration, duration
168 |                      ))
169 | 
170 |         if os.path.exists(log_output_folder):
171 |             if delete_existing_logs:
172 |                 ensure_non_existing_dir(log_output_folder)
173 |             else:
174 |                 raise EnvironmentError("{} exists and delete_existing_logs is "
175 |                                        "False => refusing to start and "
176 |                                        "overwrite logs".format(log_output_folder))
177 | 
178 |         for step_number in range(n_iterations):
179 |             logging.info("step : {}".format(step_number))
180 | 
181 |             for story in self.stories:
182 |                 for log_id, logs in story.execute().items():
183 |                     self.save_logs(log_id, logs, log_output_folder)
184 | 
185 |             self.clock.increment()
186 | 
187 |     @staticmethod
188 |     def load_from_db(circus_name):
189 | 
190 |         logging.info("loading circus {}".format(circus_name))
191 | 
192 |         namespace_folder = db.namespace_folder(namespace=circus_name)
193 |         config_file = os.path.join(namespace_folder, "circus_config.json")
194 | 
195 |         with open(config_file, "r") as config_h:
196 |             config = json.load(config_h)
197 | 
198 |             clock_config = {
199 |                 "start": pd.Timestamp(config["clock_config"]["start"]),
200 |                 "step_duration": pd.Timedelta(
201 |                     str(config["clock_config"]["step_duration"]))
202 |             }
203 | 
204 |             circus = Circus(name=circus_name, master_seed=config["master_seed"],
205 |                             **clock_config)
206 | 
207 |             for population_id in db.list_populations(namespace=circus_name):
208 |                 circus.load_population(population_id)
209 | 
210 |             for gen_type, gen_id in db.list_generators(namespace=circus_name):
211 |                 circus.load_generator(gen_type=gen_type, gen_id=gen_id)
212 | 
213 |             return circus
214 | 
215 |     def save_to_db(self, overwrite=False):
216 |         """
217 |         Create a db namespace named after this circus and saves all the
218 |         populations there.
219 | 
220 |         Only static data is saved, not the stories.
221 |         """
222 | 
223 |         logging.info("saving circus {}".format(self.name))
224 | 
225 |         if db.is_namespace_existing(namespace=self.name):
226 |             if overwrite:
227 |                 logging.warning(
228 |                     "overwriting existing circus {}".format(self.name))
229 |                 db.remove_namespace(namespace=self.name)
230 | 
231 |             else:
232 |                 raise IOError("refusing to remove existing {} namespace since "
233 |                               "overwrite parameter is False".format(self.name))
234 | 
235 |         namespace_folder = db.create_namespace(namespace=self.name)
236 |         config_file = os.path.join(namespace_folder, "circus_config.json")
237 |         with open(config_file, "w") as o:
238 |             config = {"master_seed": self.master_seed,
239 |                       "clock_config": {
240 |                           "start": self.clock_params["start"].isoformat(),
241 |                           "step_duration": str(self.clock_params["step_duration"])}
242 |                       }
243 |             json.dump(config, o, indent=4)
244 | 
245 |         logging.info("saving all populations")
246 |         for population_id, ac in self.populations.items():
247 |             db.save_population(ac, namespace=self.name,
248 |                                population_id=population_id)
249 | 
250 |         logging.info("saving all generators")
251 |         for gen_id, generator in self.generators.items():
252 |             db.save_generator(generator, namespace=self.name, gen_id=gen_id)
253 | 
254 |         logging.info("circus saved")
255 | 
256 |     def save_params_to_db(self, params_type, params):
257 |         """
258 |         Saves the params object to the circus folder in the DB for future reference
259 |         :param params_type: "build", "run" or "target"
260 |         :param params: the params object
261 |         """
262 |         target_file = os.path.join(db.namespace_folder(self.name),
263 |                                    "params_{}.json".format(params_type))
264 | 
265 |         with open(target_file, "w") as outfile:
266 |             json.dump(params, outfile)
267 | 
268 |     def description(self):
269 | 
270 |         return {
271 |             "circus_name": self.name,
272 |             "master_seed": self.master_seed,
273 |             "populations": {id: population.description()
274 |                             for id, population in self.populations.items()
275 |                             },
276 |             "generators": {gen_id: gen.description()
277 |                            for gen_id, gen in self.generators.items()
278 |                            },
279 |         }
280 | 
281 |     def __str__(self):
282 |         return json.dumps(self.description(), indent=4)
283 | 


--------------------------------------------------------------------------------
/examples/tutorial/example4.py:
--------------------------------------------------------------------------------
  1 | from trumania.core import circus
  2 | import trumania.core.population as population
  3 | import trumania.core.random_generators as gen
  4 | import trumania.core.operations as ops
  5 | import trumania.core.story as story
  6 | import trumania.components.time_patterns.profilers as profilers
  7 | import trumania.core.util_functions as util_functions
  8 | import trumania.components.db as DB
  9 | import pandas as pd
 10 | 
 11 | # each step?() function below implement one step of the fourth example of the
 12 | # tutorial documented at
 13 | # https://realimpactanalytics.atlassian.net/wiki/display/LM/Data+generator+tutorial
 14 | # this is essentially a modification of example3, with some supplementary
 15 | # features demonstrating persistence
 16 | 
 17 | 
 18 | def build_music_repo():
 19 | 
 20 |     # this time we create a "detached" population, not connected to a circus
 21 |     repo = population.Population(
 22 |         circus=None,
 23 |         size=5,
 24 |         ids_gen=gen.SequencialGenerator(prefix="GENRE_"))
 25 | 
 26 |     repo.create_attribute(
 27 |         name="genre_name",
 28 |         init_values=["blues", "jazz", "electro", "pop", "rock"])
 29 | 
 30 |     repo.create_relationship(name="songs", seed=18)
 31 | 
 32 |     return repo
 33 | 
 34 | 
 35 | def add_song_to_repo(repo_population):
 36 | 
 37 |     songs = population.Population(
 38 |         circus=None,
 39 |         size=0,
 40 |         ids_gen=gen.SequencialGenerator(prefix="SONG_"))
 41 | 
 42 |     # since the size of the population is 0, we can create attribute without
 43 |     # providing any initialization
 44 |     songs.create_attribute(name="artist_name")
 45 |     songs.create_attribute(name="song_genre")
 46 |     songs.create_attribute(name="title")
 47 |     songs.create_attribute(name="duration_seconds")
 48 |     songs.create_attribute(name="recording_year")
 49 | 
 50 |     song_id_gen = gen.SequencialGenerator(prefix="S_")
 51 | 
 52 |     # generate artist names from a list of randomly generated ones, so we have
 53 |     # some redundancy in the generated dataset
 54 |     artist_name_gen = gen.NumpyRandomGenerator(
 55 |         method="choice",
 56 |         a=gen.FakerGenerator(
 57 |             method="name",
 58 |             seed=1234).generate(size=200),
 59 |         seed=5678)
 60 | 
 61 |     title_gen = gen.FakerGenerator(method="sentence",
 62 |                                    seed=78961,
 63 |                                    nb_words=4,
 64 |                                    variable_nb_words=True)
 65 | 
 66 |     # generates recording years within a desired date range
 67 |     year_gen = gen.FakerGenerator(
 68 |             method="date_time_between_dates",
 69 |             seed=184,
 70 |             datetime_start=pd.Timestamp("1910-10-20"),
 71 |             datetime_end=pd.Timestamp("2016-12-02")) \
 72 |         .map(f=lambda d: d.year)
 73 | 
 74 |     duration_gen = gen.ParetoGenerator(xmin=60,
 75 |                                        seed=9874,
 76 |                                        force_int=True,
 77 |                                        a=1.2)
 78 | 
 79 |     repo_genre_rel = repo_population.get_attribute("genre_name")
 80 |     for genre_id, genre_name in repo_genre_rel.get_values().items():
 81 | 
 82 |         # an operation capable of creating songs of that genre
 83 |         init_attribute = ops.Chain(
 84 |             artist_name_gen.ops.generate(named_as="artist_name"),
 85 |             title_gen.ops.generate(named_as="title"),
 86 |             year_gen.ops.generate(named_as="recording_year"),
 87 |             duration_gen.ops.generate(named_as="duration_seconds"),
 88 |             gen.ConstantGenerator(value=genre_name).ops.generate(named_as="song_genre")
 89 |         )
 90 | 
 91 |         # dataframe of emtpy songs: just with one SONG_ID column for now
 92 |         song_ids = song_id_gen.generate(size=1000)
 93 |         emtpy_songs = story.Story.init_story_data(
 94 |             member_id_field_name="SONG_ID",
 95 |             active_ids=song_ids
 96 |         )
 97 | 
 98 |         # we can already adds the generated songs to the music repo relationship
 99 |         repo_population.get_relationship("songs").add_grouped_relations(
100 |             from_ids=[genre_id],
101 |             grouped_ids=[song_ids]
102 |         )
103 | 
104 |         # here we generate all desired columns in the dataframe
105 |         initialized_songs, _ = init_attribute(emtpy_songs)
106 |         initialized_songs.drop(["SONG_ID"], axis=1, inplace=True)
107 | 
108 |         # this works because the columns of init_attribute match exactly the
109 |         # ones of the attributes of the populations
110 |         songs.update(initialized_songs)
111 | 
112 |     # makes sure year and duration are handled as integer
113 |     songs.get_attribute("recording_year").transform_inplace(int)
114 |     songs.get_attribute("duration_seconds").transform_inplace(int)
115 | 
116 |     return songs
117 | 
118 | 
119 | def build_circus(name):
120 |     return circus.Circus(
121 |         name=name,
122 |         master_seed=12345,
123 |         start=pd.Timestamp("1 Jan 2017 00:00"),
124 |         step_duration=pd.Timedelta("1h"))
125 | 
126 | 
127 | def add_listener(the_circus):
128 | 
129 |     users = the_circus.create_population(
130 |         name="user", size=5,
131 |         ids_gen=gen.SequencialGenerator(prefix="user_"))
132 | 
133 |     users.create_attribute(
134 |         name="FIRST_NAME",
135 |         init_gen=gen.FakerGenerator(method="first_name",
136 |                                     seed=next(the_circus.seeder)))
137 |     users.create_attribute(
138 |         name="LAST_NAME",
139 |         init_gen=gen.FakerGenerator(method="last_name",
140 |                                     seed=next(the_circus.seeder)))
141 | 
142 | 
143 | def add_listen_and_share_stories_with_details(the_circus):
144 | 
145 |     users = the_circus.populations["user"]
146 | 
147 |     # using this timer means POS are more likely to trigger a re-stock during
148 |     # day hours rather that at night.
149 |     timer_gen = profilers.HighWeekDaysTimerGenerator(
150 |         clock=the_circus.clock, seed=next(the_circus.seeder))
151 | 
152 |     # this generate activity level distributed as a "truncated normal
153 |     # distribution", i.e. very high and low activities are prevented.
154 |     bounded_gaussian_activity_gen = gen.NumpyRandomGenerator(
155 |         method="normal",
156 |         seed=next(the_circus.seeder),
157 |         loc=timer_gen.activity(n=20, per=pd.Timedelta("1 day")),
158 |         scale=5
159 |     ).map(ops.bound_value(lb=10, ub=30))
160 | 
161 |     listen = the_circus.create_story(
162 |             name="listen_events",
163 |             initiating_population=users,
164 |             member_id_field="UID",
165 | 
166 |             timer_gen=timer_gen,
167 |             activity_gen=bounded_gaussian_activity_gen
168 |         )
169 | 
170 |     share = the_circus.create_story(
171 |             name="share_events",
172 |             initiating_population=users,
173 |             member_id_field="UID",
174 | 
175 |             timer_gen=timer_gen,
176 |             activity_gen=bounded_gaussian_activity_gen
177 |         )
178 | 
179 |     repo = the_circus.populations["music_repository"]
180 |     songs = the_circus.populations["songs"]
181 | 
182 |     select_genre_and_song = ops.Chain(
183 | 
184 |         users.ops.lookup(
185 |             id_field="UID",
186 |             select={
187 |                 "FIRST_NAME": "USER_FIRST_NAME",
188 |                 "LAST_NAME": "USER_LAST_NAME",
189 |             }
190 |         ),
191 | 
192 |         # picks a genre at random
193 |         repo.ops.select_one(named_as="GENRE"),
194 | 
195 |         # picks a song at random for that genre
196 |         repo.get_relationship("songs").ops.select_one(
197 |             from_field="GENRE",
198 |             named_as="SONG_ID"),
199 | 
200 |         # now also reporting details of listened or shared songs
201 |         songs.ops.lookup(
202 |             id_field="SONG_ID",
203 |             select={
204 |                 "artist_name": "SONG_ARTIST",
205 |                 "title": "SONG_TITLE",
206 |                 "recording_year": "SONG_YEAR",
207 |                 "duration_seconds": "SONG_DURATION",
208 |             }
209 |         ),
210 |     )
211 | 
212 |     listen.set_operations(
213 |         select_genre_and_song,
214 |         ops.FieldLogger("listen_events")
215 |     )
216 | 
217 |     share.set_operations(
218 |         select_genre_and_song,
219 | 
220 |         # picks a user this song is shared to
221 |         users.ops.select_one(named_as="SHARED_TO_UID"),
222 | 
223 |         # note we could post-check when user shared a song to their own uid
224 |         # here, in which case we can use DropRow to discard that share event
225 | 
226 |         ops.FieldLogger("share_events")
227 |     )
228 | 
229 | 
230 | def step1():
231 | 
232 |     # this creates 2 populations: music_repo and songs
233 |     music_repo = build_music_repo()
234 |     songs = add_song_to_repo(music_repo)
235 | 
236 |     # saves them to persistence
237 |     DB.remove_namespace(namespace="tutorial_example4")
238 |     DB.save_population(music_repo, namespace="tutorial_example4",
239 |                        population_id="music_repository")
240 |     DB.save_population(songs, namespace="tutorial_example4",
241 |                        population_id="songs")
242 | 
243 |     # build a new circus then loads and attach the persisted population to it
244 |     example4_circus = build_circus(name="example4_circus")
245 |     example4_circus.load_population(namespace="tutorial_example4",
246 |                                     population_id="music_repository")
247 |     example4_circus.load_population(namespace="tutorial_example4",
248 |                                     population_id="songs")
249 | 
250 |     add_listener(example4_circus)
251 | 
252 | 
253 | def step2():
254 | 
255 |     # this creates 2 populations: music_repo and songs
256 |     music_repo = build_music_repo()
257 |     songs = add_song_to_repo(music_repo)
258 | 
259 |     # saves them to persistence
260 |     DB.remove_namespace(namespace="tutorial_example4")
261 |     DB.save_population(music_repo, namespace="tutorial_example4",
262 |                        population_id="music_repository")
263 |     DB.save_population(songs, namespace="tutorial_example4",
264 |                        population_id="songs")
265 | 
266 |     # build a new circus then loads and attach the persisted population to it
267 |     example4_circus = build_circus(name="example4_circus")
268 |     example4_circus.load_population(namespace="tutorial_example4",
269 |                                     population_id="music_repository")
270 |     example4_circus.load_population(namespace="tutorial_example4",
271 |                                     population_id="songs")
272 | 
273 |     add_listener(example4_circus)
274 | 
275 |     # This saves the whole circus to persistence, with all its populations,
276 |     # relationships, generators,...
277 |     # This is independent from the 2 populations saved above: this time we no longer
278 |     # have direct control on the namespace: the persistence mechanism use the
279 |     # circus name as namespace
280 |     example4_circus.save_to_db(overwrite=True)
281 | 
282 |     # example4bis should be an exact deep copy of example4_circus
283 |     example4bis = circus.Circus.load_from_db(circus_name="example4_circus")
284 | 
285 |     # Stories are not serialized to CSV but rather serialized in code,
286 |     # using humans as transducers
287 |     add_listen_and_share_stories_with_details(example4bis)
288 | 
289 |     example4bis.run(
290 |         duration=pd.Timedelta("5 days"),
291 |         log_output_folder="output/example4",
292 |         delete_existing_logs=True)
293 | 
294 | 
295 | if __name__ == "__main__":
296 |     util_functions.setup_logging()
297 |     step2()
298 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 Riaktr
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/trumania/core/operations.py:
--------------------------------------------------------------------------------
  1 | from scipy import stats
  2 | from abc import ABCMeta, abstractmethod
  3 | import pandas as pd
  4 | import numpy as np
  5 | from trumania.core.util_functions import merge_dicts, df_concat
  6 | import functools
  7 | 
  8 | 
  9 | class Operation(object):
 10 |     """
 11 |     An Operation is able to produce transform input into an output +
 12 |         produce logs.
 13 |     """
 14 | 
 15 |     def transform(self, story_data):
 16 |         """
 17 |         :param story_data: dataframe as produced by the previous operation
 18 |         :return: a dataframe that replaces the previous one in the pipeline
 19 |         """
 20 | 
 21 |         return story_data
 22 | 
 23 |     def emit_logs(self, story_data):
 24 |         """
 25 |         This method is used to produces logs (e.g. CDRs, mobility, topus...)
 26 | 
 27 |         :param story_data: output of this operation, as produced by transform()
 28 |         :return: emitted logs, as a dictionary of {"log_id": some_data_frame}
 29 |         """
 30 | 
 31 |         return {}
 32 | 
 33 |     def __call__(self, story_data):
 34 | 
 35 |         output = self.transform(story_data)
 36 |         logs = self.emit_logs(output)
 37 | 
 38 |         return output, logs
 39 | 
 40 | 
 41 | class Chain(Operation):
 42 |     """
 43 |     A chain is a list of operation to be executed sequencially
 44 |     """
 45 | 
 46 |     def __init__(self, *operations):
 47 |         self.operations = list(operations)
 48 | 
 49 |     def append(self, *operations):
 50 |         """
 51 |         adds operations to be executed at the end of this chain
 52 |         """
 53 |         self.operations += list(operations)
 54 | 
 55 |     @staticmethod
 56 |     def _execute_operation(story_data__prev_logs, operation):
 57 |         """
 58 | 
 59 |         executes this operation and merges its logs with the previous one
 60 |         :param operation: the operation to call
 61 |         :return: the merged story data and logs
 62 |         """
 63 | 
 64 |         (story_data, prev_logs) = story_data__prev_logs
 65 | 
 66 |         output, supp_logs = operation(story_data)
 67 |         # merging the logs of each operation of this story.
 68 |         return output, merge_dicts([prev_logs, supp_logs], df_concat)
 69 | 
 70 |     def __call__(self, story_data):
 71 |         init = [(story_data, {})]
 72 |         return functools.reduce(self._execute_operation, init + self.operations)
 73 | 
 74 | 
 75 | class FieldLogger(Operation):
 76 |     """
 77 |     Log creator that simply select a set of columns and create a logged
 78 |     dataframe from it
 79 |     """
 80 | 
 81 |     def __init__(self, log_id, cols=None, exploded_cols=None):
 82 |         """
 83 |         :param log_id: the id of the logs in the dictionary of logs returned
 84 |         by the Circus, at the end of the simulation
 85 |         :param cols: sub-sets of fields from the story data that will be
 86 |         selected in order to build the logs
 87 |         :param exploded_cols: name of one or several columns containing list of
 88 |         values. If provided, we explode the story_data dataframe and log one per value
 89 |         in that list (which is more that one line per row in story_data).
 90 |         In each row, all lists must have the same length
 91 |         """
 92 |         self.log_id = log_id
 93 | 
 94 |         if type(exploded_cols) == str:
 95 |             self.exploded_cols = [exploded_cols]
 96 |         else:
 97 |             self.exploded_cols = [] if exploded_cols is None else exploded_cols
 98 | 
 99 |         if type(cols) == str:
100 |             self.cols = [cols]
101 |         else:
102 |             self.cols = [] if cols is None else cols
103 | 
104 |         self.cols += self.exploded_cols
105 | 
106 |     def emit_logs(self, story_data):
107 | 
108 |         # explode lists, cf constructor documentation
109 |         if self.exploded_cols:
110 | 
111 |             def explo(df):
112 |                 explosion_len = len(df[self.exploded_cols[0]])
113 |                 df2 = pd.DataFrame(
114 |                     [df.drop(self.exploded_cols) for _ in range(explosion_len)])
115 |                 for col in self.exploded_cols:
116 |                     df2[col] = df[col]
117 | 
118 |                 return df2
119 | 
120 |             logged_data = pd.concat(explo(row)
121 |                                     for _, row in story_data.iterrows())
122 | 
123 |         else:
124 |             logged_data = story_data
125 | 
126 |         if not self.cols:
127 |             return {self.log_id: logged_data}
128 |         else:
129 |             return {self.log_id: logged_data[self.cols]}
130 | 
131 | 
132 | class SideEffectOnly(Operation):
133 |     """
134 |     Operation that does not produce logs nor supplementary columns: just have
135 |     side effect
136 |     """
137 |     __metaclass__ = ABCMeta
138 | 
139 |     def transform(self, story_data):
140 |         self.side_effect(story_data)
141 |         return story_data
142 | 
143 |     @abstractmethod
144 |     def side_effect(self, story_data):
145 |         """
146 |         :param story_data:
147 |         :return: nothing
148 |         """
149 |         pass
150 | 
151 | 
152 | class AddColumns(Operation):
153 |     """
154 |     Very typical case of an operation that appends (i.e. joins) columns to
155 |     the previous result
156 |     """
157 |     __metaclass__ = ABCMeta
158 | 
159 |     def __init__(self, join_kind="left"):
160 |         self.join_kind = join_kind
161 | 
162 |     @abstractmethod
163 |     def build_output(self, story_data):
164 |         """
165 |         Produces a dataframe with one or several columns and an index aligned
166 |         with the one of input. The columns of this will be merge with input.
167 | 
168 |         :param story_data: current dataframe
169 |         :return: the column(s) to append to it, as a dataframe
170 |         """
171 |         pass
172 | 
173 |     def transform(self, story_data):
174 |         output = self.build_output(story_data)
175 | #        logging.info("  adding column(s) {}".format(output.columns.tolist()))
176 |         return pd.merge(left=story_data, right=output,
177 |                         left_index=True, right_index=True,
178 |                         how=self.join_kind)
179 | 
180 | 
181 | class DropRow(Operation):
182 |     """
183 |     Discards any row in the story data where the condition field is false.
184 |     """
185 | 
186 |     def __init__(self, condition_field):
187 |         self.condition_field = condition_field
188 | 
189 |     def transform(self, story_data):
190 |         return story_data[~story_data[self.condition_field]]
191 | 
192 | 
193 | class Apply(AddColumns):
194 |     """
195 |         Custom operation adding one single column computed from a user-provided
196 |         function.
197 | 
198 |         The length of the source_fields must match the number columns
199 |         in the dataframe expected by the user f function
200 | 
201 |     """
202 | 
203 |     def __init__(self, source_fields, named_as, f, f_args="dataframe"):
204 |         """
205 |         :param source_fields: input field from the story data
206 |         :param named_as: name of the resulting fields added to the story data
207 |         :param f: transforming function
208 |         :param f_args: "dataframe" or "columns", depending on the signature
209 |             of f:
210 | 
211 |             - "dataframe": input and output of the function is a dataframe
212 |              as many columns as there are values in "named_as"
213 | 
214 |             - "columns" input of f is a list of columns and output is 1
215 |              column (like many numpy built-it function). In that case,
216 |              "named_as" can obviously only contain one name
217 |         """
218 | 
219 |         AddColumns.__init__(self)
220 |         if type(source_fields) == str:
221 |             self.source_fields = [source_fields]
222 |         else:
223 |             self.source_fields = source_fields
224 | 
225 |         if type(named_as) == str:
226 |             self.named_as = [named_as]
227 |         else:
228 |             self.named_as = named_as
229 | 
230 |         self.f = f
231 |         if f_args not in ["dataframe", "series"]:
232 |             raise ValueError("unrecognized f input type: {}".format(f_args))
233 | 
234 |         if f_args == "series":
235 |             assert len(self.named_as) == 1, \
236 |                 "'series' functions can only return 1 column"
237 | 
238 |         self.f_input = f_args
239 | 
240 |     def build_output(self, story_data):
241 |         if self.f_input == "dataframe":
242 |             result = self.f(story_data[self.source_fields])
243 |             renamed = result.rename(
244 |                 columns=dict(zip(result.columns, self.named_as)))
245 | 
246 |             return renamed
247 |         else:
248 |             cols = [story_data[c] for c in self.source_fields]
249 |             result = pd.DataFrame({self.named_as[0]: self.f(*cols)})
250 |             return result
251 | 
252 | 
253 | #####################
254 | # Collection of functions directly usable in Apply
255 | 
256 | def copy_if(story_data):
257 |     """
258 |     Copies values from the source to the "named_as" if the condition is True,
259 |     otherwise inserts NA
260 | 
261 |     usage:
262 | 
263 |         Apply(source_fields=["some_source_field", "some_condition_field"],
264 |               named_as="some_result_field",
265 |               f=copy_if)
266 |     """
267 | 
268 |     condition_field, source_field = story_data.columns
269 |     copied = story_data.where(story_data[condition_field])[[source_field]]
270 |     return copied.rename(columns={source_field: "result"})
271 | 
272 | 
273 | def bound_value(lb=None, ub=None):
274 |     """
275 |     builds a function that limits the range of a value
276 |     """
277 | 
278 |     def _f(value):
279 |         limited = value if lb is None else max(lb, value)
280 |         if ub is not None:
281 |             limited = min(ub, limited)
282 |         return limited
283 | 
284 |     return _f
285 | 
286 | 
287 | def scale(factor):
288 |     def _f_vect(value):
289 |         return value * factor
290 | 
291 |     return _f_vect
292 | 
293 | 
294 | def logistic(k, x0=0, L=1):
295 |     """
296 | 
297 |     Returns a function, usable in an Apply operation, that transforms the
298 |     specified field with a sigmoid with the provided parameters
299 | 
300 |     :param k: the steepness of the curve
301 |     :param x0: the x-value of the sigmoid's midpoint (default: 0)
302 |     :param L: maximum value of the logistic (default: 1)
303 | 
304 |     same parameter naming conventions as in:
305 |     https://en.wikipedia.org/wiki/Logistic_function
306 | 
307 |     usage:
308 |         Apply(source_fields=["some_source_field"],
309 |               named_as="some_result_field",
310 |               f=sigmoid(k=-0.01, x0=1000)
311 |     """
312 | 
313 |     def _logistic(x):
314 |         the_exp = np.minimum(-k * (x - x0), 10)
315 |         return L / (1 + np.exp(the_exp))
316 | 
317 |     return _logistic
318 | 
319 | 
320 | def bounded_sigmoid(x_min, x_max, shape, incrementing=True):
321 |     """
322 |     Builds a S-shape curve that have y values evolving between 0 and 1 over
323 |     the x domain [x_min, x_max]
324 | 
325 |     This is preferable to the logistic function for cases where we want to
326 |     make sure that the curve actually reaches 0 and 1 at some point (e.g.
327 |     probability of triggering an "restock" story must be 1 if stock is as
328 |     low as 1).
329 | 
330 |     See /tests/notebooks/bounded_sigmoid.ipynb for examples
331 | 
332 |     :param x_min: lower bound of the x domain
333 |     :param x_max: lower bound of the x domain
334 |     :param incrementing: if True, evolve from 0 to 1, or from 1 to 0 otherwise
335 |     :param shape: strictly positive number controlling the shape of the
336 |                   resulting function
337 |                   * 1 correspond to linear transition
338 |                   * higher values yield a more and more sharper, i.e. more
339 |                     vertical S shape, converging towards a step function
340 |                     transiting at (x_max-x_min)/2 for very large values of S (
341 |                     e.g. 10000)
342 |                   * values in ]0,1[ yield vertically shaped sigmoids, sharply
343 |                     rising/falling at the boundary of the x domain and
344 |                     transiting more smoothly in the middle of it.
345 |     """
346 | 
347 |     bounded = bound_value(lb=x_min, ub=x_max)
348 | 
349 |     def f(x):
350 |         # values outside the sigmoid are just the repetition of what's
351 |         # happening at the boundaries
352 |         x_b = bounded(x)
353 | 
354 |         if incrementing:
355 |             return stats.beta.cdf((x_b - x_min) / (x_max - x_min),
356 |                                   a=shape,
357 |                                   b=shape)
358 |         else:
359 |             return stats.beta.sf((x_b - x_min) / (x_max - x_min),
360 |                                  a=shape,
361 |                                  b=shape)
362 | 
363 |     return np.frompyfunc(f, 1, 1)
364 | 
365 | 
366 | def identity(x):
367 |     return x
368 | 


--------------------------------------------------------------------------------
/trumania/core/clock.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import pandas as pd
  4 | import logging
  5 | import numpy as np
  6 | from numpy.random import RandomState
  7 | 
  8 | from trumania.core.operations import AddColumns
  9 | from trumania.core.random_generators import DependentGenerator
 10 | from trumania.core.util_functions import latest_date_before
 11 | 
 12 | 
 13 | class Clock(object):
 14 |     """
 15 |     A Clock is the central object managing the evolution of time of the whole circus.
 16 |     It's generating timestamps on demand, and provides information for TimeProfiler objects.
 17 |     """
 18 | 
 19 |     def __init__(self, start, step_duration, seed):
 20 |         """Create a Clock object.
 21 | 
 22 |         :type start: pd.Timestamp
 23 |         :param start: instant of start of the generation
 24 | 
 25 |         :type step_duration: pd.Timedelta
 26 |         :param step_duration: duration of a clock step
 27 | 
 28 |         :type seed: int
 29 |         :param seed: seed for timestamp generator (if steps are more than 1 sec)
 30 | 
 31 |         :return: a new Clock object, initialised
 32 |         """
 33 | 
 34 |         self.current_date = start
 35 |         self.step_duration = step_duration
 36 | 
 37 |         self.__state = RandomState(seed)
 38 |         self.ops = self.ClockOps(self)
 39 | 
 40 |         self.__increment_listeners = []
 41 | 
 42 |     def register_increment_listener(self, listener):
 43 |         """Add an object to be incremented at each step (such as a TimeProfiler)
 44 |         """
 45 |         self.__increment_listeners.append(listener)
 46 | 
 47 |     def increment(self):
 48 |         """Increments the clock by 1 step
 49 | 
 50 |         :rtype: NoneType
 51 |         :return: None
 52 |         """
 53 |         self.current_date += self.step_duration
 54 | 
 55 |         for listener in self.__increment_listeners:
 56 |             listener.increment()
 57 | 
 58 |     def get_timestamp(self, size=1, random=True, log_format=None):
 59 |         """
 60 |         Returns timestamps formatted as string
 61 | 
 62 |         :type size: int
 63 |         :param size: number of timestamps to generate, default 1
 64 | 
 65 |         :type random: boolean
 66 |         :param random: if True, the timestamps are randomly generated in [
 67 |         self.current_date, self.current_date+self.step_duration]
 68 | 
 69 |         :type log_format: string
 70 |         :param log_format: string format of the generated timestamps
 71 | 
 72 |         :rtype: Pandas Series
 73 |         :return: random timestamps in the form of strings
 74 |         """
 75 | 
 76 |         if log_format is None:
 77 |             log_format = "%Y-%m-%d %H:%M:%S"
 78 | 
 79 |         def make_ts(delta_secs):
 80 |             date = self.current_date + pd.Timedelta(seconds=delta_secs)
 81 |             return date.strftime(log_format)
 82 | 
 83 |         if random:
 84 |             step_secs = int(self.step_duration.total_seconds())
 85 |             return pd.Series(self.__state.choice(step_secs, size)).apply(make_ts)
 86 |         else:
 87 |             return pd.Series([self.current_date.strftime(log_format)] * size)
 88 | 
 89 |     def n_iterations(self, duration):
 90 |         """
 91 |         :type duration: pd.Timedelta
 92 | 
 93 |         :return: the smallest number of iteration of this clock s.t. the
 94 |         corresponding duration is >= duration
 95 |         """
 96 |         step_secs = self.step_duration.total_seconds()
 97 |         return int(np.ceil(duration.total_seconds() / step_secs))
 98 | 
 99 |     class ClockOps(object):
100 |         def __init__(self, clock):
101 |             self.clock = clock
102 | 
103 |         class Timestamp(AddColumns):
104 |             def __init__(self, clock, named_as, random, log_format):
105 |                 AddColumns.__init__(self)
106 |                 self.clock = clock
107 |                 self.named_as = named_as
108 |                 self.random = random
109 |                 self.log_format = log_format
110 | 
111 |             def build_output(self, story_data):
112 |                 values = self.clock.get_timestamp(
113 |                     size=story_data.shape[0], random=self.random,
114 |                     log_format=self.log_format).values
115 | 
116 |                 df = pd.DataFrame({self.named_as: values},
117 |                                   index=story_data.index)
118 |                 return df
119 | 
120 |         def timestamp(self, named_as, random=True, log_format=None):
121 |             """
122 |             Generates a random timestamp within the current time slice
123 |             """
124 |             return self.Timestamp(self.clock, named_as, random, log_format)
125 | 
126 | 
127 | class CyclicTimerGenerator(DependentGenerator):
128 |     """A TimeProfiler contains an activity profile over a defined time range.
129 |     It's mostly a super class, normally only its child classes should be used.
130 | 
131 |     The goal of a TimeProfiler is to keep a track of the expected level of activity of users over a cyclic time range
132 |     It will store a vector with probabilities of activity per time step, as well as a cumulative sum of the
133 |     probabilities starting with the current time step.
134 | 
135 |     This allows to quickly produce random waiting times until the next event for the users
136 | 
137 |     """
138 |     def __init__(self, clock, seed, config):
139 |         """
140 |         This should not be used, only child classes
141 | 
142 |         :type clock: Clock
143 |         :param clock: the master clock driving this simulator
144 | 
145 |         :type seed: int
146 |         :param seed: seed for random number generator, default None
147 |         :return: A new TimeProfiler is created
148 |         """
149 |         DependentGenerator.__init__(self)
150 |         self._state = RandomState(seed)
151 |         self.config = config
152 |         self.clock = clock
153 | 
154 |         # "macro" time shift: we shift the whole profile n times in the future
155 |         # or the past until it overlaps with the current clock date
156 |         init_date = latest_date_before(
157 |             starting_date=config.start_date,
158 |             upper_bound=clock.current_date,
159 |             time_step=pd.Timedelta(config.profile_time_steps) * len(
160 |                 config.profile))
161 | 
162 |         # Un-scaled weight profile. We artificially adds a nan to force the
163 |         # up-sclaling to multiply the last element
164 |         profile_idx = pd.date_range(start=init_date,
165 |                                     freq=config.profile_time_steps,
166 |                                     periods=len(config.profile) + 1)
167 |         profile_ser = pd.Series(data=config.profile + [np.nan],
168 |                                 index=profile_idx)
169 | 
170 |         # scaled weight profile, s.t. one clock step == one profile value
171 |         profile_ser = profile_ser.resample(rule=clock.step_duration).pad()[:-1]
172 | 
173 |         self.n_time_bin = profile_ser.shape[0]
174 | 
175 |         profile_cdf = (profile_ser / profile_ser.sum()).cumsum()
176 |         self.profile = pd.DataFrame({"cdf": profile_cdf,
177 | 
178 |                                      # for debugging
179 |                                      "timeframe": np.arange(len(profile_cdf))})
180 | 
181 |         # "micro" time shift,: we step forward along the profile until it is
182 |         # align with the current date
183 |         while self.profile.index[0] < clock.current_date:
184 |             self.increment()
185 | 
186 |         # makes sure we'll get notified when the clock goes forward
187 |         clock.register_increment_listener(self)
188 | 
189 |     def increment(self):
190 |         """
191 |         Increment the time generator by 1 step.
192 | 
193 |         This has as effect to move the cdf of one step to the left, decrease
194 |         all values by the value of the original first entry, and placing the
195 |         previous first entry at the end of the cdf, with value 1.
196 |         """
197 | 
198 |         self.profile["cdf"] -= self.profile["cdf"].iloc[0]
199 | 
200 |         self.profile = pd.concat([self.profile.iloc[1:], self.profile.iloc[:1]])
201 |         self.profile.loc[self.profile.index[-1], "cdf"] = 1
202 | 
203 |     def generate(self, observations):
204 |         """Generate random waiting times, based on some observed activity
205 |         levels. The higher the level of activity, the shorter the waiting
206 |         times will be
207 | 
208 |         :type observations: Pandas Series
209 |         :param observations: contains an array of floats
210 |         :return: Pandas Series
211 |         """
212 | 
213 |         activities = observations
214 | 
215 |         # activities less often than once per cycle length
216 |         low_activities = activities.where((activities <= 2) & (activities > 0)).dropna()
217 |         if low_activities.shape[0] > 0:
218 | 
219 |             draw = self._state.uniform(size=low_activities.shape[0])
220 | 
221 |             # A uniform [0, 2/activity] yields an expected freqs == 1/activity
222 |             # == average period between story.
223 |             # => n_cycles is the number of full timer cycles from now until
224 |             # next story. It's typically not an integer and possibly be > 1
225 |             # since we have on average less han 1 activity per cycle of this
226 |             # timer.
227 |             n_cycles = 2 * draw / low_activities.values
228 | 
229 |             timer_slots = n_cycles % 1
230 |             n_cycles_int = n_cycles - timer_slots
231 | 
232 |             timers = self.profile["cdf"].searchsorted(timer_slots) + \
233 |                 self.n_time_bin * n_cycles_int
234 | 
235 |             low_activity_timer = pd.Series(timers, index=low_activities.index)
236 | 
237 |         else:
238 |             low_activity_timer = pd.Series()
239 | 
240 |         high_activities = activities.where(activities > 2).dropna()
241 |         if high_activities.shape[0] > 0:
242 | 
243 |             # A beta(1, activity-1) will yield expected frequencies of
244 |             # 1/(1+activity-1) == 1/activity == average period between story.
245 |             # This just stops to work for activities < 1, or even close to one
246 |             # => we use the uniform mechanism above for activities <= 2 and
247 |             # rely on betas here for expected frequencies of 2 per cycle or
248 |             # higher
249 |             timer_slots = high_activities.apply(
250 |                 lambda activity: self._state.beta(1, activity - 1))
251 | 
252 |             timers = self.profile["cdf"].searchsorted(timer_slots, side="left")
253 |             high_activity_timer = pd.Series(timers, index=high_activities.index)
254 | 
255 |         else:
256 |             high_activity_timer = pd.Series()
257 | 
258 |         all_timers = pd.concat([low_activity_timer, high_activity_timer])
259 | 
260 |         # Not sure about that one, there seem to be a bias somewhere that
261 |         # systematically generates too large timer. Maybe it's a rounding
262 |         # effect of searchsorted() or so. Or a bug elsewhere ?
263 |         all_timers = all_timers.apply(lambda d: max(0, d - 1))
264 | 
265 |         # makes sure all_timers is in the same order and with the same index
266 |         # as input observations, even in case of duplicate index values
267 |         all_timers = all_timers.reindex_like(observations)
268 |         return all_timers
269 | 
270 |     def activity(self, n, per):
271 |         """
272 | 
273 |         :param n: number of stories
274 |         :param per: time period for that number of stories
275 |         :type per: pd.Timedelta
276 |         :return: the activity level corresponding to the specified number of n
277 |         executions per time period
278 |         """
279 | 
280 |         scale = self.config.duration().total_seconds() / per.total_seconds()
281 |         activity = n * scale
282 | 
283 |         requested_period = pd.Timedelta(seconds=per.total_seconds() / n)
284 |         if requested_period < self.clock.step_duration:
285 |             logging.warning(
286 |                 "Warning: Creating activity level for {} stories per "
287 |                 "{} =>  activity is {} but period is {}, which is "
288 |                 "shorter  than the clock period ({}). This clock "
289 |                 "cannot keep up with such rate and less events will be"
290 |                 " produced".format(n, per, activity, requested_period,
291 |                                    self.clock.step_duration)
292 |             )
293 | 
294 |         return activity
295 | 
296 | 
297 | class CyclicTimerProfile(object):
298 |     """
299 |     Static parameters of the Timer profile. Separated from the timer gen
300 |     itself to facilitate persistence.
301 | 
302 |     :type profile: python array
303 |     :param profile: Weight of each period
304 | 
305 |     :type profile_time_steps: string
306 |     :param profile_time_steps: duration of the time-steps in the profile
307 |     (e.g. "15min")
308 | 
309 |     :type start_date: pd.Timestamp
310 |     :param start_date: date of the origin of the specified profile =>
311 |     this is used to align with the values of the clock
312 | 
313 |     """
314 |     def __init__(self, profile, profile_time_steps, start_date):
315 |         self.start_date = start_date
316 |         self.profile = profile
317 |         self.profile_time_steps = profile_time_steps
318 | 
319 |     def save_to(self, file_path):
320 | 
321 |         logging.info("saving timer generator to {}".format(file_path))
322 | 
323 |         saved_df = pd.DataFrame({("value", "profile"): self.profile},
324 |                                 dtype=str).stack()
325 |         saved_df.index = saved_df.index.reorder_levels([1, 0])
326 |         saved_df.loc[("start_date", 0)] = self.start_date
327 |         saved_df.loc[("profile_time_steps", 0)] = self.profile_time_steps
328 |         saved_df.to_csv(file_path)
329 | 
330 |     @staticmethod
331 |     def load_from(file_path):
332 |         saved_df = pd.read_csv(file_path, index_col=[0, 1])
333 | 
334 |         profile = saved_df.loc[("profile", slice(None))]\
335 |             .unstack()\
336 |             .astype(float)\
337 |             .tolist()
338 | 
339 |         profile_time_steps = saved_df.loc["profile_time_steps"].values[0][0]
340 |         start_date = pd.Timestamp(saved_df.loc["start_date"].values[0][0])
341 | 
342 |         return CyclicTimerProfile(profile, profile_time_steps, start_date)
343 | 
344 |     def duration(self):
345 |         """
346 |         :return: the total duration corresponding to this time profile
347 |         """
348 | 
349 |         return len(self.profile) * pd.Timedelta(self.profile_time_steps)
350 | 


--------------------------------------------------------------------------------