├── tests ├── __init__.py ├── mocks │ ├── __init__.py │ ├── random_generators.py │ └── operations.py ├── notebooks │ ├── __init__.py │ └── pos_stocks.py ├── scenarios │ ├── __init__.py │ └── long_cdr.py └── unit_tests │ ├── __init__.py │ ├── test_circus.py │ ├── test_clock.py │ ├── test_attribute.py │ ├── test_random_generators.py │ ├── test_util_functions.py │ ├── test_operations.py │ ├── test_activity.py │ └── test_populations.py ├── trumania ├── __init__.py ├── core │ ├── __init__.py │ ├── attribute.py │ ├── util_functions.py │ ├── circus.py │ ├── operations.py │ └── clock.py └── components │ ├── __init__.py │ ├── geographies │ ├── __init__.py │ ├── random_geo.py │ └── uganda.py │ ├── social_networks │ ├── __init__.py │ └── erdos_renyi.py │ ├── time_patterns │ ├── __init__.py │ └── profilers.py │ └── db.py ├── examples ├── tutorial │ ├── __init__.py │ └── example4.py ├── presentation │ ├── 01_empty_circus.py │ ├── 02_circus_with_actor.py │ ├── 03_circus_with_story.py │ ├── 05_circus_with_story.py │ ├── 04_circus_with_story.py │ ├── 06_circus_with_story.py │ ├── 08_circus_with_timed_story.py │ └── 07_circus_with_story_and_relationship.py └── datacamp-blogpost │ ├── 01-a-basic-user-population.py │ ├── 02-hello-world-statements.py │ ├── 03-someone-to-say-hello-world-to.py │ ├── 04-you-always-say-that.py │ ├── 05-it-aint-what-yo-do-it-s-the-time-that-you-do-it.py │ └── 06-the-social-network.py ├── .flake8 ├── docs ├── source │ ├── modules.rst │ ├── trumania.rst │ ├── trumania.components.rst │ ├── index.rst │ ├── trumania.components.time_patterns.rst │ ├── trumania.components.social_networks.rst │ ├── trumania.components.geographies.rst │ ├── trumania.core.rst │ └── conf.py ├── REAMDE.md └── Makefile ├── setup.py ├── .gitignore ├── Pipfile ├── README.md └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/mocks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trumania/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/notebooks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/scenarios/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trumania/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/tutorial/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trumania/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trumania/components/geographies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trumania/components/social_networks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trumania/components/time_patterns/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E121,E126,E131 3 | max-line-length = 120 4 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | trumania 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | trumania 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name='trumania', 5 | version='1.0', 6 | py_modules=['trumania'] 7 | ) 8 | -------------------------------------------------------------------------------- /docs/source/trumania.rst: -------------------------------------------------------------------------------- 1 | trumania package 2 | ================ 3 | 4 | Module contents 5 | --------------- 6 | 7 | .. automodule:: trumania 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /trumania/components/_DB/ 2 | /trumania/components/geographies/source_data/ 3 | tests/tutorial/output/ 4 | 5 | # python 6 | *.pyc 7 | .ipynb_checkpoints 8 | *.egg-info 9 | .idea/ 10 | .cache 11 | explore.ipynb 12 | cdr_output_logs/ 13 | snd_output_logs/ 14 | .venv 15 | /output/ 16 | /venv/ 17 | 18 | # scala 19 | metastore_db 20 | target 21 | *.class 22 | derby.log 23 | run.log 24 | 25 | # mac 26 | .DS_Store 27 | -------------------------------------------------------------------------------- /docs/source/trumania.components.rst: -------------------------------------------------------------------------------- 1 | trumania\.components package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | trumania\.components\.db module 8 | ------------------------------- 9 | 10 | .. automodule:: trumania.components.db 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: trumania.components 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [dev-packages] 7 | jupyter = "*" 8 | flake8 = "*" 9 | 10 | [packages] 11 | networkx = "*" 12 | pandas = "==1.4.0" 13 | numpy = "==1.22.0" 14 | scipy = "*" 15 | pytest = "*" 16 | pytest-metadata = "*" 17 | faker = "*" 18 | pymongo = "*" 19 | "path.py" = "*" 20 | bson = "*" 21 | tabulate = "*" 22 | "e1839a8" = {path = ".", editable = true} 23 | trumania = {editable = true, path = "."} 24 | 25 | [requires] 26 | python_version = "3.9.10" 27 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. trumania documentation master file, created by 2 | sphinx-quickstart on Mon Jan 15 12:02:36 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to trumania's documentation! 7 | ==================================== 8 | 9 | WIKI 10 | ==== 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | wiki.md 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /examples/presentation/01_empty_circus.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from trumania.core import circus 5 | import trumania.core.util_functions as util_functions 6 | 7 | util_functions.setup_logging() 8 | 9 | logging.info("building circus") 10 | 11 | example1 = circus.Circus( 12 | name="example1", 13 | master_seed=12345, 14 | start=pd.Timestamp("1 Jan 2017 00:00"), 15 | step_duration=pd.Timedelta("1h")) 16 | 17 | example1.run( 18 | duration=pd.Timedelta("48h"), 19 | log_output_folder="output/example1", 20 | delete_existing_logs=True 21 | ) 22 | -------------------------------------------------------------------------------- /docs/source/trumania.components.time_patterns.rst: -------------------------------------------------------------------------------- 1 | trumania\.components\.time\_patterns package 2 | ============================================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | trumania\.components\.time\_patterns\.profilers module 8 | ------------------------------------------------------ 9 | 10 | .. automodule:: trumania.components.time_patterns.profilers 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: trumania.components.time_patterns 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/trumania.components.social_networks.rst: -------------------------------------------------------------------------------- 1 | trumania\.components\.social\_networks package 2 | ============================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | trumania\.components\.social\_networks\.erdos\_renyi module 8 | ----------------------------------------------------------- 9 | 10 | .. automodule:: trumania.components.social_networks.erdos_renyi 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: trumania.components.social_networks 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/REAMDE.md: -------------------------------------------------------------------------------- 1 | # How to generate the wiki and the doc 2 | 3 | You need to have have two copy of the trumania repository, one where you work on the code and one where you generate the docs. 4 | The one with the docs should be trumania-docs/html. 5 | The structure should be 6 | ``` 7 | - trumania (repository on the master branch) 8 | - trumania-docs 9 | - html (repository on the branch gh_pages 10 | ``` 11 | 12 | Once you have the correct structure, go to `trumania/docs` and run the two following commands 13 | ``` 14 | # Only required if the code api changed, it will update the code structure 15 | sphinx-apidoc ../trumania -o source 16 | 17 | # It will update the html pages 18 | make html 19 | ``` 20 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = trumania 8 | SOURCEDIR = source 9 | BUILDDIR = ../../trumania-docs/ 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /tests/mocks/random_generators.py: -------------------------------------------------------------------------------- 1 | from trumania.core.random_generators import Generator 2 | 3 | 4 | class ConstantsMockGenerator(Generator): 5 | """ 6 | For test only: a (non random) Generator returning pre-defined values 7 | """ 8 | def __init__(self, values): 9 | Generator.__init__(self) 10 | self.values = values 11 | 12 | def generate(self, size): 13 | # (value is ignored) 14 | return self.values 15 | 16 | 17 | class MockTimerGenerator(Generator): 18 | """ 19 | For test only: a (non random) Profiler returning pre-defined values 20 | """ 21 | def __init__(self, values_series): 22 | Generator.__init__(self) 23 | self.values_series = values_series 24 | 25 | def generate(self, observations): 26 | # (value is ignored) 27 | return self.values_series[observations.index] 28 | -------------------------------------------------------------------------------- /trumania/components/geographies/random_geo.py: -------------------------------------------------------------------------------- 1 | from trumania.core.circus import Circus 2 | from trumania.core.population import Population 3 | from trumania.core.random_generators import FakerGenerator 4 | 5 | 6 | class WithRandomGeo(Circus): 7 | """ 8 | Circus mix-in that adds the creation of random cells 9 | """ 10 | 11 | def create_random_cells(self, n_cells): 12 | """ 13 | Creation of a basic population for cells, with latitude and longitude 14 | """ 15 | 16 | cells = Population(size=n_cells) 17 | 18 | latitude_generator = FakerGenerator(method="latitude", seed=next(self.seeder)) 19 | longitude_generator = FakerGenerator(method="longitude", seed=next(self.seeder)) 20 | 21 | cells.create_attribute("latitude", init_gen=latitude_generator) 22 | cells.create_attribute("longitude", init_gen=longitude_generator) 23 | 24 | return cells 25 | -------------------------------------------------------------------------------- /docs/source/trumania.components.geographies.rst: -------------------------------------------------------------------------------- 1 | trumania\.components\.geographies package 2 | ========================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | trumania\.components\.geographies\.belgium module 8 | ------------------------------------------------- 9 | 10 | .. automodule:: trumania.components.geographies.belgium 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | trumania\.components\.geographies\.random\_geo module 16 | ----------------------------------------------------- 17 | 18 | .. automodule:: trumania.components.geographies.random_geo 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | trumania\.components\.geographies\.uganda module 24 | ------------------------------------------------ 25 | 26 | .. automodule:: trumania.components.geographies.uganda 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: trumania.components.geographies 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /tests/mocks/operations.py: -------------------------------------------------------------------------------- 1 | from trumania.core import operations 2 | 3 | 4 | class FakeOp(operations.Operation): 5 | """ 6 | just returning hard-coded results as output 7 | """ 8 | 9 | def __init__(self, output, logs): 10 | self.output = output 11 | self.logs = logs 12 | 13 | def __call__(self, story_data): 14 | return self.output, self.logs 15 | 16 | 17 | class FakeRecording(operations.Operation): 18 | 19 | def __init__(self): 20 | self.last_seen_population_ids = [] 21 | 22 | def __call__(self, story_data): 23 | self.last_seen_population_ids = story_data.index.tolist() 24 | return story_data, {} 25 | 26 | def reset(self): 27 | self.last_seen_population_ids = [] 28 | 29 | 30 | class MockDropOp(operations.Operation): 31 | """ 32 | simulating an story that drops rows 33 | """ 34 | 35 | def __init__(self, from_idx, to_idx): 36 | self.from_idx = from_idx 37 | self.to_idx = to_idx 38 | 39 | def __call__(self, story_data): 40 | return story_data.iloc[self.from_idx: self.to_idx, :], {} 41 | -------------------------------------------------------------------------------- /examples/datacamp-blogpost/01-a-basic-user-population.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from tabulate import tabulate 4 | 5 | from trumania.core import circus 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator 7 | import trumania.core.util_functions as util_functions 8 | 9 | 10 | util_functions.setup_logging() 11 | 12 | example_circus = circus.Circus(name="example1", 13 | master_seed=12345, 14 | start=pd.Timestamp("1 Jan 2017 00:00"), 15 | step_duration=pd.Timedelta("1h")) 16 | 17 | id_gen = SequencialGenerator(prefix="PERSON_") 18 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5, 19 | seed=next(example_circus.seeder)) 20 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder)) 21 | 22 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen) 23 | person.create_attribute("NAME", init_gen=name_gen) 24 | person.create_attribute("AGE", init_gen=age_gen) 25 | 26 | 27 | logging.info("\n" + 28 | tabulate(person.to_dataframe().head(10), headers='keys', tablefmt='psql') 29 | ) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /examples/presentation/02_circus_with_actor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from tabulate import tabulate 4 | 5 | from trumania.core import circus 6 | import trumania.core.util_functions as util_functions 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator 8 | 9 | 10 | util_functions.setup_logging() 11 | 12 | logging.info("building circus") 13 | 14 | example = circus.Circus( 15 | name="example", 16 | master_seed=12345, 17 | start=pd.Timestamp("1 Jan 2017 00:00"), 18 | step_duration=pd.Timedelta("1h")) 19 | 20 | person = example.create_population( 21 | name="person", size=1000, 22 | ids_gen=SequencialGenerator(prefix="PERSON_")) 23 | 24 | person.create_attribute( 25 | "NAME", 26 | init_gen=FakerGenerator(method="name", 27 | seed=next(example.seeder))) 28 | 29 | person.create_attribute( 30 | "age", 31 | init_gen=NumpyRandomGenerator( 32 | method="normal", loc=35, scale=5, 33 | seed=next(example.seeder))) 34 | 35 | example.run( 36 | duration=pd.Timedelta("48h"), 37 | log_output_folder="output/example2", 38 | delete_existing_logs=True) 39 | 40 | logging.info("10 first persons: \n" + tabulate(person.to_dataframe().head(10), 41 | headers='keys', tablefmt='psql')) 42 | -------------------------------------------------------------------------------- /tests/scenarios/long_cdr.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from test_cdr import run_cdr_scenario 4 | from trumania.core.util_functions import setup_logging 5 | 6 | # better run this outside of PyCharm for consistent measures... 7 | # 8 | # python tests/scenarios/long_cdr.py 9 | 10 | if __name__ == "__main__": 11 | setup_logging() 12 | logging.info("starting a long CDR test ") 13 | params = { 14 | "time_step": 60, 15 | "n_cells": 200, 16 | "n_agents": 500, 17 | "n_subscribers": 25000, 18 | "average_degree": 20, 19 | "n_iterations": 200 20 | } 21 | 22 | run_cdr_scenario(params) 23 | 24 | """ 25 | result on Svends's laptop: 26 | 27 | total number of logs: 392086 28 | execution times: " 29 | - building the circus: 0 days 00:01:32.156013 30 | - running the simulation: 0 days 00:14:07.355875 31 | 32 | Note: that world is a bit irrealistic: 33 | * 200 clock steps is about 3 hours 34 | * 400k logs for 25k users is 16 actions per persons 35 | 36 | => a bit more than 5 actions per user per hour 37 | 38 | 39 | New result (25 Aug) after adding multi-sim 40 | 41 | total number of logs: 380270 42 | 2016-08-25 15:11:47,849 71736 topups logs 43 | 2016-08-25 15:11:47,849 86 cell_status logs 44 | 2016-08-25 15:11:47,849 154358 voice_cdr logs 45 | 2016-08-25 15:11:47,850 152642 sms_cdr logs 46 | 2016-08-25 15:11:47,850 1448 mobility_logs logs 47 | 2016-08-25 15:11:47,852 48 | execution times: " 49 | - building the circus: 0 days 00:02:43.163336 50 | - running the simulation: 0 days 00:30:58.923819 51 | """ 52 | -------------------------------------------------------------------------------- /tests/notebooks/pos_stocks.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | from scipy import stats 5 | import seaborn as sns 6 | 7 | 8 | def ev(dist): 9 | 10 | """ 11 | :param dist: dictionary representing a probability distribution 12 | :return: the expected value of this distribution 13 | """ 14 | assert sum(dist.values()) == 1 15 | return sum(dist[v] * v for v in dist.keys()) 16 | 17 | 18 | def post_pad(vect, target_size): 19 | """ 20 | pads the end of this vectors with 0s s.t. it is as long as T_size 21 | """ 22 | return np.pad(vect, [0, target_size - len(vect)], "constant", constant_values=0) 23 | 24 | 25 | def pre_pad(vect, n_pads): 26 | """ 27 | pads the beginning of this vectors with the requested amount of 0s 28 | """ 29 | return np.pad(vect, [n_pads, 0], "constant", constant_values=0) 30 | 31 | 32 | def binom_pmf(n, p): 33 | """ 34 | return a binomial(n,p) pmf 35 | """ 36 | 37 | def _pmf(k): 38 | return stats.binom.pmf(k, n, p) 39 | 40 | return _pmf 41 | 42 | 43 | def build_heatmap(transition_matrix, **kwargs): 44 | """ 45 | convenience method to show a heatmap representing this transition matrix 46 | """ 47 | return sns.heatmap(transition_matrix, 48 | xticklabels=False, 49 | yticklabels=False, 50 | **kwargs) 51 | 52 | 53 | def compute_stationary(transition_matrix): 54 | A = transition_matrix - np.identity(transition_matrix.shape[0]) 55 | 56 | # adding one more constraint force x being a probability vector 57 | prob_const = np.ones([1, transition_matrix.shape[1]]) 58 | A2 = np.concatenate([A, prob_const], axis=0) 59 | 60 | b = np.concatenate([np.zeros([transition_matrix.shape[0], 1]), [[1]]], axis=0) 61 | 62 | x, res, rank, s = np.linalg.lstsq(A2, b) 63 | 64 | return x.T[0], res 65 | -------------------------------------------------------------------------------- /examples/datacamp-blogpost/02-hello-world-statements.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from tabulate import tabulate 4 | 5 | from trumania.core import circus, operations 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator 7 | import trumania.core.util_functions as util_functions 8 | 9 | 10 | util_functions.setup_logging() 11 | 12 | example_circus = circus.Circus(name="example", 13 | master_seed=12345, 14 | start=pd.Timestamp("1 Jan 2017 00:00"), 15 | step_duration=pd.Timedelta("1h")) 16 | 17 | id_gen = SequencialGenerator(prefix="PERSON_") 18 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5, 19 | seed=next(example_circus.seeder)) 20 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder)) 21 | 22 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen) 23 | person.create_attribute("NAME", init_gen=name_gen) 24 | person.create_attribute("AGE", init_gen=age_gen) 25 | 26 | hello_world = example_circus.create_story( 27 | name="hello_world", 28 | initiating_population=example_circus.populations["person"], 29 | member_id_field="PERSON_ID", 30 | timer_gen=ConstantDependentGenerator(value=1) 31 | ) 32 | 33 | hello_world.set_operations( 34 | example_circus.clock.ops.timestamp(named_as="TIME"), 35 | ConstantGenerator(value="hello world").ops.generate(named_as="MESSAGE"), 36 | operations.FieldLogger(log_id="hello") 37 | ) 38 | 39 | example_circus.run( 40 | duration=pd.Timedelta("48h"), 41 | log_output_folder="output/example_scenario", 42 | delete_existing_logs=True 43 | ) 44 | 45 | # -- DEBUG output printout 46 | 47 | df = pd.read_csv("output/example_scenario/hello.csv") 48 | print(df.head(10)) 49 | print(df.tail(10)) 50 | -------------------------------------------------------------------------------- /docs/source/trumania.core.rst: -------------------------------------------------------------------------------- 1 | trumania\.core package 2 | ====================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | trumania\.core\.attribute module 8 | -------------------------------- 9 | 10 | .. automodule:: trumania.core.attribute 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | trumania\.core\.circus module 16 | ----------------------------- 17 | 18 | .. automodule:: trumania.core.circus 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | trumania\.core\.clock module 24 | ---------------------------- 25 | 26 | .. automodule:: trumania.core.clock 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | trumania\.core\.operations module 32 | --------------------------------- 33 | 34 | .. automodule:: trumania.core.operations 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | trumania\.core\.population module 40 | --------------------------------- 41 | 42 | .. automodule:: trumania.core.population 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | trumania\.core\.random\_generators module 48 | ----------------------------------------- 49 | 50 | .. automodule:: trumania.core.random_generators 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | trumania\.core\.relationship module 56 | ----------------------------------- 57 | 58 | .. automodule:: trumania.core.relationship 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | trumania\.core\.story module 64 | ---------------------------- 65 | 66 | .. automodule:: trumania.core.story 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | trumania\.core\.util\_functions module 72 | -------------------------------------- 73 | 74 | .. automodule:: trumania.core.util_functions 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | 80 | Module contents 81 | --------------- 82 | 83 | .. automodule:: trumania.core 84 | :members: 85 | :undoc-members: 86 | :show-inheritance: 87 | -------------------------------------------------------------------------------- /examples/presentation/03_circus_with_story.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from trumania.core import circus 5 | import trumania.core.util_functions as util_functions 6 | from trumania.core.operations import FieldLogger 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator 8 | from trumania.core.random_generators import ConstantDependentGenerator, ConstantGenerator 9 | 10 | 11 | util_functions.setup_logging() 12 | 13 | logging.info("building circus") 14 | 15 | 16 | def create_circus_with_population(): 17 | example_circus = circus.Circus( 18 | name="example", 19 | master_seed=12345, 20 | start=pd.Timestamp("1 Jan 2017 00:00"), 21 | step_duration=pd.Timedelta("1h")) 22 | 23 | person = example_circus.create_population( 24 | name="person", size=1000, 25 | ids_gen=SequencialGenerator(prefix="PERSON_")) 26 | 27 | person.create_attribute( 28 | "NAME", 29 | init_gen=FakerGenerator(method="name", 30 | seed=next(example_circus.seeder))) 31 | 32 | person.create_attribute( 33 | "age", 34 | init_gen=NumpyRandomGenerator( 35 | method="normal", loc=3, scale=5, 36 | seed=next(example_circus.seeder))) 37 | 38 | return example_circus 39 | 40 | 41 | example = create_circus_with_population() 42 | 43 | hello_world = example.create_story( 44 | name="hello_world", 45 | initiating_population=example.populations["person"], 46 | member_id_field="PERSON_ID", 47 | 48 | timer_gen=ConstantDependentGenerator(value=1) 49 | ) 50 | 51 | hello_world.set_operations( 52 | example.clock.ops.timestamp(named_as="TIME"), 53 | ConstantGenerator(value="hello world").ops.generate(named_as="MESSAGE"), 54 | FieldLogger(log_id="hello") 55 | ) 56 | 57 | example.run( 58 | duration=pd.Timedelta("48h"), 59 | log_output_folder="output/example3", 60 | delete_existing_logs=True 61 | ) 62 | 63 | with open("output/example3/hello.csv") as log: 64 | logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10])) 65 | -------------------------------------------------------------------------------- /trumania/components/social_networks/erdos_renyi.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import logging 3 | import networkx as nx 4 | import trumania.core.random_generators as rg 5 | from trumania.core.circus import Circus 6 | 7 | import pandas as pd 8 | 9 | 10 | class WithErdosRenyi(Circus): 11 | """ 12 | Circus mix-in that provides method to build ER random graph 13 | """ 14 | 15 | def add_er_social_network_relationship(self, population, relationship_name, average_degree): 16 | """ 17 | Adds to this population a relationship from and to its members based an ER random graph 18 | """ 19 | logging.info("Creating the social network ") 20 | 21 | # create a random A to B symmetric relationship 22 | network_weight_gen = rg.ParetoGenerator(xmin=1., a=1.2, seed=next(self.seeder)) 23 | 24 | social_network_values = create_er_social_network( 25 | customer_ids=population.ids, 26 | p=average_degree / len(population.ids), 27 | seed=next(self.seeder)) 28 | 29 | social_network = population.create_relationship(relationship_name) 30 | social_network.add_relations( 31 | from_ids=social_network_values["A"].values, 32 | to_ids=social_network_values["B"].values, 33 | weights=network_weight_gen.generate(social_network_values.shape[0])) 34 | 35 | social_network.add_relations( 36 | from_ids=social_network_values["B"].values, 37 | to_ids=social_network_values["A"].values, 38 | weights=network_weight_gen.generate(social_network_values.shape[0])) 39 | 40 | 41 | def create_er_social_network(customer_ids, p, seed): 42 | """ 43 | 44 | :type customer_ids: list 45 | :param customer_ids: list of IDs as defined in the data 46 | :type p: float 47 | :param p: probability of existence of 1 edge 48 | :type seed: int 49 | :param seed: seed for random generator 50 | :rtype: Pandas DataFrame, with two columns (A and B) 51 | :return: all edges in the graph 52 | """ 53 | 54 | return pd.DataFrame.from_records([(customer_ids[e[0]], customer_ids[e[1]]) 55 | for e in nx.fast_gnp_random_graph(len(customer_ids), p, seed).edges()], 56 | columns=["A", "B"]) 57 | -------------------------------------------------------------------------------- /examples/datacamp-blogpost/03-someone-to-say-hello-world-to.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from tabulate import tabulate 4 | 5 | from trumania.core import circus, operations 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator 7 | import trumania.core.util_functions as util_functions 8 | 9 | 10 | util_functions.setup_logging() 11 | 12 | example_circus = circus.Circus(name="example", 13 | master_seed=12345, 14 | start=pd.Timestamp("1 Jan 2017 00:00"), 15 | step_duration=pd.Timedelta("1h")) 16 | 17 | id_gen = SequencialGenerator(prefix="PERSON_") 18 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5, 19 | seed=next(example_circus.seeder)) 20 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder)) 21 | 22 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen) 23 | person.create_attribute("NAME", init_gen=name_gen) 24 | person.create_attribute("AGE", init_gen=age_gen) 25 | 26 | hello_world = example_circus.create_story( 27 | name="hello_world", 28 | initiating_population=example_circus.populations["person"], 29 | member_id_field="PERSON_ID", 30 | timer_gen=ConstantDependentGenerator(value=1) 31 | ) 32 | 33 | hello_world.set_operations( 34 | example_circus.clock.ops.timestamp(named_as="TIME"), 35 | ConstantGenerator(value="hello world").ops.generate(named_as="MESSAGE"), 36 | 37 | example_circus.populations["person"].ops.select_one(named_as="OTHER_PERSON"), 38 | 39 | example_circus.populations["person"] 40 | .ops.lookup(id_field="PERSON_ID", select={"NAME": "EMITTER_NAME"}), 41 | 42 | example_circus.populations["person"] 43 | .ops.lookup(id_field="OTHER_PERSON", select={"NAME": "RECEIVER_NAME"}), 44 | 45 | operations.FieldLogger(log_id="hello_3") 46 | ) 47 | 48 | example_circus.run( 49 | duration=pd.Timedelta("48h"), 50 | log_output_folder="output/example_scenario", 51 | delete_existing_logs=True 52 | ) 53 | 54 | # -- DEBUG output printout 55 | pd.set_option('display.max_columns', 500) 56 | pd.set_option('display.width', 1000) 57 | df = pd.read_csv("output/example_scenario/hello_3.csv") 58 | print(df.head(10)) 59 | print(df.tail(10)) 60 | -------------------------------------------------------------------------------- /tests/unit_tests/test_circus.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pytest 3 | import pandas as pd 4 | 5 | from trumania.core.random_generators import SequencialGenerator 6 | from trumania.core.circus import Circus 7 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator 8 | 9 | 10 | def test_create_story_get_story_should_work_as_expected(): 11 | 12 | flying = Circus(name="tested_circus", 13 | master_seed=1, 14 | start=pd.Timestamp("8 June 2016"), 15 | step_duration=pd.Timedelta("60s")) 16 | 17 | customers = flying.create_population( 18 | "the_customers", size=100, 19 | ids_gen=SequencialGenerator(prefix="a")) 20 | 21 | mobility_time_gen = DefaultDailyTimerGenerator(flying.clock, seed=1) 22 | 23 | mobility_story = flying.create_story( 24 | name="mobility", 25 | 26 | initiating_population=customers, 27 | member_id_field="A_ID", 28 | 29 | timer_gen=mobility_time_gen, 30 | ) 31 | 32 | # add and get story by name should work as expected 33 | result = flying.get_story("mobility") 34 | 35 | assert result.name == "mobility" 36 | assert result.member_id_field == mobility_story.member_id_field 37 | 38 | # also retrieving this initiating population of that population 39 | 40 | retrieved_pop = flying.get_population_of("mobility") 41 | 42 | assert retrieved_pop == customers 43 | 44 | 45 | def test_get_non_existing_story_should_return_none(): 46 | 47 | flying = Circus(name="tested_circus", 48 | master_seed=1, 49 | start=pd.Timestamp("8 June 2016"), 50 | step_duration=pd.Timedelta("60s")) 51 | 52 | assert flying.get_story("non_existing_name") is None 53 | 54 | 55 | def test_adding_a_second_story_with_same_name_should_be_refused(): 56 | 57 | flying = Circus(name="tested_circus", 58 | master_seed=1, 59 | start=pd.Timestamp("8 June 2016"), 60 | step_duration=pd.Timedelta("60s")) 61 | 62 | customers = flying.create_population( 63 | name="tested", size=100, 64 | ids_gen=SequencialGenerator(prefix="a")) 65 | 66 | flying.create_story(name="the_story", 67 | initiating_population=customers, 68 | member_id_field="population_id") 69 | 70 | with pytest.raises(ValueError): 71 | flying.create_story(name="the_story", 72 | initiating_population=customers, 73 | member_id_field="population_id") 74 | -------------------------------------------------------------------------------- /examples/presentation/05_circus_with_story.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from trumania.core import circus 5 | import trumania.core.util_functions as util_functions 6 | from trumania.core.operations import FieldLogger 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator 8 | from trumania.core.random_generators import ConstantDependentGenerator 9 | 10 | 11 | util_functions.setup_logging() 12 | 13 | logging.info("building circus") 14 | 15 | 16 | def create_circus_with_population(): 17 | example_circus = circus.Circus( 18 | name="example", 19 | master_seed=12345, 20 | start=pd.Timestamp("1 Jan 2017 00:00"), 21 | step_duration=pd.Timedelta("1h")) 22 | 23 | person = example_circus.create_population( 24 | name="person", size=1000, 25 | ids_gen=SequencialGenerator(prefix="PERSON_")) 26 | 27 | person.create_attribute( 28 | "NAME", 29 | init_gen=FakerGenerator(method="name", 30 | seed=next(example_circus.seeder))) 31 | 32 | person.create_attribute( 33 | "age", 34 | init_gen=NumpyRandomGenerator( 35 | method="normal", loc=35, scale=5, 36 | seed=next(example_circus.seeder))) 37 | 38 | return example_circus 39 | 40 | 41 | the_circus = create_circus_with_population() 42 | 43 | hello_world = the_circus.create_story( 44 | name="hello_world", 45 | initiating_population=the_circus.populations["person"], 46 | member_id_field="PERSON_ID", 47 | 48 | timer_gen=ConstantDependentGenerator(value=1) 49 | ) 50 | 51 | hello_world.set_operations( 52 | 53 | # adding a random timestamp, within the current clock step 54 | the_circus.clock.ops.timestamp(named_as="TIME"), 55 | 56 | # message is now a random sentence from Faker 57 | FakerGenerator(method="sentence", 58 | nb_words=6, variable_nb_words=True, 59 | seed=next(the_circus.seeder) 60 | ) 61 | .ops 62 | .generate(named_as="MESSAGE"), 63 | 64 | # selecting a random "other person" 65 | the_circus.populations["person"] 66 | .ops 67 | .select_one(named_as="OTHER_PERSON"), 68 | 69 | # specifying which fields to put in the log 70 | FieldLogger(log_id="hello", 71 | cols=["TIME", "PERSON_ID", "OTHER_PERSON", "MESSAGE"] 72 | ) 73 | 74 | ) 75 | 76 | the_circus.run( 77 | duration=pd.Timedelta("48h"), 78 | log_output_folder="output/example4", 79 | delete_existing_logs=True 80 | ) 81 | 82 | with open("output/example4/hello.csv") as log: 83 | logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10])) 84 | -------------------------------------------------------------------------------- /examples/presentation/04_circus_with_story.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from trumania.core import circus 5 | import trumania.core.util_functions as util_functions 6 | from trumania.core.operations import FieldLogger 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator 8 | from trumania.core.random_generators import ConstantDependentGenerator, ConstantGenerator 9 | 10 | 11 | util_functions.setup_logging() 12 | 13 | logging.info("building circus") 14 | 15 | 16 | def create_circus_with_population(): 17 | example_circus = circus.Circus( 18 | name="example", 19 | master_seed=12345, 20 | start=pd.Timestamp("1 Jan 2017 00:00"), 21 | step_duration=pd.Timedelta("1h")) 22 | 23 | person = example_circus.create_population( 24 | name="person", size=1000, 25 | ids_gen=SequencialGenerator(prefix="PERSON_")) 26 | 27 | person.create_attribute( 28 | "NAME", 29 | init_gen=FakerGenerator(method="name", 30 | seed=next(example_circus.seeder))) 31 | 32 | person.create_attribute( 33 | "age", 34 | init_gen=NumpyRandomGenerator( 35 | method="normal", loc=35, scale=5, 36 | seed=next(example_circus.seeder))) 37 | 38 | return example_circus 39 | 40 | 41 | the_circus = create_circus_with_population() 42 | 43 | hello_world = the_circus.create_story( 44 | name="hello_world", 45 | initiating_population=the_circus.populations["person"], 46 | member_id_field="PERSON_ID", 47 | 48 | timer_gen=ConstantDependentGenerator(value=1) 49 | ) 50 | 51 | hello_world.set_operations( 52 | 53 | # adding a random timestamp, within the current clock step 54 | the_circus.clock.ops.timestamp(named_as="TIME"), 55 | 56 | ConstantGenerator(value="hello world").ops.generate(named_as="MESSAGE"), 57 | 58 | # selecting a random "other person" 59 | the_circus.populations["person"].ops.select_one(named_as="OTHER_PERSON"), 60 | 61 | the_circus.populations["person"] 62 | .ops 63 | .lookup(id_field="PERSON_ID", 64 | select={"NAME": "EMITTER_NAME"}), 65 | 66 | the_circus.populations["person"] 67 | .ops 68 | .lookup(id_field="OTHER_PERSON", 69 | select={"NAME": "RECEIVER_NAME"}), 70 | 71 | # specifying which fields to put in the log 72 | FieldLogger(log_id="hello", 73 | cols=["TIME", "PERSON_ID", "OTHER_PERSON", "MESSAGE"]) 74 | 75 | ) 76 | 77 | the_circus.run( 78 | duration=pd.Timedelta("48h"), 79 | log_output_folder="output/example4", 80 | delete_existing_logs=True 81 | ) 82 | 83 | with open("output/example4/hello.csv") as log: 84 | logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10])) 85 | 86 | from tabulate import tabulate 87 | logging.info(tabulate(log.readlines(1000)[:10])) 88 | -------------------------------------------------------------------------------- /examples/datacamp-blogpost/04-you-always-say-that.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from tabulate import tabulate 4 | 5 | from trumania.core import circus, operations 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator 7 | import trumania.core.util_functions as util_functions 8 | 9 | 10 | util_functions.setup_logging() 11 | 12 | example_circus = circus.Circus(name="example", 13 | master_seed=12345, 14 | start=pd.Timestamp("1 Jan 2017 00:00"), 15 | step_duration=pd.Timedelta("1h")) 16 | # person population 17 | 18 | id_gen = SequencialGenerator(prefix="PERSON_") 19 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5, 20 | seed=next(example_circus.seeder)) 21 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder)) 22 | 23 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen) 24 | person.create_attribute("NAME", init_gen=name_gen) 25 | person.create_attribute("AGE", init_gen=age_gen) 26 | 27 | # basic relationship to store people's quote 28 | 29 | quote_generator = FakerGenerator(method="sentence", nb_words=6, variable_nb_words=True, 30 | seed=next(example_circus.seeder)) 31 | 32 | quotes_rel = example_circus.populations["person"].create_relationship("quotes") 33 | 34 | for w in range(4): 35 | quotes_rel.add_relations( 36 | from_ids=person.ids, 37 | to_ids=quote_generator.generate(size=person.size), 38 | weights=w 39 | ) 40 | 41 | # message story 42 | 43 | hello_world = example_circus.create_story( 44 | name="hello_world", 45 | initiating_population=example_circus.populations["person"], 46 | member_id_field="PERSON_ID", 47 | timer_gen=ConstantDependentGenerator(value=1) 48 | ) 49 | 50 | hello_world.set_operations( 51 | example_circus.clock.ops.timestamp(named_as="TIME"), 52 | 53 | example_circus.populations["person"].get_relationship("quotes") 54 | .ops.select_one(from_field="PERSON_ID",named_as="MESSAGE"), 55 | 56 | example_circus.populations["person"].ops.select_one(named_as="OTHER_PERSON"), 57 | 58 | example_circus.populations["person"] 59 | .ops.lookup(id_field="PERSON_ID", select={"NAME": "EMITTER_NAME"}), 60 | 61 | example_circus.populations["person"] 62 | .ops.lookup(id_field="OTHER_PERSON", select={"NAME": "RECEIVER_NAME"}), 63 | 64 | operations.FieldLogger(log_id="hello_4") 65 | ) 66 | 67 | # message story 68 | 69 | example_circus.run( 70 | duration=pd.Timedelta("48h"), 71 | log_output_folder="output/example_scenario", 72 | delete_existing_logs=True 73 | ) 74 | 75 | # -- DEBUG output printout 76 | pd.set_option('display.max_columns', 500) 77 | pd.set_option('display.width', 1000) 78 | df = pd.read_csv("output/example_scenario/hello_4.csv") 79 | print(df.head(10)) 80 | print(df.tail(10)) 81 | -------------------------------------------------------------------------------- /examples/presentation/06_circus_with_story.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from trumania.core import circus 5 | import trumania.core.util_functions as util_functions 6 | from trumania.core.operations import FieldLogger 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator 8 | from trumania.core.random_generators import ConstantDependentGenerator 9 | 10 | 11 | util_functions.setup_logging() 12 | 13 | logging.info("building circus") 14 | 15 | 16 | def create_circus_with_population(): 17 | example_circus = circus.Circus( 18 | name="example", 19 | master_seed=12345, 20 | start=pd.Timestamp("1 Jan 2017 00:00"), 21 | step_duration=pd.Timedelta("1h")) 22 | 23 | person = example_circus.create_population( 24 | name="person", size=1000, 25 | ids_gen=SequencialGenerator(prefix="PERSON_")) 26 | 27 | person.create_attribute( 28 | "NAME", 29 | init_gen=FakerGenerator(method="name", 30 | seed=next(example_circus.seeder))) 31 | 32 | person.create_attribute( 33 | "age", 34 | init_gen=NumpyRandomGenerator( 35 | method="normal", loc=35, scale=5, 36 | seed=next(example_circus.seeder))) 37 | 38 | return example_circus 39 | 40 | 41 | the_circus = create_circus_with_population() 42 | 43 | hello_world = the_circus.create_story( 44 | name="hello_world", 45 | initiating_population=the_circus.populations["person"], 46 | member_id_field="PERSON_ID", 47 | 48 | timer_gen=ConstantDependentGenerator(value=1) 49 | ) 50 | 51 | hello_world.set_operations( 52 | 53 | # adding a random timestamp, within the current clock step 54 | the_circus.clock 55 | .ops 56 | .timestamp(named_as="TIME"), 57 | 58 | # message is now a random sentence from Faker 59 | FakerGenerator(method="sentence", 60 | nb_words=6, variable_nb_words=True, 61 | seed=next(the_circus.seeder) 62 | ) 63 | .ops 64 | .generate(named_as="MESSAGE"), 65 | 66 | # selecting a random "other person" 67 | the_circus.populations["person"] 68 | .ops 69 | .select_one(named_as="OTHER_PERSON"), 70 | 71 | the_circus.populations["person"] 72 | .ops 73 | .lookup(id_field="PERSON_ID", 74 | select={"NAME": "EMITTER_NAME"}), 75 | 76 | the_circus.populations["person"] 77 | .ops 78 | .lookup(id_field="OTHER_PERSON", 79 | select={"NAME": "RECEIVER_NAME"}), 80 | 81 | # specifying which fields to put in the log 82 | FieldLogger(log_id="hello", 83 | cols=["TIME", "EMITTER_NAME", "RECEIVER_NAME", "MESSAGE"] 84 | ) 85 | 86 | ) 87 | 88 | the_circus.run( 89 | duration=pd.Timedelta("48h"), 90 | log_output_folder="output/example4", 91 | delete_existing_logs=True 92 | ) 93 | 94 | with open("output/example4/hello.csv") as log: 95 | logging.info("some produced logs: \n\n" + "".join(log.readlines(10)[:10])) 96 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Trumania 2 | 3 | ## Documentation and tutorial 4 | 5 | Trumania is a scenario-based random dataset generator library in python 3. 6 | 7 | A [detailed step-by-step tutorial has is available on Datacamp](https://www.datacamp.com/community/tutorials/generate-data-trumania). 8 | 9 | The [Trumania github page](http://realimpactanalytics.github.io/trumania/) also contains 10 | a detailed documentation of each of the concepts as well as a step-by-step explanation of 4 example scenarios. Those scenarios, and more, are present in the [examples/](examples/) folder in this repository. 11 | 12 | The code pydoc documentation is available [here](http://realimpactanalytics.github.io/trumania/py-modindex.html). 13 | 14 | You can also join the Trumania slack channel: [trumania.slack.com](https://trumania.slack.com) 15 | 16 | ## How to install 17 | 18 | Trumania is not packaged in any special way, the way it is used at the moment is simply to clone the code and install the required dependencies. This section describes how to do that. 19 | 20 | Pre-requisites: 21 | 22 | - If you installed python 3 with homebrew, then the executable is called `python3` and pip is called `pip3`. See [homebrew python documentation](https://docs.brew.sh/Homebrew-and-Python.html) for details 23 | - If you installed python 3 with Conda, make sure you understand how environments work since they might end up conflicting with pipenv environments. See [this ticket](https://github.com/pypa/pipenv/issues/699) for a discussion 24 | - In anycase, in order to specify the exact path of the python to be used, you can always specify `--python /path/to/python` among the `pipenv` arguments. 25 | 26 | That being said, start by installing `pipenv` if necessary: 27 | 28 | ```sh 29 | # this could be called "pip", depending on the environment, and must be linked to python 3 30 | pip3 install --user pipenv 31 | ``` 32 | 33 | then install all python dependencies for this project: 34 | 35 | ```sh 36 | pipenv install --three --python /Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6 37 | ``` 38 | 39 | The steps below mention to prefix the commands with `pipenv run` whenever necessary in order to have access to those python dependencies. Alternatively, you can enter the corresponding virtualenv once with `pipenv shell`, in which case that prefix is no longer necessary. See [https://docs.pipenv.org](https://docs.pipenv.org) for more details about how to use pipenv to handle python dependencies. 40 | 41 | 42 | ## Where and how to create a scenario 43 | 44 | To create a scenario, simply create another python project that depends on trumania: 45 | 46 | ```sh 47 | mkdir -p /path/to/your/project 48 | cd /path/to/your/project 49 | 50 | # make sure /path/to/trumania/ is the absolute path where trumania is stored 51 | pipenv install -e /path/to/trumania/ 52 | ``` 53 | 54 | You can then create your scenario in python, let's call it `burbanks_and_friends_talking.py`. In order to execute it, simply launch it from pipenv: 55 | 56 | ```sh 57 | pipenv run python burbanks_and_friends_talking.py 58 | ``` 59 | 60 | ## Contributing 61 | 62 | This section provides a few pointers on how to handle the trumania codebase. 63 | 64 | ### Running Trumania unit tests locally 65 | 66 | ```sh 67 | # make sure you are not inside another pipenv shell when running this 68 | pipenv run py.test -s -v 69 | ``` 70 | 71 | ### Python linting 72 | Run `pipenv run flake8`. If nothing is returned, the correct styling has been applied. 73 | -------------------------------------------------------------------------------- /examples/presentation/08_circus_with_timed_story.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from trumania.core import circus 5 | import trumania.core.util_functions as util_functions 6 | from trumania.core.operations import FieldLogger 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator 8 | from trumania. components.time_patterns.profilers import WorkHoursTimerGenerator 9 | 10 | 11 | util_functions.setup_logging() 12 | 13 | logging.info("building circus") 14 | 15 | 16 | def create_circus_with_population(): 17 | example_circus = circus.Circus( 18 | name="example", 19 | master_seed=12345, 20 | start=pd.Timestamp("1 Jan 2017 00:00"), 21 | step_duration=pd.Timedelta("1h")) 22 | 23 | person = example_circus.create_population( 24 | name="person", size=1000, 25 | ids_gen=SequencialGenerator(prefix="PERSON_")) 26 | 27 | person.create_attribute( 28 | "NAME", 29 | init_gen=FakerGenerator(method="name", 30 | seed=next(example_circus.seeder))) 31 | 32 | person.create_attribute( 33 | "age", 34 | init_gen=NumpyRandomGenerator( 35 | method="normal", loc=35, scale=5, 36 | seed=next(example_circus.seeder))) 37 | 38 | return example_circus 39 | 40 | 41 | the_circus = create_circus_with_population() 42 | 43 | hello_world = the_circus.create_story( 44 | name="hello_world", 45 | initiating_population=the_circus.populations["person"], 46 | member_id_field="PERSON_ID", 47 | 48 | # each population instance is now going to have 10, 20 or 30 49 | # trigger of this story per week 50 | activity_gen=NumpyRandomGenerator( 51 | method="choice", a=[10, 20, 30], 52 | seed=next(the_circus.seeder) 53 | ), 54 | 55 | # story now only tiggers during office hours 56 | timer_gen=WorkHoursTimerGenerator( 57 | clock=the_circus.clock, 58 | seed=next(the_circus.seeder)) 59 | ) 60 | 61 | hello_world.set_operations( 62 | 63 | # adding a random timestamp, within the current clock step 64 | the_circus.clock 65 | .ops 66 | .timestamp(named_as="TIME"), 67 | 68 | # message is now a random sentence from Faker 69 | FakerGenerator(method="sentence", 70 | nb_words=6, variable_nb_words=True, 71 | seed=next(the_circus.seeder) 72 | ) 73 | .ops 74 | .generate(named_as="MESSAGE"), 75 | 76 | # selecting a random "other person" 77 | the_circus.populations["person"] 78 | .ops 79 | .select_one(named_as="OTHER_PERSON"), 80 | 81 | the_circus.populations["person"] 82 | .ops 83 | .lookup(id_field="PERSON_ID", 84 | select={"NAME": "EMITTER_NAME"}), 85 | 86 | the_circus.populations["person"] 87 | .ops 88 | .lookup(id_field="OTHER_PERSON", 89 | select={"NAME": "RECEIVER_NAME"}), 90 | 91 | # specifying which fields to put in the log 92 | FieldLogger(log_id="hello", 93 | cols=["TIME", "EMITTER_NAME", "RECEIVER_NAME", "MESSAGE"] 94 | ) 95 | 96 | ) 97 | 98 | the_circus.run( 99 | duration=pd.Timedelta("48h"), 100 | log_output_folder="output/example8", 101 | delete_existing_logs=True 102 | ) 103 | 104 | with open("output/example8/hello.csv") as log: 105 | logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10])) 106 | -------------------------------------------------------------------------------- /examples/datacamp-blogpost/05-it-aint-what-yo-do-it-s-the-time-that-you-do-it.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from tabulate import tabulate 4 | 5 | from trumania.core import circus, operations 6 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator 7 | import trumania.core.util_functions as util_functions 8 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator 9 | 10 | 11 | util_functions.setup_logging() 12 | 13 | example_circus = circus.Circus(name="example", 14 | master_seed=12345, 15 | start=pd.Timestamp("1 Jan 2017 00:00"), 16 | step_duration=pd.Timedelta("1h")) 17 | # person population 18 | 19 | id_gen = SequencialGenerator(prefix="PERSON_") 20 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5, 21 | seed=next(example_circus.seeder)) 22 | name_gen = FakerGenerator(method="name", seed=next(example_circus.seeder)) 23 | 24 | person = example_circus.create_population(name="person", size=1000, ids_gen=id_gen) 25 | person.create_attribute("NAME", init_gen=name_gen) 26 | person.create_attribute("AGE", init_gen=age_gen) 27 | 28 | # basic relationship to store people's quote 29 | 30 | quote_generator = FakerGenerator(method="sentence", nb_words=6, variable_nb_words=True, 31 | seed=next(example_circus.seeder)) 32 | 33 | quotes_rel = example_circus.populations["person"].create_relationship("quotes") 34 | 35 | for w in range(4): 36 | quotes_rel.add_relations( 37 | from_ids=person.ids, 38 | to_ids=quote_generator.generate(size=person.size), 39 | weights=w 40 | ) 41 | 42 | # message story 43 | 44 | story_timer_gen = DefaultDailyTimerGenerator( 45 | clock=example_circus.clock, 46 | seed=next(example_circus.seeder)) 47 | 48 | low_activity = story_timer_gen.activity(n=3, per=pd.Timedelta("1 day")) 49 | med_activity = story_timer_gen.activity(n=10, per=pd.Timedelta("1 day")) 50 | high_activity = story_timer_gen.activity(n=20, per=pd.Timedelta("1 day")) 51 | 52 | activity_gen = NumpyRandomGenerator( 53 | method="choice", 54 | a=[low_activity, med_activity, high_activity], 55 | p=[.2, .7, .1], 56 | seed=next(example_circus.seeder)) 57 | 58 | hello_world = example_circus.create_story( 59 | name="hello_world", 60 | initiating_population=example_circus.populations["person"], 61 | member_id_field="PERSON_ID", 62 | 63 | timer_gen=story_timer_gen, 64 | activity_gen=activity_gen 65 | ) 66 | 67 | hello_world.set_operations( 68 | example_circus.clock.ops.timestamp(named_as="TIME"), 69 | 70 | example_circus.populations["person"].get_relationship("quotes") 71 | .ops.select_one(from_field="PERSON_ID",named_as="MESSAGE"), 72 | 73 | example_circus.populations["person"].ops.select_one(named_as="OTHER_PERSON"), 74 | 75 | example_circus.populations["person"] 76 | .ops.lookup(id_field="PERSON_ID", select={"NAME": "EMITTER_NAME"}), 77 | 78 | example_circus.populations["person"] 79 | .ops.lookup(id_field="OTHER_PERSON", select={"NAME": "RECEIVER_NAME"}), 80 | 81 | operations.FieldLogger(log_id="hello_5") 82 | ) 83 | 84 | # message story 85 | 86 | example_circus.run( 87 | duration=pd.Timedelta("72h"), 88 | log_output_folder="output/example_scenario", 89 | delete_existing_logs=True 90 | ) 91 | 92 | # -- DEBUG output printout 93 | pd.set_option('display.max_columns', 500) 94 | pd.set_option('display.width', 1000) 95 | df = pd.read_csv("output/example_scenario/hello_5.csv") 96 | print(df.head(10)) 97 | print(df.tail(10)) 98 | -------------------------------------------------------------------------------- /examples/presentation/07_circus_with_story_and_relationship.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from trumania.core import circus 5 | import trumania.core.util_functions as util_functions 6 | from trumania.core.operations import FieldLogger 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator 8 | from trumania.core.random_generators import ConstantDependentGenerator 9 | 10 | 11 | util_functions.setup_logging() 12 | 13 | logging.info("building circus") 14 | 15 | 16 | def create_circus_with_population(): 17 | example_circus = circus.Circus( 18 | name="example", 19 | master_seed=12345, 20 | start=pd.Timestamp("1 Jan 2017 00:00"), 21 | step_duration=pd.Timedelta("1h")) 22 | 23 | person = example_circus.create_population( 24 | name="person", size=1000, 25 | ids_gen=SequencialGenerator(prefix="PERSON_")) 26 | 27 | person.create_attribute( 28 | "NAME", 29 | init_gen=FakerGenerator(method="name", 30 | seed=next(example_circus.seeder))) 31 | 32 | person.create_attribute( 33 | "age", 34 | init_gen=NumpyRandomGenerator( 35 | method="normal", loc=35, scale=5, 36 | seed=next(example_circus.seeder))) 37 | 38 | return example_circus 39 | 40 | 41 | def add_quotes(the_circus): 42 | 43 | quote_generator = FakerGenerator(method="sentence", 44 | nb_words=6, 45 | variable_nb_words=True, 46 | seed=next(the_circus.seeder)) 47 | 48 | person = the_circus.populations["person"] 49 | 50 | quotes_rel = person.create_relationship("quotes") 51 | 52 | for w in range(4): 53 | quotes_rel.add_relations( 54 | from_ids=person.ids, 55 | to_ids=quote_generator.generate(size=person.size), 56 | weights=w 57 | ) 58 | 59 | 60 | the_circus = create_circus_with_population() 61 | add_quotes(the_circus) 62 | 63 | hello_world = the_circus.create_story( 64 | name="hello_world", 65 | initiating_population=the_circus.populations["person"], 66 | member_id_field="PERSON_ID", 67 | timer_gen=ConstantDependentGenerator(value=1) 68 | ) 69 | 70 | hello_world.set_operations( 71 | 72 | # adding a random timestamp, within the current clock step 73 | the_circus.clock 74 | .ops 75 | .timestamp(named_as="TIME"), 76 | 77 | # message is now selected from the favourite quotes of the speaker 78 | the_circus.populations["person"].get_relationship("quotes") 79 | .ops 80 | .select_one( 81 | from_field="PERSON_ID", 82 | named_as="MESSAGE"), 83 | 84 | # selecting a random "other person" 85 | the_circus.populations["person"] 86 | .ops 87 | .select_one(named_as="OTHER_PERSON"), 88 | 89 | the_circus.populations["person"] 90 | .ops 91 | .lookup(id_field="PERSON_ID", 92 | select={"NAME": "EMITTER_NAME"}), 93 | 94 | the_circus.populations["person"] 95 | .ops 96 | .lookup(id_field="OTHER_PERSON", 97 | select={"NAME": "RECEIVER_NAME"}), 98 | 99 | # specifying which fields to put in the log 100 | FieldLogger(log_id="hello") 101 | 102 | ) 103 | 104 | the_circus.run( 105 | duration=pd.Timedelta("12h"), 106 | log_output_folder="output/example4", 107 | delete_existing_logs=True 108 | ) 109 | 110 | with open("output/example4/hello.csv") as log: 111 | logging.info("some produced logs: \n\n" + "".join(log.readlines(1000)[:10])) 112 | -------------------------------------------------------------------------------- /trumania/components/time_patterns/profilers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from trumania.core.clock import CyclicTimerGenerator, CyclicTimerProfile 4 | 5 | 6 | class HighWeekDaysTimerGenerator(CyclicTimerGenerator): 7 | """ 8 | Basic CyclicTimerGenerator with a one week period that allocates higher 9 | probabilities to week-day vs week-ends 10 | """ 11 | def __init__(self, clock, seed): 12 | 13 | start_date = pd.Timestamp("6 June 2016 00:00:00") 14 | CyclicTimerGenerator.__init__(self, 15 | clock=clock, 16 | seed=seed, 17 | config=CyclicTimerProfile( 18 | profile=[5., 5., 5., 5., 5., 3., 3.], 19 | profile_time_steps="1D", 20 | start_date=start_date), 21 | ) 22 | 23 | 24 | class WorkHoursTimerGenerator(CyclicTimerGenerator): 25 | """ 26 | Basic CyclicTimerGenerator with a one week period that allocates uniform 27 | probabilities to work hours. 28 | 29 | Work hours happen during week days (Monday to Friday), 30 | and between start_hour and end_hour, both included 31 | 32 | """ 33 | def __init__(self, clock, seed, start_hour=9, end_hour=17): 34 | 35 | assert start_hour >= 0 36 | assert end_hour < 24 37 | assert start_hour <= end_hour 38 | 39 | # if start_hour = 0, before_work is empty 40 | before_work = [0] * start_hour 41 | during_work = [1.] * (end_hour - start_hour + 1) 42 | # if end_hour = 23, after_work is empty 43 | after_work = [0] * (23 - end_hour) 44 | 45 | # the sum of before_work, during_work and after_work is always 24 46 | week_day_profile = before_work + during_work + after_work 47 | weekend_day_profile = [0] * 24 48 | 49 | week_profile = week_day_profile * 5 + weekend_day_profile * 2 50 | 51 | start_date = pd.Timestamp("6 June 2016 00:00:00") 52 | CyclicTimerGenerator.__init__(self, 53 | clock=clock, 54 | seed=seed, 55 | config=CyclicTimerProfile( 56 | profile=week_profile, 57 | profile_time_steps="1h", 58 | start_date=start_date)) 59 | 60 | 61 | class DefaultDailyTimerGenerator(CyclicTimerGenerator): 62 | """ 63 | Basic CyclicTimerGenerator with a one dat period with hourly weights 64 | vaguely inspired from 65 | 66 | https://github.com/RealImpactAnalytics/lab-home-work-detection/blob/3bacb58a53f69824102437a27218149f75d322e2/pub/chimayblue/01%20basic%20exploration.ipynb 67 | 68 | """ 69 | def __init__(self, clock, seed): 70 | # any date starting at midnight is ok... 71 | start_date = pd.Timestamp("6 June 2016 00:00:00") 72 | CyclicTimerGenerator.__init__(self, 73 | clock=clock, 74 | seed=seed, 75 | config=CyclicTimerProfile( 76 | profile=[1, .5, .2, .15, .2, .4, 3.8, 77 | 7.2, 8.4, 9.1, 9.0, 8.3, 8.1, 78 | 7.7, 7.4, 7.8, 8.0, 7.9, 9.7, 79 | 10.4, 10.5, 8.8, 5.7, 2.8], 80 | profile_time_steps="1h", 81 | start_date=start_date, 82 | ), 83 | ) 84 | -------------------------------------------------------------------------------- /tests/unit_tests/test_clock.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from trumania.core.clock import CyclicTimerProfile, CyclicTimerGenerator 4 | from trumania.core.clock import Clock 5 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator 6 | 7 | 8 | def test_clock_tick_per_day(): 9 | 10 | clock = Clock(start=pd.Timestamp("10 June 2016 5:45pm"), 11 | step_duration=pd.Timedelta("15 min"), 12 | seed=1234) 13 | 14 | # time steps is 900 s, i.e 15 min 15 | assert clock.n_iterations(pd.Timedelta("7D")) == 7 * 24 * 4 16 | assert clock.n_iterations(pd.Timedelta("1D")) == 24 * 4 17 | 18 | # 47 min should be rounded up to 4 quarters 19 | assert clock.n_iterations(pd.Timedelta("47min")) == 4 20 | 21 | 22 | def test_init_cyclictimergenerator(): 23 | 24 | # say we have a clock at 5.45pm on 10th June 25 | clock = Clock(start=pd.Timestamp("10 June 2016 5:45pm"), 26 | # time steps by 15min 27 | step_duration=pd.Timedelta("15 min"), 28 | seed=1234) 29 | 30 | # 1 to 12 then 12 to 1, from midnight to midnight 31 | timer_gen = CyclicTimerGenerator( 32 | clock=clock, 33 | config=CyclicTimerProfile( 34 | profile=list(range(1, 13)) + list(range(12, 0, -1)), 35 | profile_time_steps="1H", 36 | start_date=pd.Timestamp("1 January 2014 00:00:00"), 37 | ), 38 | seed=1234 39 | ) 40 | 41 | # after the initialization, the 1h time delta of the profile should have 42 | # been aligned to the 15min of the clock 43 | assert timer_gen.profile.index.shape[0] == 24 * 4 44 | 45 | # the first index should be shifted to the time of the clock 46 | assert timer_gen.profile.index[0] == pd.Timestamp("10 June 2016 5:45pm") 47 | 48 | 49 | def test_DefaultDailyTimerGenerator_should_be_initialized_correctly(): 50 | 51 | clock = Clock(start=pd.Timestamp("12 Sept 2016"), 52 | step_duration=pd.Timedelta("60 s"), 53 | seed=1234) 54 | 55 | daily = DefaultDailyTimerGenerator(clock=clock, seed=1234) 56 | 57 | assert daily.profile.index[0] == pd.Timestamp("12 Sept 2016") 58 | 59 | 60 | def test_cyclic_timer_profile_should_compute_duration_correct(): 61 | 62 | tested = CyclicTimerProfile( 63 | profile=[10, 20, 10, 40], 64 | profile_time_steps="2h", 65 | start_date=pd.Timestamp("21 March 1956") 66 | ) 67 | 68 | assert tested.duration() == pd.Timedelta("8h") 69 | 70 | 71 | def test_activity_level_should_be_scaled_according_to_profile_duration(): 72 | 73 | clock = Clock(start=pd.Timestamp("10 June 2016 5:45pm"), 74 | # time steps by 15min 75 | step_duration=pd.Timedelta("1 h"), 76 | seed=1234) 77 | 78 | # 1 to 12 then 12 to 1, from midnight to midnight 79 | one_day_timer = CyclicTimerGenerator( 80 | clock=clock, 81 | config=CyclicTimerProfile( 82 | profile=list(range(24)), 83 | profile_time_steps="1H", 84 | start_date=pd.Timestamp("1 January 2014 00:00:00"), 85 | ), 86 | seed=1234 87 | ) 88 | 89 | # 14 actions/week should be scaled to activity 2 since profile lasts 1 day 90 | assert 2 == one_day_timer.activity(n=14, per=pd.Timedelta("7 days")) 91 | 92 | # this one should generate a warning log since the corresponding freq 93 | # is shorter than the clock step 94 | assert 48 == one_day_timer.activity(n=4, per=pd.Timedelta("2h")) 95 | 96 | assert .5 == one_day_timer.activity(n=1, per=pd.Timedelta("2 days")) 97 | 98 | assert .5 == one_day_timer.activity(n=.25, per=pd.Timedelta("12h")) 99 | 100 | assert 1. / 360 - one_day_timer.activity( 101 | n=1, per=pd.Timedelta("360 days")) < 1e-10 102 | -------------------------------------------------------------------------------- /examples/datacamp-blogpost/06-the-social-network.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from tabulate import tabulate 4 | 5 | from trumania.core.circus import Circus 6 | from trumania.core import circus, operations 7 | from trumania.core.random_generators import SequencialGenerator, FakerGenerator, NumpyRandomGenerator, ConstantDependentGenerator, ConstantGenerator 8 | import trumania.core.util_functions as util_functions 9 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator 10 | from trumania.components.social_networks.erdos_renyi import WithErdosRenyi 11 | 12 | util_functions.setup_logging() 13 | 14 | class Calling_scenario(WithErdosRenyi, Circus): 15 | 16 | 17 | def __init__(self): 18 | 19 | Circus.__init__(self, 20 | name="example", 21 | master_seed=12345, 22 | start=pd.Timestamp("1 Jan 2017 00:00"), 23 | step_duration=pd.Timedelta("1h")) 24 | 25 | self._add_person_population() 26 | 27 | self.add_er_social_network_relationship( 28 | self.populations["person"], 29 | relationship_name="friends", 30 | average_degree=20) 31 | 32 | self._add_message_story() 33 | 34 | def _add_person_population(self): 35 | 36 | id_gen = SequencialGenerator(prefix="PERSON_") 37 | age_gen = NumpyRandomGenerator(method="normal", loc=3, scale=5, 38 | seed=next(self.seeder)) 39 | name_gen = FakerGenerator(method="name", seed=next(self.seeder)) 40 | 41 | person = self.create_population(name="person", size=1000, ids_gen=id_gen) 42 | person.create_attribute("NAME", init_gen=name_gen) 43 | person.create_attribute("AGE", init_gen=age_gen) 44 | 45 | quote_generator = FakerGenerator(method="sentence", nb_words=6, variable_nb_words=True, 46 | seed=next(self.seeder)) 47 | 48 | quotes_rel = self.populations["person"].create_relationship("quotes") 49 | 50 | for w in range(4): 51 | quotes_rel.add_relations( 52 | from_ids=person.ids, 53 | to_ids=quote_generator.generate(size=person.size), 54 | weights=w 55 | ) 56 | 57 | def _add_message_story(self): 58 | 59 | story_timer_gen = DefaultDailyTimerGenerator( 60 | clock=self.clock, 61 | seed=next(self.seeder)) 62 | 63 | low_activity = story_timer_gen.activity(n=3, per=pd.Timedelta("1 day")) 64 | med_activity = story_timer_gen.activity(n=10, per=pd.Timedelta("1 day")) 65 | high_activity = story_timer_gen.activity(n=20, per=pd.Timedelta("1 day")) 66 | 67 | activity_gen = NumpyRandomGenerator( 68 | method="choice", 69 | a=[low_activity, med_activity, high_activity], 70 | p=[.2, .7, .1], 71 | seed=next(self.seeder)) 72 | 73 | hello_world = self.create_story( 74 | name="hello_world", 75 | initiating_population=self.populations["person"], 76 | member_id_field="PERSON_ID", 77 | 78 | timer_gen=story_timer_gen, 79 | activity_gen=activity_gen 80 | ) 81 | 82 | hello_world.set_operations( 83 | self.clock.ops.timestamp(named_as="TIME"), 84 | 85 | self.populations["person"].get_relationship("quotes") 86 | .ops.select_one(from_field="PERSON_ID",named_as="MESSAGE"), 87 | 88 | self.populations["person"] 89 | .get_relationship("friends") 90 | .ops.select_one(from_field="PERSON_ID", named_as="OTHER_PERSON"), 91 | 92 | self.populations["person"] 93 | .ops.lookup(id_field="PERSON_ID", select={"NAME": "EMITTER_NAME"}), 94 | 95 | self.populations["person"] 96 | .ops.lookup(id_field="OTHER_PERSON", select={"NAME": "RECEIVER_NAME"}), 97 | 98 | operations.FieldLogger(log_id="hello_6") 99 | ) 100 | 101 | # message story 102 | example = Calling_scenario() 103 | 104 | example.run( 105 | duration=pd.Timedelta("72h"), 106 | log_output_folder="output/example_scenario", 107 | delete_existing_logs=True 108 | ) 109 | 110 | # -- DEBUG output printout 111 | pd.set_option('display.max_columns', 500) 112 | pd.set_option('display.width', 1000) 113 | df = pd.read_csv("output/example_scenario/hello_6.csv") 114 | print(df.head(10)) 115 | print(df.tail(10)) 116 | -------------------------------------------------------------------------------- /tests/unit_tests/test_attribute.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import path 4 | import pandas as pd 5 | import os 6 | 7 | from trumania.core.random_generators import SequencialGenerator 8 | from trumania.core.circus import Circus 9 | from trumania.core.population import Population, Attribute 10 | 11 | tc = Circus("c", master_seed=1234, start=pd.Timestamp("1 Jan 2011"), 12 | step_duration=pd.Timedelta("1h")) 13 | 14 | 15 | def test_set_and_read_values_in_attribute_should_be_equal(): 16 | 17 | population = Population(circus=None, size=5, 18 | ids_gen=SequencialGenerator(prefix="abc", max_length=1)) 19 | 20 | tested = Attribute(population, init_values=[10, 20, 30, 40, 50]) 21 | 22 | assert tested.get_values(["abc0"]).tolist() == [10] 23 | assert tested.get_values(["abc0", "abc3", "abc1"]).tolist() == [10, 40, 20] 24 | 25 | # getting no id should return empty list 26 | assert tested.get_values([]).tolist() == [] 27 | 28 | 29 | def test_updated_and_read_values_in_attribute_should_be_equal(): 30 | population = Population(circus=tc, size=5, ids_gen=SequencialGenerator( 31 | prefix="abc", max_length=1)) 32 | tested = Attribute(population, init_values=[10, 20, 30, 40, 50]) 33 | 34 | tested.update(pd.Series([22, 44], index=["abc1", "abc3"])) 35 | 36 | # value of a should untouched 37 | assert tested.get_values(["abc0"]).tolist() == [10] 38 | 39 | # arbitrary order should not be impacted 40 | assert tested.get_values(["abc0", "abc3", "abc1"]).tolist() == [10, 44, 22] 41 | 42 | 43 | def test_updating_non_existing_population_ids_should_add_them(): 44 | population = Population(circus=tc, size=5, ids_gen=SequencialGenerator( 45 | prefix="abc", max_length=1)) 46 | tested = Attribute(population, init_values=[10, 20, 30, 40, 50]) 47 | 48 | tested.update(pd.Series([22, 1000, 44], index=["abc1", "not_yet_there", "abc3"])) 49 | 50 | assert tested.get_values(["not_yet_there", "abc0", "abc3", "abc4"]).tolist() == [1000, 10, 44, 50] 51 | 52 | 53 | def test_initializing_attribute_from_relationship_must_have_a_value_for_all(): 54 | 55 | population = Population(circus=tc, size=5, ids_gen=SequencialGenerator( 56 | prefix="abc", max_length=1)) 57 | oneto1 = population.create_relationship("rel") 58 | oneto1.add_relations(from_ids=["abc0", "abc1", "abc2", "abc3", "abc4"], 59 | to_ids=["ta", "tb", "tc", "td", "te"]) 60 | 61 | attr = Attribute(population, init_relationship="rel") 62 | 63 | expected = pd.DataFrame({"value": ["ta", "tb", "tc", "td", "te"]}, 64 | index=["abc0", "abc1", "abc2", "abc3", "abc4"]) 65 | 66 | assert attr._table.sort_index().equals(expected) 67 | 68 | 69 | def test_overwrite_attribute(): 70 | 71 | population = Population(circus=tc, size=10, 72 | ids_gen=SequencialGenerator(prefix="u_", max_length=1)) 73 | 74 | ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23] 75 | age_attr = population.create_attribute("age", init_values=ages) 76 | 77 | # before modification 78 | ages = age_attr.get_values(["u_0", "u_4", "u_9"]).tolist() 79 | assert ages == [10, 100, 23] 80 | 81 | story_data = pd.DataFrame({ 82 | # id of the populations to update 83 | "A_ID": ["u_4", "u_0"], 84 | 85 | # new values to copy 86 | "new_ages": [34, 30]}, 87 | 88 | # index of the story data has, in general, nothing to do with the 89 | # updated population 90 | index=["cust_1", "cust_2"] 91 | ) 92 | 93 | update = age_attr.ops.update( 94 | member_id_field="A_ID", 95 | copy_from_field="new_ages" 96 | ) 97 | 98 | _, logs = update(story_data) 99 | 100 | assert logs == {} 101 | # before modification 102 | ages = age_attr.get_values(["u_0", "u_4", "u_9"]).tolist() 103 | assert ages == [30, 34, 23] 104 | 105 | 106 | def test_added_and_read_values_in_attribute_should_be_equal(): 107 | population = Population(circus=tc, size=5, 108 | ids_gen=SequencialGenerator(prefix="abc", max_length=1)) 109 | tested = Attribute(population, init_values=[10, 20, 30, 40, 50]) 110 | 111 | tested.add(["abc1", "abc3"], [22, 44]) 112 | 113 | assert tested.get_values(["abc0", "abc1", "abc2", "abc3", "abc4"]).tolist() == [10, 20 + 22, 30, 40 + 44, 50] 114 | 115 | 116 | def test_adding_several_times_to_the_same_from_should_pile_up(): 117 | population = Population(circus=tc, size=5, 118 | ids_gen=SequencialGenerator(prefix="abc", max_length=1)) 119 | tested = Attribute(population, init_values=[10, 20, 30, 40, 50]) 120 | 121 | tested.add(["abc1", "abc3", "abc1"], [22, 44, 10]) 122 | 123 | assert tested.get_values(["abc0", "abc1", "abc2", "abc3", "abc4"]).tolist() == [10, 20 + 22 + 10, 30, 40 + 44, 50] 124 | 125 | 126 | def test_io_round_trip(): 127 | 128 | with tempfile.TemporaryDirectory() as root_dir: 129 | 130 | population = Population(circus=tc, size=5, 131 | ids_gen=SequencialGenerator(prefix="abc", max_length=1)) 132 | orig = Attribute(population, init_values=[10, 20, 30, 40, 50]) 133 | 134 | full_path = os.path.join(root_dir, "attribute.csv") 135 | 136 | orig.save_to(full_path) 137 | retrieved = Attribute.load_from(full_path) 138 | 139 | assert orig._table.equals(retrieved._table) 140 | -------------------------------------------------------------------------------- /trumania/components/db.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just the provider of the IO methods save and retrieve various 3 | simulation components to/from persistence. 4 | 5 | A namespace defines a place where to put objects that belong together 6 | (typically, from the same scenario or component, e.g. "Uganda"). 7 | 8 | """ 9 | 10 | # TODO: we should store this elsewhere than in the git repo... 11 | 12 | # TODO: would be cool to also be able to store empirical probability 13 | # distribution here, for the random generators... 14 | 15 | import pandas as pd 16 | import os 17 | 18 | from trumania.core.util_functions import ensure_folder_exists, ensure_non_existing_dir 19 | from trumania.core.population import Population 20 | import trumania.core.clock as clock 21 | from trumania.core.random_generators import Generator, NumpyRandomGenerator 22 | 23 | 24 | def save_population(population, namespace, population_id): 25 | population.save_to(population_folder(namespace, population_id)) 26 | 27 | 28 | def load_population(namespace, population_id, circus): 29 | return Population.load_from(population_folder(namespace, population_id), circus) 30 | 31 | 32 | def list_populations(namespace): 33 | folder = _population_folder(namespace) 34 | return [d for d in os.listdir(folder) 35 | if os.path.isdir(os.path.join(folder, d))] 36 | 37 | 38 | def save_generator(generator, namespace, gen_id): 39 | 40 | output_folder = _gen_folder(namespace=namespace, 41 | gen_type=generator.__class__.__name__,) 42 | 43 | ensure_folder_exists(output_folder) 44 | generator.save_to(json_item_path(output_folder, gen_id)) 45 | 46 | 47 | def list_generators(namespace): 48 | folder = _generators_folder(namespace) 49 | if not os.path.exists(folder): 50 | return [] 51 | 52 | def _list(): 53 | for gen_type in os.listdir(folder): 54 | for gen_file in os.listdir(os.path.join(folder, gen_type)): 55 | gen_id = gen_file.split(".")[0] 56 | yield [gen_type, gen_id] 57 | 58 | return list(_list()) 59 | 60 | 61 | def load_generator(namespace, gen_type, gen_id): 62 | 63 | input_file = json_item_path( 64 | _gen_folder(namespace=namespace, gen_type=gen_type), gen_id) 65 | 66 | return Generator.load_generator(gen_type, input_file) 67 | 68 | 69 | # TODO: this can now be refactored to save as NumpyGenerator, togheter with 70 | # its state 71 | def save_timer_gen(timer_gen, namespace, timer_gen_id): 72 | 73 | timer_gen_folder = _timer_gens_root_folder(namespace) 74 | ensure_folder_exists(timer_gen_folder) 75 | timer_gen.save_to(csv_item_path(timer_gen_folder, timer_gen_id)) 76 | 77 | 78 | def load_timer_gen_config(namespace, timer_gen_id): 79 | timer_gen_folder = _timer_gens_root_folder(namespace) 80 | 81 | return clock.CyclicTimerProfile.load_from( 82 | csv_item_path(timer_gen_folder, timer_gen_id)) 83 | 84 | 85 | def save_empirical_discrete_generator(distribution, values, namespace, gen_id): 86 | assert distribution.sum() - 1 < 1e-6 87 | 88 | root_folder = _empirical_discrete_gen_folder(namespace) 89 | ensure_folder_exists(root_folder) 90 | gen_file_path = csv_item_path(root_folder, gen_id) 91 | 92 | df = pd.DataFrame({ 93 | "px": distribution, 94 | }, index=pd.Series(values, name="x")) 95 | 96 | df.to_csv(gen_file_path, index=True) 97 | 98 | 99 | def load_empirical_discrete_generator(namespace, gen_id, seed): 100 | root_folder = _empirical_discrete_gen_folder(namespace) 101 | gen_file_path = os.path.join(root_folder, "%s.csv" % gen_id) 102 | df = pd.read_csv(gen_file_path) 103 | 104 | gen = NumpyRandomGenerator( 105 | method="choice", 106 | a=df["x"].tolist(), 107 | p=df["px"].tolist(), 108 | seed=seed) 109 | 110 | return gen 111 | 112 | 113 | def is_namespace_existing(namespace): 114 | return os.path.exists(namespace_folder(namespace)) 115 | 116 | 117 | def namespace_folder(namespace): 118 | return os.path.join(_db_folder(), namespace) 119 | 120 | 121 | def create_namespace(namespace): 122 | folder = namespace_folder(namespace) 123 | if not os.path.exists(folder): 124 | os.makedirs(folder) 125 | return folder 126 | 127 | 128 | def remove_namespace(namespace): 129 | ensure_non_existing_dir(namespace_folder(namespace)) 130 | 131 | 132 | def _population_folder(namespace): 133 | return os.path.join(namespace_folder(namespace), "populations") 134 | 135 | 136 | def population_folder(namespace, population_id): 137 | return os.path.join(_population_folder(namespace), population_id) 138 | 139 | 140 | def _generators_folder(namespace): 141 | return os.path.join( 142 | namespace_folder(namespace), 143 | "generators") 144 | 145 | 146 | def _gen_folder(namespace, gen_type): 147 | return os.path.join(_generators_folder(namespace), gen_type) 148 | 149 | 150 | def csv_item_path(folder, item_id): 151 | return os.path.join(folder, "{}.csv".format(item_id)) 152 | 153 | 154 | def json_item_path(folder, item_id): 155 | return os.path.join(folder, "{}.json".format(item_id)) 156 | 157 | 158 | def _timer_gens_root_folder(namespace): 159 | return os.path.join( 160 | _generators_folder(namespace), 161 | "timer_gens") 162 | 163 | 164 | def _empirical_discrete_gen_folder(namespace): 165 | return os.path.join( 166 | _generators_folder(namespace), 167 | "empirical_discrete_gens") 168 | 169 | 170 | def _db_folder(): 171 | this_folder = os.path.dirname(os.path.realpath(__file__)) 172 | return os.path.join(this_folder, "_DB") 173 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # trumania documentation build configuration file, created by 5 | # sphinx-quickstart on Mon Jan 15 12:02:36 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.coverage', 37 | 'sphinx.ext.mathjax', 38 | 'sphinx.ext.viewcode', 39 | 'sphinx.ext.githubpages'] 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ['_templates'] 43 | 44 | # The suffix(es) of source filenames. 45 | # You can specify multiple suffix as a list of string: 46 | # 47 | from recommonmark.parser import CommonMarkParser 48 | source_parsers = { 49 | '.md': CommonMarkParser, 50 | } 51 | 52 | source_suffix = ['.rst', '.md'] 53 | #source_suffix = '.rst' 54 | 55 | # The master toctree document. 56 | master_doc = 'index' 57 | 58 | # General information about the project. 59 | project = 'trumania' 60 | copyright = '2018, RIA' 61 | author = 'RIA' 62 | 63 | # The version info for the project you're documenting, acts as replacement for 64 | # |version| and |release|, also used in various other places throughout the 65 | # built documents. 66 | # 67 | # The short X.Y version. 68 | version = '1.0.1' 69 | # The full version, including alpha/beta/rc tags. 70 | release = '1.0.1' 71 | 72 | # The language for content autogenerated by Sphinx. Refer to documentation 73 | # for a list of supported languages. 74 | # 75 | # This is also used if you do content translation via gettext catalogs. 76 | # Usually you set "language" from the command line for these cases. 77 | language = None 78 | 79 | # List of patterns, relative to source directory, that match files and 80 | # directories to ignore when looking for source files. 81 | # This patterns also effect to html_static_path and html_extra_path 82 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 83 | 84 | # The name of the Pygments (syntax highlighting) style to use. 85 | pygments_style = 'sphinx' 86 | 87 | # If true, `todo` and `todoList` produce output, else they produce nothing. 88 | todo_include_todos = False 89 | 90 | 91 | # -- Options for HTML output ---------------------------------------------- 92 | 93 | # The theme to use for HTML and HTML Help pages. See the documentation for 94 | # a list of builtin themes. 95 | # 96 | html_theme = "classic" 97 | 98 | # Theme options are theme-specific and customize the look and feel of a theme 99 | # further. For a list of options available for each theme, see the 100 | # documentation. 101 | # 102 | html_theme_options = { 103 | "rightsidebar": "false", 104 | "relbarbgcolor": "black" 105 | } 106 | 107 | # Add any paths that contain custom static files (such as style sheets) here, 108 | # relative to this directory. They are copied after the builtin static files, 109 | # so a file named "default.css" will overwrite the builtin "default.css". 110 | html_static_path = ['_static'] 111 | 112 | # Custom sidebar templates, must be a dictionary that maps document names 113 | # to template names. 114 | # 115 | # This is required for the alabaster theme 116 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 117 | html_sidebars = { 118 | '**': [ 119 | 'relations.html', # needs 'show_related': True theme option to display 120 | 'searchbox.html', 121 | ] 122 | } 123 | 124 | 125 | # -- Options for HTMLHelp output ------------------------------------------ 126 | 127 | # Output file base name for HTML help builder. 128 | htmlhelp_basename = 'trumaniadoc' 129 | 130 | 131 | # -- Options for LaTeX output --------------------------------------------- 132 | 133 | latex_elements = { 134 | # The paper size ('letterpaper' or 'a4paper'). 135 | # 136 | # 'papersize': 'letterpaper', 137 | 138 | # The font size ('10pt', '11pt' or '12pt'). 139 | # 140 | # 'pointsize': '10pt', 141 | 142 | # Additional stuff for the LaTeX preamble. 143 | # 144 | # 'preamble': '', 145 | 146 | # Latex figure (float) alignment 147 | # 148 | # 'figure_align': 'htbp', 149 | } 150 | 151 | # Grouping the document tree into LaTeX files. List of tuples 152 | # (source start file, target name, title, 153 | # author, documentclass [howto, manual, or own class]). 154 | latex_documents = [ 155 | (master_doc, 'trumania.tex', 'trumania Documentation', 156 | 'RIA', 'manual'), 157 | ] 158 | 159 | 160 | # -- Options for manual page output --------------------------------------- 161 | 162 | # One entry per manual page. List of tuples 163 | # (source start file, name, description, authors, manual section). 164 | man_pages = [ 165 | (master_doc, 'trumania', 'trumania Documentation', 166 | [author], 1) 167 | ] 168 | 169 | 170 | # -- Options for Texinfo output ------------------------------------------- 171 | 172 | # Grouping the document tree into Texinfo files. List of tuples 173 | # (source start file, target name, title, author, 174 | # dir menu entry, description, category) 175 | texinfo_documents = [ 176 | (master_doc, 'trumania', 'trumania Documentation', 177 | author, 'trumania', 'One line description of project.', 178 | 'Miscellaneous'), 179 | ] 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /tests/unit_tests/test_random_generators.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import path 5 | import functools 6 | from itertools import islice 7 | import pandas as pd 8 | import numpy as np 9 | 10 | from trumania.core.random_generators import SequencialGenerator, NumpyRandomGenerator, ConstantGenerator, seed_provider 11 | from trumania.core.random_generators import DependentTriggerGenerator, FakerGenerator, Generator 12 | 13 | 14 | def test_constant_generator_should_produce_constant_values(): 15 | tested = ConstantGenerator(value="c") 16 | 17 | assert [] == tested.generate(size=0) 18 | assert ["c"] == tested.generate(size=1) 19 | assert ["c", "c", "c", "c", "c"] == tested.generate(size=5) 20 | 21 | 22 | def test_numpy_random_generator_should_delegate_to_numpy_correctly(): 23 | 24 | # basic "smoke" test, if it does not crash it at least proves it's able 25 | # to load the appropriate method 26 | tested = NumpyRandomGenerator(method="normal", loc=10, scale=4, seed=1) 27 | assert len(tested.generate(size=10)) == 10 28 | 29 | 30 | def test_seeder_should_be_deterministic(): 31 | """ 32 | makes sure the seeds always provides the same sequence of seeds 33 | """ 34 | 35 | master_seed = 12345 36 | 37 | seeder1 = seed_provider(master_seed) 38 | seeder2 = seed_provider(master_seed) 39 | 40 | assert list(islice(seeder1, 1000)) == list(islice(seeder2, 1000)) 41 | 42 | 43 | def test_depend_trigger_should_trigger_given_constant_value(): 44 | 45 | # returns 6 hard-coded 1 and zero 46 | def fake_mapper(x): 47 | return [1, 1, 0, 0, 1, 0] 48 | 49 | g = DependentTriggerGenerator(value_to_proba_mapper=fake_mapper) 50 | 51 | triggers = g.generate(observations=pd.Series([10, 20, 30, 0, 1, 2])) 52 | 53 | # because the fake_mapper returns fake values, we should always have the 54 | # following triggers, no matter what the internal uniform distro provided 55 | assert triggers.tolist() == [True, True, False, False, True, False] 56 | 57 | 58 | def test_sequencial_generator_should_create_unique_values(): 59 | 60 | tested = SequencialGenerator(start=10, prefix="test_p_", max_length=10) 61 | 62 | sizes = [100, 200, 300, 400, 500] 63 | sets = [set(tested.generate(size)) for size in sizes] 64 | 65 | # generated values should be unique within each set 66 | all_values = functools.reduce(lambda s1, s2: s1 | s2, sets) 67 | 68 | assert len(all_values) == np.sum(sizes) 69 | 70 | 71 | def test_random_generator_should_provide_correct_amount_of_single_values(): 72 | 73 | tested = NumpyRandomGenerator(method="gamma", scale=10, shape=1.8, seed=1) 74 | 75 | genops = tested.ops.generate(named_as="rand") 76 | 77 | story_data = pd.DataFrame( 78 | np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) 79 | 80 | result, logs = genops(story_data) 81 | 82 | assert result.columns.tolist() == ["A", "B", "C", "D", "E", "rand"] 83 | 84 | # should be float and not list of values 85 | assert result["rand"].dtype == float 86 | 87 | 88 | def test_random_generator_should_provide_correct_amount_of_list_of_values(): 89 | 90 | tested = NumpyRandomGenerator(method="gamma", scale=10, shape=1.8, seed=1) 91 | 92 | story_data = pd.DataFrame( 93 | np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"], 94 | ) 95 | story_data["how_many"] = pd.Series([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) 96 | 97 | genops = tested.ops.generate(named_as="rand", quantity_field="how_many") 98 | 99 | result, logs = genops(story_data) 100 | 101 | assert result.columns.tolist() == ["A", "B", "C", "D", "E", "how_many", "rand"] 102 | 103 | # should be list of the expected sizes 104 | assert result["rand"].dtype == list 105 | assert result["rand"].apply(len).tolist() == [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] 106 | 107 | 108 | def test_faker_generator_should_delegate_to_faker_correct(): 109 | 110 | tested_name = FakerGenerator(seed=1234, method="name") 111 | some_names = tested_name.generate(10) 112 | assert len(some_names) == 10 113 | 114 | tested_text = FakerGenerator(seed=1234, method="text") 115 | some_text = tested_text.generate(20) 116 | assert len(some_text) == 20 117 | 118 | tested_address = FakerGenerator(seed=1234, method="address") 119 | some_addresses = tested_address.generate(30) 120 | assert len(some_addresses) == 30 121 | 122 | 123 | def test_sequencial_generator_read_from_disk_should_continue_sequence(): 124 | 125 | with tempfile.TemporaryDirectory() as p: 126 | 127 | tested = SequencialGenerator(start=10, prefix="o_", max_length=2) 128 | 129 | list_1 = tested.generate(size=4) 130 | assert list_1 == ["o_10", "o_11", "o_12", "o_13"] 131 | 132 | gen_file = os.path.join(p, "tested.json") 133 | tested.save_to(gen_file) 134 | 135 | tested2 = Generator.load_generator(gen_type="SequencialGenerator", 136 | input_file=gen_file) 137 | 138 | list_2 = tested2.generate(size=4) 139 | assert list_2 == ["o_14", "o_15", "o_16", "o_17"] 140 | 141 | # loading it again => we should have the same result 142 | tested3 = Generator.load_generator(gen_type="SequencialGenerator", 143 | input_file=gen_file) 144 | 145 | list_3 = tested3.generate(size=4) 146 | assert list_3 == ["o_14", "o_15", "o_16", "o_17"] 147 | 148 | 149 | def numpy_generators_read_from_disk_should_generate_same_sequence_as_original(): 150 | 151 | with tempfile.TemporaryDirectory() as p: 152 | 153 | # making sure we're not using the default seed 154 | tested = NumpyRandomGenerator(method="normal", loc=10, scale=4, 155 | seed=123456) 156 | 157 | gen_file = os.path.join(p, "tested2.json") 158 | tested.save_to(gen_file) 159 | 160 | reloaded = Generator.load_generator(gen_type="NumpyRandomGenerator", 161 | input_file=gen_file) 162 | 163 | assert tested.generate(size=10000) == reloaded.generate(size=10000) 164 | 165 | 166 | def test_sequencial_generator_must_not_change_format_when_size_is_float(): 167 | 168 | seq = SequencialGenerator(prefix="sq", max_length=2) 169 | 170 | # bugfix: this was previously generating "sq00.0", "sq01.0",... 171 | assert ["sq00", "sq01", "sq02"] == seq.generate(size=3.3) 172 | assert ["sq03", "sq04", "sq05"] == seq.generate(size=3.3) 173 | -------------------------------------------------------------------------------- /trumania/core/attribute.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import logging 3 | from trumania.core.operations import SideEffectOnly 4 | 5 | 6 | class Attribute(object): 7 | """ 8 | Static population attribute, with various ways to initialize it randomly 9 | """ 10 | 11 | def __init__(self, 12 | population, 13 | 14 | # if initializing with value, must provide ids and one of the 15 | # init values 16 | init_values=None, 17 | init_gen=None, 18 | 19 | # otherwise, we can also initialise randomly from a 20 | # relationship (in which case the ids are extracted from the 21 | # "from" field. init_relationship is a string that contains 22 | # the name of the 23 | init_relationship=None): 24 | self.ops = self.AttributeOps(self) 25 | 26 | if population.size == 0: 27 | self._table = pd.DataFrame(columns=["value"]) 28 | 29 | elif init_relationship is None: 30 | if not ((init_values is None) ^ (init_gen is None)): 31 | raise ValueError("if init_relationship is not provided, " 32 | "you must also provide init_values or " 33 | "init_values_gen") 34 | 35 | elif init_values is None: 36 | init_values = init_gen.generate(size=population.size) 37 | 38 | if type(init_values) == pd.Series: 39 | logging.warn(" Trying to create attribute with a series " 40 | "but indices will be lost.") 41 | init_values = init_values.tolist() 42 | 43 | self._table = pd.DataFrame({"value": init_values}, index=population.ids) 44 | 45 | else: 46 | if init_relationship is None: 47 | raise ValueError("must provide either ids or relationship to " 48 | "initialize the attribute") 49 | 50 | self._table = population.get_relationship(init_relationship).select_one() 51 | self._table.set_index("from", drop=True, inplace=True) 52 | self._table.rename(columns={"to": "value"}, inplace=True) 53 | 54 | def get_values(self, ids=None): 55 | """ 56 | :param ids: members ids for which the attribute values are desired 57 | :return: the current attribute values for those members, as Series 58 | """ 59 | if ids is None: 60 | return self._table["value"] 61 | else: 62 | return self._table.loc[ids]["value"] 63 | 64 | def update(self, series): 65 | """ 66 | updates or adds values of this attributes from the values of the provided 67 | series, using its index as member id 68 | """ 69 | self._table = self._table.reindex(self._table.index | series.index) 70 | self._table.loc[series.index, "value"] = series.values 71 | 72 | def add(self, ids, added_values): 73 | """ 74 | This only makes sense for attributes that support a + operation (e.g. numerical values or list) 75 | : this simply performs a += operation 76 | """ 77 | assert len(ids) == len(added_values) 78 | 79 | # putting together any add to the same attribute id 80 | to_add = pd.Series(added_values, index=ids).groupby(level=0).agg(sum) 81 | 82 | self._table.loc[to_add.index, "value"] = self._table.loc[to_add.index, "value"] + to_add 83 | 84 | def transform_inplace(self, f): 85 | """ 86 | transform the values of this attribute inplace with f 87 | """ 88 | self._table["value"] = self._table["value"].map(f) 89 | 90 | ############ 91 | # IO 92 | def save_to(self, file_path): 93 | logging.info("saving attribute to {}".format(file_path)) 94 | self._table.to_csv(file_path) 95 | 96 | @staticmethod 97 | def load_from(file_path): 98 | table = pd.read_csv(file_path, index_col=0) 99 | 100 | # we're basically hacking our own constructor, feeding it fake data 101 | # so it's initialized correctly. 102 | # 103 | # Don't do that outside this class! 104 | class FakePopulation(object): 105 | def __init__(self): 106 | self.size = table.shape[0] 107 | self.ids = table.index 108 | 109 | return Attribute(population=FakePopulation(), init_values=table["value"]) 110 | 111 | ############ 112 | # operations 113 | 114 | class AttributeOps(object): 115 | def __init__(self, attribute): 116 | self.attribute = attribute 117 | 118 | class Update(SideEffectOnly): 119 | def __init__(self, attribute, member_id_field, copy_from_field): 120 | self.attribute = attribute 121 | self.copy_from_field = copy_from_field 122 | self.member_id_field = member_id_field 123 | 124 | def side_effect(self, story_data): 125 | if story_data.shape[0] > 0: 126 | update_series = pd.Series( 127 | data=story_data[self.copy_from_field].values, 128 | index=story_data[self.member_id_field].values) 129 | self.attribute.update(update_series) 130 | 131 | def update(self, member_id_field, copy_from_field): 132 | """ 133 | Overwrite the value of this attribute with values in this field 134 | 135 | :param member_id_field: name of the field of the story data 136 | containing the member ids whose attribute should be updated 137 | :param copy_from_field: name of the field of the story data 138 | containing the new values of the attribute 139 | :return: 140 | """ 141 | return self.Update(self.attribute, member_id_field, 142 | copy_from_field) 143 | 144 | class Add(SideEffectOnly): 145 | def __init__(self, attribute, member_id_field, 146 | added_value_field, subtract): 147 | self.attribute = attribute 148 | self.added_value_field = added_value_field 149 | self.member_id_field = member_id_field 150 | self.subtract = subtract 151 | 152 | def side_effect(self, story_data): 153 | if story_data.shape[0] > 0: 154 | 155 | values = story_data[self.added_value_field].values 156 | if self.subtract: 157 | values = -values 158 | 159 | self.attribute.add( 160 | ids=story_data[self.member_id_field].values, 161 | added_values=values) 162 | 163 | def add(self, member_id_field, added_value_field): 164 | return self.Add(self.attribute, member_id_field, added_value_field, subtract=False) 165 | 166 | def subtract(self, member_id_field, subtracted_value_field): 167 | return self.Add(self.attribute, member_id_field, subtracted_value_field, subtract=True) 168 | -------------------------------------------------------------------------------- /trumania/components/geographies/uganda.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is just an illustration of how to persist various scenario components 3 | """ 4 | import logging 5 | import pandas as pd 6 | 7 | from trumania.core import operations 8 | from trumania.components import db 9 | from trumania.core.circus import Circus 10 | from trumania.core.util_functions import make_random_assign, setup_logging 11 | from trumania.core.random_generators import NumpyRandomGenerator, ParetoGenerator, seed_provider, SequencialGenerator 12 | from trumania.core.random_generators import FakerGenerator 13 | from trumania.core.clock import CyclicTimerGenerator, CyclicTimerProfile 14 | 15 | 16 | def build_unhealthy_level_gen(seed): 17 | return NumpyRandomGenerator(method="beta", a=1, b=999, seed=seed) 18 | 19 | 20 | def build_healthy_level_gen(seed): 21 | return NumpyRandomGenerator(method="beta", a=1, b=999, seed=seed) 22 | 23 | 24 | class WithUganda(Circus): 25 | 26 | def add_uganda_geography(self, force_build=False): 27 | """ 28 | Loads the cells definition from Uganda + adds 2 stories to control 29 | """ 30 | logging.info(" adding Uganda Geography") 31 | seeder = seed_provider(12345) 32 | 33 | if force_build: 34 | uganda_cells, uganda_cities, timer_config = build_uganda_populations( 35 | self) 36 | 37 | else: 38 | uganda_cells = db.load_population(namespace="uganda", population_id="cells") 39 | uganda_cities = db.load_population(namespace="uganda", population_id="cities") 40 | timer_config = db.load_timer_gen_config("uganda", 41 | "cell_repair_timer_profile") 42 | 43 | repair_n_fix_timer = CyclicTimerGenerator( 44 | clock=self.clock, 45 | seed=next(self.seeder), 46 | config=timer_config) 47 | 48 | unhealthy_level_gen = build_unhealthy_level_gen(next(seeder)) 49 | healthy_level_gen = build_healthy_level_gen(next(seeder)) 50 | 51 | # tendency is inversed in case of broken cell: it's probability of 52 | # accepting a call is much lower 53 | 54 | # same profiler for breakdown and repair: they are both related to 55 | # typical human activity 56 | 57 | logging.info(" adding Uganda Geography6") 58 | cell_break_down_story = self.create_story( 59 | name="cell_break_down", 60 | 61 | initiating_population=uganda_cells, 62 | member_id_field="CELL_ID", 63 | 64 | timer_gen=repair_n_fix_timer, 65 | 66 | # fault activity is very low: most cell tend never to break down ( 67 | # hopefully...) 68 | activity_gen=ParetoGenerator(xmin=5, a=1.4, seed=next(self.seeder)) 69 | ) 70 | 71 | cell_repair_story = self.create_story( 72 | name="cell_repair_down", 73 | 74 | initiating_population=uganda_cells, 75 | member_id_field="CELL_ID", 76 | 77 | timer_gen=repair_n_fix_timer, 78 | 79 | # repair activity is much higher 80 | activity_gen=ParetoGenerator(xmin=100, a=1.2, 81 | seed=next(self.seeder)), 82 | 83 | # repair is not re-scheduled at the end of a repair, but only triggered 84 | # from a "break-down" story 85 | auto_reset_timer=False 86 | ) 87 | 88 | cell_break_down_story.set_operations( 89 | unhealthy_level_gen.ops.generate(named_as="NEW_HEALTH_LEVEL"), 90 | 91 | uganda_cells.get_attribute("HEALTH").ops.update( 92 | member_id_field="CELL_ID", 93 | copy_from_field="NEW_HEALTH_LEVEL"), 94 | 95 | cell_repair_story.ops.reset_timers(member_id_field="CELL_ID"), 96 | self.clock.ops.timestamp(named_as="TIME"), 97 | 98 | operations.FieldLogger(log_id="cell_status", 99 | cols=["TIME", "CELL_ID", 100 | "NEW_HEALTH_LEVEL"]), 101 | ) 102 | 103 | cell_repair_story.set_operations( 104 | healthy_level_gen.ops.generate(named_as="NEW_HEALTH_LEVEL"), 105 | 106 | uganda_cells.get_attribute("HEALTH").ops.update( 107 | member_id_field="CELL_ID", 108 | copy_from_field="NEW_HEALTH_LEVEL"), 109 | 110 | self.clock.ops.timestamp(named_as="TIME"), 111 | 112 | # note that both stories are contributing to the same 113 | # "cell_status" log 114 | operations.FieldLogger(log_id="cell_status", 115 | cols=["TIME", "CELL_ID", 116 | "NEW_HEALTH_LEVEL"]), 117 | ) 118 | 119 | return uganda_cells, uganda_cities 120 | 121 | 122 | def build_uganda_populations(circus): 123 | 124 | seeder = seed_provider(12345) 125 | 126 | cells = circus.create_population(name="cells", 127 | ids_gen=SequencialGenerator(prefix="CELL_"), 128 | size=200) 129 | latitude_generator = FakerGenerator(method="latitude", 130 | seed=next(seeder)) 131 | cells.create_attribute("latitude", init_gen=latitude_generator) 132 | 133 | longitude_generator = FakerGenerator(method="longitude", 134 | seed=next(seeder)) 135 | cells.create_attribute("longitude", init_gen=longitude_generator) 136 | 137 | # the cell "health" is its probability of accepting a call. By default 138 | # let's says it's one expected failure every 1000 calls 139 | healthy_level_gen = build_healthy_level_gen(next(seeder)) 140 | 141 | cells.create_attribute(name="HEALTH", init_gen=healthy_level_gen) 142 | 143 | city_gen = FakerGenerator(method="city", seed=next(seeder)) 144 | cities_values = pd.unique(city_gen.generate(500))[:200] 145 | cities = circus.create_population(name="cities", ids=cities_values) 146 | 147 | cell_city_rel = cities.create_relationship("CELLS") 148 | 149 | cell_city_df = make_random_assign(cells.ids, cities.ids, next(seeder)) 150 | cell_city_rel.add_relations( 151 | from_ids=cell_city_df["chosen_from_set2"], 152 | to_ids=cell_city_df["set1"]) 153 | 154 | pop_gen = ParetoGenerator(xmin=10000, a=1.4, seed=next(seeder)) 155 | cities.create_attribute("population", init_gen=pop_gen) 156 | 157 | timer_config = CyclicTimerProfile( 158 | profile=[1, .5, .2, .15, .2, .4, 3.8, 159 | 7.2, 8.4, 9.1, 9.0, 8.3, 8.1, 160 | 7.7, 7.4, 7.8, 8.0, 7.9, 9.7, 161 | 10.4, 10.5, 8.8, 5.7, 2.8], 162 | profile_time_steps="1h", 163 | start_date=pd.Timestamp("6 June 2016 00:00:00")) 164 | 165 | return cells, cities, timer_config 166 | 167 | 168 | if __name__ == "__main__": 169 | # This is meant to be executed only once, to create the data on disk. 170 | 171 | # Note: using generators and persisting the result could make sense 172 | # if such generation is costly or for facilitating reproduceability, 173 | # though a more common use cas might be to build such Populations and 174 | # relationship from empirical exploration of a dataset. 175 | 176 | # Note2: only the "static" properties of an environment are saved here, 177 | # whereas the "dynamic parts" (e.g. stories) are stored "in code", i.e. 178 | # in the withXYZ() class above that then need to be mixed in a Circus. 179 | 180 | setup_logging() 181 | 182 | cells, cities, timer_config = build_uganda_populations() 183 | 184 | db.remove_namespace("uganda") 185 | db.save_population(population=cells, namespace="uganda", population_id="cells") 186 | db.save_population(population=cities, namespace="uganda", population_id="cities") 187 | 188 | db.save_timer_gen(timer_gen=timer_config, namespace="uganda", 189 | timer_gen_id="cell_repair_timer_profile") 190 | -------------------------------------------------------------------------------- /trumania/core/util_functions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Collection of utility functions 3 | """ 4 | 5 | from numpy.random import RandomState 6 | import pandas as pd 7 | import numpy as np 8 | import os 9 | import functools 10 | from networkx.algorithms import bipartite 11 | import logging 12 | 13 | 14 | def make_random_bipartite_data(group1, group2, p, seed): 15 | """ 16 | 17 | :type group1: list 18 | :param group1: Ids of first group 19 | :type group2: list 20 | :param group2: Ids of second group 21 | :type p: float 22 | :param p: probability of existence of 1 edge 23 | :type seed: int 24 | :param seed: seed for random generator 25 | :rtype: list 26 | :return: all edges in the graph 27 | """ 28 | logging.info(" creating a bipartite graph between {} items in group1, {} " 29 | "items in group2 and edge probability {}".format( 30 | len(group1), len(group2), p)) 31 | 32 | if len(group1) == 0 or len(group2) == 0 or p == 0: 33 | return [] 34 | 35 | bp = pd.DataFrame.from_records(list(bipartite.random_graph(len(group1), len(group2), p, seed).edges()), 36 | columns=["from", "to"]) 37 | logging.info(" (bipartite index created, now resolving item values)") 38 | 39 | # as all "to" nodes are from the second group, 40 | # but numbered by networkx in range(len(group1),len(group1)+len(group2)) 41 | # we need to deduct len(group1) to have proper indexes. 42 | bp["to"] -= len(group1) 43 | 44 | bp["from"] = bp.apply(lambda x: group1[x["from"]], axis=1) 45 | bp["to"] = bp.apply(lambda x: group2[x["to"]], axis=1) 46 | logging.info(" (resolution done, now converting to tuples)") 47 | out = [tuple(x) for x in bp.to_records(index=False)] 48 | logging.info(" (exiting bipartite)") 49 | return out 50 | 51 | 52 | def assign_random_proportions(name1, name2, group1, group2, seed): 53 | 54 | state = RandomState(seed) 55 | assignments = state.rand(len(group1), len(group2)) 56 | assignments = assignments / assignments.sum(axis=1, keepdims=True) 57 | data = pd.DataFrame(assignments, index=group1, 58 | columns=group2).stack().reset_index(level=[0, 1]) 59 | data.rename(columns={"level_0": name1, 60 | "level_1": name2, 61 | 0: "weight"}, 62 | inplace=True) 63 | return data 64 | 65 | 66 | def make_random_assign(set1, set2, seed): 67 | """Assign randomly a member of set2 to each member of set1 68 | :return: a dataframe with as many rows as set1 69 | """ 70 | chosen_froms = RandomState(seed).choice(set2, size=len(set1)) 71 | return pd.DataFrame({"set1": set1, "chosen_from_set2": chosen_froms}) 72 | 73 | 74 | def merge_2_dicts(dict1, dict2, value_merge_func=None): 75 | """ 76 | :param dict1: first dictionary to be merged 77 | :param dict2: first dictionary to be merged 78 | :param value_merge_func: specifies how to merge 2 values if present in 79 | both dictionaries 80 | :type value_merge_func: function (value1, value) => value 81 | :return: 82 | """ 83 | if dict1 is None and dict2 is None: 84 | return {} 85 | 86 | if dict2 is None: 87 | return dict1 88 | 89 | if dict1 is None: 90 | return dict2 91 | 92 | def merged_value(key): 93 | if key not in dict1: 94 | return dict2[key] 95 | elif key not in dict2: 96 | return dict1[key] 97 | else: 98 | if value_merge_func is None: 99 | raise ValueError( 100 | "Conflict in merged dictionaries: merge function not " 101 | "provided but key {} exists in both dictionaries".format( 102 | key)) 103 | 104 | return value_merge_func(dict1[key], dict2[key]) 105 | 106 | keys = set(dict1.keys()) | set(dict2.keys()) 107 | 108 | return {key: merged_value(key) for key in keys} 109 | 110 | 111 | def df_concat(d1, d2): 112 | return pd.concat([d1, d2], ignore_index=True, copy=False) 113 | 114 | 115 | def merge_dicts(dicts, merge_func=None): 116 | """ 117 | :param dicts: list of dictionnaries to be merged 118 | :type dicts: list[dict] 119 | :param merge_func: 120 | :type merge_func: function 121 | :return: one single dictionary containing all entries received 122 | """ 123 | from itertools import tee 124 | 125 | # check if the input list or iterator is empty 126 | dict_backup, test = tee(iter(dicts)) 127 | try: 128 | next(test) 129 | except StopIteration: 130 | return {} 131 | 132 | return functools.reduce(lambda d1, d2: merge_2_dicts(d1, d2, merge_func), dict_backup) 133 | 134 | 135 | def setup_logging(): 136 | logging.basicConfig( 137 | format='%(asctime)s %(message)s', 138 | level=logging.INFO) 139 | 140 | 141 | # stolen from http://stackoverflow.com/questions/1835018/ 142 | # python-check-if-an-object-is-a-list-or-tuple-but-not-string#answer-1835259 143 | def is_sequence(arg): 144 | return type(arg) is list or type(arg) is tuple or type(arg) is set 145 | 146 | 147 | def build_ids(size, id_start=0, prefix="id_", max_length=10): 148 | """ 149 | builds a sequencial list of string ids of specified size 150 | """ 151 | return [prefix + str(x).zfill(max_length) 152 | for x in np.arange(id_start, id_start + size)] 153 | 154 | 155 | def log_dataframe_sample(msg, df): 156 | 157 | if df.shape[0] == 0: 158 | logging.info("{}: [empty]".format(msg)) 159 | else: 160 | logging.info("{}: \n {}".format(msg, df.sample(min(df.shape[0], 15)))) 161 | 162 | 163 | def cap_to_total(values, target_total): 164 | """ 165 | return a copy of values with the largest values possible s.t.: 166 | - all return values are <= the original ones 167 | - their sum is == total 168 | - 169 | """ 170 | 171 | excedent = np.sum(values) - target_total 172 | if excedent <= 0: 173 | return values 174 | elif values[-1] >= excedent: 175 | return values[:-1] + [values[-1] - excedent] 176 | else: 177 | return cap_to_total(values[:-1], target_total) + [0] 178 | 179 | 180 | def ensure_folder_exists(folder): 181 | if not os.path.exists(folder): 182 | os.makedirs(folder) 183 | 184 | 185 | def ensure_non_existing_dir(folder): 186 | """ 187 | makes sure the specified directory does not exist, potentially deleting 188 | any file or folder it contains 189 | """ 190 | 191 | if not os.path.exists(folder): 192 | return 193 | 194 | if os.path.isfile(folder): 195 | os.remove(folder) 196 | 197 | else: 198 | for f in os.listdir(folder): 199 | full_path = os.path.join(folder, f) 200 | ensure_non_existing_dir(full_path) 201 | os.rmdir(folder) 202 | 203 | 204 | def latest_date_before(starting_date, upper_bound, time_step): 205 | """ 206 | Looks for the latest result_date s.t 207 | 208 | result_date = starting_date + n * time_step for any integer n 209 | result_date <= upper_bound 210 | 211 | :type starting_date: pd.Timestamp 212 | :type upper_bound: pd.Timestamp 213 | :type time_step: pd.Timedelta 214 | :return: pd.Timestamp 215 | """ 216 | 217 | result = starting_date 218 | 219 | while result > upper_bound: 220 | result -= time_step 221 | 222 | while upper_bound - result >= time_step: 223 | result += time_step 224 | 225 | return result 226 | 227 | 228 | def load_all_logs(folder): 229 | """ 230 | loads all csv file contained in this folder and retun them as one 231 | dictionary where the key is the filename without the extension 232 | """ 233 | 234 | all_logs = {} 235 | 236 | for file_name in os.listdir(folder): 237 | full_path = os.path.join(folder, file_name) 238 | logs = pd.read_csv(full_path, index_col=None) 239 | log_id = file_name[:-4] 240 | 241 | all_logs[log_id] = logs 242 | 243 | return all_logs 244 | -------------------------------------------------------------------------------- /tests/unit_tests/test_util_functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import functools 3 | 4 | from trumania.core.util_functions import merge_2_dicts, merge_dicts, is_sequence, make_random_assign, cap_to_total 5 | from trumania.core.util_functions import build_ids, latest_date_before, bipartite, make_random_bipartite_data 6 | 7 | 8 | def test_merge_two_empty_dict_should_return_empty_dict(): 9 | assert {} == merge_2_dicts({}, {}) 10 | 11 | 12 | def test_merge_two_none_dict_should_return_empty_dict(): 13 | assert {} == merge_2_dicts(None, None) 14 | 15 | 16 | def test_merging_one_dict_with_none_should_yield_dict(): 17 | d1 = {"a": 1, "b": 2} 18 | assert d1 == merge_2_dicts(d1, None) 19 | 20 | 21 | def test_merging_none_with_one_dict_should_yield_dict(): 22 | d2 = {"a": 1, "b": 2} 23 | assert d2 == merge_2_dicts(None, d2) 24 | 25 | 26 | def test_merge_empty_with_dict_should_return_itself(): 27 | 28 | d1 = {"a": 1, "b": 2} 29 | assert d1 == merge_2_dicts(d1, {}) 30 | assert d1 == merge_2_dicts({}, d1) 31 | 32 | 33 | def test_merge_non_overlapping_dict_should_return_all_values(): 34 | 35 | d1 = {"a": 1, "b": 2} 36 | d2 = {"c": 3, "d": 4} 37 | assert {"a": 1, "b": 2, "c": 3, "d": 4} == merge_2_dicts(d1, d2) 38 | 39 | 40 | def test_merge_dict_to_itself_should_return_doubled_values(): 41 | 42 | d1 = {"a": 1, "b": 2} 43 | assert {"a": 2, "b": 4} == merge_2_dicts(d1, d1, lambda a, b: a + b) 44 | 45 | 46 | def test_merging_one_dictionary_should_yield_itself(): 47 | d1 = {"a": 1, "b": 2} 48 | assert d1 == merge_dicts([d1], lambda a, b: a + b) 49 | 50 | 51 | def test_merging_an_empty_list_of_dicts_should_yield_empty_dict(): 52 | assert {} == merge_dicts([]) 53 | 54 | 55 | def test_merging_an_empty_gen_of_dicts_should_yield_empty_dict(): 56 | emtpy_gen = ({"a": 1} for _ in []) 57 | assert {} == merge_dicts(emtpy_gen) 58 | 59 | 60 | def test_merging_many_dictionary_should_yield_expected_result(): 61 | d1 = {"a": 10, "b": 20} 62 | d2 = {"a": 100, "c": 30} 63 | d3 = {} 64 | d4 = {"b": 200, "z": 1000} 65 | d5 = {"z": -10} 66 | 67 | merged = merge_dicts([d1, d2, d3, d4, d5], lambda a, b: a + b) 68 | 69 | assert {"a": 110, "b": 220, "c": 30, "z": 990} == merged 70 | 71 | 72 | def test_merging_many_dictionary_from_gen_should_yield_expected_result(): 73 | ds = [{"a": 10, "b": 20}, 74 | {"a": 100, "c": 30}, 75 | {}, 76 | {"b": 200, "z": 1000}, 77 | {"z": -10}] 78 | 79 | dicts_gens = (d for d in ds) 80 | 81 | merged = merge_dicts(dicts_gens, lambda a, b: a + b) 82 | 83 | assert {"a": 110, "b": 220, "c": 30, "z": 990} == merged 84 | 85 | 86 | def test_is_sequence(): 87 | assert is_sequence([]) 88 | assert is_sequence([1, 2, 3, 1]) 89 | assert is_sequence({1, 2, 3, 1}) 90 | assert not is_sequence(1) 91 | assert not is_sequence("hello") 92 | 93 | 94 | def test_make_random_assign_shoud_assign_each_element_only_once(): 95 | 96 | dealers = build_ids(size=10, prefix="DEALER_", max_length=2) 97 | sims = build_ids(size=1000, prefix="SIM_", max_length=4) 98 | 99 | assignment = make_random_assign(set1=sims, set2=dealers, seed=10) 100 | 101 | # all sims should have been assigned 102 | assert assignment.shape == (1000, 2) 103 | 104 | # all SIM should have been given 105 | assert set(assignment["set1"].unique().tolist()) == set(sims) 106 | 107 | # all owners should be part of the dealers 108 | assert set(assignment["chosen_from_set2"].unique().tolist()) <= set(dealers) 109 | 110 | 111 | def test_cap_to_total_should_leave_untouched_values_below_target(): 112 | assert [10, 20, 30] == cap_to_total([10, 20, 30], target_total=100) 113 | 114 | 115 | def test_cap_to_total_should_leave_untouched_equal_to_target(): 116 | assert [50, 40, 20] == cap_to_total([50, 40, 20], target_total=110) 117 | 118 | 119 | def test_cap_to_total_should_lower_last_correctly(): 120 | assert [50, 40, 5] == cap_to_total([50, 40, 20], target_total=95) 121 | 122 | 123 | def test_cap_to_total_should_zero_last_correctly(): 124 | assert [50, 40, 0] == cap_to_total([50, 40, 20], target_total=90) 125 | 126 | 127 | def test_cap_to_total_should_zero_several_correctly(): 128 | assert [38, 0, 0] == cap_to_total([50, 40, 20], target_total=38) 129 | 130 | 131 | def test_latest_date_before_should_return_input_if_within_range(): 132 | 133 | starting_date = pd.Timestamp("6 June 2016") 134 | upper_bound = pd.Timestamp("8 June 2016") 135 | time_step = pd.Timedelta("7D") 136 | 137 | result = latest_date_before(starting_date, upper_bound, time_step) 138 | 139 | assert result == starting_date 140 | 141 | 142 | def test_latest_date_before_should_return_input_if_start_equals_ub(): 143 | 144 | starting_date = pd.Timestamp("8 June 2016") 145 | upper_bound = pd.Timestamp("8 June 2016") 146 | time_step = pd.Timedelta("7D") 147 | 148 | result = latest_date_before(starting_date, upper_bound, time_step) 149 | 150 | assert result == starting_date 151 | 152 | 153 | def test_latest_date_before_should_shift_backward_ne_week_input_as_required(): 154 | 155 | starting_date = pd.Timestamp("10 June 2016") 156 | expected_date = pd.Timestamp("3 June 2016") 157 | upper_bound = pd.Timestamp("8 June 2016") 158 | time_step = pd.Timedelta("7D") 159 | 160 | result = latest_date_before(starting_date, upper_bound, time_step) 161 | 162 | assert result == expected_date 163 | 164 | 165 | def test_latest_date_before_should_shift_backward_n_weeks_input_as_required(): 166 | 167 | starting_date = pd.Timestamp("10 June 2016") 168 | expected_date = pd.Timestamp("25 March 2016") 169 | upper_bound = pd.Timestamp("31 March 2016") 170 | time_step = pd.Timedelta("7D") 171 | 172 | result = latest_date_before(starting_date, upper_bound, time_step) 173 | 174 | assert result == expected_date 175 | 176 | 177 | def test_latest_date_before_should_shift_forward_n_weeks_input_as_required(): 178 | 179 | starting_date = pd.Timestamp("10 June 2016") 180 | expected_date = pd.Timestamp("27 January 2017") 181 | upper_bound = pd.Timestamp("29 January 2017") 182 | time_step = pd.Timedelta("7D") 183 | 184 | result = latest_date_before(starting_date, upper_bound, time_step) 185 | 186 | assert result == expected_date 187 | 188 | 189 | def test_latest_date_before_should_shift_forward_until_upper_bound(): 190 | 191 | # here the upper bound IS the expected date => makes sure we go up to 192 | # thsi ons 193 | starting_date = pd.Timestamp("10 June 2016") 194 | upper_bound = pd.Timestamp("24 June 2016") 195 | time_step = pd.Timedelta("7D") 196 | 197 | result = latest_date_before(starting_date, upper_bound, time_step) 198 | 199 | assert result == upper_bound 200 | 201 | 202 | def test_if_networkx_bipartite_keeps_actual_structure(): 203 | 204 | # Currently, Netorkx.bipartite returns bipartite networks where the first node 205 | # is always in the first group, and the second node is always in the second group 206 | RB = bipartite.random_graph(5, 10, 0.9, 1234) 207 | 208 | assert functools.reduce(lambda x, y: x & y, [e[0] < 5 for e in RB.edges()]) 209 | 210 | 211 | def test_random_bipartite_network_generation_returns_empty_list_if_first_entry_is_empty(): 212 | 213 | assert [] == make_random_bipartite_data([], [1, 2], 1., 1234) 214 | 215 | 216 | def test_random_bipartite_network_generation_returns_empty_list_if_second_entry_is_empty(): 217 | 218 | assert [] == make_random_bipartite_data([1, 2], [], 1., 1234) 219 | 220 | 221 | def test_random_bipartite_network_generation_returns_empty_list_if_both_entries_are_empty(): 222 | 223 | assert [] == make_random_bipartite_data([], [], 1., 1234) 224 | 225 | 226 | def test_random_bipartite_network_generation_returns_empty_list_if_prob_is_zero(): 227 | 228 | assert [] == make_random_bipartite_data([1, 2], [5, 6], 0., 1234) 229 | 230 | 231 | def test_random_bipartite_network_generation_returns_bipartite_network(): 232 | 233 | all_edges = [(1, 5), (1, 6), (2, 5), (2, 6)] 234 | bp = make_random_bipartite_data([1, 2], [5, 6], 1., 1234) 235 | 236 | assert functools.reduce(lambda x, y: x & y, [e in bp for e in all_edges]) 237 | -------------------------------------------------------------------------------- /tests/unit_tests/test_operations.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import tests.mocks.operations as mockops 5 | from trumania.core import operations 6 | from trumania.core.util_functions import build_ids 7 | 8 | 9 | def test_apply_should_delegate_to_single_col_dataframe_function_correctly(): 10 | 11 | # some function that expect a dataframe as input => must return 12 | # dataframe with "result" column 13 | def f(df): 14 | return pd.DataFrame({"result": df["A"] + df["D"] - df["C"]}) 15 | 16 | tested = operations.Apply(source_fields=["A", "C", "D"], 17 | named_as="r", 18 | f=f, f_args="dataframe") 19 | 20 | story_data = pd.DataFrame( 21 | np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) 22 | 23 | result = tested.build_output(story_data) 24 | 25 | assert result["r"].equals(story_data["A"] + story_data["D"] - story_data[ 26 | "C"]) 27 | 28 | 29 | def test_apply_should_delegate_to_multi_col_dataframe_function_correctly(): 30 | 31 | # now f returns several columns 32 | def f(df): 33 | return pd.DataFrame({ 34 | "r1": df["A"] + df["D"] - df["C"], 35 | "r2": df["A"] + df["C"], 36 | "r3": df["A"] * df["C"], 37 | }) 38 | 39 | tested = operations.Apply(source_fields=["A", "C", "D"], 40 | named_as=["op1", "op2", "op3"], 41 | f=f, f_args="dataframe") 42 | 43 | story_data = pd.DataFrame( 44 | np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) 45 | 46 | result = tested.transform(story_data) 47 | assert result.columns.tolist() == ["A", "B", "C", "D", "E", "op1", "op2", 48 | "op3"] 49 | 50 | assert result["op1"].equals( 51 | story_data["A"] + story_data["D"] - story_data["C"]) 52 | assert result["op2"].equals( 53 | story_data["A"] + story_data["C"]) 54 | assert result["op3"].equals( 55 | story_data["A"] * story_data["C"]) 56 | 57 | 58 | def test_apply_should_delegate_to_columns_function_correctly(): 59 | """ 60 | same as the above, but this time f input and output arguments are 61 | pandas Series 62 | """ 63 | 64 | def f(ca, cc, cd): 65 | return ca + cd - cc 66 | 67 | tested = operations.Apply(source_fields=["A", "C", "D"], 68 | named_as="r", 69 | f=f, f_args="series") 70 | 71 | story_data = pd.DataFrame( 72 | np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) 73 | 74 | result = tested.build_output(story_data) 75 | 76 | assert result["r"].equals( 77 | story_data["A"] + story_data["D"] - story_data["C"]) 78 | 79 | 80 | def test_one_execution_should_merge_empty_data_correctly(): 81 | 82 | # empty previous 83 | prev_df = pd.DataFrame(columns=[]) 84 | prev_log = {} 85 | nop = operations.Operation() 86 | 87 | output, logs = operations.Chain._execute_operation((prev_df, prev_log), nop) 88 | 89 | assert logs == {} 90 | assert output.equals(prev_df) 91 | 92 | 93 | def test_one_execution_should_merge_one_op_with_nothing_into_one_result(): 94 | 95 | # empty previous 96 | prev = pd.DataFrame(columns=[]), {} 97 | 98 | cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"]) 99 | input = pd.DataFrame(np.random.rand(10, 2), columns=["C", "D"]) 100 | op = mockops.FakeOp(input, logs={"cdrs": cdrs}) 101 | 102 | output, logs = operations.Chain._execute_operation(prev, op) 103 | 104 | assert logs == {"cdrs": cdrs} 105 | assert input.equals(output) 106 | 107 | 108 | def test_one_execution_should_merge_2_ops_correctly(): 109 | 110 | # previous results 111 | init = pd.DataFrame(columns=[]) 112 | mobility_logs = pd.DataFrame(np.random.rand(12, 3), 113 | columns=["A", "CELL", "duration"]) 114 | 115 | cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"]) 116 | input = pd.DataFrame(np.random.rand(10, 2), columns=["C", "D"]) 117 | op = mockops.FakeOp(input, {"cdrs": cdrs}) 118 | 119 | output, logs = operations.Chain._execute_operation( 120 | (init, {"mobility": mobility_logs}), op) 121 | 122 | assert logs == {"cdrs": cdrs, "mobility": mobility_logs} 123 | assert input.equals(output) 124 | 125 | 126 | def test_chain_of_3_operation_should_return_merged_logs(): 127 | 128 | cdrs1 = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"]) 129 | op1 = mockops.FakeOp(input, {"cdrs1": cdrs1}) 130 | 131 | cdrs2 = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"]) 132 | op2 = mockops.FakeOp(input, {"cdrs2": cdrs2}) 133 | 134 | cdrs3 = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"]) 135 | op3 = mockops.FakeOp(input, {"cdrs3": cdrs3}) 136 | 137 | chain = operations.Chain(op1, op2, op3) 138 | 139 | prev_data = pd.DataFrame(columns=[]) 140 | story_data, all_logs = chain(prev_data) 141 | 142 | assert set(all_logs.keys()) == {"cdrs1", "cdrs2", "cdrs3"} 143 | assert all_logs["cdrs1"].equals(cdrs1) 144 | assert all_logs["cdrs2"].equals(cdrs2) 145 | assert all_logs["cdrs3"].equals(cdrs3) 146 | 147 | 148 | def test_drop_when_condition_is_all_false_should_have_no_impact(): 149 | 150 | cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"]) 151 | cdrs["all_nos"] = False 152 | 153 | rem = operations.DropRow(condition_field="all_nos") 154 | story_data, all_logs = rem(cdrs) 155 | 156 | # all rows should still be there 157 | assert story_data.shape == (12, 4) 158 | assert story_data.columns.tolist() == ["A", "B", "duration", "all_nos"] 159 | assert story_data["A"].equals(cdrs["A"]) 160 | assert story_data["B"].equals(cdrs["B"]) 161 | assert story_data["duration"].equals(cdrs["duration"]) 162 | 163 | 164 | def test_drop_when_condition_is_all_true_should_remove_everything(): 165 | 166 | cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"]) 167 | cdrs["all_yes"] = True 168 | 169 | rem = operations.DropRow(condition_field="all_yes") 170 | story_data, all_logs = rem(cdrs) 171 | 172 | # all rows should still be there 173 | assert story_data.shape == (0, 4) 174 | assert story_data.columns.tolist() == ["A", "B", "duration", "all_yes"] 175 | assert story_data["A"].equals(pd.Series()) 176 | assert story_data["B"].equals(pd.Series()) 177 | assert story_data["duration"].equals(pd.Series()) 178 | 179 | 180 | def test_drop_should_remove_the_rows_where_condition_is_true_(): 181 | cdrs = pd.DataFrame(np.random.rand(12, 3), columns=["A", "B", "duration"]) 182 | cdrs.index = build_ids(12, prefix="ix_", max_length=2) 183 | cdrs["cond"] = ([True] * 3 + [False] * 3) * 2 184 | 185 | rem = operations.DropRow(condition_field="cond") 186 | story_data, all_logs = rem(cdrs) 187 | 188 | kept_index = ["ix_03", "ix_04", "ix_05", "ix_09", "ix_10", "ix_11"] 189 | 190 | # 6 rows should have been removed 191 | assert story_data.shape == (6, 4) 192 | assert story_data.columns.tolist() == ["A", "B", "duration", "cond"] 193 | assert story_data["A"].equals(cdrs.loc[kept_index]["A"]) 194 | assert story_data["B"].equals(cdrs.loc[kept_index]["B"]) 195 | assert story_data["duration"].equals(cdrs.loc[kept_index]["duration"]) 196 | 197 | 198 | def test_increasing_bounded_sigmoid_must_reach_min_and_max_at_boundaries(): 199 | 200 | freud = operations.bounded_sigmoid(x_min=2, x_max=15, shape=5, 201 | incrementing=True) 202 | 203 | # all values before x_min should be 0 204 | for x in np.linspace(-100, 2, 200): 205 | assert freud(x) == 0 206 | 207 | # all values after x_max should be 1 208 | for x in np.linspace(15, 100, 200): 209 | assert freud(x) == 1 210 | 211 | # all values in between should be in [0,1 ] 212 | for x in np.linspace(0, 1, 200): 213 | assert 0 <= freud(x) <= 1 214 | 215 | 216 | def test_decreasing_bounded_sigmoid_must_reach_min_and_max_at_boundaries(): 217 | 218 | freud = operations.bounded_sigmoid(x_min=2, x_max=15, shape=5, 219 | incrementing=False) 220 | 221 | # all values before x_min should be 1 222 | for x in np.linspace(-100, 2, 200): 223 | assert freud(x) == 1 224 | 225 | # all values after x_max should be 0 226 | for x in np.linspace(15, 100, 200): 227 | assert freud(x) == 0 228 | 229 | # all values in between should be in [0,1 ] 230 | for x in np.linspace(0, 1, 200): 231 | assert 0 <= freud(x) <= 1 232 | 233 | 234 | def test_bounded_sigmoid_should_broadcast_as_a_ufunc(): 235 | 236 | freud = operations.bounded_sigmoid(x_min=2, x_max=15, shape=5, 237 | incrementing=True) 238 | 239 | # passing a range of x should yield a range of y's 240 | for y in freud(np.linspace(-100, 2, 200)): 241 | assert y == 0 242 | 243 | # all values after x_max should be 1 244 | for y in freud(np.linspace(15, 100, 200)): 245 | assert y == 1 246 | 247 | # all values in between should be in [0,1 ] 248 | for y in freud(np.linspace(0, 1, 200)): 249 | assert 0 <= y <= 1 250 | 251 | 252 | def test_bounding_function_should_not_modify_unbounded_values(): 253 | bound_f = operations.bound_value(lb=None, ub=None) 254 | 255 | for x in np.arange(-1000, 2000, 10000): 256 | assert x == bound_f(x) 257 | 258 | 259 | def test_bounded_generator_should_limnit_with_lower_bound(): 260 | 261 | bound_f = operations.bound_value(lb=15) 262 | assert bound_f(10) == 15 263 | assert bound_f(15) == 15 264 | assert bound_f(20) == 20 265 | 266 | 267 | def test_bounded_generator_should_limnit_with_upper_bound(): 268 | 269 | bound_f = operations.bound_value(ub=15) 270 | assert bound_f(10) == 10 271 | assert bound_f(15) == 15 272 | assert bound_f(20) == 15 273 | 274 | 275 | def test_bounded_generator_should_limnit_with_both_bound(): 276 | 277 | bound_f = operations.bound_value(lb=10, ub=15) 278 | assert bound_f(5) == 10 279 | assert bound_f(10) == 10 280 | assert bound_f(12) == 12 281 | assert bound_f(15) == 15 282 | assert bound_f(20) == 15 283 | -------------------------------------------------------------------------------- /tests/unit_tests/test_activity.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import path 4 | import pandas as pd 5 | import logging 6 | import os 7 | 8 | from trumania.core.util_functions import setup_logging, load_all_logs 9 | from trumania.core.clock import CyclicTimerProfile, CyclicTimerGenerator 10 | from trumania.core.random_generators import SequencialGenerator, NumpyRandomGenerator, ConstantGenerator 11 | from trumania.core.circus import Circus 12 | from trumania.core.operations import FieldLogger, bound_value 13 | from trumania.components.time_patterns.profilers import DefaultDailyTimerGenerator, WorkHoursTimerGenerator 14 | 15 | setup_logging() 16 | 17 | 18 | def run_test_scenario_1(clock_step, simulation_duration, 19 | n_stories, per, 20 | log_folder): 21 | 22 | circus = Circus(name="tested_circus", master_seed=1, 23 | start=pd.Timestamp("8 June 2016"), 24 | step_duration=pd.Timedelta(clock_step)) 25 | 26 | population = circus.create_population( 27 | name="a", 28 | size=1000, 29 | ids_gen=SequencialGenerator(max_length=3, prefix="id_")) 30 | 31 | daily_profile = CyclicTimerGenerator( 32 | clock=circus.clock, 33 | config=CyclicTimerProfile( 34 | profile=[1] * 24, 35 | profile_time_steps="1h", 36 | start_date=pd.Timestamp("8 June 2016") 37 | ), 38 | seed=1234) 39 | 40 | # each of the 500 populations have a constant 12 logs per day rate 41 | activity_gen = ConstantGenerator( 42 | value=daily_profile.activity( 43 | n=n_stories, per=per 44 | )) 45 | 46 | # just a dummy operation to produce some logs 47 | story = circus.create_story( 48 | name="test_story", 49 | initiating_population=population, 50 | member_id_field="some_id", 51 | timer_gen=daily_profile, 52 | activity_gen=activity_gen) 53 | 54 | story.set_operations( 55 | circus.clock.ops.timestamp(named_as="TIME"), 56 | FieldLogger(log_id="the_logs") 57 | ) 58 | 59 | circus.run(duration=pd.Timedelta(simulation_duration), log_output_folder=log_folder) 60 | 61 | 62 | def test_1000_populations_with_activity_12perday_should_yield_24k_logs_in_2days(): 63 | """ 64 | this is a "high frequency test", where the number of stories per cycle ( 65 | i.e. per day here) is largely below 1 => the cyclic generator should 66 | typically generate timers smaller than the length of the cycle 67 | """ 68 | 69 | with tempfile.TemporaryDirectory() as log_parent_folder: 70 | log_folder = os.path.join(log_parent_folder, "logs") 71 | 72 | run_test_scenario_1(clock_step="15 min", 73 | simulation_duration="2 days", 74 | n_stories=12, 75 | per=pd.Timedelta("1day"), 76 | log_folder=log_folder) 77 | 78 | logging.info("loading produced logs") 79 | logs = load_all_logs(log_folder)["the_logs"] 80 | 81 | # 2 days of simulation should produce 1000 * 12 * 2 == 24k logs 82 | logging.info("number of produced logs: {} logs".format(logs.shape[0])) 83 | assert 22e3 <= logs.shape[0] <= 26e3 84 | 85 | 86 | def test_1000_populations_with_activity_12perday_should_yield_60k_logs_in_5days(): 87 | """ 88 | same test as above, with bigger clock step => typically more "rounding 89 | errors", and longer total simulation duration 90 | """ 91 | 92 | with tempfile.TemporaryDirectory() as log_parent_folder: 93 | log_folder = os.path.join(log_parent_folder, "logs") 94 | 95 | # note that we cannot have clock_step > 2h since that 96 | run_test_scenario_1(clock_step="1h", 97 | simulation_duration="5 days", 98 | n_stories=12, 99 | per=pd.Timedelta("1day"), 100 | log_folder=log_folder) 101 | 102 | logging.info("loading produced logs") 103 | logs = load_all_logs(log_folder)["the_logs"] 104 | 105 | logging.info("number of produced logs: {} logs".format(logs.shape[0])) 106 | 107 | # 5 days of simulation should produce 1000 * 12 * 5 == 60k logs 108 | assert 55e3 <= logs.shape[0] <= 65e3 109 | 110 | 111 | def test_1000_populations_with_low_activity(): 112 | """ 113 | 114 | This is a low activity test, where the populations have less than one activity 115 | per cycle 116 | 117 | """ 118 | 119 | with tempfile.TemporaryDirectory() as log_parent_folder: 120 | log_folder = os.path.join(log_parent_folder, "logs") 121 | 122 | run_test_scenario_1(clock_step="1 h", 123 | simulation_duration="20days", 124 | n_stories=1, 125 | per=pd.Timedelta("5 days"), 126 | log_folder=log_folder) 127 | 128 | logging.info("loading produced logs") 129 | logs = load_all_logs(log_folder)["the_logs"] 130 | 131 | logging.info("number of produced logs: {} logs".format(logs.shape[0])) 132 | 133 | # 20 days of simulation should produce 1000 * .2 * 20 == 4000 logs 134 | assert 3500 <= logs.shape[0] <= 4500 135 | 136 | 137 | def test_1000_populations_with_low_activity2(): 138 | """ 139 | 140 | This is a low activity test, where the populations have less than one activity 141 | per cycle 142 | 143 | """ 144 | 145 | with tempfile.TemporaryDirectory() as log_parent_folder: 146 | log_folder = os.path.join(log_parent_folder, "logs") 147 | 148 | run_test_scenario_1(clock_step="3 h", 149 | simulation_duration="15days", 150 | n_stories=1, 151 | per=pd.Timedelta("5 days"), 152 | log_folder=log_folder) 153 | 154 | logging.info("loading produced logs") 155 | logs = load_all_logs(log_folder)["the_logs"] 156 | 157 | # 2 days of simulation should produce 1000 * 15 * 1/5 == 3000 logs 158 | assert 2600 <= logs.shape[0] <= 3400 159 | 160 | 161 | def test_1000_populations_with_activity_one_per_cycle(): 162 | """ 163 | This is a border case between low and high activity, where the desired 164 | amount of logs per cycle is close to 1 (i.e. close to 1 per day with our 165 | timer) => we still need to have generated timers a bit above or below one 166 | day, and achieve the expected total amount of logs 167 | """ 168 | 169 | with tempfile.TemporaryDirectory() as log_parent_folder: 170 | log_folder = os.path.join(log_parent_folder, "logs") 171 | 172 | run_test_scenario_1(clock_step="15 min", 173 | simulation_duration="10days", 174 | n_stories=1, 175 | per=pd.Timedelta("1 day"), 176 | log_folder=log_folder) 177 | 178 | logging.info("loading produced logs") 179 | logs = load_all_logs(log_folder)["the_logs"] 180 | 181 | logging.info("number of produced logs: {} logs".format(logs.shape[0])) 182 | 183 | # 10 days of simulation should produce 1000 * 1 * 10 == 10000 logs 184 | assert 9500 <= logs.shape[0] <= 10500 185 | 186 | 187 | def test_populations_during_default_daily(): 188 | 189 | with tempfile.TemporaryDirectory() as log_parent_folder: 190 | log_folder = os.path.join(log_parent_folder, "logs") 191 | 192 | circus = Circus(name="tested_circus", 193 | master_seed=1, 194 | start=pd.Timestamp("8 June 2016"), 195 | step_duration=pd.Timedelta("1h")) 196 | 197 | field_agents = circus.create_population( 198 | name="fa", 199 | size=100, 200 | ids_gen=SequencialGenerator(max_length=3, prefix="id_")) 201 | 202 | mobility_time_gen = DefaultDailyTimerGenerator( 203 | clock=circus.clock, seed=next(circus.seeder)) 204 | 205 | gaussian_activity = NumpyRandomGenerator( 206 | method="normal", loc=5, 207 | scale=.5, seed=1) 208 | mobility_activity_gen = gaussian_activity.map(bound_value(lb=1)) 209 | 210 | # just a dummy operation to produce some logs 211 | story = circus.create_story( 212 | name="test_story", 213 | initiating_population=field_agents, 214 | member_id_field="some_id", 215 | timer_gen=mobility_time_gen, 216 | activity_gen=mobility_activity_gen) 217 | 218 | story.set_operations( 219 | circus.clock.ops.timestamp(named_as="TIME"), 220 | FieldLogger(log_id="the_logs") 221 | ) 222 | 223 | circus.run(duration=pd.Timedelta("30 days"), log_output_folder=log_folder) 224 | 225 | logging.info("loading produced logs") 226 | logs = load_all_logs(log_folder)["the_logs"] 227 | 228 | logging.info("number of produced logs: {} logs".format(logs.shape[0])) 229 | 230 | # 30 days of simulation should produce 100 * 5 * 30 == 15k logs 231 | assert 14e3 <= logs.shape[0] <= 16.5e3 232 | 233 | 234 | def test_populations_during_working_hours(): 235 | 236 | with tempfile.TemporaryDirectory() as log_parent_folder: 237 | log_folder = os.path.join(log_parent_folder, "logs") 238 | 239 | circus = Circus(name="tested_circus", 240 | master_seed=1, 241 | start=pd.Timestamp("8 June 2016"), 242 | step_duration=pd.Timedelta("1h")) 243 | 244 | field_agents = circus.create_population( 245 | name="fa", 246 | size=100, 247 | ids_gen=SequencialGenerator(max_length=3, prefix="id_")) 248 | 249 | mobility_time_gen = WorkHoursTimerGenerator( 250 | clock=circus.clock, seed=next(circus.seeder)) 251 | 252 | five_per_day = mobility_time_gen.activity( 253 | n=5, per=pd.Timedelta("1day")) 254 | 255 | std_per_day = mobility_time_gen.activity( 256 | n=.5, per=pd.Timedelta("1day")) 257 | 258 | gaussian_activity = NumpyRandomGenerator( 259 | method="normal", loc=five_per_day, 260 | scale=std_per_day, seed=1) 261 | mobility_activity_gen = gaussian_activity.map(bound_value(lb=1)) 262 | 263 | # just a dummy operation to produce some logs 264 | story = circus.create_story( 265 | name="test_story", 266 | initiating_population=field_agents, 267 | member_id_field="some_id", 268 | timer_gen=mobility_time_gen, 269 | activity_gen=mobility_activity_gen) 270 | 271 | story.set_operations( 272 | circus.clock.ops.timestamp(named_as="TIME"), 273 | FieldLogger(log_id="the_logs") 274 | ) 275 | 276 | circus.run(duration=pd.Timedelta("30 days"), log_output_folder=log_folder) 277 | 278 | logging.info("loading produced logs") 279 | logs = load_all_logs(log_folder)["the_logs"] 280 | 281 | logging.info("number of produced logs: {} logs".format(logs.shape[0])) 282 | 283 | # 30 days of simulation should produce 100 * 5 * 30 == 15k logs 284 | assert 14e3 <= logs.shape[0] <= 16e3 285 | -------------------------------------------------------------------------------- /tests/unit_tests/test_populations.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import path 4 | import pandas as pd 5 | import os 6 | import pytest 7 | 8 | from trumania.core.random_generators import SequencialGenerator 9 | from trumania.core.population import Population 10 | 11 | dummy_population = Population(circus=None, 12 | size=10, 13 | ids_gen=SequencialGenerator(max_length=1, prefix="id_")) 14 | 15 | ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23] 16 | dummy_population.create_attribute("age", init_values=ages) 17 | 18 | city = ["a", "b", "b", "a", "d", "e", "r", "a", "z", "c"] 19 | dummy_population.create_attribute("city", init_values=city) 20 | 21 | # some fake story data with an index corresponding to another population 22 | # => simulates an story triggered by that other population 23 | # the column "NEIGHBOUR" contains value that point to the dummy population, with 24 | # a duplication (id2) 25 | story_data = pd.DataFrame({ 26 | "A": ["a1", "a2", "a3", "a4"], 27 | "B": ["b1", "b2", "b3", "b4"], 28 | "NEIGHBOUR": ["id_2", "id_4", "id_7", "id_2"], 29 | "COUSINS": [ 30 | ["id_2", "id_4", "id_7", "id_2"], 31 | ["id_3"], 32 | ["id_4", "id_5", "id_8"], 33 | ["id_0", "id_4"], 34 | ], 35 | }, 36 | index=["cust_1", "cust_2", "cust_3", "cust_4"] 37 | ) 38 | 39 | 40 | def test_resulting_size_should_be_as_expected(): 41 | assert dummy_population.size == 10 42 | assert len(dummy_population.ids) == 10 43 | 44 | 45 | def test_population_constructor_should_refuse_duplicated_ids(): 46 | with pytest.raises(ValueError): 47 | Population(circus=None, 48 | size=10, 49 | # these ids have duplicated values, that is not good 50 | ids=[1,2,3,4,5,6,7,8,9,9]) 51 | 52 | 53 | def test_transforming_population_to_dataframe_should_provide_all_data(): 54 | df = dummy_population.to_dataframe() 55 | 56 | # order of the columns in the resulting dataframe is currently not 57 | # deterministic 58 | assert sorted(df.columns) == ["age", "city"] 59 | 60 | assert df["age"].values.tolist() == ages 61 | assert df["city"].values.tolist() == city 62 | 63 | 64 | def test_lookup_values_by_scalar_should_return_correct_values(): 65 | 66 | lookup = dummy_population.ops.lookup( 67 | id_field="NEIGHBOUR", 68 | select={ 69 | "age": "neighbour_age", 70 | "city": "neighbour_city", 71 | } 72 | ) 73 | 74 | result, logs = lookup(story_data) 75 | expected_cols = ["A", "B", "COUSINS", "NEIGHBOUR", "neighbour_age", 76 | "neighbour_city"] 77 | assert logs == {} 78 | 79 | assert sorted(result.columns) == expected_cols 80 | 81 | # values of the age corresponding to the neighbour id 82 | assert [40, 100, 39, 40] == result["neighbour_age"].tolist() 83 | 84 | # values of the age corresponding to the neighbour id 85 | assert ["b", "d", "a", "b"] == result["neighbour_city"].tolist() 86 | 87 | 88 | def test_lookup_operation_from_empty_story_data_should_return_empty_df_with_all_columns(): 89 | 90 | lookup = dummy_population.ops.lookup( 91 | id_field="NEIGHBOUR", 92 | select={ 93 | "age": "neighbour_age", 94 | "city": "neighbour_city", 95 | } 96 | ) 97 | 98 | empty_story_data = pd.DataFrame(columns=["A", "B", "COUSINS", "NEIGHBOUR"]) 99 | 100 | result, logs = lookup(empty_story_data) 101 | expected_cols = ["A", "B", "COUSINS", "NEIGHBOUR", "neighbour_age", 102 | "neighbour_city"] 103 | assert logs == {} 104 | assert sorted(result.columns) == expected_cols 105 | assert result.shape[0] == 0 106 | 107 | 108 | def test_lookup_values_by_array_should_return_correct_values(): 109 | 110 | lookup = dummy_population.ops.lookup( 111 | id_field="COUSINS", 112 | select={ 113 | "age": "cousins_age", 114 | "city": "cousins_city", 115 | } 116 | ) 117 | 118 | result, logs = lookup(story_data) 119 | expected_cols = ["A", "B", "COUSINS", "NEIGHBOUR", "cousins_age", 120 | "cousins_city"] 121 | assert logs == {} 122 | 123 | assert sorted(result.columns) == expected_cols 124 | 125 | # list of values of the age corresponding to the coussin id, in the correct 126 | # order 127 | assert [ 128 | [40, 100, 39, 40], 129 | [10], 130 | [100, 98, 76], 131 | [10, 100] 132 | ] == result["cousins_age"].tolist() 133 | 134 | assert [ 135 | ["b", "d", "a", "b"], 136 | ["a"], 137 | ["d", "e", "z"], 138 | ["a", "d"] 139 | ] == result["cousins_city"].tolist() 140 | 141 | 142 | def test_insert_poppulation_value_for_existing_populations_should_update_all_values(): 143 | 144 | # copy of dummy population that will be updated 145 | tested_population = Population( 146 | circus=None, 147 | size=10, 148 | ids_gen=SequencialGenerator(max_length=1, prefix="a_") 149 | ) 150 | ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23] 151 | tested_population.create_attribute("age", init_values=ages) 152 | city = ["a", "b", "b", "a", "d", "e", "r", "a", "z", "c"] 153 | tested_population.create_attribute("city", init_values=city) 154 | 155 | current = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9"]) 156 | assert current.tolist() == [10, 39, 23] 157 | 158 | update = pd.DataFrame( 159 | { 160 | "age": [139, 123], 161 | "city": ["city_7", "city_9"] 162 | }, 163 | index=["a_7", "a_9"] 164 | ) 165 | 166 | tested_population.update(update) 167 | 168 | # we should have the same number of populations 169 | assert tested_population.ids.shape[0] == 10 170 | 171 | updated_age = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9"]) 172 | updated_city = tested_population.get_attribute_values("city", ["a_0", "a_7", "a_9"]) 173 | 174 | assert updated_age.tolist() == [10, 139, 123] 175 | assert updated_city.tolist() == ["a", "city_7", "city_9"] 176 | 177 | 178 | def test_insert_population_value_for_existing_and_new_populations_should_update_and_add_values(): 179 | 180 | # copy of dummy population that will be updated 181 | tested_population = Population( 182 | circus=None, size=10, 183 | ids_gen=SequencialGenerator(max_length=1, prefix="a_")) 184 | ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23] 185 | tested_population.create_attribute("age", init_values=ages) 186 | city = ["a", "b", "b", "a", "d", "e", "r", "a", "z", "c"] 187 | tested_population.create_attribute("city", init_values=city) 188 | 189 | current = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9"]) 190 | assert current.tolist() == [10, 39, 23] 191 | 192 | update = pd.DataFrame( 193 | { 194 | "age": [139, 123, 54, 25], 195 | "city": ["city_7", "city_9", "city_11", "city_10"] 196 | }, 197 | index=["a_7", "a_9", "a_11", "a_10"] 198 | ) 199 | 200 | tested_population.update(update) 201 | 202 | # we should have 2 new populations 203 | assert tested_population.ids.shape[0] == 12 204 | 205 | updated_age = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9", "a_10", "a_11"]) 206 | updated_city = tested_population.get_attribute_values("city", ["a_0", "a_7", "a_9", "a_10", "a_11"]) 207 | 208 | assert updated_age.tolist() == [10, 139, 123, 25, 54] 209 | assert updated_city.tolist() == ["a", "city_7", "city_9", "city_10", "city_11"] 210 | 211 | 212 | def test_insert_op_population_value_for_existing_populations_should_update_all_values(): 213 | # same as test above but triggered as an Operation on story data 214 | 215 | # copy of dummy population that will be updated 216 | tested_population = Population( 217 | circus=None, size=10, 218 | ids_gen=SequencialGenerator(max_length=1, prefix="a_")) 219 | ages = [10, 20, 40, 10, 100, 98, 12, 39, 76, 23] 220 | tested_population.create_attribute("age", init_values=ages) 221 | city = ["a", "b", "b", "a", "d", "e", "r", "a", "z", "c"] 222 | tested_population.create_attribute("city", init_values=city) 223 | 224 | story_data = pd.DataFrame( 225 | { 226 | "the_new_age": [139, 123, 1, 2], 227 | "location": ["city_7", "city_9", "city_11", "city_10"], 228 | "updated_populations": ["a_7", "a_9", "a_11", "a_10"] 229 | }, 230 | index=["d_1", "d_2", "d_4", "d_3"] 231 | ) 232 | 233 | update_op = tested_population.ops.update( 234 | id_field="updated_populations", 235 | copy_attributes_from_fields={ 236 | "age": "the_new_age", 237 | "city": "location" 238 | } 239 | ) 240 | 241 | story_data_2, logs = update_op(story_data) 242 | 243 | # there should be no impact on the story data 244 | assert story_data_2.shape == (4, 3) 245 | assert sorted(story_data_2.columns.tolist()) == ["location", "the_new_age", "updated_populations"] 246 | 247 | # we should have 2 new populations 248 | assert tested_population.ids.shape[0] == 12 249 | 250 | updated_age = tested_population.get_attribute_values("age", ["a_0", "a_7", "a_9", "a_10", "a_11"]) 251 | updated_city = tested_population.get_attribute_values("city", ["a_0", "a_7", "a_9", "a_10", "a_11"]) 252 | 253 | assert updated_age.tolist() == [10, 139, 123, 2, 1] 254 | assert updated_city.tolist() == ["a", "city_7", "city_9", "city_10", "city_11"] 255 | 256 | 257 | def test_creating_an_empty_population_and_adding_attributes_later_should_be_possible(): 258 | 259 | # empty population 260 | a = Population(circus=None, size=0) 261 | assert a.ids.shape[0] == 0 262 | 263 | # empty attributes 264 | a.create_attribute("att1") 265 | a.create_attribute("att2") 266 | 267 | dynamically_created = pd.DataFrame({ 268 | "att1": [1, 2, 3], 269 | "att2": [11, 12, 13], 270 | }, index=["ac1", "ac2", "ac3"] 271 | ) 272 | 273 | a.update(dynamically_created) 274 | 275 | assert a.ids.tolist() == ["ac1", "ac2", "ac3"] 276 | assert a.get_attribute_values("att1", ["ac1", "ac2", "ac3"]).tolist() == [1, 2, 3] 277 | assert a.get_attribute_values("att2", ["ac1", "ac2", "ac3"]).tolist() == [11, 12, 13] 278 | 279 | 280 | def test_io_round_trip(): 281 | 282 | with tempfile.TemporaryDirectory() as p: 283 | 284 | population_path = os.path.join(p, "test_location") 285 | dummy_population.save_to(population_path) 286 | retrieved = Population.load_from(circus=None, folder=population_path) 287 | 288 | assert dummy_population.size == retrieved.size 289 | assert dummy_population.ids.tolist() == retrieved.ids.tolist() 290 | 291 | ids = dummy_population.ids.tolist() 292 | 293 | for att_name in dummy_population.attribute_names(): 294 | assert dummy_population.get_attribute_values(att_name, ids).equals( 295 | retrieved.get_attribute_values(att_name, ids) 296 | ) 297 | 298 | for rel_name in dummy_population.relationship_names(): 299 | assert dummy_population.get_relationship(rel_name)._table.equals( 300 | retrieved.get_relationship(rel_name)._table 301 | ) 302 | -------------------------------------------------------------------------------- /trumania/core/circus.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import json 4 | import pandas as pd 5 | 6 | from trumania.core import population 7 | from trumania.components import db 8 | from trumania.core.random_generators import seed_provider 9 | from trumania.core.util_functions import ensure_non_existing_dir 10 | from trumania.core.clock import Clock 11 | from trumania.core.story import Story 12 | 13 | 14 | class Circus(object): 15 | """ 16 | A Circus is just a container of a lot of objects that are required to make the simulation 17 | It is also the object that will execute the stories required for 1 iteration 18 | """ 19 | 20 | def __init__(self, name, master_seed, **clock_params): 21 | """Create a new Circus object 22 | 23 | :param master_seed: seed used to initialized random generatof of 24 | other seeds 25 | :type master_seed: int 26 | 27 | :rtype: Circus 28 | :return: a new Circus object, with the clock, is created 29 | """ 30 | self.name = name 31 | 32 | self.master_seed = master_seed 33 | self.clock_params = clock_params 34 | 35 | self.seeder = seed_provider(master_seed=master_seed) 36 | self.clock = Clock(seed=next(self.seeder), **clock_params) 37 | self.stories = [] 38 | self.populations = {} 39 | self.generators = {} 40 | 41 | def create_population(self, name, **population_params): 42 | """ 43 | Creates a population with the specifed parameters and attach it to this 44 | circus. 45 | """ 46 | if name in self.populations: 47 | raise ValueError("refusing to overwrite existing population: {} " 48 | "".format(name)) 49 | 50 | self.populations[name] = population.Population(circus=self, **population_params) 51 | return self.populations[name] 52 | 53 | def load_population(self, population_id, namespace=None): 54 | """ 55 | Load this population definition add attach it to this circus 56 | """ 57 | 58 | # Defaulting to the namespace associated to this circus if none 59 | # specified 60 | if namespace is None: 61 | namespace = self.name 62 | 63 | loaded = db.load_population(namespace=namespace, 64 | population_id=population_id, circus=self) 65 | self.populations[population_id] = loaded 66 | return loaded 67 | 68 | def create_story(self, name, **story_params): 69 | """ 70 | Creates a story with the provided parameters and attach it to this 71 | circus. 72 | """ 73 | 74 | existing = self.get_story(name) 75 | 76 | if existing is None: 77 | story = Story(name=name, **story_params) 78 | self.stories.append(story) 79 | return story 80 | 81 | else: 82 | raise ValueError("Cannot add story {}: another story with " 83 | "identical name is already in the circus".format(name)) 84 | 85 | def get_story(self, story_name): 86 | """ 87 | Looks up and story by name in this circus and returns it. Returns none 88 | if not found. 89 | """ 90 | remaining_stories = filter(lambda a: a.name == story_name, self.stories) 91 | try: 92 | return next(remaining_stories) 93 | except StopIteration: 94 | logging.warn("story not found: {}".format(story_name)) 95 | return None 96 | 97 | def get_population_of(self, story_name): 98 | """ 99 | Looks up the initiating population associated to this story 100 | """ 101 | return self.get_story(story_name).triggering_population 102 | 103 | def attach_generator(self, gen_id, generator): 104 | """ 105 | "attach" a random generator to this circus, s.t. it gets persisted 106 | with the rest 107 | """ 108 | if gen_id in self.generators: 109 | raise ValueError("refusing to replace existing generator: {} " 110 | "".format(gen_id)) 111 | 112 | self.generators[gen_id] = generator 113 | 114 | def load_generator(self, gen_type, gen_id): 115 | """ 116 | Load this generator definition add attach it to this circus 117 | """ 118 | gen = db.load_generator( 119 | namespace=self.name, gen_type=gen_type, gen_id=gen_id) 120 | 121 | self.attach_generator(gen_id, gen) 122 | return gen 123 | 124 | @staticmethod 125 | def save_logs(log_id, logs, log_output_folder): 126 | """ 127 | Appends those logs to the corresponding output file, creating it if 128 | it does not exist or appending lines to it otherwise. 129 | """ 130 | 131 | output_file = os.path.join(log_output_folder, "{}.csv".format(log_id)) 132 | 133 | if not os.path.exists(log_output_folder): 134 | os.makedirs(log_output_folder) 135 | 136 | if logs.shape[0] > 0: 137 | logging.info("appending {} rows to {}".format(logs.shape[0], output_file)) 138 | 139 | if not os.path.exists(output_file): 140 | # If these are this first persisted logs, we create the file 141 | # and include the field names as column header. 142 | logs.to_csv(output_file, index=False, header=True) 143 | 144 | else: 145 | # Otherwise, open the existing log file in append mode and add 146 | # the new logs at the end, this time without columns headers. 147 | with open(output_file, "a") as out_f: 148 | logs.to_csv(out_f, index=False, header=False) 149 | 150 | def run(self, duration, log_output_folder, delete_existing_logs=False): 151 | """ 152 | Executes all stories in the circus for as long as requested. 153 | 154 | :param duration: duration of the desired simulation (start date is 155 | dictated by the clock) 156 | :type duration: pd.TimeDelta 157 | 158 | :param log_output_folder: folder where to write the logs. 159 | :type log_output_folder: string 160 | 161 | :param delete_existing_logs: 162 | """ 163 | 164 | n_iterations = self.clock.n_iterations(duration) 165 | logging.info("Starting circus for {} iterations of {} for a " 166 | "total duration of {}".format( 167 | n_iterations, self.clock.step_duration, duration 168 | )) 169 | 170 | if os.path.exists(log_output_folder): 171 | if delete_existing_logs: 172 | ensure_non_existing_dir(log_output_folder) 173 | else: 174 | raise EnvironmentError("{} exists and delete_existing_logs is " 175 | "False => refusing to start and " 176 | "overwrite logs".format(log_output_folder)) 177 | 178 | for step_number in range(n_iterations): 179 | logging.info("step : {}".format(step_number)) 180 | 181 | for story in self.stories: 182 | for log_id, logs in story.execute().items(): 183 | self.save_logs(log_id, logs, log_output_folder) 184 | 185 | self.clock.increment() 186 | 187 | @staticmethod 188 | def load_from_db(circus_name): 189 | 190 | logging.info("loading circus {}".format(circus_name)) 191 | 192 | namespace_folder = db.namespace_folder(namespace=circus_name) 193 | config_file = os.path.join(namespace_folder, "circus_config.json") 194 | 195 | with open(config_file, "r") as config_h: 196 | config = json.load(config_h) 197 | 198 | clock_config = { 199 | "start": pd.Timestamp(config["clock_config"]["start"]), 200 | "step_duration": pd.Timedelta( 201 | str(config["clock_config"]["step_duration"])) 202 | } 203 | 204 | circus = Circus(name=circus_name, master_seed=config["master_seed"], 205 | **clock_config) 206 | 207 | for population_id in db.list_populations(namespace=circus_name): 208 | circus.load_population(population_id) 209 | 210 | for gen_type, gen_id in db.list_generators(namespace=circus_name): 211 | circus.load_generator(gen_type=gen_type, gen_id=gen_id) 212 | 213 | return circus 214 | 215 | def save_to_db(self, overwrite=False): 216 | """ 217 | Create a db namespace named after this circus and saves all the 218 | populations there. 219 | 220 | Only static data is saved, not the stories. 221 | """ 222 | 223 | logging.info("saving circus {}".format(self.name)) 224 | 225 | if db.is_namespace_existing(namespace=self.name): 226 | if overwrite: 227 | logging.warning( 228 | "overwriting existing circus {}".format(self.name)) 229 | db.remove_namespace(namespace=self.name) 230 | 231 | else: 232 | raise IOError("refusing to remove existing {} namespace since " 233 | "overwrite parameter is False".format(self.name)) 234 | 235 | namespace_folder = db.create_namespace(namespace=self.name) 236 | config_file = os.path.join(namespace_folder, "circus_config.json") 237 | with open(config_file, "w") as o: 238 | config = {"master_seed": self.master_seed, 239 | "clock_config": { 240 | "start": self.clock_params["start"].isoformat(), 241 | "step_duration": str(self.clock_params["step_duration"])} 242 | } 243 | json.dump(config, o, indent=4) 244 | 245 | logging.info("saving all populations") 246 | for population_id, ac in self.populations.items(): 247 | db.save_population(ac, namespace=self.name, 248 | population_id=population_id) 249 | 250 | logging.info("saving all generators") 251 | for gen_id, generator in self.generators.items(): 252 | db.save_generator(generator, namespace=self.name, gen_id=gen_id) 253 | 254 | logging.info("circus saved") 255 | 256 | def save_params_to_db(self, params_type, params): 257 | """ 258 | Saves the params object to the circus folder in the DB for future reference 259 | :param params_type: "build", "run" or "target" 260 | :param params: the params object 261 | """ 262 | target_file = os.path.join(db.namespace_folder(self.name), 263 | "params_{}.json".format(params_type)) 264 | 265 | with open(target_file, "w") as outfile: 266 | json.dump(params, outfile) 267 | 268 | def description(self): 269 | 270 | return { 271 | "circus_name": self.name, 272 | "master_seed": self.master_seed, 273 | "populations": {id: population.description() 274 | for id, population in self.populations.items() 275 | }, 276 | "generators": {gen_id: gen.description() 277 | for gen_id, gen in self.generators.items() 278 | }, 279 | } 280 | 281 | def __str__(self): 282 | return json.dumps(self.description(), indent=4) 283 | -------------------------------------------------------------------------------- /examples/tutorial/example4.py: -------------------------------------------------------------------------------- 1 | from trumania.core import circus 2 | import trumania.core.population as population 3 | import trumania.core.random_generators as gen 4 | import trumania.core.operations as ops 5 | import trumania.core.story as story 6 | import trumania.components.time_patterns.profilers as profilers 7 | import trumania.core.util_functions as util_functions 8 | import trumania.components.db as DB 9 | import pandas as pd 10 | 11 | # each step?() function below implement one step of the fourth example of the 12 | # tutorial documented at 13 | # https://realimpactanalytics.atlassian.net/wiki/display/LM/Data+generator+tutorial 14 | # this is essentially a modification of example3, with some supplementary 15 | # features demonstrating persistence 16 | 17 | 18 | def build_music_repo(): 19 | 20 | # this time we create a "detached" population, not connected to a circus 21 | repo = population.Population( 22 | circus=None, 23 | size=5, 24 | ids_gen=gen.SequencialGenerator(prefix="GENRE_")) 25 | 26 | repo.create_attribute( 27 | name="genre_name", 28 | init_values=["blues", "jazz", "electro", "pop", "rock"]) 29 | 30 | repo.create_relationship(name="songs", seed=18) 31 | 32 | return repo 33 | 34 | 35 | def add_song_to_repo(repo_population): 36 | 37 | songs = population.Population( 38 | circus=None, 39 | size=0, 40 | ids_gen=gen.SequencialGenerator(prefix="SONG_")) 41 | 42 | # since the size of the population is 0, we can create attribute without 43 | # providing any initialization 44 | songs.create_attribute(name="artist_name") 45 | songs.create_attribute(name="song_genre") 46 | songs.create_attribute(name="title") 47 | songs.create_attribute(name="duration_seconds") 48 | songs.create_attribute(name="recording_year") 49 | 50 | song_id_gen = gen.SequencialGenerator(prefix="S_") 51 | 52 | # generate artist names from a list of randomly generated ones, so we have 53 | # some redundancy in the generated dataset 54 | artist_name_gen = gen.NumpyRandomGenerator( 55 | method="choice", 56 | a=gen.FakerGenerator( 57 | method="name", 58 | seed=1234).generate(size=200), 59 | seed=5678) 60 | 61 | title_gen = gen.FakerGenerator(method="sentence", 62 | seed=78961, 63 | nb_words=4, 64 | variable_nb_words=True) 65 | 66 | # generates recording years within a desired date range 67 | year_gen = gen.FakerGenerator( 68 | method="date_time_between_dates", 69 | seed=184, 70 | datetime_start=pd.Timestamp("1910-10-20"), 71 | datetime_end=pd.Timestamp("2016-12-02")) \ 72 | .map(f=lambda d: d.year) 73 | 74 | duration_gen = gen.ParetoGenerator(xmin=60, 75 | seed=9874, 76 | force_int=True, 77 | a=1.2) 78 | 79 | repo_genre_rel = repo_population.get_attribute("genre_name") 80 | for genre_id, genre_name in repo_genre_rel.get_values().items(): 81 | 82 | # an operation capable of creating songs of that genre 83 | init_attribute = ops.Chain( 84 | artist_name_gen.ops.generate(named_as="artist_name"), 85 | title_gen.ops.generate(named_as="title"), 86 | year_gen.ops.generate(named_as="recording_year"), 87 | duration_gen.ops.generate(named_as="duration_seconds"), 88 | gen.ConstantGenerator(value=genre_name).ops.generate(named_as="song_genre") 89 | ) 90 | 91 | # dataframe of emtpy songs: just with one SONG_ID column for now 92 | song_ids = song_id_gen.generate(size=1000) 93 | emtpy_songs = story.Story.init_story_data( 94 | member_id_field_name="SONG_ID", 95 | active_ids=song_ids 96 | ) 97 | 98 | # we can already adds the generated songs to the music repo relationship 99 | repo_population.get_relationship("songs").add_grouped_relations( 100 | from_ids=[genre_id], 101 | grouped_ids=[song_ids] 102 | ) 103 | 104 | # here we generate all desired columns in the dataframe 105 | initialized_songs, _ = init_attribute(emtpy_songs) 106 | initialized_songs.drop(["SONG_ID"], axis=1, inplace=True) 107 | 108 | # this works because the columns of init_attribute match exactly the 109 | # ones of the attributes of the populations 110 | songs.update(initialized_songs) 111 | 112 | # makes sure year and duration are handled as integer 113 | songs.get_attribute("recording_year").transform_inplace(int) 114 | songs.get_attribute("duration_seconds").transform_inplace(int) 115 | 116 | return songs 117 | 118 | 119 | def build_circus(name): 120 | return circus.Circus( 121 | name=name, 122 | master_seed=12345, 123 | start=pd.Timestamp("1 Jan 2017 00:00"), 124 | step_duration=pd.Timedelta("1h")) 125 | 126 | 127 | def add_listener(the_circus): 128 | 129 | users = the_circus.create_population( 130 | name="user", size=5, 131 | ids_gen=gen.SequencialGenerator(prefix="user_")) 132 | 133 | users.create_attribute( 134 | name="FIRST_NAME", 135 | init_gen=gen.FakerGenerator(method="first_name", 136 | seed=next(the_circus.seeder))) 137 | users.create_attribute( 138 | name="LAST_NAME", 139 | init_gen=gen.FakerGenerator(method="last_name", 140 | seed=next(the_circus.seeder))) 141 | 142 | 143 | def add_listen_and_share_stories_with_details(the_circus): 144 | 145 | users = the_circus.populations["user"] 146 | 147 | # using this timer means POS are more likely to trigger a re-stock during 148 | # day hours rather that at night. 149 | timer_gen = profilers.HighWeekDaysTimerGenerator( 150 | clock=the_circus.clock, seed=next(the_circus.seeder)) 151 | 152 | # this generate activity level distributed as a "truncated normal 153 | # distribution", i.e. very high and low activities are prevented. 154 | bounded_gaussian_activity_gen = gen.NumpyRandomGenerator( 155 | method="normal", 156 | seed=next(the_circus.seeder), 157 | loc=timer_gen.activity(n=20, per=pd.Timedelta("1 day")), 158 | scale=5 159 | ).map(ops.bound_value(lb=10, ub=30)) 160 | 161 | listen = the_circus.create_story( 162 | name="listen_events", 163 | initiating_population=users, 164 | member_id_field="UID", 165 | 166 | timer_gen=timer_gen, 167 | activity_gen=bounded_gaussian_activity_gen 168 | ) 169 | 170 | share = the_circus.create_story( 171 | name="share_events", 172 | initiating_population=users, 173 | member_id_field="UID", 174 | 175 | timer_gen=timer_gen, 176 | activity_gen=bounded_gaussian_activity_gen 177 | ) 178 | 179 | repo = the_circus.populations["music_repository"] 180 | songs = the_circus.populations["songs"] 181 | 182 | select_genre_and_song = ops.Chain( 183 | 184 | users.ops.lookup( 185 | id_field="UID", 186 | select={ 187 | "FIRST_NAME": "USER_FIRST_NAME", 188 | "LAST_NAME": "USER_LAST_NAME", 189 | } 190 | ), 191 | 192 | # picks a genre at random 193 | repo.ops.select_one(named_as="GENRE"), 194 | 195 | # picks a song at random for that genre 196 | repo.get_relationship("songs").ops.select_one( 197 | from_field="GENRE", 198 | named_as="SONG_ID"), 199 | 200 | # now also reporting details of listened or shared songs 201 | songs.ops.lookup( 202 | id_field="SONG_ID", 203 | select={ 204 | "artist_name": "SONG_ARTIST", 205 | "title": "SONG_TITLE", 206 | "recording_year": "SONG_YEAR", 207 | "duration_seconds": "SONG_DURATION", 208 | } 209 | ), 210 | ) 211 | 212 | listen.set_operations( 213 | select_genre_and_song, 214 | ops.FieldLogger("listen_events") 215 | ) 216 | 217 | share.set_operations( 218 | select_genre_and_song, 219 | 220 | # picks a user this song is shared to 221 | users.ops.select_one(named_as="SHARED_TO_UID"), 222 | 223 | # note we could post-check when user shared a song to their own uid 224 | # here, in which case we can use DropRow to discard that share event 225 | 226 | ops.FieldLogger("share_events") 227 | ) 228 | 229 | 230 | def step1(): 231 | 232 | # this creates 2 populations: music_repo and songs 233 | music_repo = build_music_repo() 234 | songs = add_song_to_repo(music_repo) 235 | 236 | # saves them to persistence 237 | DB.remove_namespace(namespace="tutorial_example4") 238 | DB.save_population(music_repo, namespace="tutorial_example4", 239 | population_id="music_repository") 240 | DB.save_population(songs, namespace="tutorial_example4", 241 | population_id="songs") 242 | 243 | # build a new circus then loads and attach the persisted population to it 244 | example4_circus = build_circus(name="example4_circus") 245 | example4_circus.load_population(namespace="tutorial_example4", 246 | population_id="music_repository") 247 | example4_circus.load_population(namespace="tutorial_example4", 248 | population_id="songs") 249 | 250 | add_listener(example4_circus) 251 | 252 | 253 | def step2(): 254 | 255 | # this creates 2 populations: music_repo and songs 256 | music_repo = build_music_repo() 257 | songs = add_song_to_repo(music_repo) 258 | 259 | # saves them to persistence 260 | DB.remove_namespace(namespace="tutorial_example4") 261 | DB.save_population(music_repo, namespace="tutorial_example4", 262 | population_id="music_repository") 263 | DB.save_population(songs, namespace="tutorial_example4", 264 | population_id="songs") 265 | 266 | # build a new circus then loads and attach the persisted population to it 267 | example4_circus = build_circus(name="example4_circus") 268 | example4_circus.load_population(namespace="tutorial_example4", 269 | population_id="music_repository") 270 | example4_circus.load_population(namespace="tutorial_example4", 271 | population_id="songs") 272 | 273 | add_listener(example4_circus) 274 | 275 | # This saves the whole circus to persistence, with all its populations, 276 | # relationships, generators,... 277 | # This is independent from the 2 populations saved above: this time we no longer 278 | # have direct control on the namespace: the persistence mechanism use the 279 | # circus name as namespace 280 | example4_circus.save_to_db(overwrite=True) 281 | 282 | # example4bis should be an exact deep copy of example4_circus 283 | example4bis = circus.Circus.load_from_db(circus_name="example4_circus") 284 | 285 | # Stories are not serialized to CSV but rather serialized in code, 286 | # using humans as transducers 287 | add_listen_and_share_stories_with_details(example4bis) 288 | 289 | example4bis.run( 290 | duration=pd.Timedelta("5 days"), 291 | log_output_folder="output/example4", 292 | delete_existing_logs=True) 293 | 294 | 295 | if __name__ == "__main__": 296 | util_functions.setup_logging() 297 | step2() 298 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Riaktr 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /trumania/core/operations.py: -------------------------------------------------------------------------------- 1 | from scipy import stats 2 | from abc import ABCMeta, abstractmethod 3 | import pandas as pd 4 | import numpy as np 5 | from trumania.core.util_functions import merge_dicts, df_concat 6 | import functools 7 | 8 | 9 | class Operation(object): 10 | """ 11 | An Operation is able to produce transform input into an output + 12 | produce logs. 13 | """ 14 | 15 | def transform(self, story_data): 16 | """ 17 | :param story_data: dataframe as produced by the previous operation 18 | :return: a dataframe that replaces the previous one in the pipeline 19 | """ 20 | 21 | return story_data 22 | 23 | def emit_logs(self, story_data): 24 | """ 25 | This method is used to produces logs (e.g. CDRs, mobility, topus...) 26 | 27 | :param story_data: output of this operation, as produced by transform() 28 | :return: emitted logs, as a dictionary of {"log_id": some_data_frame} 29 | """ 30 | 31 | return {} 32 | 33 | def __call__(self, story_data): 34 | 35 | output = self.transform(story_data) 36 | logs = self.emit_logs(output) 37 | 38 | return output, logs 39 | 40 | 41 | class Chain(Operation): 42 | """ 43 | A chain is a list of operation to be executed sequencially 44 | """ 45 | 46 | def __init__(self, *operations): 47 | self.operations = list(operations) 48 | 49 | def append(self, *operations): 50 | """ 51 | adds operations to be executed at the end of this chain 52 | """ 53 | self.operations += list(operations) 54 | 55 | @staticmethod 56 | def _execute_operation(story_data__prev_logs, operation): 57 | """ 58 | 59 | executes this operation and merges its logs with the previous one 60 | :param operation: the operation to call 61 | :return: the merged story data and logs 62 | """ 63 | 64 | (story_data, prev_logs) = story_data__prev_logs 65 | 66 | output, supp_logs = operation(story_data) 67 | # merging the logs of each operation of this story. 68 | return output, merge_dicts([prev_logs, supp_logs], df_concat) 69 | 70 | def __call__(self, story_data): 71 | init = [(story_data, {})] 72 | return functools.reduce(self._execute_operation, init + self.operations) 73 | 74 | 75 | class FieldLogger(Operation): 76 | """ 77 | Log creator that simply select a set of columns and create a logged 78 | dataframe from it 79 | """ 80 | 81 | def __init__(self, log_id, cols=None, exploded_cols=None): 82 | """ 83 | :param log_id: the id of the logs in the dictionary of logs returned 84 | by the Circus, at the end of the simulation 85 | :param cols: sub-sets of fields from the story data that will be 86 | selected in order to build the logs 87 | :param exploded_cols: name of one or several columns containing list of 88 | values. If provided, we explode the story_data dataframe and log one per value 89 | in that list (which is more that one line per row in story_data). 90 | In each row, all lists must have the same length 91 | """ 92 | self.log_id = log_id 93 | 94 | if type(exploded_cols) == str: 95 | self.exploded_cols = [exploded_cols] 96 | else: 97 | self.exploded_cols = [] if exploded_cols is None else exploded_cols 98 | 99 | if type(cols) == str: 100 | self.cols = [cols] 101 | else: 102 | self.cols = [] if cols is None else cols 103 | 104 | self.cols += self.exploded_cols 105 | 106 | def emit_logs(self, story_data): 107 | 108 | # explode lists, cf constructor documentation 109 | if self.exploded_cols: 110 | 111 | def explo(df): 112 | explosion_len = len(df[self.exploded_cols[0]]) 113 | df2 = pd.DataFrame( 114 | [df.drop(self.exploded_cols) for _ in range(explosion_len)]) 115 | for col in self.exploded_cols: 116 | df2[col] = df[col] 117 | 118 | return df2 119 | 120 | logged_data = pd.concat(explo(row) 121 | for _, row in story_data.iterrows()) 122 | 123 | else: 124 | logged_data = story_data 125 | 126 | if not self.cols: 127 | return {self.log_id: logged_data} 128 | else: 129 | return {self.log_id: logged_data[self.cols]} 130 | 131 | 132 | class SideEffectOnly(Operation): 133 | """ 134 | Operation that does not produce logs nor supplementary columns: just have 135 | side effect 136 | """ 137 | __metaclass__ = ABCMeta 138 | 139 | def transform(self, story_data): 140 | self.side_effect(story_data) 141 | return story_data 142 | 143 | @abstractmethod 144 | def side_effect(self, story_data): 145 | """ 146 | :param story_data: 147 | :return: nothing 148 | """ 149 | pass 150 | 151 | 152 | class AddColumns(Operation): 153 | """ 154 | Very typical case of an operation that appends (i.e. joins) columns to 155 | the previous result 156 | """ 157 | __metaclass__ = ABCMeta 158 | 159 | def __init__(self, join_kind="left"): 160 | self.join_kind = join_kind 161 | 162 | @abstractmethod 163 | def build_output(self, story_data): 164 | """ 165 | Produces a dataframe with one or several columns and an index aligned 166 | with the one of input. The columns of this will be merge with input. 167 | 168 | :param story_data: current dataframe 169 | :return: the column(s) to append to it, as a dataframe 170 | """ 171 | pass 172 | 173 | def transform(self, story_data): 174 | output = self.build_output(story_data) 175 | # logging.info(" adding column(s) {}".format(output.columns.tolist())) 176 | return pd.merge(left=story_data, right=output, 177 | left_index=True, right_index=True, 178 | how=self.join_kind) 179 | 180 | 181 | class DropRow(Operation): 182 | """ 183 | Discards any row in the story data where the condition field is false. 184 | """ 185 | 186 | def __init__(self, condition_field): 187 | self.condition_field = condition_field 188 | 189 | def transform(self, story_data): 190 | return story_data[~story_data[self.condition_field]] 191 | 192 | 193 | class Apply(AddColumns): 194 | """ 195 | Custom operation adding one single column computed from a user-provided 196 | function. 197 | 198 | The length of the source_fields must match the number columns 199 | in the dataframe expected by the user f function 200 | 201 | """ 202 | 203 | def __init__(self, source_fields, named_as, f, f_args="dataframe"): 204 | """ 205 | :param source_fields: input field from the story data 206 | :param named_as: name of the resulting fields added to the story data 207 | :param f: transforming function 208 | :param f_args: "dataframe" or "columns", depending on the signature 209 | of f: 210 | 211 | - "dataframe": input and output of the function is a dataframe 212 | as many columns as there are values in "named_as" 213 | 214 | - "columns" input of f is a list of columns and output is 1 215 | column (like many numpy built-it function). In that case, 216 | "named_as" can obviously only contain one name 217 | """ 218 | 219 | AddColumns.__init__(self) 220 | if type(source_fields) == str: 221 | self.source_fields = [source_fields] 222 | else: 223 | self.source_fields = source_fields 224 | 225 | if type(named_as) == str: 226 | self.named_as = [named_as] 227 | else: 228 | self.named_as = named_as 229 | 230 | self.f = f 231 | if f_args not in ["dataframe", "series"]: 232 | raise ValueError("unrecognized f input type: {}".format(f_args)) 233 | 234 | if f_args == "series": 235 | assert len(self.named_as) == 1, \ 236 | "'series' functions can only return 1 column" 237 | 238 | self.f_input = f_args 239 | 240 | def build_output(self, story_data): 241 | if self.f_input == "dataframe": 242 | result = self.f(story_data[self.source_fields]) 243 | renamed = result.rename( 244 | columns=dict(zip(result.columns, self.named_as))) 245 | 246 | return renamed 247 | else: 248 | cols = [story_data[c] for c in self.source_fields] 249 | result = pd.DataFrame({self.named_as[0]: self.f(*cols)}) 250 | return result 251 | 252 | 253 | ##################### 254 | # Collection of functions directly usable in Apply 255 | 256 | def copy_if(story_data): 257 | """ 258 | Copies values from the source to the "named_as" if the condition is True, 259 | otherwise inserts NA 260 | 261 | usage: 262 | 263 | Apply(source_fields=["some_source_field", "some_condition_field"], 264 | named_as="some_result_field", 265 | f=copy_if) 266 | """ 267 | 268 | condition_field, source_field = story_data.columns 269 | copied = story_data.where(story_data[condition_field])[[source_field]] 270 | return copied.rename(columns={source_field: "result"}) 271 | 272 | 273 | def bound_value(lb=None, ub=None): 274 | """ 275 | builds a function that limits the range of a value 276 | """ 277 | 278 | def _f(value): 279 | limited = value if lb is None else max(lb, value) 280 | if ub is not None: 281 | limited = min(ub, limited) 282 | return limited 283 | 284 | return _f 285 | 286 | 287 | def scale(factor): 288 | def _f_vect(value): 289 | return value * factor 290 | 291 | return _f_vect 292 | 293 | 294 | def logistic(k, x0=0, L=1): 295 | """ 296 | 297 | Returns a function, usable in an Apply operation, that transforms the 298 | specified field with a sigmoid with the provided parameters 299 | 300 | :param k: the steepness of the curve 301 | :param x0: the x-value of the sigmoid's midpoint (default: 0) 302 | :param L: maximum value of the logistic (default: 1) 303 | 304 | same parameter naming conventions as in: 305 | https://en.wikipedia.org/wiki/Logistic_function 306 | 307 | usage: 308 | Apply(source_fields=["some_source_field"], 309 | named_as="some_result_field", 310 | f=sigmoid(k=-0.01, x0=1000) 311 | """ 312 | 313 | def _logistic(x): 314 | the_exp = np.minimum(-k * (x - x0), 10) 315 | return L / (1 + np.exp(the_exp)) 316 | 317 | return _logistic 318 | 319 | 320 | def bounded_sigmoid(x_min, x_max, shape, incrementing=True): 321 | """ 322 | Builds a S-shape curve that have y values evolving between 0 and 1 over 323 | the x domain [x_min, x_max] 324 | 325 | This is preferable to the logistic function for cases where we want to 326 | make sure that the curve actually reaches 0 and 1 at some point (e.g. 327 | probability of triggering an "restock" story must be 1 if stock is as 328 | low as 1). 329 | 330 | See /tests/notebooks/bounded_sigmoid.ipynb for examples 331 | 332 | :param x_min: lower bound of the x domain 333 | :param x_max: lower bound of the x domain 334 | :param incrementing: if True, evolve from 0 to 1, or from 1 to 0 otherwise 335 | :param shape: strictly positive number controlling the shape of the 336 | resulting function 337 | * 1 correspond to linear transition 338 | * higher values yield a more and more sharper, i.e. more 339 | vertical S shape, converging towards a step function 340 | transiting at (x_max-x_min)/2 for very large values of S ( 341 | e.g. 10000) 342 | * values in ]0,1[ yield vertically shaped sigmoids, sharply 343 | rising/falling at the boundary of the x domain and 344 | transiting more smoothly in the middle of it. 345 | """ 346 | 347 | bounded = bound_value(lb=x_min, ub=x_max) 348 | 349 | def f(x): 350 | # values outside the sigmoid are just the repetition of what's 351 | # happening at the boundaries 352 | x_b = bounded(x) 353 | 354 | if incrementing: 355 | return stats.beta.cdf((x_b - x_min) / (x_max - x_min), 356 | a=shape, 357 | b=shape) 358 | else: 359 | return stats.beta.sf((x_b - x_min) / (x_max - x_min), 360 | a=shape, 361 | b=shape) 362 | 363 | return np.frompyfunc(f, 1, 1) 364 | 365 | 366 | def identity(x): 367 | return x 368 | -------------------------------------------------------------------------------- /trumania/core/clock.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import pandas as pd 4 | import logging 5 | import numpy as np 6 | from numpy.random import RandomState 7 | 8 | from trumania.core.operations import AddColumns 9 | from trumania.core.random_generators import DependentGenerator 10 | from trumania.core.util_functions import latest_date_before 11 | 12 | 13 | class Clock(object): 14 | """ 15 | A Clock is the central object managing the evolution of time of the whole circus. 16 | It's generating timestamps on demand, and provides information for TimeProfiler objects. 17 | """ 18 | 19 | def __init__(self, start, step_duration, seed): 20 | """Create a Clock object. 21 | 22 | :type start: pd.Timestamp 23 | :param start: instant of start of the generation 24 | 25 | :type step_duration: pd.Timedelta 26 | :param step_duration: duration of a clock step 27 | 28 | :type seed: int 29 | :param seed: seed for timestamp generator (if steps are more than 1 sec) 30 | 31 | :return: a new Clock object, initialised 32 | """ 33 | 34 | self.current_date = start 35 | self.step_duration = step_duration 36 | 37 | self.__state = RandomState(seed) 38 | self.ops = self.ClockOps(self) 39 | 40 | self.__increment_listeners = [] 41 | 42 | def register_increment_listener(self, listener): 43 | """Add an object to be incremented at each step (such as a TimeProfiler) 44 | """ 45 | self.__increment_listeners.append(listener) 46 | 47 | def increment(self): 48 | """Increments the clock by 1 step 49 | 50 | :rtype: NoneType 51 | :return: None 52 | """ 53 | self.current_date += self.step_duration 54 | 55 | for listener in self.__increment_listeners: 56 | listener.increment() 57 | 58 | def get_timestamp(self, size=1, random=True, log_format=None): 59 | """ 60 | Returns timestamps formatted as string 61 | 62 | :type size: int 63 | :param size: number of timestamps to generate, default 1 64 | 65 | :type random: boolean 66 | :param random: if True, the timestamps are randomly generated in [ 67 | self.current_date, self.current_date+self.step_duration] 68 | 69 | :type log_format: string 70 | :param log_format: string format of the generated timestamps 71 | 72 | :rtype: Pandas Series 73 | :return: random timestamps in the form of strings 74 | """ 75 | 76 | if log_format is None: 77 | log_format = "%Y-%m-%d %H:%M:%S" 78 | 79 | def make_ts(delta_secs): 80 | date = self.current_date + pd.Timedelta(seconds=delta_secs) 81 | return date.strftime(log_format) 82 | 83 | if random: 84 | step_secs = int(self.step_duration.total_seconds()) 85 | return pd.Series(self.__state.choice(step_secs, size)).apply(make_ts) 86 | else: 87 | return pd.Series([self.current_date.strftime(log_format)] * size) 88 | 89 | def n_iterations(self, duration): 90 | """ 91 | :type duration: pd.Timedelta 92 | 93 | :return: the smallest number of iteration of this clock s.t. the 94 | corresponding duration is >= duration 95 | """ 96 | step_secs = self.step_duration.total_seconds() 97 | return int(np.ceil(duration.total_seconds() / step_secs)) 98 | 99 | class ClockOps(object): 100 | def __init__(self, clock): 101 | self.clock = clock 102 | 103 | class Timestamp(AddColumns): 104 | def __init__(self, clock, named_as, random, log_format): 105 | AddColumns.__init__(self) 106 | self.clock = clock 107 | self.named_as = named_as 108 | self.random = random 109 | self.log_format = log_format 110 | 111 | def build_output(self, story_data): 112 | values = self.clock.get_timestamp( 113 | size=story_data.shape[0], random=self.random, 114 | log_format=self.log_format).values 115 | 116 | df = pd.DataFrame({self.named_as: values}, 117 | index=story_data.index) 118 | return df 119 | 120 | def timestamp(self, named_as, random=True, log_format=None): 121 | """ 122 | Generates a random timestamp within the current time slice 123 | """ 124 | return self.Timestamp(self.clock, named_as, random, log_format) 125 | 126 | 127 | class CyclicTimerGenerator(DependentGenerator): 128 | """A TimeProfiler contains an activity profile over a defined time range. 129 | It's mostly a super class, normally only its child classes should be used. 130 | 131 | The goal of a TimeProfiler is to keep a track of the expected level of activity of users over a cyclic time range 132 | It will store a vector with probabilities of activity per time step, as well as a cumulative sum of the 133 | probabilities starting with the current time step. 134 | 135 | This allows to quickly produce random waiting times until the next event for the users 136 | 137 | """ 138 | def __init__(self, clock, seed, config): 139 | """ 140 | This should not be used, only child classes 141 | 142 | :type clock: Clock 143 | :param clock: the master clock driving this simulator 144 | 145 | :type seed: int 146 | :param seed: seed for random number generator, default None 147 | :return: A new TimeProfiler is created 148 | """ 149 | DependentGenerator.__init__(self) 150 | self._state = RandomState(seed) 151 | self.config = config 152 | self.clock = clock 153 | 154 | # "macro" time shift: we shift the whole profile n times in the future 155 | # or the past until it overlaps with the current clock date 156 | init_date = latest_date_before( 157 | starting_date=config.start_date, 158 | upper_bound=clock.current_date, 159 | time_step=pd.Timedelta(config.profile_time_steps) * len( 160 | config.profile)) 161 | 162 | # Un-scaled weight profile. We artificially adds a nan to force the 163 | # up-sclaling to multiply the last element 164 | profile_idx = pd.date_range(start=init_date, 165 | freq=config.profile_time_steps, 166 | periods=len(config.profile) + 1) 167 | profile_ser = pd.Series(data=config.profile + [np.nan], 168 | index=profile_idx) 169 | 170 | # scaled weight profile, s.t. one clock step == one profile value 171 | profile_ser = profile_ser.resample(rule=clock.step_duration).pad()[:-1] 172 | 173 | self.n_time_bin = profile_ser.shape[0] 174 | 175 | profile_cdf = (profile_ser / profile_ser.sum()).cumsum() 176 | self.profile = pd.DataFrame({"cdf": profile_cdf, 177 | 178 | # for debugging 179 | "timeframe": np.arange(len(profile_cdf))}) 180 | 181 | # "micro" time shift,: we step forward along the profile until it is 182 | # align with the current date 183 | while self.profile.index[0] < clock.current_date: 184 | self.increment() 185 | 186 | # makes sure we'll get notified when the clock goes forward 187 | clock.register_increment_listener(self) 188 | 189 | def increment(self): 190 | """ 191 | Increment the time generator by 1 step. 192 | 193 | This has as effect to move the cdf of one step to the left, decrease 194 | all values by the value of the original first entry, and placing the 195 | previous first entry at the end of the cdf, with value 1. 196 | """ 197 | 198 | self.profile["cdf"] -= self.profile["cdf"].iloc[0] 199 | 200 | self.profile = pd.concat([self.profile.iloc[1:], self.profile.iloc[:1]]) 201 | self.profile.loc[self.profile.index[-1], "cdf"] = 1 202 | 203 | def generate(self, observations): 204 | """Generate random waiting times, based on some observed activity 205 | levels. The higher the level of activity, the shorter the waiting 206 | times will be 207 | 208 | :type observations: Pandas Series 209 | :param observations: contains an array of floats 210 | :return: Pandas Series 211 | """ 212 | 213 | activities = observations 214 | 215 | # activities less often than once per cycle length 216 | low_activities = activities.where((activities <= 2) & (activities > 0)).dropna() 217 | if low_activities.shape[0] > 0: 218 | 219 | draw = self._state.uniform(size=low_activities.shape[0]) 220 | 221 | # A uniform [0, 2/activity] yields an expected freqs == 1/activity 222 | # == average period between story. 223 | # => n_cycles is the number of full timer cycles from now until 224 | # next story. It's typically not an integer and possibly be > 1 225 | # since we have on average less han 1 activity per cycle of this 226 | # timer. 227 | n_cycles = 2 * draw / low_activities.values 228 | 229 | timer_slots = n_cycles % 1 230 | n_cycles_int = n_cycles - timer_slots 231 | 232 | timers = self.profile["cdf"].searchsorted(timer_slots) + \ 233 | self.n_time_bin * n_cycles_int 234 | 235 | low_activity_timer = pd.Series(timers, index=low_activities.index) 236 | 237 | else: 238 | low_activity_timer = pd.Series() 239 | 240 | high_activities = activities.where(activities > 2).dropna() 241 | if high_activities.shape[0] > 0: 242 | 243 | # A beta(1, activity-1) will yield expected frequencies of 244 | # 1/(1+activity-1) == 1/activity == average period between story. 245 | # This just stops to work for activities < 1, or even close to one 246 | # => we use the uniform mechanism above for activities <= 2 and 247 | # rely on betas here for expected frequencies of 2 per cycle or 248 | # higher 249 | timer_slots = high_activities.apply( 250 | lambda activity: self._state.beta(1, activity - 1)) 251 | 252 | timers = self.profile["cdf"].searchsorted(timer_slots, side="left") 253 | high_activity_timer = pd.Series(timers, index=high_activities.index) 254 | 255 | else: 256 | high_activity_timer = pd.Series() 257 | 258 | all_timers = pd.concat([low_activity_timer, high_activity_timer]) 259 | 260 | # Not sure about that one, there seem to be a bias somewhere that 261 | # systematically generates too large timer. Maybe it's a rounding 262 | # effect of searchsorted() or so. Or a bug elsewhere ? 263 | all_timers = all_timers.apply(lambda d: max(0, d - 1)) 264 | 265 | # makes sure all_timers is in the same order and with the same index 266 | # as input observations, even in case of duplicate index values 267 | all_timers = all_timers.reindex_like(observations) 268 | return all_timers 269 | 270 | def activity(self, n, per): 271 | """ 272 | 273 | :param n: number of stories 274 | :param per: time period for that number of stories 275 | :type per: pd.Timedelta 276 | :return: the activity level corresponding to the specified number of n 277 | executions per time period 278 | """ 279 | 280 | scale = self.config.duration().total_seconds() / per.total_seconds() 281 | activity = n * scale 282 | 283 | requested_period = pd.Timedelta(seconds=per.total_seconds() / n) 284 | if requested_period < self.clock.step_duration: 285 | logging.warning( 286 | "Warning: Creating activity level for {} stories per " 287 | "{} => activity is {} but period is {}, which is " 288 | "shorter than the clock period ({}). This clock " 289 | "cannot keep up with such rate and less events will be" 290 | " produced".format(n, per, activity, requested_period, 291 | self.clock.step_duration) 292 | ) 293 | 294 | return activity 295 | 296 | 297 | class CyclicTimerProfile(object): 298 | """ 299 | Static parameters of the Timer profile. Separated from the timer gen 300 | itself to facilitate persistence. 301 | 302 | :type profile: python array 303 | :param profile: Weight of each period 304 | 305 | :type profile_time_steps: string 306 | :param profile_time_steps: duration of the time-steps in the profile 307 | (e.g. "15min") 308 | 309 | :type start_date: pd.Timestamp 310 | :param start_date: date of the origin of the specified profile => 311 | this is used to align with the values of the clock 312 | 313 | """ 314 | def __init__(self, profile, profile_time_steps, start_date): 315 | self.start_date = start_date 316 | self.profile = profile 317 | self.profile_time_steps = profile_time_steps 318 | 319 | def save_to(self, file_path): 320 | 321 | logging.info("saving timer generator to {}".format(file_path)) 322 | 323 | saved_df = pd.DataFrame({("value", "profile"): self.profile}, 324 | dtype=str).stack() 325 | saved_df.index = saved_df.index.reorder_levels([1, 0]) 326 | saved_df.loc[("start_date", 0)] = self.start_date 327 | saved_df.loc[("profile_time_steps", 0)] = self.profile_time_steps 328 | saved_df.to_csv(file_path) 329 | 330 | @staticmethod 331 | def load_from(file_path): 332 | saved_df = pd.read_csv(file_path, index_col=[0, 1]) 333 | 334 | profile = saved_df.loc[("profile", slice(None))]\ 335 | .unstack()\ 336 | .astype(float)\ 337 | .tolist() 338 | 339 | profile_time_steps = saved_df.loc["profile_time_steps"].values[0][0] 340 | start_date = pd.Timestamp(saved_df.loc["start_date"].values[0][0]) 341 | 342 | return CyclicTimerProfile(profile, profile_time_steps, start_date) 343 | 344 | def duration(self): 345 | """ 346 | :return: the total duration corresponding to this time profile 347 | """ 348 | 349 | return len(self.profile) * pd.Timedelta(self.profile_time_steps) 350 | --------------------------------------------------------------------------------