├── .DS_Store
├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── common
├── __init__.py
├── operation.py
├── schema.py
├── util.py
├── viz.py
├── vizgraph.py
└── vizrequest.py
├── data
└── flights.zip
├── datagen.py
├── drivers
├── __init__.py
├── monetdb.py
├── sample.py
└── xdb.py
├── idebench.py
├── logo.png
├── reports
└── .gitignore
├── results
└── .gitignore
├── runconfig_sample.json
├── workflowgen.py
└── workflowgen
├── __init__.py
├── baseaction.py
├── bulkgen.py
├── filteraction.py
├── linkaction.py
├── selectionaction.py
└── vizaction.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/.DS_Store
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.csv filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | **/__pycache__/*
7 |
8 | _benchmark
9 | node_modules/
10 | _temp/*
11 |
12 | _creation
13 | _idea
14 | flights.csv
15 | flights.csv
16 | output.csv
17 |
18 | margin_errors
19 |
20 | # C extensions
21 | *.so
22 |
23 | # Distribution / packaging
24 | .Python
25 | build/
26 | develop-eggs/
27 | dist/
28 | downloads/
29 | eggs/
30 | .eggs/
31 | lib/
32 | lib64/
33 | parts/
34 | sdist/
35 | var/
36 | wheels/
37 | *.egg-info/
38 | .installed.cfg
39 | *.egg
40 |
41 | # PyInstaller
42 | # Usually these files are written by a python script from a template
43 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
44 | *.manifest
45 | *.spec
46 |
47 | # Installer logs
48 | pip-log.txt
49 | pip-delete-this-directory.txt
50 |
51 | # Unit test / coverage reports
52 | htmlcov/
53 | .tox/
54 | .coverage
55 | .coverage.*
56 | .cache
57 | nosetests.xml
58 | coverage.xml
59 | *.cover
60 | .hypothesis/
61 |
62 | # Translations
63 | *.mo
64 | *.pot
65 |
66 | # Django stuff:
67 | *.log
68 | local_settings.py
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | target/
82 |
83 | # Jupyter Notebook
84 | .ipynb_checkpoints
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # celery beat schedule file
90 | celerybeat-schedule
91 |
92 | # SageMath parsed files
93 | *.sage.py
94 |
95 | # Environments
96 | .env
97 | .venv
98 | env/
99 | venv/
100 | ENV/
101 |
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 |
106 | # Rope project settings
107 | .ropeproject
108 |
109 | # mkdocs documentation
110 | /site
111 |
112 | # mypy
113 | .mypy_cache/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018 Brown University
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | For information on how to use IDEBench, checkout our Wiki: http://github.com/IDEBench/IDEBench-public/wiki
6 |
7 |
8 | ## License
9 |
10 | MIT
11 |
12 | ## Cite
13 | Users of IDEBench are requested to use the following BibTeX reference:
14 | ```
15 | @misc{1804.02593,
16 | Author = {Philipp Eichmann and Carsten Binnig and Tim Kraska and Emanuel Zgraggen},
17 | Title = {IDEBench: A Benchmark for Interactive Data Exploration},
18 | Year = {2018},
19 | Eprint = {arXiv:1804.02593},
20 | }
21 | ```
22 |
23 | ## Publications
24 |
25 | Eichmann, Philipp, Carsten Binnig, Tim Kraska and Emanuel Zgraggen. "IDEBench: A Benchmark for Interactive Data Exploration".
26 | [PDF](https://arxiv.org/abs/1804.02593)
27 |
28 | Eichmann, Philipp, Emanuel Zgraggen, Zheguang Zhao, Carsten Binnig, and Tim Kraska. "Towards a Benchmark for Interactive Data Exploration." IEEE Data Eng. Bull. 39, no. 4 (2016): 50-61.
29 | [PDF](http://cs.brown.edu/~peichmann/downloads/bide_vision.pdf)
30 |
--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/common/__init__.py
--------------------------------------------------------------------------------
/common/operation.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | class Operation:
4 |
5 | def __init__(self, data):
6 | self.data = data
7 |
8 | def get_viz_name(self):
9 | if "name" in self.data:
10 | return self.data["name"]
11 | return None
12 |
13 | def get_source(self):
14 | return self.data["source"]
15 |
16 | def has_source(self):
17 | return "source" in self.data
18 |
19 | def has_selection(self):
20 | return "selection" in self.data and len(self.data["selection"]) > 0
21 |
22 | def get_selection(self):
23 | return self.data["selection"]
24 |
25 | def has_filter(self):
26 | return "filter" in self.data and len(self.data["filter"]) > 0
27 |
28 | def get_filter(self):
29 | return self.data["filter"]
30 |
31 | def get_source_vizs(self):
32 | sources = self.get_source().replace("(", "").replace(")", "").replace("and", "").replace("or", "").split(" ")
33 | return set([s for s in sources if not s == "" ])
34 |
35 |
--------------------------------------------------------------------------------
/common/schema.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | class Schema:
4 |
5 | def __init__(self, schema_json, is_normalized=False):
6 | self.is_normalized = is_normalized
7 | self.schema_json = schema_json
8 |
9 | def get_fact_table(self):
10 | return self.schema_json["tables"]["fact"]
11 |
12 | def get_fact_table_name(self):
13 | return self.schema_json["tables"]["fact"]["name"]
14 |
15 | def translate_field(self, field_name):
16 |
17 | if not self.is_normalized:
18 | return field_name, None, None
19 |
20 |
21 | for dim_tbl in self.schema_json["tables"]["dimension"]:
22 | for m_idx, mapping in enumerate(dim_tbl["mapping"]):
23 | for f_idx, field in enumerate(mapping["fromFields"]):
24 | if field == field_name:
25 | tbl_alias = "%s%s" % (dim_tbl["name"], m_idx)
26 | tbl_join = "%s.ID = %s.%s" % (tbl_alias, self.get_fact_table_name(), mapping["fk"])
27 | tbl_as = "%s AS %s" % (dim_tbl["name"], tbl_alias)
28 | return ("%s.%s" % (tbl_alias, dim_tbl["columns"][f_idx])), tbl_as, tbl_join
29 | return field_name, self.get_fact_table_name(), None
30 |
31 | def get_tables_for(self, field_name):
32 | if not self.is_normalized:
33 | return ""
34 |
35 | for dim_tbl in self.schema_json["tables"]["dimension"]:
36 | for m_idx, mapping in enumerate(dim_tbl["mapping"]):
37 | for f_idx, field in enumerate(mapping["fromFields"]):
38 | if field == field_name:
39 | tbl_alias = "%s%s" % (dim_tbl["name"], m_idx)
40 | tbl_as = "%s AS %s" % (dim_tbl["name"], tbl_alias)
41 | if dim_tbl["name"] == "tbl_carriers":
42 | os._exit()
43 | return ("%s.%s" % (tbl_alias, dim_tbl["columns"][f_idx])), tbl_as
44 |
45 |
--------------------------------------------------------------------------------
/common/util.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | def get_current_ms_time():
4 | return int(round(time.time() * 1000))
--------------------------------------------------------------------------------
/common/viz.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 |
4 | class Viz:
5 |
6 | @staticmethod
7 | def createFromDict(obj):
8 | viz = Viz()
9 | viz.name = "" if "name" not in obj else obj["name"]
10 | viz.source = "" if "source" not in obj else obj["source"]
11 | viz.selection = "" if "selection" not in obj else obj["selection"]
12 | viz.filter = "" if "filter" not in obj else obj["filter"]
13 | viz.computed_filter = ""
14 | viz.binning = [] if "binning" not in obj else obj["binning"]
15 | viz.per_bin_aggregates = [] if "perBinAggregates" not in obj else obj["perBinAggregates"]
16 | return viz
17 |
18 | def __init__(self):
19 | self.name = ""
20 | self.source = ""
21 | self.selection = ""
22 | self.filter = ""
23 | self.computed_filter = ""
24 | self.binning = []
25 | self.per_bin_aggregates = []
26 |
27 | def apply_interaction(self, operation):
28 | self.source = operation.get_source() if operation.has_source() else ""
29 | self.selection = operation.get_selection() if operation.has_selection() else ""
30 | self.filter = operation.get_filter() if operation.has_filter() else ""
31 |
32 | def has_filter(self):
33 | return len(self.filter) > 0
34 |
35 | def get_filter(self):
36 | return self.filter
37 |
38 | def get_computed_filter(self):
39 | return self.computed_filter
40 |
41 | def set_computed_filter(self, filter_str):
42 | self.computed_filter = filter_str
43 |
44 | def get_source(self):
45 | return self.source
46 |
47 | def has_source(self):
48 | return True if not self.source == "" else False
49 |
50 | def has_selection(self):
51 | return True if not self.selection == "" else False
52 |
53 | def get_selection(self):
54 | return self.selection
55 |
56 | def get_source_vizs(self):
57 | sources = self.get_source().replace("(", "").replace(")", "").replace("and", "").replace("or", "").split(" ")
58 | return set([s for s in sources if not s == "" ])
59 |
60 | def get_computed_filter_as_sql(self, schema):
61 | bins = []
62 | bin_str = ""
63 | tables = set()
64 | tables.add(schema.get_fact_table_name())
65 | joins = set()
66 | for bin_desc in self.binning:
67 | dimension = bin_desc["dimension"]
68 | bins.append("bin_" + dimension)
69 | if "width" in bin_desc:
70 | bin_width = bin_desc["width"]
71 | bin_str += "FLOOR(%s/%s) AS bin_%s, " % (dimension, bin_width, dimension)
72 | else:
73 | dd, tblas, tjoins = schema.translate_field(dimension)
74 | joins.add(tjoins)
75 | tables.add(tblas)
76 |
77 | bin_str += "%s AS bin_%s, " % (dd, dimension)
78 |
79 | agg_str = ""
80 | for per_bin_aggregate_desc in self.per_bin_aggregates:
81 |
82 | if "dimension" in per_bin_aggregate_desc:
83 | aggregate_dimension = per_bin_aggregate_desc["dimension"]
84 | if per_bin_aggregate_desc["type"] == "count":
85 | agg_str += "COUNT(*) as count, "
86 | elif per_bin_aggregate_desc["type"] == "avg":
87 | aggregate_dimension = per_bin_aggregate_desc["dimension"]
88 | aggregate_dimension2 = aggregate_dimension
89 |
90 | #if storage.normalized:
91 | # aggregate_dimension2 = storage.get_all_tables()[0]["name"] + "." + aggregate_dimension
92 |
93 | agg_str += "AVG(%s) as average_%s, " % (aggregate_dimension2, aggregate_dimension)
94 |
95 | agg_str = agg_str.rstrip(", ")
96 | bins_str = ", ".join(bins)
97 |
98 | sql_statement = "SELECT %s %s " % (bin_str, agg_str)
99 | if schema.is_normalized:
100 |
101 | computed_filter = self.get_computed_filter()
102 | fields = [f["field"] for f in schema.get_fact_table()["fields"]]
103 |
104 | for field in fields:
105 | if field not in computed_filter:
106 | continue
107 |
108 | translation, tblas, tjoins = schema.translate_field(field)
109 | joins.add(tjoins)
110 | tables.add(tblas)
111 |
112 | computed_filter = computed_filter.replace(field + " ", translation + " ")
113 |
114 |
115 | sql_statement += "FROM %s " % ", ".join(tables)
116 | joins = list(filter(None, joins))
117 | if len(joins) > 0:
118 | sql_statement += "WHERE (%s) " % computed_filter + " AND " if computed_filter else "WHERE "
119 | sql_statement += "("
120 | sql_statement += " AND ".join(joins) + ") "
121 | else:
122 | sql_statement += " "
123 |
124 | sql_statement += "GROUP BY %s" % bins_str
125 | else:
126 | sql_statement = "SELECT %s %s " % (bin_str, agg_str)
127 | sql_statement += "FROM %s " % schema.get_fact_table_name()
128 | sql_statement += "WHERE (%s) " % self.get_computed_filter() if self.get_computed_filter() else ""
129 | sql_statement += "GROUP BY %s" % bins_str
130 |
131 | return sql_statement
132 |
133 |
134 | def get_computed_filter_as_sql2(self, storage):
135 | bins = []
136 | bin_str = ""
137 | for bin_desc in self.binning:
138 | dimension = bin_desc["dimension"]
139 |
140 | if "width" in bin_desc:
141 | bin_width = bin_desc["width"]
142 | bin_str += "FLOOR(%s/%s), " % (dimension, bin_width)
143 | bins.append("FLOOR(%s/%s)" % (dimension, bin_width))
144 | else:
145 | dd = dimension
146 | bin_str += "%s, " % (dd)
147 | bins.append("%s" % (dd))
148 |
149 | agg_str = ""
150 | for per_bin_aggregate_desc in self.per_bin_aggregates:
151 |
152 | if "dimension" in per_bin_aggregate_desc:
153 | aggregate_dimension = per_bin_aggregate_desc["dimension"]
154 | if per_bin_aggregate_desc["type"] == "count":
155 | agg_str += "COUNT(*) as count_, relative_error(count_),"
156 | elif per_bin_aggregate_desc["type"] == "avg":
157 | aggregate_dimension = per_bin_aggregate_desc["dimension"]
158 | agg_str += "AVG(%s) as average_%s, relative_error(average_%s)" % (aggregate_dimension, aggregate_dimension, aggregate_dimension)
159 |
160 | agg_str = agg_str.rstrip(", ")
161 | bins_str = ", ".join(bins)
162 | sql_statement = "SELECT %s %s " % (bin_str, agg_str)
163 | sql_statement += "FROM %s " % storage.get_fact_table_name()
164 | sql_statement += "WHERE (%s) " % self.get_computed_filter() if self.get_computed_filter() else ""
165 | sql_statement += "GROUP BY %s" % bins_str
166 | return sql_statement + " WITH ERROR 0.1 BEHAVIOR 'do_nothing'"
--------------------------------------------------------------------------------
/common/vizgraph.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from collections import deque
3 | from collections import OrderedDict
4 | from common.viz import Viz
5 |
6 |
7 | class VizGraph(object):
8 |
9 | def __init__(self):
10 | self._graph = OrderedDict()
11 | self.nodes = set()
12 |
13 | def apply_interaction(self, operation):
14 | # initialize a set of vizs that are affected by this operation
15 | vizs_to_request = OrderedDict()
16 |
17 | if operation.get_viz_name() not in self.get_nodes_dict():
18 | viz = Viz.createFromDict(operation.data)
19 | self.nodes.add(viz)
20 | vizs_to_request[viz] = True
21 |
22 | viz_dict = self.get_nodes_dict()
23 | current_viz = viz_dict[operation.get_viz_name()]
24 |
25 | if operation.has_filter():
26 | current_viz.filter = operation.get_filter()
27 | #vizs_to_request.add(current_viz)
28 | vizs_to_request[current_viz] = True
29 |
30 | # parse source attribute
31 | if operation.has_source():
32 | source = operation.get_source()
33 | if len(source) > 0:
34 | # find all current sources add check which ones have been added/removed
35 | old_sources = current_viz.get_source_vizs()
36 | new_sources = operation.get_source_vizs()
37 | sources_removed = old_sources - new_sources
38 | sources_remained = new_sources.intersection(old_sources)
39 | sources_added = new_sources - sources_remained - sources_removed
40 |
41 | for src in sources_removed:
42 | self.remove_connection(viz_dict[src], current_viz)
43 | #if "remove_link" in self.interface.remove_link:
44 | # self.interface.remove_link(current_viz, viz_dict[src])
45 |
46 | for src in sources_added:
47 | self.add_connection(viz_dict[src], current_viz)
48 | #add_link_method = getattr(self.interface, "add_link", None)
49 | #if callable(add_link_method):
50 | # self.interface.add_link(current_viz, viz_dict[src])
51 | # return
52 |
53 | # update the source of the current viz
54 | current_viz.source = operation.get_source()
55 | #vizs_to_request.add(current_viz)
56 | vizs_to_request[current_viz] = True
57 | #self.process_next_interaction()
58 | return vizs_to_request
59 |
60 | # parse selection
61 | if operation.has_selection():
62 | current_viz.selection = operation.get_selection()
63 |
64 | # find other vizs affected by this selection
65 | vizs_to_request.update(self.update_affected_vizs(current_viz, viz_dict))
66 |
67 | current_viz.set_computed_filter(self.compute_filter(current_viz, viz_dict))
68 |
69 |
70 | # set the parent id of each viz request
71 | #for viz_to_request in vizs_to_request.keys():
72 | # self.parent_operations[viz_to_request] = index
73 |
74 | return vizs_to_request.keys()
75 | #self.viz_requests = []
76 | #for viz in vizs_to_request.keys():
77 | # self.operation_count += 1
78 | # self.viz_requests.append(VizRequest(self.operation_count, index, viz))
79 | #
80 | # self.interface.request_vizs(self.viz_requests)
81 |
82 | def update_affected_vizs(self, current_viz, viz_dict):
83 | dependent_vizs = self.find_dependencies_top_down(current_viz)
84 | vizs_to_request = OrderedDict()
85 | for viz in dependent_vizs:
86 | computed_filter = self.compute_filter(viz, viz_dict)
87 | viz.set_computed_filter(computed_filter)
88 | vizs_to_request[viz] = True
89 |
90 | return vizs_to_request
91 |
92 | def compute_filter(self, viz, viz_dict):
93 |
94 | def compute_filter_inner(start, selections, filter_strs, source_strs):
95 |
96 | if start.has_filter():
97 | filter_strs.append(start.filter)
98 |
99 | if start.selection and not start.selection == "":
100 | selections[start.name] = start.selection
101 |
102 | if not start.source or start.source == "":
103 | return
104 |
105 |
106 | source_strs.extend(start.get_source_vizs())
107 | sources = start.get_source_vizs()
108 |
109 | for src in sources:
110 | compute_filter_inner(viz_dict[src], selections, filter_strs, source_strs)
111 |
112 | source_strs_list = []
113 | selections = {}
114 | filters = []
115 | compute_filter_inner(viz, selections, filters, source_strs_list)
116 |
117 | source_strs = " and ".join(source_strs_list)
118 |
119 | for src in source_strs_list:
120 | if src in selections:
121 | source_strs = source_strs.replace(src, selections[src])
122 | else:
123 | source_strs = source_strs.replace(src, "NULL")
124 | source_strs = source_strs.replace("and NULL", "").replace("NULL and", "").replace("or NULL", "").replace("NULL or", "").replace("NULL", "").strip().lstrip("and ")
125 | if len(source_strs) > 0 and len(filters) > 0:
126 | source_strs += " AND "
127 | return source_strs + " AND ".join(filters)
128 |
129 | def get_nodes(self):
130 | return self.nodes
131 |
132 | def get_nodes_dict(self):
133 | d = {}
134 | for node in self.get_nodes():
135 | d[node.name] = node
136 | return d
137 |
138 | def remove_connection(self, node1, node2):
139 | self._graph[node1].remove(node2)
140 |
141 | def add_connection(self, node1, node2):
142 | """ Add connection between node1 and node2 """
143 | self.nodes.add(node1)
144 | self.nodes.add(node2)
145 |
146 | if node1 not in self._graph:
147 | self._graph[node1] = OrderedDict()
148 | self._graph[node1][node2] = True
149 |
150 |
151 | def remove(self, node):
152 | """ Remove all references to node """
153 | self.nodes.remove(node)
154 | for cxns in self._graph.keys():
155 | try:
156 | del cxns[node]
157 | except KeyError:
158 | pass
159 | try:
160 | del self._graph[node]
161 | except KeyError:
162 | pass
163 |
164 | def find_dependencies_top_down(self, start):
165 |
166 | if not start in self._graph.keys():
167 | return []
168 |
169 | queue = deque()
170 | queue.append(start)
171 | result = []
172 | while queue:
173 | node = queue.popleft()
174 | result.append(node)
175 | if node in self._graph:
176 | for n in self._graph[node].keys():
177 | queue.append(n)
178 | return result[1:]
179 |
--------------------------------------------------------------------------------
/common/vizrequest.py:
--------------------------------------------------------------------------------
1 | import json
2 | from common import util
3 |
4 | class VizRequest:
5 |
6 | def __init__(self, operation_id, parent_operation_id, viz):
7 | self.operation_id = operation_id
8 | self.parent_operation_id = parent_operation_id
9 | self.viz = viz
10 | self.start_time = util.get_current_ms_time()
11 | self.end_time = None
12 | self.result = None
13 | self.margins = None
14 | self.delivered = False
15 | self.bins = None
16 | self.timedout = False
17 | self.t_start = 0
18 | self.t_pause = 0
19 | self.progress = 0
20 |
21 | def toJSON(self):
22 | return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
--------------------------------------------------------------------------------
/data/flights.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/data/flights.zip
--------------------------------------------------------------------------------
/datagen.py:
--------------------------------------------------------------------------------
1 | import json
2 | import math
3 | import random
4 | import itertools
5 | import numpy as np
6 | import pandas as pd
7 | import scipy as sp
8 | import collections
9 | import threading
10 | from scipy.linalg import eigh, cholesky
11 | from scipy.stats import norm
12 | import scipy.interpolate as interpolate
13 | from optparse import OptionParser
14 | import time
15 | import os
16 | from collections import OrderedDict
17 |
18 | current_milli_time = lambda: int(round(time.time() * 1000))
19 |
20 | class DataGen:
21 |
22 | def __init__(self):
23 |
24 | parser = OptionParser()
25 | parser.add_option("-r", "--random-seed", dest="seed", action="store", type=int, help="Random seed", default=41001)
26 | parser.add_option("--normalize", dest="normalize", action="store", help="Size of the dataset in MB")
27 | parser.add_option("--prevent-zero", dest="prevent_zero", action="store_true", help="Size of the dataset in MB", default=True)
28 | parser.add_option("-s", "--size", dest="size", action="store", type=int, help="Number of samples to generate", default=50000)
29 | parser.add_option("-b", "--batchsize", dest="batchsize", action="store", type=int, help="Number of samples to process in a single batch", default=100000)
30 | parser.add_option("-x", "--sample-file", dest="samplefile", action="store", help="Path to the sample file", default="data/flights/sample.csv")
31 | parser.add_option("-y", "--sample-descriptor", dest="samplejsonfile", action="store", help="Path to the sample file", default="data/flights/sample.json")
32 | parser.add_option("-n","--num-samples", dest="numsamples", action="store", type=int, help="Number of samples to draw from the original dataset", default=5000000)
33 | parser.add_option("-o", "--output-file", dest="output", action="store", help="The name of the output file", default="dataset.csv")
34 | #parser.add_option("--output-normalized-file", dest="output_normalized", action="store", help="The name of the output file", default="datagen/output_normalized.csv")
35 | (options, args) = parser.parse_args()
36 |
37 | self.options = options
38 |
39 | # read sample json
40 | self.sample_json = None
41 | with open(self.options.samplejsonfile, "r") as f:
42 | self.sample_json = json.load(f)
43 |
44 | if self.options.normalize:
45 | self.normalize()
46 | else:
47 | self.generate_data()
48 |
49 | def normalize(self):
50 |
51 | if "tables" not in self.sample_json:
52 | raise Exception("no tables defined in sample json")
53 |
54 | tables = self.sample_json["tables"]["dimension"]
55 | table_dfs = {}
56 | for tbl in tables:
57 | table_dfs[tbl["name"]] = pd.DataFrame(columns=tbl["columns"])
58 |
59 | for chunk_id, chunk in enumerate(pd.read_csv(self.options.normalize, chunksize=100000, header=0)):
60 | all_from_fields = []
61 | for tbl in tables:
62 |
63 | table_df = table_dfs[tbl["name"]]
64 | for mapping in tbl["mapping"]:
65 | xx = chunk[mapping["fromFields"]]
66 | xx.columns = tbl["columns"]
67 | table_df = table_df.append(xx)
68 | table_df = table_df.drop_duplicates(subset=tbl["columns"])
69 |
70 |
71 | table_df = table_df.reset_index(drop=True)
72 | table_df.index.name = "ID"
73 | table_dfs[tbl["name"]] = table_df
74 | if "tmp_ID" in table_df.columns:
75 | del table_df["tmp_ID"]
76 | table_df.to_csv(tbl["name"], index=True, mode="w")
77 |
78 |
79 | table_df["tmp_ID"] = table_df.index
80 | count = 0
81 | for mapping in tbl["mapping"]:
82 | all_from_fields.extend(mapping["fromFields"])
83 | boo = len(chunk)
84 | beforechunk = chunk
85 | chunk = pd.merge(chunk, table_df, how="left", left_on=mapping["fromFields"], right_on=tbl["columns"])
86 |
87 |
88 | chunk = chunk.rename(columns={'tmp_ID': mapping["fk"]})
89 |
90 | for c in tbl["columns"]:
91 | del chunk[c]
92 |
93 |
94 | for c in all_from_fields:
95 | del chunk[c]
96 |
97 | if chunk_id == 0:
98 | chunk.to_csv(self.options.output, index=False, mode="w")
99 | else:
100 | chunk.to_csv(self.options.output, index=False, header=False, mode="a")
101 |
102 |
103 | def generate_data(self):
104 | # load sample data
105 | self.df = pd.read_csv(self.options.samplefile, nrows=self.options.numsamples, header=0)
106 |
107 | if self.options.prevent_zero:
108 | self.quant_col_names = [ col["field"] for col in self.sample_json["tables"]["fact"]["fields"] if col["type"] == "quantitative" ]
109 | for quant_col_name in self.quant_col_names:
110 | self.df[quant_col_name] = self.df[quant_col_name] - self.df[quant_col_name].min()
111 |
112 | self.cat_col_names = [ col["field"] for col in self.sample_json["tables"]["fact"]["fields"] if col["type"] == "categorical" ]
113 | for cat_col_name in self.cat_col_names:
114 | self.df[cat_col_name] = self.df[cat_col_name].astype("category")
115 |
116 | self.derived_cols = [ col for col in self.sample_json["tables"]["fact"]["fields"] if "deriveFrom" in col ]
117 | self.derivates = {}
118 | for derived_col in self.derived_cols:
119 | kk = self.df.groupby(derived_col["deriveFrom"])[derived_col["field"]].first().to_dict()
120 | self.derivates[derived_col["field"]] = kk
121 |
122 |
123 | self.orgdf = self.df.copy()
124 | self.cat_cols = list(self.orgdf.select_dtypes(include=["category"]).columns)
125 | self.cat_hists = {}
126 | self.cat_hists_keys = {}
127 | self.cat_hists_values = {}
128 |
129 | for cat_col in self.cat_cols:
130 | self.cat_hists[cat_col] = self.df[cat_col].value_counts(normalize=True).to_dict()
131 | self.cat_hists[cat_col] = OrderedDict(sorted(self.cat_hists[cat_col].items(), key=lambda x:x[0]))
132 | self.cat_hists_keys[cat_col] = list(self.cat_hists[cat_col].keys())
133 | self.cat_hists_values[cat_col] = list(self.cat_hists[cat_col].values())
134 | del self.df[cat_col]
135 |
136 | self.means = self.df.mean()
137 | self.stdevs = self.df.std()
138 | np.set_printoptions(suppress=True)
139 |
140 | # z-normalize all data
141 | for idx, col in enumerate(self.df.columns):
142 | self.df[col] = (self.df[col] - self.means[col])/self.stdevs[col]
143 |
144 | self.inv_cdfs = self.get_inverse_cdfs(self.orgdf, self.df)
145 |
146 | # apply a gaussian copula
147 | covariance = self.df.cov()
148 | self.decomposition = cholesky(covariance, lower=True)
149 |
150 | # calculate how many batches we need to compute
151 | num_batches = int(math.ceil(self.options.size / self.options.batchsize))
152 |
153 | st = current_milli_time()
154 | # process all batches
155 | for batch_i in range(num_batches):
156 | print(" %i/%i batches processed." % (batch_i, num_batches))
157 | self.process_batch(batch_i)
158 |
159 | print("done.")
160 | print( (current_milli_time()-st) )
161 |
162 | def process_batch(self, batch_number):
163 |
164 | # Calculate how many samples we need to generate for this batch
165 | if (batch_number+1) * self.options.batchsize > self.options.size:
166 | num_samples_to_generate = self.options.size - (batch_number) * self.options.batchsize
167 | else:
168 | num_samples_to_generate = self.options.batchsize
169 |
170 | # adjust the random seed based on the batch number
171 | np.random.seed(seed=self.options.seed + batch_number)
172 |
173 | data_rnormal = sp.stats.norm.rvs(size=(len(self.df.columns), num_samples_to_generate))
174 | data_rnormal_correlated = np.dot(self.decomposition, data_rnormal)
175 |
176 | # convert each value to the its corresponding value of the normal CDF. We could find the CDF empirically,
177 | # but since our samples follow a normal distribution we know their exact CDF.
178 | stdnormcdf = sp.stats.norm.cdf(data_rnormal_correlated)
179 |
180 | global cc
181 | cc = 0
182 | def apply_inverse(xx):
183 | global cc
184 | res = self.inv_cdfs[cc](xx)
185 | cc += 1
186 | return res
187 |
188 | # apply the inverse (empirical) CDF to the correlated random data
189 | res = np.apply_along_axis(apply_inverse, 1, stdnormcdf)
190 |
191 | # de-normalize
192 | for i, col in enumerate(self.df.columns):
193 | res[i,:] = res[i,:] * self.stdevs[col] + self.means[col]
194 |
195 | # reconstruct categorical values
196 | result_frame = pd.DataFrame(res.T)
197 | result_frame.columns = self.df.columns
198 |
199 | for cat_col in self.cat_cols:
200 | ix = self.orgdf.columns.get_loc(cat_col)
201 | keys = self.cat_hists_keys[cat_col]
202 | values = self.cat_hists_values[cat_col]
203 | xx = np.random.choice(keys, num_samples_to_generate, p=values)
204 | result_frame.insert(self.orgdf.columns.get_loc(cat_col), cat_col, xx)
205 |
206 | for quant_col in self.quant_col_names:
207 | result_frame[quant_col] = result_frame[quant_col].round(decimals=1)
208 |
209 | cast_cols = [ col for col in self.sample_json["tables"]["fact"]["fields"] if "cast" in col ]
210 | for cast_col in cast_cols:
211 | if cast_col["cast"] == "int":
212 | result_frame[cast_col["field"]] = result_frame[cast_col["field"]].astype(int)
213 | else:
214 | raise Exception("unsupported cast")
215 |
216 | for derived_col in self.derived_cols:
217 | new_col = result_frame[derived_col["deriveFrom"]].map(self.derivates[derived_col["field"]])
218 | result_frame[derived_col["field"]] = new_col
219 |
220 |
221 |
222 | if batch_number == 0:
223 | result_frame.to_csv(self.options.output, index=False, mode="w")
224 | else:
225 | result_frame.to_csv(self.options.output, index=False, header=False, mode="a")
226 |
227 | def get_inverse_cdfs(self, orgdf, df):
228 | cdfs = []
229 | for col in df.columns:
230 | num_bins = 1000
231 | hist, bin_edges = np.histogram(df[col], bins=num_bins, density=True)
232 | cdf = np.zeros(bin_edges.shape)
233 | cdf[1:] = np.cumsum(hist*np.diff(bin_edges))
234 | inv_cdf = interpolate.interp1d(cdf, bin_edges)
235 | cdfs.append(inv_cdf)
236 | return cdfs
237 |
238 | DataGen()
--------------------------------------------------------------------------------
/drivers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/drivers/__init__.py
--------------------------------------------------------------------------------
/drivers/monetdb.py:
--------------------------------------------------------------------------------
1 | import pymonetdb
2 | import datetime, time
3 | import itertools
4 | import csv
5 | import json
6 | import os
7 | import multiprocessing
8 | from subprocess import call
9 | from common import util
10 |
11 | class IDEBenchDriver:
12 |
13 | def init(self, options, schema, driver_arg):
14 | pass
15 |
16 | def create_connection(self):
17 | connection = pymonetdb.connect(username="monetdb", password="monetdb", hostname="localhost", port=50000, database="demo")
18 | cursor = connection.cursor()
19 | return connection, cursor
20 |
21 | def process_request(self, viz_request, options, schema, result_queue):
22 | print("processsing..." + str(viz_request.operation_id))
23 | viz = viz_request.viz
24 | sql_statement = viz.get_computed_filter_as_sql(schema)
25 | connection, cursor = self.create_connection()
26 | viz_request.start_time = util.get_current_ms_time()
27 | cursor.execute(sql_statement)
28 | data = cursor.fetchall()
29 | viz_request.end_time = util.get_current_ms_time()
30 | connection.close()
31 |
32 | results = {}
33 | for row in data:
34 | keys = []
35 | for i, bin_desc in enumerate(viz.binning):
36 |
37 | if "width" in bin_desc:
38 | bin_width = bin_desc["width"]
39 | keys.append(str(int(row[i])))
40 | else:
41 | keys.append(str(row[i]))
42 |
43 | key = ",".join(keys)
44 | results[key] = row[len(viz.binning):]
45 |
46 | viz_request.result = results
47 | result_queue.put(viz_request)
48 |
49 | def workflow_start(self):
50 | # clear cache here
51 | pass
52 |
53 | def workflow_end(self):
54 | pass
55 |
--------------------------------------------------------------------------------
/drivers/sample.py:
--------------------------------------------------------------------------------
1 | import time
2 | from common import util
3 |
4 | class IDEBenchDriver:
5 |
6 | def init(self, options, schema, driver_arg):
7 | print("init")
8 | print("table name: %s" %schema.get_fact_table_name())
9 | print("driver arg0: %s" % driver_arg[0])
10 | print("driver arg1: %s" % driver_arg[1])
11 |
12 | def workflow_start(self):
13 | print("workflow start")
14 | pass
15 |
16 | def workflow_end(self):
17 | print("workflow end")
18 | pass
19 |
20 | def process_request(self, viz_request, options, schema, result_queue):
21 | print("processsing...")
22 |
23 | # record start time
24 | viz_request.start_time = util.get_current_ms_time()
25 |
26 | # print SQL translation of request and simulate query execution
27 | print(viz_request.viz.get_computed_filter_as_sql(schema))
28 | time.sleep(1)
29 |
30 | # record end time
31 | viz_request.end_time = util.get_current_ms_time()
32 |
33 | # write an empty result to the viz_request
34 | viz_request.result = {}
35 |
36 | # notify IDEBench that processing is done by writing it to the result buffer
37 | result_queue.put(viz_request)
--------------------------------------------------------------------------------
/drivers/xdb.py:
--------------------------------------------------------------------------------
1 | import datetime, time
2 | import itertools
3 | import psycopg2
4 | import decimal
5 | import os
6 | import multiprocessing
7 | from multiprocessing import Queue
8 | from common import util
9 |
10 | class IDEBenchDriver:
11 |
12 | def init(self, options, schema, driver_arg):
13 | pass
14 |
15 | def workflow_start(self):
16 | print("workflow start")
17 | pass
18 |
19 | def workflow_end(self):
20 | os.system("/usr/local/pgsql/bin/pg_ctl stop -D ~/xdb_data")
21 | os.system('sudo -b bash -c "echo 1 > /proc/sys/vm/drop_caches"')
22 | os.system("/usr/local/pgsql/bin/pg_ctl start -D ~/xdb_data")
23 | pass
24 |
25 | def can_execute_online(self, sql_statement):
26 | return (not " or " in sql_statement.lower()) and (not " AVG(" in sql_statement)
27 |
28 | def create_connection(self, timeout=None):
29 | ip = "localhost"
30 | if timeout is None:
31 | connection = psycopg2.connect("dbname='idebench' port=45001 user='test' host='%s' password='test'" % (ip))
32 | else:
33 | connection = psycopg2.connect("dbname='idebench' port=45001 user='test' host='%s' password='test' options='-c statement_timeout=%i'" % (ip, timeout))
34 | cursor = connection.cursor()
35 | return connection, cursor
36 |
37 | def process_request(self, viz_request, options, schema, out_q):
38 | print("processsing..." + str(viz_request.operation_id))
39 | if viz_request.viz.binning:
40 | sql_statement = viz_request.viz.get_computed_filter_as_sql(schema)
41 | sql_statement = sql_statement.replace(schema.get_fact_table_name(), "%s_%s%s" % (schema.get_fact_table_name(), options.settings_size, "n" if options.settings_normalized else "") )
42 | if self.can_execute_online(sql_statement):
43 | sql_statement = sql_statement.replace("SELECT ", "SELECT ONLINE ")
44 | sql_statement += " WITHTIME %s CONFIDENCE 95" % options.settings_time_requirement
45 | sql_statement += " REPORTINTERVAL %s;" % options.settings_time_requirement
46 | connection, cursor = self.create_connection(options.settings_time_requirement + 20)
47 | else:
48 | connection, cursor = self.create_connection(options.settings_time_requirement)
49 | viz_request.start_time = util.get_current_ms_time()
50 | try:
51 | cursor.execute(sql_statement)
52 | except psycopg2.extensions.QueryCanceledError as qce:
53 | viz_request.result = {}
54 | viz_request.margins = {}
55 | viz_request.timedout = True
56 | viz_request.end_time = util.get_current_ms_time()
57 | out_q.put(viz_request)
58 | return
59 |
60 | data = cursor.fetchall()
61 | viz_request.end_time = util.get_current_ms_time()
62 | connection.close()
63 |
64 | results = {}
65 | margins = {}
66 | for row in data:
67 | keys = []
68 |
69 | if row[0] is None:
70 | continue
71 |
72 | startindex = 3 if self.can_execute_online(sql_statement) else 0
73 |
74 | for i, bin_desc in enumerate(viz_request.viz.binning):
75 | if "width" in bin_desc:
76 | bin_width = bin_desc["width"]
77 | keys.append(str(int(row[i+startindex])))
78 | else:
79 | keys.append(str(row[startindex+i]).strip())
80 |
81 | key = ",".join(keys)
82 |
83 | row = list(row)
84 | for i,r in enumerate(row):
85 | if isinstance(r, decimal.Decimal):
86 | row[i] = float(r)
87 | if startindex == 3:
88 | results[key] = row[len(viz_request.viz.binning)+startindex:-1]
89 | else:
90 | results[key] = row[len(viz_request.viz.binning)+startindex:]
91 |
92 | if self.can_execute_online(sql_statement) and startindex == 3:
93 | margins[key] = row[len(row)-1:]
94 |
95 | viz_request.result = results
96 | viz_request.margins = margins
97 | out_q.put(viz_request)
98 | print("delivering...")
99 |
--------------------------------------------------------------------------------
/idebench.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import json
3 | import csv
4 | import time
5 | import hashlib
6 | import multiprocessing
7 | import statistics
8 | import numpy as np
9 | import os
10 | from common.schema import Schema
11 | from common.vizgraph import VizGraph
12 | from common.vizrequest import VizRequest
13 | from common.operation import Operation
14 | from optparse import OptionParser
15 | from scipy import spatial
16 | import glob
17 | from os.path import basename
18 |
19 | class IDEBench:
20 |
21 | result_queue = multiprocessing.Queue()
22 | def __init__(self):
23 |
24 | parser = OptionParser()
25 | parser.add_option("--driver-name", dest="driver_name", action="store", help="Driver name")
26 | parser.add_option("--driver-create-storage", dest="driver_create_storage", action="store_true", help="Calls create_storage on driver", default=False)
27 | parser.add_option("--driver-clear-storage", dest="driver_clear_storage", action="store_true", help="Calls clear_storage on driver", default=False)
28 | parser.add_option("--driver-clear-cache", dest="driver_clear_cache", action="store_true", help="Calls clear_cache on driver", default=False)
29 | parser.add_option("--driver-args", dest="driver_args", action="store", help="Arguments to pass to the driver", default="")
30 |
31 | parser.add_option("--settings-normalized", dest="settings_normalized", action="store_true", help="Whether joins should be used", default=False)
32 | parser.add_option("--settings-dataset", dest="settings_dataset", action="store", help="Name of the dataset")
33 | parser.add_option("--settings-size", dest="settings_size", default="", action="store", help="Number of rows in the dataset")
34 | parser.add_option("--settings-thinktime", dest="settings_thinktime", type="int", action="store", help="Think-time in seconds between two executions", default=1000)
35 | parser.add_option("--settings-time-requirement", dest="settings_time_requirement", action="store", help="The Time requirement to be used", default=1000)
36 | parser.add_option("--settings-confidence-level", dest="settings_confidence_level", action="store", help="The confidence level to be used", default=95)
37 | parser.add_option("--settings-workflow", dest="settings_workflow", action="store", help="The workflow file to be used")
38 |
39 | parser.add_option("--evaluate", dest="evaluate", action="store_true", help="Size of the dataset in MB", default=False)
40 | parser.add_option("--create--full-report", dest="create_report", action="store_true", help="Merges all reports in the reports directory into a single file", default=False)
41 | parser.add_option("--run", dest="run", action="store_true", help="Flag to run the benchmark without config file", default=False)
42 | parser.add_option("--run-config", dest="config", action="store", help="Flag to run the benchmark with the specified config file")
43 | parser.add_option("--groundtruth", dest="groundtruth", action="store_true", help="If set computes the ground-truth for the specified workflow", default=False)
44 |
45 | (self.options, args) = parser.parse_args()
46 |
47 | if not self.options.config:
48 |
49 | if self.options.create_report:
50 | self.create_report()
51 | return
52 |
53 | if not self.options.driver_name:
54 | parser.error("No driver name specified.")
55 |
56 | if not self.options.settings_dataset:
57 | parser.error("No dataset specified.")
58 |
59 | if not self.options.settings_size:
60 | print("Warning: No dataset size specified.")
61 |
62 | if self.options.groundtruth or self.options.run:
63 | self.setup()
64 |
65 | if self.options.groundtruth:
66 |
67 | self.options.think_time = 1
68 | self.options.time_requirement = 999999
69 |
70 | workflow_files = glob.glob("data/" + self.options.settings_dataset + "/workflows/*.json")
71 |
72 | for workflow_file in workflow_files:
73 | self.options.settings_workflow = basename(workflow_file).split(".")[0]
74 | self.run()
75 |
76 | elif self.options.run:
77 |
78 | if not self.options.settings_workflow:
79 | parser.error("No workflow specified.")
80 |
81 | self.run()
82 | elif self.options.evaluate:
83 | self.evaluate(self.get_config_hash())
84 | else:
85 |
86 | with open(self.options.config) as f:
87 | config = json.load(f)
88 | assure_path_exists("./results")
89 | for d in config["settings-datasets"]:
90 | assure_path_exists("./data/%s/groundtruths" % d)
91 |
92 | # TODO: create pairs instead
93 | for dataset in config["settings-datasets"]:
94 | self.options.settings_dataset = dataset
95 |
96 | for driver_name in config["driver-names"]:
97 | for driver_arg in config["driver-args"]:
98 |
99 | self.options.driver_name = driver_name
100 | self.setup(driver_arg)
101 |
102 | for size in config["settings-sizes"]:
103 | for workflow in config["settings-workflows"]:
104 | for thinktime in config["settings-thinktimes"]:
105 | for time_requirement in config["settings-time-requirements"]:
106 | for confidence_level in config["settings-confidence-levels"]:
107 |
108 | self.options.driver_name = driver_name
109 |
110 | self.options.settings_size = size
111 | self.options.settings_workflow = workflow
112 | self.options.settings_thinktime = thinktime
113 | self.options.settings_time_requirement = time_requirement
114 | self.options.settings_confidence_level = confidence_level
115 | self.options.settings_normalized = config["settings-normalized"]
116 | self.options.groundtruth = config["groundtruth"] if "groundtruth" in config else False
117 | self.options.run = config["run"] if "run" in config else True
118 | self.options.evaluate = config["evaluate"] if "evaluate" in config else True
119 |
120 | if self.options.run:
121 | self.run()
122 |
123 | if self.options.evaluate:
124 | self.evaluate(self.get_config_hash())
125 |
126 | def setup(self, driver_arg = None):
127 | with open(self.get_schema_path()) as f:
128 | self.schema = Schema(json.load(f), self.options.settings_normalized)
129 |
130 | module = importlib.import_module("drivers." + self.options.driver_name)
131 | self.driver = getattr(module, "IDEBenchDriver")()
132 |
133 | try:
134 | self.driver.init(self.options, self.schema, driver_arg)
135 | except AttributeError:
136 | pass
137 |
138 | def run(self):
139 |
140 | try:
141 | self.driver.workflow_start()
142 | except AttributeError:
143 | pass
144 |
145 |
146 | with open(self.get_workflow_path()) as f:
147 | self.workflow_interactions = json.load(f)["interactions"]
148 |
149 | self.vizgraph = VizGraph()
150 | self.operation_results = { "args": vars(self.options), "results": {} }
151 | self.current_interaction_index = 0
152 | self.current_vizrequest_index = 0
153 | self.process_interaction(0)
154 |
155 | def end_run(self):
156 |
157 | try:
158 | self.driver.workflow_end()
159 | except AttributeError:
160 | pass
161 |
162 | path = "results/%s.json" % (self.get_config_hash())
163 |
164 | if not self.options.groundtruth:
165 | with open(path, "w") as fp:
166 | json.dump(self.operation_results, fp)
167 |
168 | if self.options.groundtruth:
169 | path = "data/%s/groundtruths/%s_%s.json" % (self.options.settings_dataset, self.options.settings_size, self.options.settings_workflow)
170 | with open(path, "w") as fp:
171 | json.dump(self.operation_results, fp)
172 |
173 | def process_interaction(self, interaction_index):
174 | print("processing!")
175 | if interaction_index < 0 or interaction_index >= len(self.workflow_interactions):
176 | print("reached end of interactions")
177 | self.end_run()
178 | return
179 |
180 | print("thinking...")
181 | time.sleep(self.options.settings_thinktime / 1000)
182 |
183 | interaction = self.workflow_interactions[interaction_index]
184 | vizs_to_request = self.vizgraph.apply_interaction(Operation(interaction))
185 |
186 | viz_requests = []
187 | for viz in vizs_to_request:
188 | viz_requests.append(VizRequest(self.current_vizrequest_index, self.current_interaction_index, viz))
189 | self.current_vizrequest_index += 1
190 |
191 | #if interaction_index == 0:
192 | # self.result_queue = multiprocessing.Queue()
193 |
194 | # TODO: document this feature
195 | try:
196 | self.driver.before_requests(self.options, self.schema, IDEBench.result_queue)
197 | except AttributeError:
198 | pass
199 |
200 | procs = []
201 | nprocs = len(viz_requests)
202 | if hasattr(self.driver, "use_single_process") and self.driver.use_single_process:
203 | for viz_request in viz_requests:
204 | self.driver.process_request(viz_request, self.options, self.schema, IDEBench.result_queue)
205 | else:
206 | for viz_request in viz_requests:
207 | proc = multiprocessing.Process(target=self.driver.process_request, args=(viz_request, self.options, self.schema, IDEBench.result_queue))
208 | procs.append(proc)
209 | proc.start()
210 |
211 | resultlist = []
212 | for i in range(nprocs):
213 | resultlist.append(IDEBench.result_queue.get())
214 |
215 | for proc in procs:
216 | proc.join()
217 |
218 | self.deliver_viz_request(resultlist)
219 | self.current_interaction_index += 1
220 | self.process_interaction(self.current_interaction_index)
221 |
222 |
223 | def deliver_viz_request(self, viz_requests):
224 |
225 | for viz_request in viz_requests:
226 | if len(viz_request.result.keys()) == 0:
227 | pass
228 |
229 | operation_result = {}
230 | operation_result["id"] = viz_request.operation_id
231 | operation_result["sql"] = viz_request.viz.get_computed_filter_as_sql(self.schema)
232 | operation_result["viz_name"] = viz_request.viz.name
233 | operation_result["parent_operation_id"] = viz_request.parent_operation_id
234 | operation_result["start_time"] = viz_request.start_time
235 | operation_result["end_time"] = viz_request.end_time
236 | operation_result["time_violated"] = viz_request.timedout
237 | operation_result["t_pause"] = viz_request.t_pause
238 | operation_result["t_start"] = viz_request.t_start
239 | operation_result["progress"] = viz_request.progress
240 | operation_result["output"] = viz_request.result
241 | operation_result["margins"] = viz_request.margins
242 | operation_result["num_binning_dimensions"] = len(viz_request.viz.binning)
243 | operation_result["num_aggregates_per_bin"] = len(viz_request.viz.per_bin_aggregates)
244 |
245 | bin_types = []
246 | for viz_bin in viz_request.viz.binning:
247 | if "width" in viz_bin:
248 | bin_types.append("quantitative")
249 | else:
250 | bin_types.append("nominal")
251 | operation_result["binning_type"] = "_".join(sorted(bin_types))
252 |
253 | agg_types = []
254 | for viz_agg in viz_request.viz.per_bin_aggregates:
255 | if viz_agg["type"] == "count":
256 | agg_types.append("count")
257 | elif viz_agg["type"] == "avg":
258 | agg_types.append("avg")
259 | else:
260 | raise Exception()
261 | operation_result["aggregate_type"] = "_".join(sorted(agg_types))
262 |
263 | if not viz_request.operation_id in self.operation_results:
264 | self.operation_results["results"][viz_request.operation_id] = operation_result
265 |
266 | viz_request.delivered = True
267 |
268 | #self.driver.request_vizs(self.viz_requests)
269 |
270 | def get_config_hash(self):
271 | o = self.options
272 | h = (o.driver_name, o.settings_dataset, o.settings_workflow, o.settings_size, o.settings_normalized, o.settings_confidence_level, o.settings_thinktime, o.settings_thinktime, o.settings_time_requirement)
273 | return hashlib.md5(str(h).encode('utf-8')).hexdigest()
274 |
275 | def get_schema_path(self):
276 | return "data/%s/sample.json" % (self.options.settings_dataset)
277 |
278 | def get_workflow_path(self):
279 | return "data/%s/workflows/%s.json" % (self.options.settings_dataset, self.options.settings_workflow)
280 |
281 | def compute_viz_similarity(self, viz_gt, viz):
282 |
283 | if len(viz.keys()) == 0 and len(viz_gt.keys()) == 0:
284 | return 1
285 |
286 | if len(viz_gt.keys()) == 0 and len(viz.keys()) > 0:
287 | raise Exception()
288 |
289 | if len(viz_gt.keys()) > 0 and len(viz.keys()) == 0:
290 | return 0
291 |
292 | for gt_key in viz_gt.keys():
293 | if gt_key not in viz:
294 | viz[gt_key] = 0
295 |
296 | viz_gt_vals = []
297 | viz_vals = []
298 | for gt_key in viz_gt.keys():
299 | if isinstance(viz_gt[gt_key], list):
300 | viz_gt_vals.append(viz_gt[gt_key][0])
301 | else:
302 | viz_gt_vals.append(viz_gt[gt_key])
303 |
304 | if isinstance(viz[gt_key], list):
305 | viz_vals.append(viz[gt_key][0])
306 | else:
307 | viz_vals.append(viz[gt_key])
308 |
309 | viz_gt_vals = np.array(viz_gt_vals).astype(float)
310 | viz_vals = np.array(viz_vals).astype(float)
311 |
312 |
313 |
314 | #viz_gt_vals = self.normalize(viz_gt_vals)
315 | #viz_vals = self.normalize(viz_vals)
316 |
317 | if np.isnan(viz_gt_vals).any():
318 | raise Exception()
319 |
320 | if np.isnan(viz_vals).any():
321 | raise Exception()
322 |
323 |
324 | #score = np.dot(viz_gt_vals, viz_vals)/ ( np.sqrt(np.sum(np.square(viz_gt_vals))) * np.sqrt(np.sum(np.square(viz_vals))) )
325 | np.seterr(all='raise')
326 | try:
327 | score = 1 - spatial.distance.cosine(viz_gt_vals, viz_vals)
328 | except:
329 | return 0
330 | return score if not np.isnan(score) else 0
331 |
332 | def normalize(self, v):
333 | norm=np.linalg.norm(v, ord=1)
334 | if norm==0:
335 | norm=np.finfo(v.dtype).eps
336 | return v/norm
337 |
338 | def evaluate(self, config_hash):
339 | print("evaluate")
340 | result_json = None
341 | try:
342 | with open("results/%s.json" % config_hash, "r") as json_data:
343 | result_json = json.load(json_data)
344 | except:
345 | print("couldn't load file %s" % ("results/%s.json" % config_hash))
346 | return
347 |
348 | workflow = result_json["args"]["settings_workflow"]
349 | dataset = result_json["args"]["settings_dataset"]
350 | size = result_json["args"]["settings_size"]
351 | time_requirement = result_json["args"]["settings_time_requirement"]
352 |
353 | with open("data/%s/groundtruths/%s_%s.json" % (dataset, size, workflow), "r") as json_data:
354 | groundtruths = json.load(json_data)["results"]
355 |
356 | with open("reports/%s.csv" % config_hash, 'w') as fp:
357 | w = csv.DictWriter(fp, [
358 | "operation_id",
359 | "config_hash",
360 | "interaction_id",
361 | "dataset",
362 | "size",
363 | "viz_name",
364 | "interface",
365 | "think_time",
366 | "time_requirement",
367 | "t_start",
368 | "t_pause",
369 | "workflow",
370 | "start_time",
371 | "end_time",
372 | "duration",
373 | "progress",
374 | "time_violated",
375 | "num_binning_dimensions",
376 | "binning_type",
377 | "has_invalid_bins",
378 | "num_bins_out_of_margin",
379 | "num_bins_delivered",
380 | "num_bins_in_gt",
381 | "missing_bins",
382 | "dissimilarity",
383 | "num_aggregates_per_bin",
384 | "aggregate_type",
385 | "bias",
386 | "rel_error_avg",
387 | "rel_error_stdev",
388 | "rel_error_min",
389 | "rel_error_max",
390 | "margin_avg",
391 | "margin_stdev",
392 | "margin_min",
393 | "margin_max",
394 | "margin_ratio"], delimiter=",", lineterminator="\n")
395 | w.writeheader()
396 |
397 | operations = result_json["results"]
398 |
399 |
400 | for op_number in operations.keys():
401 |
402 | gt_output = groundtruths[op_number]["output"]
403 | operation = operations[op_number]
404 |
405 | margins = []
406 | rel_errors = []
407 | forecast_values = []
408 | actual_values = []
409 | out_of_margin_count = 0
410 |
411 | for gt_bin_identifier, gt_aggregate_results in gt_output.items():
412 |
413 | if gt_bin_identifier in operation["output"]:
414 |
415 | for agg_bin_result_index, agg_bin_result in enumerate(operation["output"][gt_bin_identifier]):
416 | rel_error = None
417 | op_result = operation["output"][gt_bin_identifier][agg_bin_result_index]
418 | gt_result = gt_aggregate_results[agg_bin_result_index]
419 |
420 | if abs(gt_result) > 0:
421 | rel_error = abs(op_result - gt_result)/abs(gt_result)
422 | if rel_error > 1e-5:
423 | pass
424 | rel_errors.append(rel_error)
425 | else:
426 | print("ignoring zero in groundtruth")
427 |
428 | forecast_values.append(op_result)
429 | actual_values.append(gt_result)
430 |
431 | if operation["margins"] and gt_bin_identifier in operation["margins"]:
432 | op_margin = float(operation["margins"][gt_bin_identifier][agg_bin_result_index])
433 |
434 | if np.isnan(op_margin) or np.isinf(op_margin) or abs(op_margin) > 1000000:
435 | if os.path.exists("./margin_errors"):
436 | append_write = 'a' # append if already exists
437 | else:
438 | append_write = 'w' # make a new file if not
439 | with open("./margin_errors", append_write) as ffff:
440 | ffff.writelines(self.options.settings_workflow + "\n" + str(operation["margins"][gt_bin_identifier][agg_bin_result_index]) + "\n")
441 |
442 | elif gt_result + 1e-6 < op_result - abs(op_result * op_margin) or gt_result - 1e-6 > op_result + abs(op_result * op_margin):
443 | out_of_margin_count += 1
444 | margins.append(abs(op_margin))
445 | else:
446 | margins.append(abs(op_margin))
447 |
448 |
449 | else:
450 | pass
451 | # add error as many times as a bin was expected!
452 | #rel_errors.extend( [ 1 for n in range(len(gt_aggregate_results)) ] )
453 |
454 | # invalid bins test
455 | has_invalid_bins = False
456 | num_invalid = 0
457 | inv = []
458 |
459 | for kk in operation["output"].keys():
460 | if kk not in gt_output:
461 | has_invalid_bins = True
462 | num_invalid += 1
463 | inv.append(kk)
464 |
465 | print(self.options.settings_workflow)
466 | print(str(operation["id"]))
467 | print("invalid key:" + kk)
468 | print(operation["sql"])
469 | print(operation["output"])
470 | os._exit(0)
471 |
472 | args = result_json["args"]
473 |
474 | missing_bins = 1 - len(operation["output"].keys()) / len(gt_output.keys()) if len(gt_output.keys()) > 0 else 0
475 | op_eval_result = {}
476 | op_eval_result["operation_id"] = operation["id"]
477 | op_eval_result["config_hash"] = self.get_config_hash()
478 | op_eval_result["interaction_id"] = operation["parent_operation_id"]
479 | op_eval_result["dataset"] = args["settings_dataset"]
480 | op_eval_result["size"] = args["settings_size"]
481 | op_eval_result["viz_name"] = operation["viz_name"]
482 | op_eval_result["think_time"] = args["settings_thinktime"]
483 | op_eval_result["time_requirement"] = args["settings_time_requirement"]
484 | op_eval_result["interface"] = args["driver_name"]
485 | op_eval_result["workflow"] = args["settings_workflow"]
486 | op_eval_result["start_time"] = operation["start_time"]
487 | op_eval_result["end_time"] = operation["end_time"]
488 | op_eval_result["t_pause"] = operation["t_pause"] if "t_pause" in operation else 0
489 | op_eval_result["t_start"] = operation["t_start"] if "t_start" in operation else 0
490 | op_eval_result["duration"] = operation["end_time"] - operation["start_time"]
491 |
492 | if "time_violated" in operation:
493 | op_eval_result["time_violated"] = operation["time_violated"]
494 | elif "timedout" in operation:
495 | op_eval_result["time_violated"] = operation["timedout"]
496 | else:
497 | raise Exception()
498 |
499 | op_eval_result["has_invalid_bins"] = has_invalid_bins
500 | op_eval_result["binning_type"] = operation["binning_type"]
501 | op_eval_result["aggregate_type"] = operation["aggregate_type"]
502 | op_eval_result["num_bins_delivered"] = len(operation["output"].keys())
503 | op_eval_result["num_bins_in_gt"] = len(gt_output.items())
504 | op_eval_result["missing_bins"] = "%.5f" % missing_bins
505 |
506 | op_eval_result["dissimilarity"] = "%.5f" % (1- self.compute_viz_similarity(gt_output, operation["output"]))
507 |
508 | op_eval_result["num_bins_out_of_margin"] = "%i" % out_of_margin_count
509 | op_eval_result["num_aggregates_per_bin"] = operation["num_aggregates_per_bin"]
510 | op_eval_result["num_binning_dimensions"] = operation["num_binning_dimensions"]
511 | op_eval_result["progress"] = "%.5f" % operation["progress"]
512 | op_eval_result["bias"] = "%.5f" % (sum(forecast_values) / sum(actual_values) - 1)if len(actual_values) > 0 else 0
513 | op_eval_result["rel_error_stdev"] = "%.5f" % statistics.stdev(rel_errors) if len(rel_errors) > 1 else 0.0
514 | op_eval_result["rel_error_min"] = "%.5f" % min(rel_errors) if len(rel_errors) > 0 else 0
515 | op_eval_result["rel_error_max"] = "%.5f" % max(rel_errors) if len(rel_errors) > 0 else 0
516 | op_eval_result["rel_error_avg"] = "%.5f" % float(sum(rel_errors) / float(len(rel_errors))) if len(rel_errors) > 0 else 0
517 | op_eval_result["margin_stdev"] = "%.5f" % statistics.stdev(margins) if len(margins) > 1 else 0.0
518 | op_eval_result["margin_min"] = "%.5f" % min(margins) if len(margins) > 0 else 0.0
519 | op_eval_result["margin_max"] = "%.5f" % max(margins) if len(margins) > 0 else 0.0
520 | op_eval_result["margin_avg"] = "%.5f" % float(sum(margins) / float(len(margins))) if len(margins) > 0 else 0.0
521 | op_eval_result["margin_ratio"] = "%.5f" % float(len(operation["margins"]) / len(operation["output"])) if operation["margins"] and len(operation["output"]) > 0 else 1
522 | w.writerow(op_eval_result)
523 |
524 | def create_report(self):
525 | header_saved = False
526 | interesting_files = glob.glob("reports/*.csv")
527 | with open('./full_report.csv','w') as fout:
528 | for filename in interesting_files:
529 | print(filename)
530 | with open(filename) as fin:
531 | header = next(fin)
532 | if not header_saved:
533 | print(header)
534 | fout.write(header)
535 | header_saved = True
536 | for line in fin:
537 | fout.write(line)
538 | print("saved report")
539 |
540 |
541 | def assure_path_exists(path):
542 | d = os.path.dirname(path)
543 | if not os.path.exists(d):
544 | os.makedirs(d)
545 |
546 | if __name__ == '__main__':
547 | IDEBench()
548 |
549 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/logo.png
--------------------------------------------------------------------------------
/reports/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore
--------------------------------------------------------------------------------
/results/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore
--------------------------------------------------------------------------------
/runconfig_sample.json:
--------------------------------------------------------------------------------
1 | {
2 | "driver-names": ["sample"],
3 | "driver-args": [[100, "arg2"]],
4 | "settings-datasets": ["flights"],
5 | "settings-workflows": ["test"],
6 | "settings-sizes": ["500k"],
7 | "settings-thinktimes": [1000],
8 | "settings-time-requirements": [1000],
9 | "settings-confidence-levels": [95],
10 | "settings-normalized": false
11 | }
--------------------------------------------------------------------------------
/workflowgen.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | import pprint
4 | import json
5 | from workflowgen.vizaction import VizAction
6 | from workflowgen.linkaction import LinkAction
7 | from optparse import OptionParser
8 | import pandas as pd
9 | from common.schema import Schema
10 | from common.vizgraph import VizGraph
11 | #from common.storage import Storage
12 | import pandasql
13 |
14 |
15 | class WorkflowGenerator:
16 |
17 | def __init__(self):
18 |
19 | parser = OptionParser()
20 | parser.add_option("-r", "--seed", dest="seed", action="store", type=int, help="Random seed", default=25000)
21 | parser.add_option("-d", "--dataset", dest="data_folder", action="store", help="path to save the file", default="flights")
22 | parser.add_option("--debug", dest="debug", action="store_true", help="creates a debug file", default=False)
23 | parser.add_option("-n", "--num-operations", dest="num_operations", action="store", type=int, help="Number of operations to generate", default=20)
24 | parser.add_option("-c", "--workflow-type", dest="config", action="store", help="path to config file", default="data/flights/workflowtypes/sequential.json")
25 | parser.add_option("-p", "--output", dest="path", action="store", help="path to save the file", default="workflow.json")
26 | parser.add_option("-s", "--num-samples", dest="numsamples", action="store", type=int, help="Number of samples to draw from the original dataset", default=10000)
27 | (options, args) = parser.parse_args()
28 | self.options = options
29 |
30 | random.seed(options.seed)
31 | np.random.seed(seed=options.seed)
32 |
33 | print("data/" + options.data_folder + "/" + options.config)
34 | with open("data/" + options.data_folder + "/workflowtypes/" + options.config, "r") as fp:
35 | self.config = json.load(fp)
36 |
37 | schema = None
38 | with open(self.get_schema_path()) as f:
39 | schema = Schema(json.load(f))
40 |
41 | print("reading csv...")
42 | # load sample data
43 | df = pd.read_csv("data/" + options.data_folder + "/sample.csv", nrows=options.numsamples, header=0)
44 |
45 | #schema = {"tables": [{ "name": "df", "dimensions": []}]}
46 | sample_json = None
47 | with open("data/" + options.data_folder + "/sample.json", "r") as f:
48 | sample_json = json.load(f)
49 | # for field in sample_json["tables"]["fact"]["fields"]:
50 | # schema["tables"][0]["dimensions"].append({"name": field["field"]})
51 |
52 |
53 | #storage = Storage(schema)
54 |
55 | zero_qs_ratio = 100
56 |
57 | tries = -1
58 | while zero_qs_ratio > 0.15:
59 | tries += 1
60 | num_zeros_qs = 0
61 | num_qs = 0
62 | VizAction.VIZ_COUNTER = -1
63 | LinkAction.FIRST_LINK = None
64 | LinkAction.LATEST_LINK = None
65 | LinkAction.LINKS = set()
66 |
67 | vizgraph = VizGraph()
68 | random.seed(options.seed + tries)
69 | root = VizAction(self.config, df, vizgraph, schema, sample_json)
70 | current = root
71 | states = []
72 |
73 | num_ops = 0
74 |
75 | debug_states = []
76 | while num_ops < options.num_operations:
77 | res = current.get_states()
78 | if res:
79 | affected_vizs = vizgraph.apply_interaction(res)
80 | if options.debug:
81 | nodes_dict = vizgraph.get_nodes_dict()
82 | states_dict = {}
83 | for n in nodes_dict.keys():
84 | states_dict[n] = {
85 | "name":n,
86 | "source" : nodes_dict[n].get_source(),
87 | "binning": nodes_dict[n].binning,
88 | "agg": nodes_dict[n].per_bin_aggregates,
89 | "selection": nodes_dict[n].get_selection(),
90 | "filter": nodes_dict[n].get_filter(),
91 | "computed_filter": nodes_dict[n].get_computed_filter_as_sql(schema),
92 | }
93 | debug_states.append(states_dict)
94 |
95 | for x in affected_vizs:
96 | sql = x.get_computed_filter_as_sql(schema).replace("FLOOR", "ROUND").replace(schema.get_fact_table_name(), "df")
97 | r = pandasql.sqldf(sql, locals())
98 | num_qs += 1
99 | if len(r.index) == 0:
100 | num_zeros_qs += 1
101 |
102 | states.append(res.data)
103 | #if "source" not in res:
104 | num_ops += 1
105 |
106 | current = current.get_next()
107 | if current is None:
108 | zero_qs_ratio = num_zeros_qs/num_qs
109 | break
110 | zero_qs_ratio = num_zeros_qs/num_qs
111 |
112 |
113 | with open("data/" + options.data_folder + "/workflows/" + options.path + ".json", "w") as fp:
114 | fp.write(json.dumps({"name": "generated", "dataset": options.data_folder, "seed": options.seed, "config": options.config, "interactions": states}))
115 |
116 | print("done.")
117 | #with open("workflowviewer/public/workflow.json", "w") as fp:
118 | # fp.write(json.dumps({"name": "generated", "dataset": options.data_folder, "seed": options.seed, "config": options.config, "interactions": states}))
119 |
120 | #with open("workflowviewer/public/workflow_debug.json", "w") as fp:
121 | # fp.write(json.dumps(debug_states))
122 |
123 | #if options.debug:
124 | # import webbrowser
125 | # url = "http://localhost:3000"
126 | # webbrowser.open(url)
127 |
128 | def get_schema_path(self):
129 | return "data/%s/sample.json" % (self.options.data_folder)
130 |
131 | def get_viz_name(self):
132 | return "viz_%i" % self.config["viz_counter"]
133 |
134 | WorkflowGenerator()
--------------------------------------------------------------------------------
/workflowgen/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/workflowgen/__init__.py
--------------------------------------------------------------------------------
/workflowgen/baseaction.py:
--------------------------------------------------------------------------------
1 | import random
2 | import importlib
3 | import numpy as np
4 |
5 | class BaseAction:
6 |
7 | def __init__(self, config, df, vizgraph, storage, sample_json):
8 | self.config = config
9 | self.df = df
10 | self.vizgraph = vizgraph
11 | self.storage = storage
12 | self.sample_json = sample_json
13 |
14 | def get_next(self):
15 | pick = self.pick(self.config["nextAction"]["values"], self.config["nextAction"]["pd"])
16 | pick_split = pick.split(".")
17 | module = importlib.import_module(pick_split[0] + "." + pick_split[1])
18 | return getattr(module, pick_split[2])(self.config, self.df, self.vizgraph, self.storage, self.sample_json)
19 |
20 | def get_states(self):
21 | return []
22 |
23 | def pick(self, choices, pd=None):
24 | if pd is None:
25 | return random.choice(choices)
26 |
27 | total = sum(pd)
28 | r = random.uniform(0, total)
29 | upto = 0
30 | for i, c in enumerate(choices):
31 | if upto + pd[i] >= r:
32 | return c
33 | upto += pd[i]
34 | assert False, "Shouldn't get here"
35 |
36 | def pick_range(self, val_min, val_max):
37 | delta = val_max - val_min
38 | selectionrange = max(0, min(np.random.normal(loc=0.5, scale=0.25),1))
39 | selectionstart = random.uniform(0, 1 - selectionrange)
40 | return val_min + delta * selectionstart, (1 - selectionrange) * delta
41 |
--------------------------------------------------------------------------------
/workflowgen/bulkgen.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | t = "inverse_bushy"
4 | config = "config/%s.json" % t
5 | sizes = [25]
6 | for size in sizes:
7 | for n in range(10):
8 | seed = 1000 * size + n
9 | os.system("python workflowgen.py -n %i -r %i --config %s -p %s" % (size, seed, config, ("../workflows/%s_%i_r%i.json" % (t, size,seed))))
--------------------------------------------------------------------------------
/workflowgen/filteraction.py:
--------------------------------------------------------------------------------
1 | import random
2 | import math
3 | import numpy as np
4 | import pandasql
5 | from collections import OrderedDict
6 | from workflowgen.baseaction import BaseAction
7 | from workflowgen.vizaction import VizAction
8 | from common.operation import Operation
9 |
10 | class FilterAction(BaseAction):
11 |
12 | def get_states(self):
13 |
14 | src_viz_num = random.randint(0, VizAction.VIZ_COUNTER)
15 | src_viz = list(self.vizgraph.get_nodes())[src_viz_num]
16 | computed_filter = src_viz.get_computed_filter()
17 | df = self.df
18 | sql_statement = "SELECT * FROM df "
19 | if len(computed_filter) > 0:
20 | sql_statement += "WHERE " + computed_filter
21 |
22 | df_result = pandasql.sqldf(sql_statement, locals())
23 |
24 | if df_result.empty:
25 | return None
26 |
27 | filter_per_dim = []
28 |
29 | for bin_dim in range(len(src_viz.binning)):
30 | filters = []
31 | dim = src_viz.binning[bin_dim]["dimension"]
32 | field = list(filter(lambda x: x["field"] == dim, self.sample_json["tables"]["fact"]["fields"]))[0]
33 | if field["type"] == "quantitative":
34 | bin_width = float(src_viz.binning[bin_dim]["width"])
35 | min_val = df_result[dim].min()
36 | max_val = df_result[dim].max()
37 |
38 | min_index = math.floor(min_val / bin_width)
39 | max_index = math.floor(max_val / bin_width)
40 | num_bins = 0
41 | if np.random.rand() < 0.4:
42 | num_bins = 1
43 | else:
44 | num_bins = random.randint(1, max_index-min_index) if max_index > min_index else 1
45 | selected_bins = np.random.choice(np.arange(min_index, max_index + 1), size=num_bins, replace=False)
46 |
47 | for selected_bin in selected_bins:
48 | range_min = selected_bin * bin_width
49 | range_max = (selected_bin + 1) * bin_width
50 | filt = "(%s >= %s and %s < %s)" % (dim, '{:.1f}'.format(range_min), dim, '{:.1f}'.format(range_max))
51 | filters.append(filt)
52 | else:
53 | all_bins = df_result[dim].unique().tolist()
54 | num_bins = random.randint(1, len(all_bins))
55 | selected_bins = np.random.choice(all_bins, size=num_bins, replace=False)
56 | for selected_bin in list(selected_bins):
57 | filt = "(%s = '%s')" % (dim, selected_bin)
58 | filters.append(filt)
59 | filter_per_dim.append(" or ".join(filters))
60 | filter_per_dim = ["(%s)" % f for f in filter_per_dim]
61 |
62 | return Operation(OrderedDict({"name": ("viz_%s" % src_viz_num), "filter": " and ".join(filter_per_dim)}))
--------------------------------------------------------------------------------
/workflowgen/linkaction.py:
--------------------------------------------------------------------------------
1 | import random
2 | from collections import OrderedDict
3 | from common.operation import Operation
4 | from workflowgen.baseaction import BaseAction
5 | from workflowgen.vizaction import VizAction
6 |
7 | class LinkAction(BaseAction):
8 |
9 | def get_states(self):
10 |
11 | if VizAction.VIZ_COUNTER < 1:
12 | return None
13 |
14 | pick_from = -1
15 | pick_to = -1
16 | link_type = None
17 | while (pick_from == -1 or pick_to == -1):
18 | from_candidate = random.randint(0, VizAction.VIZ_COUNTER)
19 | to_candidate = random.randint(0, VizAction.VIZ_COUNTER)
20 |
21 |
22 | link_type_names = [l["name"] for l in self.config["linkType"]]
23 | link_type_pds = [l["p"] for l in self.config["linkType"]]
24 |
25 | link_type = self.pick(link_type_names, link_type_pds)
26 | print(link_type)
27 |
28 | if link_type == "sequential" and LinkAction.LATEST_LINK:
29 | from_candidate = LinkAction.LATEST_LINK[1]
30 | elif link_type == "1n" and LinkAction.LATEST_LINK:
31 | from_candidate = LinkAction.LATEST_LINK[0]
32 | elif link_type == "n1" and LinkAction.FIRST_LINK:
33 | to_candidate = LinkAction.FIRST_LINK[1]
34 |
35 | num_tries = 10
36 | giveup = False
37 | g = {}
38 | for i in range(num_tries+1):
39 |
40 | g = {}
41 | for l in LinkAction.LINKS:
42 | if l[0] not in g:
43 | g[l[0]] = []
44 | g[l[0]].append(l[1])
45 |
46 | if from_candidate not in g:
47 | g[from_candidate] = []
48 | g[from_candidate].append(to_candidate)
49 |
50 | if self.cyclic(g):
51 | if link_type == "n1" and LinkAction.FIRST_LINK:
52 | to_candidate = LinkAction.FIRST_LINK[1]
53 | else:
54 | to_candidate = random.randint(0, VizAction.VIZ_COUNTER)
55 | if i == num_tries:
56 |
57 | giveup = True
58 | else:
59 | break
60 |
61 | if giveup:
62 | print("giving up!")
63 | break
64 |
65 | if from_candidate != to_candidate and ((to_candidate, from_candidate) not in LinkAction.LINKS) and ((from_candidate, to_candidate) not in LinkAction.LINKS):
66 |
67 | pick_from = from_candidate
68 | pick_to = to_candidate
69 |
70 | if not LinkAction.FIRST_LINK:
71 | LinkAction.FIRST_LINK = (pick_from, pick_to)
72 | LinkAction.LATEST_LINK = (pick_from, pick_to)
73 | LinkAction.LINKS.add(LinkAction.LATEST_LINK)
74 | break
75 |
76 | if len(LinkAction.LINKS) >= VizAction.VIZ_COUNTER-1:
77 | break
78 |
79 | if (pick_from == -1 or pick_to == -1):
80 | return None
81 |
82 | incoming_links = [ "viz_" + str(l[0]) for l in filter(lambda x: x[1] == pick_to, LinkAction.LINKS)]
83 | combined_filters = Operation(OrderedDict({"name": "viz_" + str(pick_to), "source": ( " and ".join(incoming_links))}))
84 | return combined_filters
85 |
86 | def cyclic(self, g):
87 | path = set()
88 | def visit(vertex):
89 | path.add(vertex)
90 | for neighbour in g.get(vertex, ()):
91 | if neighbour in path or visit(neighbour):
92 | return True
93 | path.remove(vertex)
94 | return False
95 |
96 | return any(visit(v) for v in g)
97 |
98 | LinkAction.FIRST_LINK = None
99 | LinkAction.LATEST_LINK = None
100 | LinkAction.LINKS = set()
101 |
--------------------------------------------------------------------------------
/workflowgen/selectionaction.py:
--------------------------------------------------------------------------------
1 | import random
2 | from collections import OrderedDict
3 | from common.operation import Operation
4 | from workflowgen.baseaction import BaseAction
5 | from workflowgen.linkaction import LinkAction
6 | import pandasql
7 | import math
8 | import numpy as np
9 |
10 | class SelectionAction(BaseAction):
11 |
12 | def get_states(self):
13 |
14 | if len(LinkAction.LINKS) == 0:
15 | return
16 |
17 | rand_link = self.pick(list(LinkAction.LINKS))
18 | rand_link_src = rand_link[0]
19 | nodes_dict = self.vizgraph.get_nodes_dict()
20 | src_viz = nodes_dict["viz_" + str(rand_link_src)]
21 | computed_filter = src_viz.get_computed_filter()
22 | df = self.df
23 | sql_statement = "SELECT * FROM df "
24 | if len(computed_filter) > 0:
25 | sql_statement += "WHERE " + computed_filter
26 |
27 | df_result = pandasql.sqldf(sql_statement, locals())
28 |
29 | if df_result.empty:
30 | return None
31 |
32 | filter_per_dim = []
33 |
34 | for bin_dim in range(len(src_viz.binning)):
35 | filters = []
36 | dim = src_viz.binning[bin_dim]["dimension"]
37 | field = list(filter(lambda x: x["field"] == dim, self.sample_json["tables"]["fact"]["fields"]))[0]
38 | if field["type"] == "quantitative":
39 | bin_width = float(src_viz.binning[bin_dim]["width"])
40 | min_val = df_result[dim].min()
41 | max_val = df_result[dim].max()
42 |
43 | min_index = math.floor(min_val / bin_width)
44 | max_index = math.floor(max_val / bin_width)
45 | num_bins = 0
46 | if np.random.rand() < 0.4:
47 | num_bins = 1
48 | else:
49 | num_bins = random.randint(1, max_index-min_index) if max_index > min_index else 1
50 | selected_bins = np.random.choice(np.arange(min_index, max_index + 1), size=num_bins, replace=False)
51 |
52 | for selected_bin in selected_bins:
53 | range_min = selected_bin * bin_width
54 | range_max = (selected_bin + 1) * bin_width
55 | filt = "(%s >= %s and %s < %s)" % (dim, '{:.1f}'.format(range_min), dim, '{:.1f}'.format(range_max))
56 | filters.append(filt)
57 | else:
58 | all_bins = df_result[dim].unique().tolist()
59 | num_bins = random.randint(1, len(all_bins))
60 | selected_bins = np.random.choice(all_bins, size=num_bins, replace=False)
61 | for selected_bin in list(selected_bins):
62 | filt = "(%s = '%s')" % (dim, selected_bin)
63 | filters.append(filt)
64 | filter_per_dim.append(" or ".join(filters))
65 | filter_per_dim = ["(%s)" % f for f in filter_per_dim]
66 |
67 | return Operation(OrderedDict({"name": ("viz_%s" % rand_link_src), "selection": " and ".join(filter_per_dim)}))
68 |
--------------------------------------------------------------------------------
/workflowgen/vizaction.py:
--------------------------------------------------------------------------------
1 | import random
2 | from workflowgen.baseaction import BaseAction
3 | from common.operation import Operation
4 | from collections import OrderedDict
5 | import pandasql
6 | import pandas as pd
7 |
8 | class VizAction(BaseAction):
9 |
10 | def __init__(self, config, df, vizgraph, storage, sample_json):
11 | super().__init__(config, df, vizgraph, storage, sample_json)
12 |
13 | self.dim_to_type = {}
14 | for field in sample_json["tables"]["fact"]["fields"]:
15 | self.dim_to_type[field["field"]] = field["type"]
16 |
17 | def get_states(self):
18 | num_bins = self.pick(self.config["numBinDimensionsPerViz"]["values"], self.config["numBinDimensionsPerViz"]["pd"] )
19 | bins = []
20 | picks = []
21 |
22 | while len(bins) < num_bins:
23 | dimensions_p = [dim["p"] for dim in self.config["dimensions"]]
24 | dimensions_p = [p/sum(dimensions_p) for p in dimensions_p]
25 | dimension = self.pick(self.config["dimensions"], dimensions_p)
26 |
27 | if dimension in picks:
28 | continue
29 |
30 | picks.append(dimension)
31 |
32 | sql_statement = "SELECT * FROM df "
33 | df = self.df
34 | df = pandasql.sqldf(sql_statement, locals())
35 |
36 | d_bin = {"dimension": dimension["name"] }
37 | if self.dim_to_type[dimension["name"]] == "quantitative":
38 | dim_max_val = df[dimension["name"]].max()
39 | dim_min_val = df[dimension["name"]].min()
40 | #d_bin["width"] = round(random.uniform(0.025, 0.1) * (dim_max_val - dim_min_val))
41 | d_bin["width"] = round(random.uniform(0.025, 0.1) * (dim_max_val - dim_min_val))
42 | elif self.dim_to_type[dimension["name"]] == "categorical":
43 | try:
44 | pd.to_numeric(df[dimension["name"]])
45 | d_bin["width"] = 1
46 | except:
47 | pass
48 |
49 | bins.append(d_bin)
50 |
51 | per_bin_aggregate_type = self.pick(self.config["perBinAggregates"]["values"], self.config["perBinAggregates"]["pd"] )
52 | per_bin_aggregate = {"type": per_bin_aggregate_type}
53 | if per_bin_aggregate_type == "avg":
54 | avg_dimension = self.pick([d for d in self.sample_json["tables"]["fact"]["fields"] if (d["type"] == "quantitative")])
55 | per_bin_aggregate["dimension"] = avg_dimension["field"]
56 |
57 | VizAction.VIZ_COUNTER += 1
58 | self.viz_name = "viz_%s" % VizAction.VIZ_COUNTER
59 | self.binning = bins
60 | self.perBinAggregates = [per_bin_aggregate]
61 | return Operation(OrderedDict({"name": self.viz_name, "binning": self.binning, "perBinAggregates": self.perBinAggregates}))
62 |
63 | VizAction.VIZ_COUNTER = -1
--------------------------------------------------------------------------------