├── .DS_Store ├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── common ├── __init__.py ├── operation.py ├── schema.py ├── util.py ├── viz.py ├── vizgraph.py └── vizrequest.py ├── data └── flights.zip ├── datagen.py ├── drivers ├── __init__.py ├── monetdb.py ├── sample.py └── xdb.py ├── idebench.py ├── logo.png ├── reports └── .gitignore ├── results └── .gitignore ├── runconfig_sample.json ├── workflowgen.py └── workflowgen ├── __init__.py ├── baseaction.py ├── bulkgen.py ├── filteraction.py ├── linkaction.py ├── selectionaction.py └── vizaction.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.csv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | **/__pycache__/* 7 | 8 | _benchmark 9 | node_modules/ 10 | _temp/* 11 | 12 | _creation 13 | _idea 14 | flights.csv 15 | flights.csv 16 | output.csv 17 | 18 | margin_errors 19 | 20 | # C extensions 21 | *.so 22 | 23 | # Distribution / packaging 24 | .Python 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *.cover 60 | .hypothesis/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # celery beat schedule file 90 | celerybeat-schedule 91 | 92 | # SageMath parsed files 93 | *.sage.py 94 | 95 | # Environments 96 | .env 97 | .venv 98 | env/ 99 | venv/ 100 | ENV/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018 Brown University 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 |


5 | For information on how to use IDEBench, checkout our Wiki: http://github.com/IDEBench/IDEBench-public/wiki 6 |

7 | 8 | ## License 9 | 10 | MIT 11 | 12 | ## Cite 13 | Users of IDEBench are requested to use the following BibTeX reference: 14 | ``` 15 | @misc{1804.02593, 16 | Author = {Philipp Eichmann and Carsten Binnig and Tim Kraska and Emanuel Zgraggen}, 17 | Title = {IDEBench: A Benchmark for Interactive Data Exploration}, 18 | Year = {2018}, 19 | Eprint = {arXiv:1804.02593}, 20 | } 21 | ``` 22 | 23 | ## Publications 24 | 25 | Eichmann, Philipp, Carsten Binnig, Tim Kraska and Emanuel Zgraggen. "IDEBench: A Benchmark for Interactive Data Exploration". 26 | [PDF](https://arxiv.org/abs/1804.02593) 27 | 28 | Eichmann, Philipp, Emanuel Zgraggen, Zheguang Zhao, Carsten Binnig, and Tim Kraska. "Towards a Benchmark for Interactive Data Exploration." IEEE Data Eng. Bull. 39, no. 4 (2016): 50-61. 29 | [PDF](http://cs.brown.edu/~peichmann/downloads/bide_vision.pdf) 30 | -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/common/__init__.py -------------------------------------------------------------------------------- /common/operation.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | class Operation: 4 | 5 | def __init__(self, data): 6 | self.data = data 7 | 8 | def get_viz_name(self): 9 | if "name" in self.data: 10 | return self.data["name"] 11 | return None 12 | 13 | def get_source(self): 14 | return self.data["source"] 15 | 16 | def has_source(self): 17 | return "source" in self.data 18 | 19 | def has_selection(self): 20 | return "selection" in self.data and len(self.data["selection"]) > 0 21 | 22 | def get_selection(self): 23 | return self.data["selection"] 24 | 25 | def has_filter(self): 26 | return "filter" in self.data and len(self.data["filter"]) > 0 27 | 28 | def get_filter(self): 29 | return self.data["filter"] 30 | 31 | def get_source_vizs(self): 32 | sources = self.get_source().replace("(", "").replace(")", "").replace("and", "").replace("or", "").split(" ") 33 | return set([s for s in sources if not s == "" ]) 34 | 35 | -------------------------------------------------------------------------------- /common/schema.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class Schema: 4 | 5 | def __init__(self, schema_json, is_normalized=False): 6 | self.is_normalized = is_normalized 7 | self.schema_json = schema_json 8 | 9 | def get_fact_table(self): 10 | return self.schema_json["tables"]["fact"] 11 | 12 | def get_fact_table_name(self): 13 | return self.schema_json["tables"]["fact"]["name"] 14 | 15 | def translate_field(self, field_name): 16 | 17 | if not self.is_normalized: 18 | return field_name, None, None 19 | 20 | 21 | for dim_tbl in self.schema_json["tables"]["dimension"]: 22 | for m_idx, mapping in enumerate(dim_tbl["mapping"]): 23 | for f_idx, field in enumerate(mapping["fromFields"]): 24 | if field == field_name: 25 | tbl_alias = "%s%s" % (dim_tbl["name"], m_idx) 26 | tbl_join = "%s.ID = %s.%s" % (tbl_alias, self.get_fact_table_name(), mapping["fk"]) 27 | tbl_as = "%s AS %s" % (dim_tbl["name"], tbl_alias) 28 | return ("%s.%s" % (tbl_alias, dim_tbl["columns"][f_idx])), tbl_as, tbl_join 29 | return field_name, self.get_fact_table_name(), None 30 | 31 | def get_tables_for(self, field_name): 32 | if not self.is_normalized: 33 | return "" 34 | 35 | for dim_tbl in self.schema_json["tables"]["dimension"]: 36 | for m_idx, mapping in enumerate(dim_tbl["mapping"]): 37 | for f_idx, field in enumerate(mapping["fromFields"]): 38 | if field == field_name: 39 | tbl_alias = "%s%s" % (dim_tbl["name"], m_idx) 40 | tbl_as = "%s AS %s" % (dim_tbl["name"], tbl_alias) 41 | if dim_tbl["name"] == "tbl_carriers": 42 | os._exit() 43 | return ("%s.%s" % (tbl_alias, dim_tbl["columns"][f_idx])), tbl_as 44 | 45 | -------------------------------------------------------------------------------- /common/util.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | def get_current_ms_time(): 4 | return int(round(time.time() * 1000)) -------------------------------------------------------------------------------- /common/viz.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | class Viz: 5 | 6 | @staticmethod 7 | def createFromDict(obj): 8 | viz = Viz() 9 | viz.name = "" if "name" not in obj else obj["name"] 10 | viz.source = "" if "source" not in obj else obj["source"] 11 | viz.selection = "" if "selection" not in obj else obj["selection"] 12 | viz.filter = "" if "filter" not in obj else obj["filter"] 13 | viz.computed_filter = "" 14 | viz.binning = [] if "binning" not in obj else obj["binning"] 15 | viz.per_bin_aggregates = [] if "perBinAggregates" not in obj else obj["perBinAggregates"] 16 | return viz 17 | 18 | def __init__(self): 19 | self.name = "" 20 | self.source = "" 21 | self.selection = "" 22 | self.filter = "" 23 | self.computed_filter = "" 24 | self.binning = [] 25 | self.per_bin_aggregates = [] 26 | 27 | def apply_interaction(self, operation): 28 | self.source = operation.get_source() if operation.has_source() else "" 29 | self.selection = operation.get_selection() if operation.has_selection() else "" 30 | self.filter = operation.get_filter() if operation.has_filter() else "" 31 | 32 | def has_filter(self): 33 | return len(self.filter) > 0 34 | 35 | def get_filter(self): 36 | return self.filter 37 | 38 | def get_computed_filter(self): 39 | return self.computed_filter 40 | 41 | def set_computed_filter(self, filter_str): 42 | self.computed_filter = filter_str 43 | 44 | def get_source(self): 45 | return self.source 46 | 47 | def has_source(self): 48 | return True if not self.source == "" else False 49 | 50 | def has_selection(self): 51 | return True if not self.selection == "" else False 52 | 53 | def get_selection(self): 54 | return self.selection 55 | 56 | def get_source_vizs(self): 57 | sources = self.get_source().replace("(", "").replace(")", "").replace("and", "").replace("or", "").split(" ") 58 | return set([s for s in sources if not s == "" ]) 59 | 60 | def get_computed_filter_as_sql(self, schema): 61 | bins = [] 62 | bin_str = "" 63 | tables = set() 64 | tables.add(schema.get_fact_table_name()) 65 | joins = set() 66 | for bin_desc in self.binning: 67 | dimension = bin_desc["dimension"] 68 | bins.append("bin_" + dimension) 69 | if "width" in bin_desc: 70 | bin_width = bin_desc["width"] 71 | bin_str += "FLOOR(%s/%s) AS bin_%s, " % (dimension, bin_width, dimension) 72 | else: 73 | dd, tblas, tjoins = schema.translate_field(dimension) 74 | joins.add(tjoins) 75 | tables.add(tblas) 76 | 77 | bin_str += "%s AS bin_%s, " % (dd, dimension) 78 | 79 | agg_str = "" 80 | for per_bin_aggregate_desc in self.per_bin_aggregates: 81 | 82 | if "dimension" in per_bin_aggregate_desc: 83 | aggregate_dimension = per_bin_aggregate_desc["dimension"] 84 | if per_bin_aggregate_desc["type"] == "count": 85 | agg_str += "COUNT(*) as count, " 86 | elif per_bin_aggregate_desc["type"] == "avg": 87 | aggregate_dimension = per_bin_aggregate_desc["dimension"] 88 | aggregate_dimension2 = aggregate_dimension 89 | 90 | #if storage.normalized: 91 | # aggregate_dimension2 = storage.get_all_tables()[0]["name"] + "." + aggregate_dimension 92 | 93 | agg_str += "AVG(%s) as average_%s, " % (aggregate_dimension2, aggregate_dimension) 94 | 95 | agg_str = agg_str.rstrip(", ") 96 | bins_str = ", ".join(bins) 97 | 98 | sql_statement = "SELECT %s %s " % (bin_str, agg_str) 99 | if schema.is_normalized: 100 | 101 | computed_filter = self.get_computed_filter() 102 | fields = [f["field"] for f in schema.get_fact_table()["fields"]] 103 | 104 | for field in fields: 105 | if field not in computed_filter: 106 | continue 107 | 108 | translation, tblas, tjoins = schema.translate_field(field) 109 | joins.add(tjoins) 110 | tables.add(tblas) 111 | 112 | computed_filter = computed_filter.replace(field + " ", translation + " ") 113 | 114 | 115 | sql_statement += "FROM %s " % ", ".join(tables) 116 | joins = list(filter(None, joins)) 117 | if len(joins) > 0: 118 | sql_statement += "WHERE (%s) " % computed_filter + " AND " if computed_filter else "WHERE " 119 | sql_statement += "(" 120 | sql_statement += " AND ".join(joins) + ") " 121 | else: 122 | sql_statement += " " 123 | 124 | sql_statement += "GROUP BY %s" % bins_str 125 | else: 126 | sql_statement = "SELECT %s %s " % (bin_str, agg_str) 127 | sql_statement += "FROM %s " % schema.get_fact_table_name() 128 | sql_statement += "WHERE (%s) " % self.get_computed_filter() if self.get_computed_filter() else "" 129 | sql_statement += "GROUP BY %s" % bins_str 130 | 131 | return sql_statement 132 | 133 | 134 | def get_computed_filter_as_sql2(self, storage): 135 | bins = [] 136 | bin_str = "" 137 | for bin_desc in self.binning: 138 | dimension = bin_desc["dimension"] 139 | 140 | if "width" in bin_desc: 141 | bin_width = bin_desc["width"] 142 | bin_str += "FLOOR(%s/%s), " % (dimension, bin_width) 143 | bins.append("FLOOR(%s/%s)" % (dimension, bin_width)) 144 | else: 145 | dd = dimension 146 | bin_str += "%s, " % (dd) 147 | bins.append("%s" % (dd)) 148 | 149 | agg_str = "" 150 | for per_bin_aggregate_desc in self.per_bin_aggregates: 151 | 152 | if "dimension" in per_bin_aggregate_desc: 153 | aggregate_dimension = per_bin_aggregate_desc["dimension"] 154 | if per_bin_aggregate_desc["type"] == "count": 155 | agg_str += "COUNT(*) as count_, relative_error(count_)," 156 | elif per_bin_aggregate_desc["type"] == "avg": 157 | aggregate_dimension = per_bin_aggregate_desc["dimension"] 158 | agg_str += "AVG(%s) as average_%s, relative_error(average_%s)" % (aggregate_dimension, aggregate_dimension, aggregate_dimension) 159 | 160 | agg_str = agg_str.rstrip(", ") 161 | bins_str = ", ".join(bins) 162 | sql_statement = "SELECT %s %s " % (bin_str, agg_str) 163 | sql_statement += "FROM %s " % storage.get_fact_table_name() 164 | sql_statement += "WHERE (%s) " % self.get_computed_filter() if self.get_computed_filter() else "" 165 | sql_statement += "GROUP BY %s" % bins_str 166 | return sql_statement + " WITH ERROR 0.1 BEHAVIOR 'do_nothing'" -------------------------------------------------------------------------------- /common/vizgraph.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from collections import deque 3 | from collections import OrderedDict 4 | from common.viz import Viz 5 | 6 | 7 | class VizGraph(object): 8 | 9 | def __init__(self): 10 | self._graph = OrderedDict() 11 | self.nodes = set() 12 | 13 | def apply_interaction(self, operation): 14 | # initialize a set of vizs that are affected by this operation 15 | vizs_to_request = OrderedDict() 16 | 17 | if operation.get_viz_name() not in self.get_nodes_dict(): 18 | viz = Viz.createFromDict(operation.data) 19 | self.nodes.add(viz) 20 | vizs_to_request[viz] = True 21 | 22 | viz_dict = self.get_nodes_dict() 23 | current_viz = viz_dict[operation.get_viz_name()] 24 | 25 | if operation.has_filter(): 26 | current_viz.filter = operation.get_filter() 27 | #vizs_to_request.add(current_viz) 28 | vizs_to_request[current_viz] = True 29 | 30 | # parse source attribute 31 | if operation.has_source(): 32 | source = operation.get_source() 33 | if len(source) > 0: 34 | # find all current sources add check which ones have been added/removed 35 | old_sources = current_viz.get_source_vizs() 36 | new_sources = operation.get_source_vizs() 37 | sources_removed = old_sources - new_sources 38 | sources_remained = new_sources.intersection(old_sources) 39 | sources_added = new_sources - sources_remained - sources_removed 40 | 41 | for src in sources_removed: 42 | self.remove_connection(viz_dict[src], current_viz) 43 | #if "remove_link" in self.interface.remove_link: 44 | # self.interface.remove_link(current_viz, viz_dict[src]) 45 | 46 | for src in sources_added: 47 | self.add_connection(viz_dict[src], current_viz) 48 | #add_link_method = getattr(self.interface, "add_link", None) 49 | #if callable(add_link_method): 50 | # self.interface.add_link(current_viz, viz_dict[src]) 51 | # return 52 | 53 | # update the source of the current viz 54 | current_viz.source = operation.get_source() 55 | #vizs_to_request.add(current_viz) 56 | vizs_to_request[current_viz] = True 57 | #self.process_next_interaction() 58 | return vizs_to_request 59 | 60 | # parse selection 61 | if operation.has_selection(): 62 | current_viz.selection = operation.get_selection() 63 | 64 | # find other vizs affected by this selection 65 | vizs_to_request.update(self.update_affected_vizs(current_viz, viz_dict)) 66 | 67 | current_viz.set_computed_filter(self.compute_filter(current_viz, viz_dict)) 68 | 69 | 70 | # set the parent id of each viz request 71 | #for viz_to_request in vizs_to_request.keys(): 72 | # self.parent_operations[viz_to_request] = index 73 | 74 | return vizs_to_request.keys() 75 | #self.viz_requests = [] 76 | #for viz in vizs_to_request.keys(): 77 | # self.operation_count += 1 78 | # self.viz_requests.append(VizRequest(self.operation_count, index, viz)) 79 | # 80 | # self.interface.request_vizs(self.viz_requests) 81 | 82 | def update_affected_vizs(self, current_viz, viz_dict): 83 | dependent_vizs = self.find_dependencies_top_down(current_viz) 84 | vizs_to_request = OrderedDict() 85 | for viz in dependent_vizs: 86 | computed_filter = self.compute_filter(viz, viz_dict) 87 | viz.set_computed_filter(computed_filter) 88 | vizs_to_request[viz] = True 89 | 90 | return vizs_to_request 91 | 92 | def compute_filter(self, viz, viz_dict): 93 | 94 | def compute_filter_inner(start, selections, filter_strs, source_strs): 95 | 96 | if start.has_filter(): 97 | filter_strs.append(start.filter) 98 | 99 | if start.selection and not start.selection == "": 100 | selections[start.name] = start.selection 101 | 102 | if not start.source or start.source == "": 103 | return 104 | 105 | 106 | source_strs.extend(start.get_source_vizs()) 107 | sources = start.get_source_vizs() 108 | 109 | for src in sources: 110 | compute_filter_inner(viz_dict[src], selections, filter_strs, source_strs) 111 | 112 | source_strs_list = [] 113 | selections = {} 114 | filters = [] 115 | compute_filter_inner(viz, selections, filters, source_strs_list) 116 | 117 | source_strs = " and ".join(source_strs_list) 118 | 119 | for src in source_strs_list: 120 | if src in selections: 121 | source_strs = source_strs.replace(src, selections[src]) 122 | else: 123 | source_strs = source_strs.replace(src, "NULL") 124 | source_strs = source_strs.replace("and NULL", "").replace("NULL and", "").replace("or NULL", "").replace("NULL or", "").replace("NULL", "").strip().lstrip("and ") 125 | if len(source_strs) > 0 and len(filters) > 0: 126 | source_strs += " AND " 127 | return source_strs + " AND ".join(filters) 128 | 129 | def get_nodes(self): 130 | return self.nodes 131 | 132 | def get_nodes_dict(self): 133 | d = {} 134 | for node in self.get_nodes(): 135 | d[node.name] = node 136 | return d 137 | 138 | def remove_connection(self, node1, node2): 139 | self._graph[node1].remove(node2) 140 | 141 | def add_connection(self, node1, node2): 142 | """ Add connection between node1 and node2 """ 143 | self.nodes.add(node1) 144 | self.nodes.add(node2) 145 | 146 | if node1 not in self._graph: 147 | self._graph[node1] = OrderedDict() 148 | self._graph[node1][node2] = True 149 | 150 | 151 | def remove(self, node): 152 | """ Remove all references to node """ 153 | self.nodes.remove(node) 154 | for cxns in self._graph.keys(): 155 | try: 156 | del cxns[node] 157 | except KeyError: 158 | pass 159 | try: 160 | del self._graph[node] 161 | except KeyError: 162 | pass 163 | 164 | def find_dependencies_top_down(self, start): 165 | 166 | if not start in self._graph.keys(): 167 | return [] 168 | 169 | queue = deque() 170 | queue.append(start) 171 | result = [] 172 | while queue: 173 | node = queue.popleft() 174 | result.append(node) 175 | if node in self._graph: 176 | for n in self._graph[node].keys(): 177 | queue.append(n) 178 | return result[1:] 179 | -------------------------------------------------------------------------------- /common/vizrequest.py: -------------------------------------------------------------------------------- 1 | import json 2 | from common import util 3 | 4 | class VizRequest: 5 | 6 | def __init__(self, operation_id, parent_operation_id, viz): 7 | self.operation_id = operation_id 8 | self.parent_operation_id = parent_operation_id 9 | self.viz = viz 10 | self.start_time = util.get_current_ms_time() 11 | self.end_time = None 12 | self.result = None 13 | self.margins = None 14 | self.delivered = False 15 | self.bins = None 16 | self.timedout = False 17 | self.t_start = 0 18 | self.t_pause = 0 19 | self.progress = 0 20 | 21 | def toJSON(self): 22 | return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) -------------------------------------------------------------------------------- /data/flights.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/data/flights.zip -------------------------------------------------------------------------------- /datagen.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import random 4 | import itertools 5 | import numpy as np 6 | import pandas as pd 7 | import scipy as sp 8 | import collections 9 | import threading 10 | from scipy.linalg import eigh, cholesky 11 | from scipy.stats import norm 12 | import scipy.interpolate as interpolate 13 | from optparse import OptionParser 14 | import time 15 | import os 16 | from collections import OrderedDict 17 | 18 | current_milli_time = lambda: int(round(time.time() * 1000)) 19 | 20 | class DataGen: 21 | 22 | def __init__(self): 23 | 24 | parser = OptionParser() 25 | parser.add_option("-r", "--random-seed", dest="seed", action="store", type=int, help="Random seed", default=41001) 26 | parser.add_option("--normalize", dest="normalize", action="store", help="Size of the dataset in MB") 27 | parser.add_option("--prevent-zero", dest="prevent_zero", action="store_true", help="Size of the dataset in MB", default=True) 28 | parser.add_option("-s", "--size", dest="size", action="store", type=int, help="Number of samples to generate", default=50000) 29 | parser.add_option("-b", "--batchsize", dest="batchsize", action="store", type=int, help="Number of samples to process in a single batch", default=100000) 30 | parser.add_option("-x", "--sample-file", dest="samplefile", action="store", help="Path to the sample file", default="data/flights/sample.csv") 31 | parser.add_option("-y", "--sample-descriptor", dest="samplejsonfile", action="store", help="Path to the sample file", default="data/flights/sample.json") 32 | parser.add_option("-n","--num-samples", dest="numsamples", action="store", type=int, help="Number of samples to draw from the original dataset", default=5000000) 33 | parser.add_option("-o", "--output-file", dest="output", action="store", help="The name of the output file", default="dataset.csv") 34 | #parser.add_option("--output-normalized-file", dest="output_normalized", action="store", help="The name of the output file", default="datagen/output_normalized.csv") 35 | (options, args) = parser.parse_args() 36 | 37 | self.options = options 38 | 39 | # read sample json 40 | self.sample_json = None 41 | with open(self.options.samplejsonfile, "r") as f: 42 | self.sample_json = json.load(f) 43 | 44 | if self.options.normalize: 45 | self.normalize() 46 | else: 47 | self.generate_data() 48 | 49 | def normalize(self): 50 | 51 | if "tables" not in self.sample_json: 52 | raise Exception("no tables defined in sample json") 53 | 54 | tables = self.sample_json["tables"]["dimension"] 55 | table_dfs = {} 56 | for tbl in tables: 57 | table_dfs[tbl["name"]] = pd.DataFrame(columns=tbl["columns"]) 58 | 59 | for chunk_id, chunk in enumerate(pd.read_csv(self.options.normalize, chunksize=100000, header=0)): 60 | all_from_fields = [] 61 | for tbl in tables: 62 | 63 | table_df = table_dfs[tbl["name"]] 64 | for mapping in tbl["mapping"]: 65 | xx = chunk[mapping["fromFields"]] 66 | xx.columns = tbl["columns"] 67 | table_df = table_df.append(xx) 68 | table_df = table_df.drop_duplicates(subset=tbl["columns"]) 69 | 70 | 71 | table_df = table_df.reset_index(drop=True) 72 | table_df.index.name = "ID" 73 | table_dfs[tbl["name"]] = table_df 74 | if "tmp_ID" in table_df.columns: 75 | del table_df["tmp_ID"] 76 | table_df.to_csv(tbl["name"], index=True, mode="w") 77 | 78 | 79 | table_df["tmp_ID"] = table_df.index 80 | count = 0 81 | for mapping in tbl["mapping"]: 82 | all_from_fields.extend(mapping["fromFields"]) 83 | boo = len(chunk) 84 | beforechunk = chunk 85 | chunk = pd.merge(chunk, table_df, how="left", left_on=mapping["fromFields"], right_on=tbl["columns"]) 86 | 87 | 88 | chunk = chunk.rename(columns={'tmp_ID': mapping["fk"]}) 89 | 90 | for c in tbl["columns"]: 91 | del chunk[c] 92 | 93 | 94 | for c in all_from_fields: 95 | del chunk[c] 96 | 97 | if chunk_id == 0: 98 | chunk.to_csv(self.options.output, index=False, mode="w") 99 | else: 100 | chunk.to_csv(self.options.output, index=False, header=False, mode="a") 101 | 102 | 103 | def generate_data(self): 104 | # load sample data 105 | self.df = pd.read_csv(self.options.samplefile, nrows=self.options.numsamples, header=0) 106 | 107 | if self.options.prevent_zero: 108 | self.quant_col_names = [ col["field"] for col in self.sample_json["tables"]["fact"]["fields"] if col["type"] == "quantitative" ] 109 | for quant_col_name in self.quant_col_names: 110 | self.df[quant_col_name] = self.df[quant_col_name] - self.df[quant_col_name].min() 111 | 112 | self.cat_col_names = [ col["field"] for col in self.sample_json["tables"]["fact"]["fields"] if col["type"] == "categorical" ] 113 | for cat_col_name in self.cat_col_names: 114 | self.df[cat_col_name] = self.df[cat_col_name].astype("category") 115 | 116 | self.derived_cols = [ col for col in self.sample_json["tables"]["fact"]["fields"] if "deriveFrom" in col ] 117 | self.derivates = {} 118 | for derived_col in self.derived_cols: 119 | kk = self.df.groupby(derived_col["deriveFrom"])[derived_col["field"]].first().to_dict() 120 | self.derivates[derived_col["field"]] = kk 121 | 122 | 123 | self.orgdf = self.df.copy() 124 | self.cat_cols = list(self.orgdf.select_dtypes(include=["category"]).columns) 125 | self.cat_hists = {} 126 | self.cat_hists_keys = {} 127 | self.cat_hists_values = {} 128 | 129 | for cat_col in self.cat_cols: 130 | self.cat_hists[cat_col] = self.df[cat_col].value_counts(normalize=True).to_dict() 131 | self.cat_hists[cat_col] = OrderedDict(sorted(self.cat_hists[cat_col].items(), key=lambda x:x[0])) 132 | self.cat_hists_keys[cat_col] = list(self.cat_hists[cat_col].keys()) 133 | self.cat_hists_values[cat_col] = list(self.cat_hists[cat_col].values()) 134 | del self.df[cat_col] 135 | 136 | self.means = self.df.mean() 137 | self.stdevs = self.df.std() 138 | np.set_printoptions(suppress=True) 139 | 140 | # z-normalize all data 141 | for idx, col in enumerate(self.df.columns): 142 | self.df[col] = (self.df[col] - self.means[col])/self.stdevs[col] 143 | 144 | self.inv_cdfs = self.get_inverse_cdfs(self.orgdf, self.df) 145 | 146 | # apply a gaussian copula 147 | covariance = self.df.cov() 148 | self.decomposition = cholesky(covariance, lower=True) 149 | 150 | # calculate how many batches we need to compute 151 | num_batches = int(math.ceil(self.options.size / self.options.batchsize)) 152 | 153 | st = current_milli_time() 154 | # process all batches 155 | for batch_i in range(num_batches): 156 | print(" %i/%i batches processed." % (batch_i, num_batches)) 157 | self.process_batch(batch_i) 158 | 159 | print("done.") 160 | print( (current_milli_time()-st) ) 161 | 162 | def process_batch(self, batch_number): 163 | 164 | # Calculate how many samples we need to generate for this batch 165 | if (batch_number+1) * self.options.batchsize > self.options.size: 166 | num_samples_to_generate = self.options.size - (batch_number) * self.options.batchsize 167 | else: 168 | num_samples_to_generate = self.options.batchsize 169 | 170 | # adjust the random seed based on the batch number 171 | np.random.seed(seed=self.options.seed + batch_number) 172 | 173 | data_rnormal = sp.stats.norm.rvs(size=(len(self.df.columns), num_samples_to_generate)) 174 | data_rnormal_correlated = np.dot(self.decomposition, data_rnormal) 175 | 176 | # convert each value to the its corresponding value of the normal CDF. We could find the CDF empirically, 177 | # but since our samples follow a normal distribution we know their exact CDF. 178 | stdnormcdf = sp.stats.norm.cdf(data_rnormal_correlated) 179 | 180 | global cc 181 | cc = 0 182 | def apply_inverse(xx): 183 | global cc 184 | res = self.inv_cdfs[cc](xx) 185 | cc += 1 186 | return res 187 | 188 | # apply the inverse (empirical) CDF to the correlated random data 189 | res = np.apply_along_axis(apply_inverse, 1, stdnormcdf) 190 | 191 | # de-normalize 192 | for i, col in enumerate(self.df.columns): 193 | res[i,:] = res[i,:] * self.stdevs[col] + self.means[col] 194 | 195 | # reconstruct categorical values 196 | result_frame = pd.DataFrame(res.T) 197 | result_frame.columns = self.df.columns 198 | 199 | for cat_col in self.cat_cols: 200 | ix = self.orgdf.columns.get_loc(cat_col) 201 | keys = self.cat_hists_keys[cat_col] 202 | values = self.cat_hists_values[cat_col] 203 | xx = np.random.choice(keys, num_samples_to_generate, p=values) 204 | result_frame.insert(self.orgdf.columns.get_loc(cat_col), cat_col, xx) 205 | 206 | for quant_col in self.quant_col_names: 207 | result_frame[quant_col] = result_frame[quant_col].round(decimals=1) 208 | 209 | cast_cols = [ col for col in self.sample_json["tables"]["fact"]["fields"] if "cast" in col ] 210 | for cast_col in cast_cols: 211 | if cast_col["cast"] == "int": 212 | result_frame[cast_col["field"]] = result_frame[cast_col["field"]].astype(int) 213 | else: 214 | raise Exception("unsupported cast") 215 | 216 | for derived_col in self.derived_cols: 217 | new_col = result_frame[derived_col["deriveFrom"]].map(self.derivates[derived_col["field"]]) 218 | result_frame[derived_col["field"]] = new_col 219 | 220 | 221 | 222 | if batch_number == 0: 223 | result_frame.to_csv(self.options.output, index=False, mode="w") 224 | else: 225 | result_frame.to_csv(self.options.output, index=False, header=False, mode="a") 226 | 227 | def get_inverse_cdfs(self, orgdf, df): 228 | cdfs = [] 229 | for col in df.columns: 230 | num_bins = 1000 231 | hist, bin_edges = np.histogram(df[col], bins=num_bins, density=True) 232 | cdf = np.zeros(bin_edges.shape) 233 | cdf[1:] = np.cumsum(hist*np.diff(bin_edges)) 234 | inv_cdf = interpolate.interp1d(cdf, bin_edges) 235 | cdfs.append(inv_cdf) 236 | return cdfs 237 | 238 | DataGen() -------------------------------------------------------------------------------- /drivers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/drivers/__init__.py -------------------------------------------------------------------------------- /drivers/monetdb.py: -------------------------------------------------------------------------------- 1 | import pymonetdb 2 | import datetime, time 3 | import itertools 4 | import csv 5 | import json 6 | import os 7 | import multiprocessing 8 | from subprocess import call 9 | from common import util 10 | 11 | class IDEBenchDriver: 12 | 13 | def init(self, options, schema, driver_arg): 14 | pass 15 | 16 | def create_connection(self): 17 | connection = pymonetdb.connect(username="monetdb", password="monetdb", hostname="localhost", port=50000, database="demo") 18 | cursor = connection.cursor() 19 | return connection, cursor 20 | 21 | def process_request(self, viz_request, options, schema, result_queue): 22 | print("processsing..." + str(viz_request.operation_id)) 23 | viz = viz_request.viz 24 | sql_statement = viz.get_computed_filter_as_sql(schema) 25 | connection, cursor = self.create_connection() 26 | viz_request.start_time = util.get_current_ms_time() 27 | cursor.execute(sql_statement) 28 | data = cursor.fetchall() 29 | viz_request.end_time = util.get_current_ms_time() 30 | connection.close() 31 | 32 | results = {} 33 | for row in data: 34 | keys = [] 35 | for i, bin_desc in enumerate(viz.binning): 36 | 37 | if "width" in bin_desc: 38 | bin_width = bin_desc["width"] 39 | keys.append(str(int(row[i]))) 40 | else: 41 | keys.append(str(row[i])) 42 | 43 | key = ",".join(keys) 44 | results[key] = row[len(viz.binning):] 45 | 46 | viz_request.result = results 47 | result_queue.put(viz_request) 48 | 49 | def workflow_start(self): 50 | # clear cache here 51 | pass 52 | 53 | def workflow_end(self): 54 | pass 55 | -------------------------------------------------------------------------------- /drivers/sample.py: -------------------------------------------------------------------------------- 1 | import time 2 | from common import util 3 | 4 | class IDEBenchDriver: 5 | 6 | def init(self, options, schema, driver_arg): 7 | print("init") 8 | print("table name: %s" %schema.get_fact_table_name()) 9 | print("driver arg0: %s" % driver_arg[0]) 10 | print("driver arg1: %s" % driver_arg[1]) 11 | 12 | def workflow_start(self): 13 | print("workflow start") 14 | pass 15 | 16 | def workflow_end(self): 17 | print("workflow end") 18 | pass 19 | 20 | def process_request(self, viz_request, options, schema, result_queue): 21 | print("processsing...") 22 | 23 | # record start time 24 | viz_request.start_time = util.get_current_ms_time() 25 | 26 | # print SQL translation of request and simulate query execution 27 | print(viz_request.viz.get_computed_filter_as_sql(schema)) 28 | time.sleep(1) 29 | 30 | # record end time 31 | viz_request.end_time = util.get_current_ms_time() 32 | 33 | # write an empty result to the viz_request 34 | viz_request.result = {} 35 | 36 | # notify IDEBench that processing is done by writing it to the result buffer 37 | result_queue.put(viz_request) -------------------------------------------------------------------------------- /drivers/xdb.py: -------------------------------------------------------------------------------- 1 | import datetime, time 2 | import itertools 3 | import psycopg2 4 | import decimal 5 | import os 6 | import multiprocessing 7 | from multiprocessing import Queue 8 | from common import util 9 | 10 | class IDEBenchDriver: 11 | 12 | def init(self, options, schema, driver_arg): 13 | pass 14 | 15 | def workflow_start(self): 16 | print("workflow start") 17 | pass 18 | 19 | def workflow_end(self): 20 | os.system("/usr/local/pgsql/bin/pg_ctl stop -D ~/xdb_data") 21 | os.system('sudo -b bash -c "echo 1 > /proc/sys/vm/drop_caches"') 22 | os.system("/usr/local/pgsql/bin/pg_ctl start -D ~/xdb_data") 23 | pass 24 | 25 | def can_execute_online(self, sql_statement): 26 | return (not " or " in sql_statement.lower()) and (not " AVG(" in sql_statement) 27 | 28 | def create_connection(self, timeout=None): 29 | ip = "localhost" 30 | if timeout is None: 31 | connection = psycopg2.connect("dbname='idebench' port=45001 user='test' host='%s' password='test'" % (ip)) 32 | else: 33 | connection = psycopg2.connect("dbname='idebench' port=45001 user='test' host='%s' password='test' options='-c statement_timeout=%i'" % (ip, timeout)) 34 | cursor = connection.cursor() 35 | return connection, cursor 36 | 37 | def process_request(self, viz_request, options, schema, out_q): 38 | print("processsing..." + str(viz_request.operation_id)) 39 | if viz_request.viz.binning: 40 | sql_statement = viz_request.viz.get_computed_filter_as_sql(schema) 41 | sql_statement = sql_statement.replace(schema.get_fact_table_name(), "%s_%s%s" % (schema.get_fact_table_name(), options.settings_size, "n" if options.settings_normalized else "") ) 42 | if self.can_execute_online(sql_statement): 43 | sql_statement = sql_statement.replace("SELECT ", "SELECT ONLINE ") 44 | sql_statement += " WITHTIME %s CONFIDENCE 95" % options.settings_time_requirement 45 | sql_statement += " REPORTINTERVAL %s;" % options.settings_time_requirement 46 | connection, cursor = self.create_connection(options.settings_time_requirement + 20) 47 | else: 48 | connection, cursor = self.create_connection(options.settings_time_requirement) 49 | viz_request.start_time = util.get_current_ms_time() 50 | try: 51 | cursor.execute(sql_statement) 52 | except psycopg2.extensions.QueryCanceledError as qce: 53 | viz_request.result = {} 54 | viz_request.margins = {} 55 | viz_request.timedout = True 56 | viz_request.end_time = util.get_current_ms_time() 57 | out_q.put(viz_request) 58 | return 59 | 60 | data = cursor.fetchall() 61 | viz_request.end_time = util.get_current_ms_time() 62 | connection.close() 63 | 64 | results = {} 65 | margins = {} 66 | for row in data: 67 | keys = [] 68 | 69 | if row[0] is None: 70 | continue 71 | 72 | startindex = 3 if self.can_execute_online(sql_statement) else 0 73 | 74 | for i, bin_desc in enumerate(viz_request.viz.binning): 75 | if "width" in bin_desc: 76 | bin_width = bin_desc["width"] 77 | keys.append(str(int(row[i+startindex]))) 78 | else: 79 | keys.append(str(row[startindex+i]).strip()) 80 | 81 | key = ",".join(keys) 82 | 83 | row = list(row) 84 | for i,r in enumerate(row): 85 | if isinstance(r, decimal.Decimal): 86 | row[i] = float(r) 87 | if startindex == 3: 88 | results[key] = row[len(viz_request.viz.binning)+startindex:-1] 89 | else: 90 | results[key] = row[len(viz_request.viz.binning)+startindex:] 91 | 92 | if self.can_execute_online(sql_statement) and startindex == 3: 93 | margins[key] = row[len(row)-1:] 94 | 95 | viz_request.result = results 96 | viz_request.margins = margins 97 | out_q.put(viz_request) 98 | print("delivering...") 99 | -------------------------------------------------------------------------------- /idebench.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import json 3 | import csv 4 | import time 5 | import hashlib 6 | import multiprocessing 7 | import statistics 8 | import numpy as np 9 | import os 10 | from common.schema import Schema 11 | from common.vizgraph import VizGraph 12 | from common.vizrequest import VizRequest 13 | from common.operation import Operation 14 | from optparse import OptionParser 15 | from scipy import spatial 16 | import glob 17 | from os.path import basename 18 | 19 | class IDEBench: 20 | 21 | result_queue = multiprocessing.Queue() 22 | def __init__(self): 23 | 24 | parser = OptionParser() 25 | parser.add_option("--driver-name", dest="driver_name", action="store", help="Driver name") 26 | parser.add_option("--driver-create-storage", dest="driver_create_storage", action="store_true", help="Calls create_storage on driver", default=False) 27 | parser.add_option("--driver-clear-storage", dest="driver_clear_storage", action="store_true", help="Calls clear_storage on driver", default=False) 28 | parser.add_option("--driver-clear-cache", dest="driver_clear_cache", action="store_true", help="Calls clear_cache on driver", default=False) 29 | parser.add_option("--driver-args", dest="driver_args", action="store", help="Arguments to pass to the driver", default="") 30 | 31 | parser.add_option("--settings-normalized", dest="settings_normalized", action="store_true", help="Whether joins should be used", default=False) 32 | parser.add_option("--settings-dataset", dest="settings_dataset", action="store", help="Name of the dataset") 33 | parser.add_option("--settings-size", dest="settings_size", default="", action="store", help="Number of rows in the dataset") 34 | parser.add_option("--settings-thinktime", dest="settings_thinktime", type="int", action="store", help="Think-time in seconds between two executions", default=1000) 35 | parser.add_option("--settings-time-requirement", dest="settings_time_requirement", action="store", help="The Time requirement to be used", default=1000) 36 | parser.add_option("--settings-confidence-level", dest="settings_confidence_level", action="store", help="The confidence level to be used", default=95) 37 | parser.add_option("--settings-workflow", dest="settings_workflow", action="store", help="The workflow file to be used") 38 | 39 | parser.add_option("--evaluate", dest="evaluate", action="store_true", help="Size of the dataset in MB", default=False) 40 | parser.add_option("--create--full-report", dest="create_report", action="store_true", help="Merges all reports in the reports directory into a single file", default=False) 41 | parser.add_option("--run", dest="run", action="store_true", help="Flag to run the benchmark without config file", default=False) 42 | parser.add_option("--run-config", dest="config", action="store", help="Flag to run the benchmark with the specified config file") 43 | parser.add_option("--groundtruth", dest="groundtruth", action="store_true", help="If set computes the ground-truth for the specified workflow", default=False) 44 | 45 | (self.options, args) = parser.parse_args() 46 | 47 | if not self.options.config: 48 | 49 | if self.options.create_report: 50 | self.create_report() 51 | return 52 | 53 | if not self.options.driver_name: 54 | parser.error("No driver name specified.") 55 | 56 | if not self.options.settings_dataset: 57 | parser.error("No dataset specified.") 58 | 59 | if not self.options.settings_size: 60 | print("Warning: No dataset size specified.") 61 | 62 | if self.options.groundtruth or self.options.run: 63 | self.setup() 64 | 65 | if self.options.groundtruth: 66 | 67 | self.options.think_time = 1 68 | self.options.time_requirement = 999999 69 | 70 | workflow_files = glob.glob("data/" + self.options.settings_dataset + "/workflows/*.json") 71 | 72 | for workflow_file in workflow_files: 73 | self.options.settings_workflow = basename(workflow_file).split(".")[0] 74 | self.run() 75 | 76 | elif self.options.run: 77 | 78 | if not self.options.settings_workflow: 79 | parser.error("No workflow specified.") 80 | 81 | self.run() 82 | elif self.options.evaluate: 83 | self.evaluate(self.get_config_hash()) 84 | else: 85 | 86 | with open(self.options.config) as f: 87 | config = json.load(f) 88 | assure_path_exists("./results") 89 | for d in config["settings-datasets"]: 90 | assure_path_exists("./data/%s/groundtruths" % d) 91 | 92 | # TODO: create pairs instead 93 | for dataset in config["settings-datasets"]: 94 | self.options.settings_dataset = dataset 95 | 96 | for driver_name in config["driver-names"]: 97 | for driver_arg in config["driver-args"]: 98 | 99 | self.options.driver_name = driver_name 100 | self.setup(driver_arg) 101 | 102 | for size in config["settings-sizes"]: 103 | for workflow in config["settings-workflows"]: 104 | for thinktime in config["settings-thinktimes"]: 105 | for time_requirement in config["settings-time-requirements"]: 106 | for confidence_level in config["settings-confidence-levels"]: 107 | 108 | self.options.driver_name = driver_name 109 | 110 | self.options.settings_size = size 111 | self.options.settings_workflow = workflow 112 | self.options.settings_thinktime = thinktime 113 | self.options.settings_time_requirement = time_requirement 114 | self.options.settings_confidence_level = confidence_level 115 | self.options.settings_normalized = config["settings-normalized"] 116 | self.options.groundtruth = config["groundtruth"] if "groundtruth" in config else False 117 | self.options.run = config["run"] if "run" in config else True 118 | self.options.evaluate = config["evaluate"] if "evaluate" in config else True 119 | 120 | if self.options.run: 121 | self.run() 122 | 123 | if self.options.evaluate: 124 | self.evaluate(self.get_config_hash()) 125 | 126 | def setup(self, driver_arg = None): 127 | with open(self.get_schema_path()) as f: 128 | self.schema = Schema(json.load(f), self.options.settings_normalized) 129 | 130 | module = importlib.import_module("drivers." + self.options.driver_name) 131 | self.driver = getattr(module, "IDEBenchDriver")() 132 | 133 | try: 134 | self.driver.init(self.options, self.schema, driver_arg) 135 | except AttributeError: 136 | pass 137 | 138 | def run(self): 139 | 140 | try: 141 | self.driver.workflow_start() 142 | except AttributeError: 143 | pass 144 | 145 | 146 | with open(self.get_workflow_path()) as f: 147 | self.workflow_interactions = json.load(f)["interactions"] 148 | 149 | self.vizgraph = VizGraph() 150 | self.operation_results = { "args": vars(self.options), "results": {} } 151 | self.current_interaction_index = 0 152 | self.current_vizrequest_index = 0 153 | self.process_interaction(0) 154 | 155 | def end_run(self): 156 | 157 | try: 158 | self.driver.workflow_end() 159 | except AttributeError: 160 | pass 161 | 162 | path = "results/%s.json" % (self.get_config_hash()) 163 | 164 | if not self.options.groundtruth: 165 | with open(path, "w") as fp: 166 | json.dump(self.operation_results, fp) 167 | 168 | if self.options.groundtruth: 169 | path = "data/%s/groundtruths/%s_%s.json" % (self.options.settings_dataset, self.options.settings_size, self.options.settings_workflow) 170 | with open(path, "w") as fp: 171 | json.dump(self.operation_results, fp) 172 | 173 | def process_interaction(self, interaction_index): 174 | print("processing!") 175 | if interaction_index < 0 or interaction_index >= len(self.workflow_interactions): 176 | print("reached end of interactions") 177 | self.end_run() 178 | return 179 | 180 | print("thinking...") 181 | time.sleep(self.options.settings_thinktime / 1000) 182 | 183 | interaction = self.workflow_interactions[interaction_index] 184 | vizs_to_request = self.vizgraph.apply_interaction(Operation(interaction)) 185 | 186 | viz_requests = [] 187 | for viz in vizs_to_request: 188 | viz_requests.append(VizRequest(self.current_vizrequest_index, self.current_interaction_index, viz)) 189 | self.current_vizrequest_index += 1 190 | 191 | #if interaction_index == 0: 192 | # self.result_queue = multiprocessing.Queue() 193 | 194 | # TODO: document this feature 195 | try: 196 | self.driver.before_requests(self.options, self.schema, IDEBench.result_queue) 197 | except AttributeError: 198 | pass 199 | 200 | procs = [] 201 | nprocs = len(viz_requests) 202 | if hasattr(self.driver, "use_single_process") and self.driver.use_single_process: 203 | for viz_request in viz_requests: 204 | self.driver.process_request(viz_request, self.options, self.schema, IDEBench.result_queue) 205 | else: 206 | for viz_request in viz_requests: 207 | proc = multiprocessing.Process(target=self.driver.process_request, args=(viz_request, self.options, self.schema, IDEBench.result_queue)) 208 | procs.append(proc) 209 | proc.start() 210 | 211 | resultlist = [] 212 | for i in range(nprocs): 213 | resultlist.append(IDEBench.result_queue.get()) 214 | 215 | for proc in procs: 216 | proc.join() 217 | 218 | self.deliver_viz_request(resultlist) 219 | self.current_interaction_index += 1 220 | self.process_interaction(self.current_interaction_index) 221 | 222 | 223 | def deliver_viz_request(self, viz_requests): 224 | 225 | for viz_request in viz_requests: 226 | if len(viz_request.result.keys()) == 0: 227 | pass 228 | 229 | operation_result = {} 230 | operation_result["id"] = viz_request.operation_id 231 | operation_result["sql"] = viz_request.viz.get_computed_filter_as_sql(self.schema) 232 | operation_result["viz_name"] = viz_request.viz.name 233 | operation_result["parent_operation_id"] = viz_request.parent_operation_id 234 | operation_result["start_time"] = viz_request.start_time 235 | operation_result["end_time"] = viz_request.end_time 236 | operation_result["time_violated"] = viz_request.timedout 237 | operation_result["t_pause"] = viz_request.t_pause 238 | operation_result["t_start"] = viz_request.t_start 239 | operation_result["progress"] = viz_request.progress 240 | operation_result["output"] = viz_request.result 241 | operation_result["margins"] = viz_request.margins 242 | operation_result["num_binning_dimensions"] = len(viz_request.viz.binning) 243 | operation_result["num_aggregates_per_bin"] = len(viz_request.viz.per_bin_aggregates) 244 | 245 | bin_types = [] 246 | for viz_bin in viz_request.viz.binning: 247 | if "width" in viz_bin: 248 | bin_types.append("quantitative") 249 | else: 250 | bin_types.append("nominal") 251 | operation_result["binning_type"] = "_".join(sorted(bin_types)) 252 | 253 | agg_types = [] 254 | for viz_agg in viz_request.viz.per_bin_aggregates: 255 | if viz_agg["type"] == "count": 256 | agg_types.append("count") 257 | elif viz_agg["type"] == "avg": 258 | agg_types.append("avg") 259 | else: 260 | raise Exception() 261 | operation_result["aggregate_type"] = "_".join(sorted(agg_types)) 262 | 263 | if not viz_request.operation_id in self.operation_results: 264 | self.operation_results["results"][viz_request.operation_id] = operation_result 265 | 266 | viz_request.delivered = True 267 | 268 | #self.driver.request_vizs(self.viz_requests) 269 | 270 | def get_config_hash(self): 271 | o = self.options 272 | h = (o.driver_name, o.settings_dataset, o.settings_workflow, o.settings_size, o.settings_normalized, o.settings_confidence_level, o.settings_thinktime, o.settings_thinktime, o.settings_time_requirement) 273 | return hashlib.md5(str(h).encode('utf-8')).hexdigest() 274 | 275 | def get_schema_path(self): 276 | return "data/%s/sample.json" % (self.options.settings_dataset) 277 | 278 | def get_workflow_path(self): 279 | return "data/%s/workflows/%s.json" % (self.options.settings_dataset, self.options.settings_workflow) 280 | 281 | def compute_viz_similarity(self, viz_gt, viz): 282 | 283 | if len(viz.keys()) == 0 and len(viz_gt.keys()) == 0: 284 | return 1 285 | 286 | if len(viz_gt.keys()) == 0 and len(viz.keys()) > 0: 287 | raise Exception() 288 | 289 | if len(viz_gt.keys()) > 0 and len(viz.keys()) == 0: 290 | return 0 291 | 292 | for gt_key in viz_gt.keys(): 293 | if gt_key not in viz: 294 | viz[gt_key] = 0 295 | 296 | viz_gt_vals = [] 297 | viz_vals = [] 298 | for gt_key in viz_gt.keys(): 299 | if isinstance(viz_gt[gt_key], list): 300 | viz_gt_vals.append(viz_gt[gt_key][0]) 301 | else: 302 | viz_gt_vals.append(viz_gt[gt_key]) 303 | 304 | if isinstance(viz[gt_key], list): 305 | viz_vals.append(viz[gt_key][0]) 306 | else: 307 | viz_vals.append(viz[gt_key]) 308 | 309 | viz_gt_vals = np.array(viz_gt_vals).astype(float) 310 | viz_vals = np.array(viz_vals).astype(float) 311 | 312 | 313 | 314 | #viz_gt_vals = self.normalize(viz_gt_vals) 315 | #viz_vals = self.normalize(viz_vals) 316 | 317 | if np.isnan(viz_gt_vals).any(): 318 | raise Exception() 319 | 320 | if np.isnan(viz_vals).any(): 321 | raise Exception() 322 | 323 | 324 | #score = np.dot(viz_gt_vals, viz_vals)/ ( np.sqrt(np.sum(np.square(viz_gt_vals))) * np.sqrt(np.sum(np.square(viz_vals))) ) 325 | np.seterr(all='raise') 326 | try: 327 | score = 1 - spatial.distance.cosine(viz_gt_vals, viz_vals) 328 | except: 329 | return 0 330 | return score if not np.isnan(score) else 0 331 | 332 | def normalize(self, v): 333 | norm=np.linalg.norm(v, ord=1) 334 | if norm==0: 335 | norm=np.finfo(v.dtype).eps 336 | return v/norm 337 | 338 | def evaluate(self, config_hash): 339 | print("evaluate") 340 | result_json = None 341 | try: 342 | with open("results/%s.json" % config_hash, "r") as json_data: 343 | result_json = json.load(json_data) 344 | except: 345 | print("couldn't load file %s" % ("results/%s.json" % config_hash)) 346 | return 347 | 348 | workflow = result_json["args"]["settings_workflow"] 349 | dataset = result_json["args"]["settings_dataset"] 350 | size = result_json["args"]["settings_size"] 351 | time_requirement = result_json["args"]["settings_time_requirement"] 352 | 353 | with open("data/%s/groundtruths/%s_%s.json" % (dataset, size, workflow), "r") as json_data: 354 | groundtruths = json.load(json_data)["results"] 355 | 356 | with open("reports/%s.csv" % config_hash, 'w') as fp: 357 | w = csv.DictWriter(fp, [ 358 | "operation_id", 359 | "config_hash", 360 | "interaction_id", 361 | "dataset", 362 | "size", 363 | "viz_name", 364 | "interface", 365 | "think_time", 366 | "time_requirement", 367 | "t_start", 368 | "t_pause", 369 | "workflow", 370 | "start_time", 371 | "end_time", 372 | "duration", 373 | "progress", 374 | "time_violated", 375 | "num_binning_dimensions", 376 | "binning_type", 377 | "has_invalid_bins", 378 | "num_bins_out_of_margin", 379 | "num_bins_delivered", 380 | "num_bins_in_gt", 381 | "missing_bins", 382 | "dissimilarity", 383 | "num_aggregates_per_bin", 384 | "aggregate_type", 385 | "bias", 386 | "rel_error_avg", 387 | "rel_error_stdev", 388 | "rel_error_min", 389 | "rel_error_max", 390 | "margin_avg", 391 | "margin_stdev", 392 | "margin_min", 393 | "margin_max", 394 | "margin_ratio"], delimiter=",", lineterminator="\n") 395 | w.writeheader() 396 | 397 | operations = result_json["results"] 398 | 399 | 400 | for op_number in operations.keys(): 401 | 402 | gt_output = groundtruths[op_number]["output"] 403 | operation = operations[op_number] 404 | 405 | margins = [] 406 | rel_errors = [] 407 | forecast_values = [] 408 | actual_values = [] 409 | out_of_margin_count = 0 410 | 411 | for gt_bin_identifier, gt_aggregate_results in gt_output.items(): 412 | 413 | if gt_bin_identifier in operation["output"]: 414 | 415 | for agg_bin_result_index, agg_bin_result in enumerate(operation["output"][gt_bin_identifier]): 416 | rel_error = None 417 | op_result = operation["output"][gt_bin_identifier][agg_bin_result_index] 418 | gt_result = gt_aggregate_results[agg_bin_result_index] 419 | 420 | if abs(gt_result) > 0: 421 | rel_error = abs(op_result - gt_result)/abs(gt_result) 422 | if rel_error > 1e-5: 423 | pass 424 | rel_errors.append(rel_error) 425 | else: 426 | print("ignoring zero in groundtruth") 427 | 428 | forecast_values.append(op_result) 429 | actual_values.append(gt_result) 430 | 431 | if operation["margins"] and gt_bin_identifier in operation["margins"]: 432 | op_margin = float(operation["margins"][gt_bin_identifier][agg_bin_result_index]) 433 | 434 | if np.isnan(op_margin) or np.isinf(op_margin) or abs(op_margin) > 1000000: 435 | if os.path.exists("./margin_errors"): 436 | append_write = 'a' # append if already exists 437 | else: 438 | append_write = 'w' # make a new file if not 439 | with open("./margin_errors", append_write) as ffff: 440 | ffff.writelines(self.options.settings_workflow + "\n" + str(operation["margins"][gt_bin_identifier][agg_bin_result_index]) + "\n") 441 | 442 | elif gt_result + 1e-6 < op_result - abs(op_result * op_margin) or gt_result - 1e-6 > op_result + abs(op_result * op_margin): 443 | out_of_margin_count += 1 444 | margins.append(abs(op_margin)) 445 | else: 446 | margins.append(abs(op_margin)) 447 | 448 | 449 | else: 450 | pass 451 | # add error as many times as a bin was expected! 452 | #rel_errors.extend( [ 1 for n in range(len(gt_aggregate_results)) ] ) 453 | 454 | # invalid bins test 455 | has_invalid_bins = False 456 | num_invalid = 0 457 | inv = [] 458 | 459 | for kk in operation["output"].keys(): 460 | if kk not in gt_output: 461 | has_invalid_bins = True 462 | num_invalid += 1 463 | inv.append(kk) 464 | 465 | print(self.options.settings_workflow) 466 | print(str(operation["id"])) 467 | print("invalid key:" + kk) 468 | print(operation["sql"]) 469 | print(operation["output"]) 470 | os._exit(0) 471 | 472 | args = result_json["args"] 473 | 474 | missing_bins = 1 - len(operation["output"].keys()) / len(gt_output.keys()) if len(gt_output.keys()) > 0 else 0 475 | op_eval_result = {} 476 | op_eval_result["operation_id"] = operation["id"] 477 | op_eval_result["config_hash"] = self.get_config_hash() 478 | op_eval_result["interaction_id"] = operation["parent_operation_id"] 479 | op_eval_result["dataset"] = args["settings_dataset"] 480 | op_eval_result["size"] = args["settings_size"] 481 | op_eval_result["viz_name"] = operation["viz_name"] 482 | op_eval_result["think_time"] = args["settings_thinktime"] 483 | op_eval_result["time_requirement"] = args["settings_time_requirement"] 484 | op_eval_result["interface"] = args["driver_name"] 485 | op_eval_result["workflow"] = args["settings_workflow"] 486 | op_eval_result["start_time"] = operation["start_time"] 487 | op_eval_result["end_time"] = operation["end_time"] 488 | op_eval_result["t_pause"] = operation["t_pause"] if "t_pause" in operation else 0 489 | op_eval_result["t_start"] = operation["t_start"] if "t_start" in operation else 0 490 | op_eval_result["duration"] = operation["end_time"] - operation["start_time"] 491 | 492 | if "time_violated" in operation: 493 | op_eval_result["time_violated"] = operation["time_violated"] 494 | elif "timedout" in operation: 495 | op_eval_result["time_violated"] = operation["timedout"] 496 | else: 497 | raise Exception() 498 | 499 | op_eval_result["has_invalid_bins"] = has_invalid_bins 500 | op_eval_result["binning_type"] = operation["binning_type"] 501 | op_eval_result["aggregate_type"] = operation["aggregate_type"] 502 | op_eval_result["num_bins_delivered"] = len(operation["output"].keys()) 503 | op_eval_result["num_bins_in_gt"] = len(gt_output.items()) 504 | op_eval_result["missing_bins"] = "%.5f" % missing_bins 505 | 506 | op_eval_result["dissimilarity"] = "%.5f" % (1- self.compute_viz_similarity(gt_output, operation["output"])) 507 | 508 | op_eval_result["num_bins_out_of_margin"] = "%i" % out_of_margin_count 509 | op_eval_result["num_aggregates_per_bin"] = operation["num_aggregates_per_bin"] 510 | op_eval_result["num_binning_dimensions"] = operation["num_binning_dimensions"] 511 | op_eval_result["progress"] = "%.5f" % operation["progress"] 512 | op_eval_result["bias"] = "%.5f" % (sum(forecast_values) / sum(actual_values) - 1)if len(actual_values) > 0 else 0 513 | op_eval_result["rel_error_stdev"] = "%.5f" % statistics.stdev(rel_errors) if len(rel_errors) > 1 else 0.0 514 | op_eval_result["rel_error_min"] = "%.5f" % min(rel_errors) if len(rel_errors) > 0 else 0 515 | op_eval_result["rel_error_max"] = "%.5f" % max(rel_errors) if len(rel_errors) > 0 else 0 516 | op_eval_result["rel_error_avg"] = "%.5f" % float(sum(rel_errors) / float(len(rel_errors))) if len(rel_errors) > 0 else 0 517 | op_eval_result["margin_stdev"] = "%.5f" % statistics.stdev(margins) if len(margins) > 1 else 0.0 518 | op_eval_result["margin_min"] = "%.5f" % min(margins) if len(margins) > 0 else 0.0 519 | op_eval_result["margin_max"] = "%.5f" % max(margins) if len(margins) > 0 else 0.0 520 | op_eval_result["margin_avg"] = "%.5f" % float(sum(margins) / float(len(margins))) if len(margins) > 0 else 0.0 521 | op_eval_result["margin_ratio"] = "%.5f" % float(len(operation["margins"]) / len(operation["output"])) if operation["margins"] and len(operation["output"]) > 0 else 1 522 | w.writerow(op_eval_result) 523 | 524 | def create_report(self): 525 | header_saved = False 526 | interesting_files = glob.glob("reports/*.csv") 527 | with open('./full_report.csv','w') as fout: 528 | for filename in interesting_files: 529 | print(filename) 530 | with open(filename) as fin: 531 | header = next(fin) 532 | if not header_saved: 533 | print(header) 534 | fout.write(header) 535 | header_saved = True 536 | for line in fin: 537 | fout.write(line) 538 | print("saved report") 539 | 540 | 541 | def assure_path_exists(path): 542 | d = os.path.dirname(path) 543 | if not os.path.exists(d): 544 | os.makedirs(d) 545 | 546 | if __name__ == '__main__': 547 | IDEBench() 548 | 549 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/logo.png -------------------------------------------------------------------------------- /reports/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore -------------------------------------------------------------------------------- /results/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore -------------------------------------------------------------------------------- /runconfig_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "driver-names": ["sample"], 3 | "driver-args": [[100, "arg2"]], 4 | "settings-datasets": ["flights"], 5 | "settings-workflows": ["test"], 6 | "settings-sizes": ["500k"], 7 | "settings-thinktimes": [1000], 8 | "settings-time-requirements": [1000], 9 | "settings-confidence-levels": [95], 10 | "settings-normalized": false 11 | } -------------------------------------------------------------------------------- /workflowgen.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import pprint 4 | import json 5 | from workflowgen.vizaction import VizAction 6 | from workflowgen.linkaction import LinkAction 7 | from optparse import OptionParser 8 | import pandas as pd 9 | from common.schema import Schema 10 | from common.vizgraph import VizGraph 11 | #from common.storage import Storage 12 | import pandasql 13 | 14 | 15 | class WorkflowGenerator: 16 | 17 | def __init__(self): 18 | 19 | parser = OptionParser() 20 | parser.add_option("-r", "--seed", dest="seed", action="store", type=int, help="Random seed", default=25000) 21 | parser.add_option("-d", "--dataset", dest="data_folder", action="store", help="path to save the file", default="flights") 22 | parser.add_option("--debug", dest="debug", action="store_true", help="creates a debug file", default=False) 23 | parser.add_option("-n", "--num-operations", dest="num_operations", action="store", type=int, help="Number of operations to generate", default=20) 24 | parser.add_option("-c", "--workflow-type", dest="config", action="store", help="path to config file", default="data/flights/workflowtypes/sequential.json") 25 | parser.add_option("-p", "--output", dest="path", action="store", help="path to save the file", default="workflow.json") 26 | parser.add_option("-s", "--num-samples", dest="numsamples", action="store", type=int, help="Number of samples to draw from the original dataset", default=10000) 27 | (options, args) = parser.parse_args() 28 | self.options = options 29 | 30 | random.seed(options.seed) 31 | np.random.seed(seed=options.seed) 32 | 33 | print("data/" + options.data_folder + "/" + options.config) 34 | with open("data/" + options.data_folder + "/workflowtypes/" + options.config, "r") as fp: 35 | self.config = json.load(fp) 36 | 37 | schema = None 38 | with open(self.get_schema_path()) as f: 39 | schema = Schema(json.load(f)) 40 | 41 | print("reading csv...") 42 | # load sample data 43 | df = pd.read_csv("data/" + options.data_folder + "/sample.csv", nrows=options.numsamples, header=0) 44 | 45 | #schema = {"tables": [{ "name": "df", "dimensions": []}]} 46 | sample_json = None 47 | with open("data/" + options.data_folder + "/sample.json", "r") as f: 48 | sample_json = json.load(f) 49 | # for field in sample_json["tables"]["fact"]["fields"]: 50 | # schema["tables"][0]["dimensions"].append({"name": field["field"]}) 51 | 52 | 53 | #storage = Storage(schema) 54 | 55 | zero_qs_ratio = 100 56 | 57 | tries = -1 58 | while zero_qs_ratio > 0.15: 59 | tries += 1 60 | num_zeros_qs = 0 61 | num_qs = 0 62 | VizAction.VIZ_COUNTER = -1 63 | LinkAction.FIRST_LINK = None 64 | LinkAction.LATEST_LINK = None 65 | LinkAction.LINKS = set() 66 | 67 | vizgraph = VizGraph() 68 | random.seed(options.seed + tries) 69 | root = VizAction(self.config, df, vizgraph, schema, sample_json) 70 | current = root 71 | states = [] 72 | 73 | num_ops = 0 74 | 75 | debug_states = [] 76 | while num_ops < options.num_operations: 77 | res = current.get_states() 78 | if res: 79 | affected_vizs = vizgraph.apply_interaction(res) 80 | if options.debug: 81 | nodes_dict = vizgraph.get_nodes_dict() 82 | states_dict = {} 83 | for n in nodes_dict.keys(): 84 | states_dict[n] = { 85 | "name":n, 86 | "source" : nodes_dict[n].get_source(), 87 | "binning": nodes_dict[n].binning, 88 | "agg": nodes_dict[n].per_bin_aggregates, 89 | "selection": nodes_dict[n].get_selection(), 90 | "filter": nodes_dict[n].get_filter(), 91 | "computed_filter": nodes_dict[n].get_computed_filter_as_sql(schema), 92 | } 93 | debug_states.append(states_dict) 94 | 95 | for x in affected_vizs: 96 | sql = x.get_computed_filter_as_sql(schema).replace("FLOOR", "ROUND").replace(schema.get_fact_table_name(), "df") 97 | r = pandasql.sqldf(sql, locals()) 98 | num_qs += 1 99 | if len(r.index) == 0: 100 | num_zeros_qs += 1 101 | 102 | states.append(res.data) 103 | #if "source" not in res: 104 | num_ops += 1 105 | 106 | current = current.get_next() 107 | if current is None: 108 | zero_qs_ratio = num_zeros_qs/num_qs 109 | break 110 | zero_qs_ratio = num_zeros_qs/num_qs 111 | 112 | 113 | with open("data/" + options.data_folder + "/workflows/" + options.path + ".json", "w") as fp: 114 | fp.write(json.dumps({"name": "generated", "dataset": options.data_folder, "seed": options.seed, "config": options.config, "interactions": states})) 115 | 116 | print("done.") 117 | #with open("workflowviewer/public/workflow.json", "w") as fp: 118 | # fp.write(json.dumps({"name": "generated", "dataset": options.data_folder, "seed": options.seed, "config": options.config, "interactions": states})) 119 | 120 | #with open("workflowviewer/public/workflow_debug.json", "w") as fp: 121 | # fp.write(json.dumps(debug_states)) 122 | 123 | #if options.debug: 124 | # import webbrowser 125 | # url = "http://localhost:3000" 126 | # webbrowser.open(url) 127 | 128 | def get_schema_path(self): 129 | return "data/%s/sample.json" % (self.options.data_folder) 130 | 131 | def get_viz_name(self): 132 | return "viz_%i" % self.config["viz_counter"] 133 | 134 | WorkflowGenerator() -------------------------------------------------------------------------------- /workflowgen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IDEBench/IDEBench-public/67339d9b81d0bcbb7b41ce6dc2e55918cf1c498f/workflowgen/__init__.py -------------------------------------------------------------------------------- /workflowgen/baseaction.py: -------------------------------------------------------------------------------- 1 | import random 2 | import importlib 3 | import numpy as np 4 | 5 | class BaseAction: 6 | 7 | def __init__(self, config, df, vizgraph, storage, sample_json): 8 | self.config = config 9 | self.df = df 10 | self.vizgraph = vizgraph 11 | self.storage = storage 12 | self.sample_json = sample_json 13 | 14 | def get_next(self): 15 | pick = self.pick(self.config["nextAction"]["values"], self.config["nextAction"]["pd"]) 16 | pick_split = pick.split(".") 17 | module = importlib.import_module(pick_split[0] + "." + pick_split[1]) 18 | return getattr(module, pick_split[2])(self.config, self.df, self.vizgraph, self.storage, self.sample_json) 19 | 20 | def get_states(self): 21 | return [] 22 | 23 | def pick(self, choices, pd=None): 24 | if pd is None: 25 | return random.choice(choices) 26 | 27 | total = sum(pd) 28 | r = random.uniform(0, total) 29 | upto = 0 30 | for i, c in enumerate(choices): 31 | if upto + pd[i] >= r: 32 | return c 33 | upto += pd[i] 34 | assert False, "Shouldn't get here" 35 | 36 | def pick_range(self, val_min, val_max): 37 | delta = val_max - val_min 38 | selectionrange = max(0, min(np.random.normal(loc=0.5, scale=0.25),1)) 39 | selectionstart = random.uniform(0, 1 - selectionrange) 40 | return val_min + delta * selectionstart, (1 - selectionrange) * delta 41 | -------------------------------------------------------------------------------- /workflowgen/bulkgen.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | t = "inverse_bushy" 4 | config = "config/%s.json" % t 5 | sizes = [25] 6 | for size in sizes: 7 | for n in range(10): 8 | seed = 1000 * size + n 9 | os.system("python workflowgen.py -n %i -r %i --config %s -p %s" % (size, seed, config, ("../workflows/%s_%i_r%i.json" % (t, size,seed)))) -------------------------------------------------------------------------------- /workflowgen/filteraction.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import numpy as np 4 | import pandasql 5 | from collections import OrderedDict 6 | from workflowgen.baseaction import BaseAction 7 | from workflowgen.vizaction import VizAction 8 | from common.operation import Operation 9 | 10 | class FilterAction(BaseAction): 11 | 12 | def get_states(self): 13 | 14 | src_viz_num = random.randint(0, VizAction.VIZ_COUNTER) 15 | src_viz = list(self.vizgraph.get_nodes())[src_viz_num] 16 | computed_filter = src_viz.get_computed_filter() 17 | df = self.df 18 | sql_statement = "SELECT * FROM df " 19 | if len(computed_filter) > 0: 20 | sql_statement += "WHERE " + computed_filter 21 | 22 | df_result = pandasql.sqldf(sql_statement, locals()) 23 | 24 | if df_result.empty: 25 | return None 26 | 27 | filter_per_dim = [] 28 | 29 | for bin_dim in range(len(src_viz.binning)): 30 | filters = [] 31 | dim = src_viz.binning[bin_dim]["dimension"] 32 | field = list(filter(lambda x: x["field"] == dim, self.sample_json["tables"]["fact"]["fields"]))[0] 33 | if field["type"] == "quantitative": 34 | bin_width = float(src_viz.binning[bin_dim]["width"]) 35 | min_val = df_result[dim].min() 36 | max_val = df_result[dim].max() 37 | 38 | min_index = math.floor(min_val / bin_width) 39 | max_index = math.floor(max_val / bin_width) 40 | num_bins = 0 41 | if np.random.rand() < 0.4: 42 | num_bins = 1 43 | else: 44 | num_bins = random.randint(1, max_index-min_index) if max_index > min_index else 1 45 | selected_bins = np.random.choice(np.arange(min_index, max_index + 1), size=num_bins, replace=False) 46 | 47 | for selected_bin in selected_bins: 48 | range_min = selected_bin * bin_width 49 | range_max = (selected_bin + 1) * bin_width 50 | filt = "(%s >= %s and %s < %s)" % (dim, '{:.1f}'.format(range_min), dim, '{:.1f}'.format(range_max)) 51 | filters.append(filt) 52 | else: 53 | all_bins = df_result[dim].unique().tolist() 54 | num_bins = random.randint(1, len(all_bins)) 55 | selected_bins = np.random.choice(all_bins, size=num_bins, replace=False) 56 | for selected_bin in list(selected_bins): 57 | filt = "(%s = '%s')" % (dim, selected_bin) 58 | filters.append(filt) 59 | filter_per_dim.append(" or ".join(filters)) 60 | filter_per_dim = ["(%s)" % f for f in filter_per_dim] 61 | 62 | return Operation(OrderedDict({"name": ("viz_%s" % src_viz_num), "filter": " and ".join(filter_per_dim)})) -------------------------------------------------------------------------------- /workflowgen/linkaction.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import OrderedDict 3 | from common.operation import Operation 4 | from workflowgen.baseaction import BaseAction 5 | from workflowgen.vizaction import VizAction 6 | 7 | class LinkAction(BaseAction): 8 | 9 | def get_states(self): 10 | 11 | if VizAction.VIZ_COUNTER < 1: 12 | return None 13 | 14 | pick_from = -1 15 | pick_to = -1 16 | link_type = None 17 | while (pick_from == -1 or pick_to == -1): 18 | from_candidate = random.randint(0, VizAction.VIZ_COUNTER) 19 | to_candidate = random.randint(0, VizAction.VIZ_COUNTER) 20 | 21 | 22 | link_type_names = [l["name"] for l in self.config["linkType"]] 23 | link_type_pds = [l["p"] for l in self.config["linkType"]] 24 | 25 | link_type = self.pick(link_type_names, link_type_pds) 26 | print(link_type) 27 | 28 | if link_type == "sequential" and LinkAction.LATEST_LINK: 29 | from_candidate = LinkAction.LATEST_LINK[1] 30 | elif link_type == "1n" and LinkAction.LATEST_LINK: 31 | from_candidate = LinkAction.LATEST_LINK[0] 32 | elif link_type == "n1" and LinkAction.FIRST_LINK: 33 | to_candidate = LinkAction.FIRST_LINK[1] 34 | 35 | num_tries = 10 36 | giveup = False 37 | g = {} 38 | for i in range(num_tries+1): 39 | 40 | g = {} 41 | for l in LinkAction.LINKS: 42 | if l[0] not in g: 43 | g[l[0]] = [] 44 | g[l[0]].append(l[1]) 45 | 46 | if from_candidate not in g: 47 | g[from_candidate] = [] 48 | g[from_candidate].append(to_candidate) 49 | 50 | if self.cyclic(g): 51 | if link_type == "n1" and LinkAction.FIRST_LINK: 52 | to_candidate = LinkAction.FIRST_LINK[1] 53 | else: 54 | to_candidate = random.randint(0, VizAction.VIZ_COUNTER) 55 | if i == num_tries: 56 | 57 | giveup = True 58 | else: 59 | break 60 | 61 | if giveup: 62 | print("giving up!") 63 | break 64 | 65 | if from_candidate != to_candidate and ((to_candidate, from_candidate) not in LinkAction.LINKS) and ((from_candidate, to_candidate) not in LinkAction.LINKS): 66 | 67 | pick_from = from_candidate 68 | pick_to = to_candidate 69 | 70 | if not LinkAction.FIRST_LINK: 71 | LinkAction.FIRST_LINK = (pick_from, pick_to) 72 | LinkAction.LATEST_LINK = (pick_from, pick_to) 73 | LinkAction.LINKS.add(LinkAction.LATEST_LINK) 74 | break 75 | 76 | if len(LinkAction.LINKS) >= VizAction.VIZ_COUNTER-1: 77 | break 78 | 79 | if (pick_from == -1 or pick_to == -1): 80 | return None 81 | 82 | incoming_links = [ "viz_" + str(l[0]) for l in filter(lambda x: x[1] == pick_to, LinkAction.LINKS)] 83 | combined_filters = Operation(OrderedDict({"name": "viz_" + str(pick_to), "source": ( " and ".join(incoming_links))})) 84 | return combined_filters 85 | 86 | def cyclic(self, g): 87 | path = set() 88 | def visit(vertex): 89 | path.add(vertex) 90 | for neighbour in g.get(vertex, ()): 91 | if neighbour in path or visit(neighbour): 92 | return True 93 | path.remove(vertex) 94 | return False 95 | 96 | return any(visit(v) for v in g) 97 | 98 | LinkAction.FIRST_LINK = None 99 | LinkAction.LATEST_LINK = None 100 | LinkAction.LINKS = set() 101 | -------------------------------------------------------------------------------- /workflowgen/selectionaction.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import OrderedDict 3 | from common.operation import Operation 4 | from workflowgen.baseaction import BaseAction 5 | from workflowgen.linkaction import LinkAction 6 | import pandasql 7 | import math 8 | import numpy as np 9 | 10 | class SelectionAction(BaseAction): 11 | 12 | def get_states(self): 13 | 14 | if len(LinkAction.LINKS) == 0: 15 | return 16 | 17 | rand_link = self.pick(list(LinkAction.LINKS)) 18 | rand_link_src = rand_link[0] 19 | nodes_dict = self.vizgraph.get_nodes_dict() 20 | src_viz = nodes_dict["viz_" + str(rand_link_src)] 21 | computed_filter = src_viz.get_computed_filter() 22 | df = self.df 23 | sql_statement = "SELECT * FROM df " 24 | if len(computed_filter) > 0: 25 | sql_statement += "WHERE " + computed_filter 26 | 27 | df_result = pandasql.sqldf(sql_statement, locals()) 28 | 29 | if df_result.empty: 30 | return None 31 | 32 | filter_per_dim = [] 33 | 34 | for bin_dim in range(len(src_viz.binning)): 35 | filters = [] 36 | dim = src_viz.binning[bin_dim]["dimension"] 37 | field = list(filter(lambda x: x["field"] == dim, self.sample_json["tables"]["fact"]["fields"]))[0] 38 | if field["type"] == "quantitative": 39 | bin_width = float(src_viz.binning[bin_dim]["width"]) 40 | min_val = df_result[dim].min() 41 | max_val = df_result[dim].max() 42 | 43 | min_index = math.floor(min_val / bin_width) 44 | max_index = math.floor(max_val / bin_width) 45 | num_bins = 0 46 | if np.random.rand() < 0.4: 47 | num_bins = 1 48 | else: 49 | num_bins = random.randint(1, max_index-min_index) if max_index > min_index else 1 50 | selected_bins = np.random.choice(np.arange(min_index, max_index + 1), size=num_bins, replace=False) 51 | 52 | for selected_bin in selected_bins: 53 | range_min = selected_bin * bin_width 54 | range_max = (selected_bin + 1) * bin_width 55 | filt = "(%s >= %s and %s < %s)" % (dim, '{:.1f}'.format(range_min), dim, '{:.1f}'.format(range_max)) 56 | filters.append(filt) 57 | else: 58 | all_bins = df_result[dim].unique().tolist() 59 | num_bins = random.randint(1, len(all_bins)) 60 | selected_bins = np.random.choice(all_bins, size=num_bins, replace=False) 61 | for selected_bin in list(selected_bins): 62 | filt = "(%s = '%s')" % (dim, selected_bin) 63 | filters.append(filt) 64 | filter_per_dim.append(" or ".join(filters)) 65 | filter_per_dim = ["(%s)" % f for f in filter_per_dim] 66 | 67 | return Operation(OrderedDict({"name": ("viz_%s" % rand_link_src), "selection": " and ".join(filter_per_dim)})) 68 | -------------------------------------------------------------------------------- /workflowgen/vizaction.py: -------------------------------------------------------------------------------- 1 | import random 2 | from workflowgen.baseaction import BaseAction 3 | from common.operation import Operation 4 | from collections import OrderedDict 5 | import pandasql 6 | import pandas as pd 7 | 8 | class VizAction(BaseAction): 9 | 10 | def __init__(self, config, df, vizgraph, storage, sample_json): 11 | super().__init__(config, df, vizgraph, storage, sample_json) 12 | 13 | self.dim_to_type = {} 14 | for field in sample_json["tables"]["fact"]["fields"]: 15 | self.dim_to_type[field["field"]] = field["type"] 16 | 17 | def get_states(self): 18 | num_bins = self.pick(self.config["numBinDimensionsPerViz"]["values"], self.config["numBinDimensionsPerViz"]["pd"] ) 19 | bins = [] 20 | picks = [] 21 | 22 | while len(bins) < num_bins: 23 | dimensions_p = [dim["p"] for dim in self.config["dimensions"]] 24 | dimensions_p = [p/sum(dimensions_p) for p in dimensions_p] 25 | dimension = self.pick(self.config["dimensions"], dimensions_p) 26 | 27 | if dimension in picks: 28 | continue 29 | 30 | picks.append(dimension) 31 | 32 | sql_statement = "SELECT * FROM df " 33 | df = self.df 34 | df = pandasql.sqldf(sql_statement, locals()) 35 | 36 | d_bin = {"dimension": dimension["name"] } 37 | if self.dim_to_type[dimension["name"]] == "quantitative": 38 | dim_max_val = df[dimension["name"]].max() 39 | dim_min_val = df[dimension["name"]].min() 40 | #d_bin["width"] = round(random.uniform(0.025, 0.1) * (dim_max_val - dim_min_val)) 41 | d_bin["width"] = round(random.uniform(0.025, 0.1) * (dim_max_val - dim_min_val)) 42 | elif self.dim_to_type[dimension["name"]] == "categorical": 43 | try: 44 | pd.to_numeric(df[dimension["name"]]) 45 | d_bin["width"] = 1 46 | except: 47 | pass 48 | 49 | bins.append(d_bin) 50 | 51 | per_bin_aggregate_type = self.pick(self.config["perBinAggregates"]["values"], self.config["perBinAggregates"]["pd"] ) 52 | per_bin_aggregate = {"type": per_bin_aggregate_type} 53 | if per_bin_aggregate_type == "avg": 54 | avg_dimension = self.pick([d for d in self.sample_json["tables"]["fact"]["fields"] if (d["type"] == "quantitative")]) 55 | per_bin_aggregate["dimension"] = avg_dimension["field"] 56 | 57 | VizAction.VIZ_COUNTER += 1 58 | self.viz_name = "viz_%s" % VizAction.VIZ_COUNTER 59 | self.binning = bins 60 | self.perBinAggregates = [per_bin_aggregate] 61 | return Operation(OrderedDict({"name": self.viz_name, "binning": self.binning, "perBinAggregates": self.perBinAggregates})) 62 | 63 | VizAction.VIZ_COUNTER = -1 --------------------------------------------------------------------------------