├── .gitignore
├── .idea
    ├── .name
    ├── codeStyleSettings.xml
    ├── encodings.xml
    ├── misc.xml
    ├── modules.xml
    ├── pyxplorer.iml
    ├── scopes
    │   └── scope_settings.xml
    └── vcs.xml
├── LICENSE
├── MANIFEST
├── MANIFEST.in
├── README.rst
├── dependencies.txt
├── pyxplorer
    ├── __init__.py
    ├── helper.py
    ├── loader.py
    ├── manager.py
    └── types.py
├── pyxplorer_stuff.ipynb
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | .DS_Store
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | bin/
14 | build/
15 | develop-eggs/
16 | dist/
17 | eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # Installer logs
28 | pip-log.txt
29 | pip-delete-this-directory.txt
30 | 
31 | # Unit test / coverage reports
32 | htmlcov/
33 | .tox/
34 | .coverage
35 | .cache
36 | nosetests.xml
37 | coverage.xml
38 | 
39 | # Translations
40 | *.mo
41 | 
42 | # Mr Developer
43 | .mr.developer.cfg
44 | .project
45 | .pydevproject
46 | 
47 | # Rope
48 | .ropeproject
49 | 
50 | # Django stuff:
51 | *.log
52 | *.pot
53 | 
54 | # Sphinx documentation
55 | docs/_build/
56 | 
57 | 


--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | pyxplorer


--------------------------------------------------------------------------------
/.idea/codeStyleSettings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectCodeStyleSettingsManager">
 4 |     <option name="PER_PROJECT_SETTINGS">
 5 |       <value>
 6 |         <option name="ALIGN_MULTILINE_PARAMETERS_IN_CALLS" value="true" />
 7 |         <XML>
 8 |           <option name="XML_LEGACY_SETTINGS_IMPORTED" value="true" />
 9 |         </XML>
10 |         <codeStyleSettings language="Python">
11 |           <option name="PARENT_SETTINGS_INSTALLED" value="true" />
12 |         </codeStyleSettings>
13 |       </value>
14 |     </option>
15 |     <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
16 |   </component>
17 | </project>
18 | 
19 | 


--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
4 | </project>
5 | 
6 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.5 (/usr/bin/python)" project-jdk-type="Python SDK" />
4 | </project>
5 | 
6 | 


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectModuleManager">
 4 |     <modules>
 5 |       <module fileurl="file://$PROJECT_DIR$/.idea/pyxplorer.iml" filepath="$PROJECT_DIR$/.idea/pyxplorer.iml" />
 6 |     </modules>
 7 |   </component>
 8 | </project>
 9 | 
10 | 


--------------------------------------------------------------------------------
/.idea/pyxplorer.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 | </module>
 9 | 
10 | 


--------------------------------------------------------------------------------
/.idea/scopes/scope_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="DependencyValidationManager">
2 |   <state>
3 |     <option name="SKIP_IMPORT_STATEMENTS" value="false" />
4 |   </state>
5 | </component>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>
7 | 
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Martin Grund
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
 1 | # file GENERATED by distutils, do NOT edit
 2 | LICENSE
 3 | README.md
 4 | setup.py
 5 | pyxplorer/__init__.py
 6 | pyxplorer/helper.py
 7 | pyxplorer/loader.py
 8 | pyxplorer/manager.py
 9 | pyxplorer/types.py
10 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.md
2 | include LICENSE
3 | 
4 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | pyxplorer -- Easy Interactive Data Profiling for Big Data (and Small Data)
 2 | --------------------------------------------------------------------------
 3 | 
 4 | The goal of pyxplorer is to provide a simple tool that allows interactive
 5 | profiling of datasets that are accessible via a SQL like interface. The only
 6 | requirement to run data profiling is that you are able to provide a Python
 7 | DBAPI like interface to your data source and the data source is able to
 8 | understand simplistic SQL queries.
 9 | 
10 | I built this piece of software while trying to get a better understanding of
11 | data distribution in a massive several hundred million record large dataset.
12 | Depending on the size of the dataset and the query engine the response time
13 | can ranging from seconds (Impala) to minutes (Hive) or even hours (MySQL)
14 | 
15 | The typical use case is to use ```pyxplorer``` interactively from an iPython
16 | Notebook or iPython shell to incrementally extract information about your data.
17 | 
18 | Usage
19 | ------
20 | 
21 | Imagine that you are provided with access to a huge Hive/Impala database on
22 | your very own Hadoop cluster and you're asked to profile the data to get a
23 | better understanding for performing more specific data science later on.::
24 | 
25 |   import pyxplorer as pxp
26 |   from impala.dbapi import connect
27 |   conn = connect(host='impala_server', port=21050)
28 | 
29 |   db = pxp.Database("default", conn)
30 |   db.tables()
31 | 
32 | This simple code gives you access to all the tables in this database. So let's
33 | assume the result shows a ```sales_orders``` table, what can we do now?::
34 | 
35 |   orders = db["sales_orders"]
36 |   orders.size() # 100M
37 |   orders.columns() # [ol_w_id, ol_d_id, ol_o_id, ol_number, ol_i_id, ...]
38 | 
39 | Ok, if we have so many columns, what can we find out about a single column?::
40 | 
41 |   orders.ol_d_id.min() # 1
42 |   orders.ol_d_id.max() # 9999
43 |   orders.ol_d_id.dcount() # 1000
44 | 
45 | And like this there are some more key-figures about the data like uniqueness,
46 | constancy, most and least frequent values and distribution.
47 | 
48 | In some cases, where it makes sense, the output of a method call will not be a
49 | simple array or list but directly a Pandas dataframe to facilitate plotting
50 | and further analysis.
51 | 
52 | You will find an easier to digest tutorial here:
53 | 
54 |   * http://nbviewer.ipython.org/github/grundprinzip/pyxplorer/blob/master/pyxplorer_stuff.ipynb
55 | 
56 | 
57 | Supported Features
58 | -------------------
59 | 
60 |   * Column Count (Database / Table)
61 |   * Table Count
62 |   * Tuple Count (Database / Table)
63 |   * Min / Max
64 |   * Most Frequent / Least Frequent
65 |   * Top-K Most Frequent / Top-K Least Frequent
66 |   * Top-K Value Distribution (Database / Table )
67 |   * Uniqueness
68 |   * Constancy
69 |   * Distinc Value Count
70 | 
71 | 
72 | Supported Platforms
73 | --------------------
74 | 
75 | The following platforms are typically tested while using ```pyxplorer```
76 | 
77 |  * Hive
78 |  * Impala
79 |  * MySQL
80 | 
81 | 
82 | Dependencies
83 | -------------
84 | 
85 |   * pandas
86 |   * phys2 for Hive based loading of data sets
87 |   * pympala for connecting to Impala
88 |   * snakebite for loading data from HDFS to Hive
89 | 


--------------------------------------------------------------------------------
/dependencies.txt:
--------------------------------------------------------------------------------
1 | pympala
2 | snakebite
3 | pandas
4 | pyhs2
5 | 


--------------------------------------------------------------------------------
/pyxplorer/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # The database helper
 3 | from manager import Database
 4 | 
 5 | # Relevant for getting Table information
 6 | from types import Column, Table
 7 | 
 8 | # The HDFS Loader
 9 | from loader import Loader
10 | 


--------------------------------------------------------------------------------
/pyxplorer/helper.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | from StringIO import StringIO
 3 | 
 4 | 
 5 | def car(data):
 6 |     return [x[0] for x in data]
 7 | 
 8 | 
 9 | def render_table(head, rows, limit=10):
10 |     buf = StringIO()
11 |     buf.write("<table><tr>")
12 |     for h in head:
13 |         buf.write("<th>{0}</th>".format(h))
14 |     buf.write("</tr>")
15 | 
16 |     # Build the slices we need
17 |     if limit == None or len(rows) <= limit:
18 |         data = rows
19 |         footer = None
20 |     else:
21 |         data = rows[:9]
22 |         footer = rows[-1:]
23 | 
24 |     for r in data:
25 |         buf.write("<tr>")
26 |         for c in r:
27 |             buf.write("<td>{0}</td>".format(c))
28 |         buf.write("</tr>")
29 | 
30 |     if footer:
31 |         for r in footer:
32 |             buf.write("<tr>")
33 |             for c in r:
34 |                 buf.write("<td>{0}</td>".format(c))
35 |             buf.write("</tr>")
36 |     buf.write("</table>")
37 |     buf.write("<p>Rows: %d / Columns: %d</p>" % (len(rows), len(head)))
38 |     return buf.getvalue()
39 | 
40 | 
41 | def memoize(obj):
42 |     cache = obj.cache = {}
43 | 
44 |     @functools.wraps(obj)
45 |     def memoizer(*args, **kwargs):
46 |         key = str(args) + str(kwargs)
47 |         if key not in cache:
48 |             cache[key] = obj(*args, **kwargs)
49 |         return cache[key]
50 | 
51 | 
52 |     return memoizer
53 | 


--------------------------------------------------------------------------------
/pyxplorer/loader.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'grund'
  2 | import re
  3 | 
  4 | from snakebite.client import Client
  5 | import pyhs2
  6 | 
  7 | 
  8 | class Loader:
  9 |     """
 10 |     The idea of the loader is to provide a convenient interface to create a new table
 11 |     based on some input files
 12 |     """
 13 | 
 14 |     def __init__(self, path, name_node, hive_server,
 15 |                  user="root", hive_db="default", password=None, nn_port=8020, hive_port=10000):
 16 | 
 17 |         # HDFS Connection
 18 |         self._client = Client(name_node, nn_port)
 19 | 
 20 |         self._db = hive_db
 21 | 
 22 |         # Hive Connection
 23 |         self._hive = pyhs2.connect(host=hive_server,
 24 |                                    port=hive_port,
 25 |                                    authMechanism="PLAIN",
 26 |                                    database=hive_db,
 27 |                                    user=user,
 28 |                                    password=password)
 29 |         self._path = path
 30 | 
 31 | 
 32 |     def load(self):
 33 |         # Check data to see which kind it is
 34 |         files = self._client.ls([self._path])
 35 | 
 36 |         files = [f for f in files if f['file_type'] == 'f']
 37 |         if len(files) == 0:
 38 |             raise Exception("Cannot load empty directory")
 39 | 
 40 |         # Pick the first file and assume that it has the same content as the others
 41 |         data = self.head(files[0]['path'])
 42 |         res = self.check_separator(data)
 43 |         if res == None:
 44 |             # We cant load the data and better abort here
 45 |             print("cant load data, cannot find a separator")
 46 |             return
 47 | 
 48 |         sep = res[0]
 49 |         num_cols = res[1]
 50 | 
 51 |         # Build table statement
 52 |         table_statement, table_name = self._create_table(self._path, sep, num_cols)
 53 |         cursor = self._hive.cursor()
 54 |         cursor.execute(table_statement)
 55 | 
 56 |         return self._db, table_name
 57 | 
 58 | 
 59 |     def _create_table(self, path, sep, count):
 60 |         buf = """CREATE EXTERNAL TABLE pyxplorer_data (
 61 |     %s
 62 |     )ROW FORMAT DELIMITED FIELDS TERMINATED BY '%s'
 63 |     STORED AS TEXTFILE LOCATION '%s'
 64 |     """ % (",".join(["col_%d string" % x for x in range(count)]), sep, path)
 65 |         return buf, "pyxplorer_data"
 66 | 
 67 |     def check_separator(self, data):
 68 |         """
 69 |         THis method evaluates a list of separators on the input data to check which one
 70 |         is correct. This is done by first splitting the input by newline and then
 71 |         checking if the split by separator is equal for each input row except the last
 72 |         that might be incomplete due to the limited input data
 73 | 
 74 |         :param data: input data to check
 75 |         :return:
 76 |         """
 77 | 
 78 |         sep_list = [r'\t', r';', r',', r'|', r'\s+']
 79 | 
 80 |         data_copy = data
 81 |         for sep in sep_list:
 82 |             # Check if the count matches each line
 83 |             splitted = data_copy.split("\n")
 84 |             parts = [len(re.split(sep, line)) for line in splitted]
 85 | 
 86 |             # If we did not split anything continue
 87 |             if sum(parts) == len(splitted):
 88 |                 continue
 89 | 
 90 |             diff = 0
 91 | 
 92 |             for i in range(len(parts[1:-1])):
 93 |                 diff += abs(parts[i] - parts[i + 1])
 94 | 
 95 |             if diff == 0:
 96 |                 return sep, parts[0]
 97 | 
 98 |         # If we reach this point we did not find a separator
 99 |         return None
100 | 
101 | 
102 |     def head(self, file_path):
103 |         """
104 |         Onlye read the first packets that come, try to max out at 1024kb
105 | 
106 |         :return: up to 1024b of the first block of the file
107 |         """
108 |         processor = lambda path, node, tail_only=True, append=False: self._handle_head(
109 |             path, node)
110 | 
111 |         # Find items and go
112 |         for item in self._client._find_items([file_path], processor,
113 |                                              include_toplevel=True,
114 |                                              include_children=False, recurse=False):
115 |             if item:
116 |                 return item
117 | 
118 |     def _handle_head(self, path, node, upper=1024 * 1024):
119 |         data = ''
120 |         for load in self._client._read_file(path, node, tail_only=False,
121 |                                             check_crc=False):
122 |             data += load
123 |             if (len(data) > upper):
124 |                 return data
125 | 
126 |         return data
127 | 


--------------------------------------------------------------------------------
/pyxplorer/manager.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | 
 4 | import pandas as pd
 5 | 
 6 | import types as t
 7 | import helper as h
 8 | 
 9 | 
10 | class Database:
11 |     def __init__(self, db, conn):
12 |         self.db = db
13 |         self.connection = conn
14 | 
15 |     def __getitem__(self, item):
16 |         for x in self.tables():
17 |             if x.name() == item:
18 |                 return x
19 |         raise KeyError(item)
20 | 
21 |     @h.memoize
22 |     def tables(self):
23 |         """
24 |         :return: all tables stored in this database
25 |         """
26 |         cursor = self.connection.cursor()
27 |         cursor.execute("show tables in %s" % self.db)
28 |         self._tables = [t.Table(r[0], con=self.connection, db=self.db) for r in cursor.fetchall()]
29 |         return self._tables
30 | 
31 |     def __len__(self):
32 |         return len(self.tables())
33 | 
34 |     @h.memoize
35 |     def tcounts(self):
36 |         """
37 |         :return: a data frame containing the names and sizes for all tables
38 |         """
39 |         df = pd.DataFrame([[t.name(), t.size()] for t in self.tables()], columns=["name", "size"])
40 |         df.index = df.name
41 |         return df
42 | 
43 |     @h.memoize
44 |     def dcounts(self):
45 |         """
46 |         :return: a data frame with names and distinct counts and fractions for all columns in the database
47 |         """
48 |         print("WARNING: Distinct value count for all tables can take a long time...", file=sys.stderr)
49 |         sys.stderr.flush()
50 | 
51 |         data = []
52 |         for t in self.tables():
53 |             for c in t.columns():
54 |                 data.append([t.name(), c.name(), c.dcount(), t.size(), c.dcount() / float(t.size())])
55 |         df = pd.DataFrame(data, columns=["table", "column", "distinct", "size", "fraction"])
56 |         return df
57 | 
58 | 
59 |     def _repr_html_(self):
60 |         return h.render_table(["Name", "Size"], [[x.name(), x.size()] for x in self.tables()])
61 | 
62 | 
63 |     def num_tables(self):
64 |         return len(self)
65 | 
66 |     def num_columns(self):
67 |         return sum([len(x.columns()) for x in self.tables()])
68 | 
69 |     def num_tuples(self):
70 |         return sum([x.size() for x in self.tables()])
71 | 


--------------------------------------------------------------------------------
/pyxplorer/types.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import pandas as pd
  4 | import helper as h
  5 | import sys
  6 | 
  7 | 
  8 | class Column:
  9 |     """
 10 |     Representation of a column and the profiling information
 11 |     """
 12 | 
 13 |     def _qexec(self, fld, group=None, order=None):
 14 |         c = self._con.cursor()
 15 |         if not group is None:
 16 |             group = " group by %s" % group
 17 |         else:
 18 |             group = ""
 19 | 
 20 |         if not order is None:
 21 |             order = " order by %s" % order
 22 |         else:
 23 |             order = ""
 24 | 
 25 |         query = "select %s from `%s`.`%s` %s %s" % (fld, self._table.db(), self._table.name(), group, order)
 26 |         c.execute(query)
 27 |         return c.fetchall()
 28 | 
 29 |     def __init__(self, name, type_name, con, table):
 30 |         self._name = name
 31 |         self._type_name = type_name
 32 |         self._con = con
 33 |         self._table = table
 34 |         self._distribution = None
 35 |         self._min = None
 36 |         self._max = None
 37 |         self._dcount = None
 38 |         self._most_frequent = None
 39 |         self._most_frequent_count = None
 40 |         self._least_frequent = None
 41 |         self._least_frequent_count = None
 42 | 
 43 |     def __repr__(self):
 44 |         return self.name()
 45 | 
 46 |     def __str__(self):
 47 |         buf = "%s\n" % self.name()
 48 |         funs = [self.min, self.max, self.dcount, self.most_frequent, self.least_frequent]
 49 |         for x in funs:
 50 |             buf += "%s:\t%s\n" % (x.__name__, x())
 51 |         return buf
 52 | 
 53 |     def name(self):
 54 |         return self._name
 55 | 
 56 |     @classmethod
 57 |     def build(cls, data, con, table):
 58 |         return Column(data[0], data[1], con, table)
 59 | 
 60 |     def __eq__(self, other):
 61 |         return self._name == other._name and self._type_name == other._type_name
 62 | 
 63 |     @h.memoize
 64 |     def min(self):
 65 |         """
 66 |         :returns the minimum of the column
 67 |         """
 68 |         res = self._qexec("min(%s)" % self._name)
 69 |         if len(res) > 0:
 70 |             self._min = res[0][0]
 71 |         return self._min
 72 | 
 73 |     @h.memoize
 74 |     def max(self):
 75 |         """
 76 |         :returns the maximum of the column
 77 |         """
 78 |         res = self._qexec("max(%s)" % self._name)
 79 |         if len(res) > 0:
 80 |             self._max = res[0][0]
 81 |         return self._max
 82 | 
 83 |     @h.memoize
 84 |     def dcount(self):
 85 |         res = self._qexec("count(distinct %s)" % self._name)
 86 |         if len(res) > 0:
 87 |             self._dcount = res[0][0]
 88 |         return self._dcount
 89 | 
 90 |     @h.memoize
 91 |     def distribution(self, limit=1024):
 92 |         """
 93 |         Build the distribution of distinct values
 94 |         """
 95 |         res = self._qexec("%s, count(*) as __cnt" % self.name(), group="%s" % self.name(),
 96 |                           order="__cnt DESC LIMIT %d" % limit)
 97 |         dist = []
 98 |         cnt = self._table.size()
 99 |         for i, r in enumerate(res):
100 |             dist.append(list(r) + [i, r[1] / float(cnt)])
101 | 
102 |         self._distribution = pd.DataFrame(dist, columns=["value", "cnt", "r", "fraction"])
103 |         self._distribution.index = self._distribution.r
104 | 
105 |         return self._distribution
106 | 
107 |     @h.memoize
108 |     def most_frequent(self):
109 |         res = self.n_most_frequent(1)
110 |         self._most_frequent = res[0][0]
111 |         self._most_frequent_count = res[0][1]
112 |         return self._most_frequent, self._most_frequent_count
113 | 
114 |     @h.memoize
115 |     def least_frequent(self):
116 |         res = self.n_least_frequent(1)
117 |         self._least_frequent = res[0][0]
118 |         self._least_frequent_count = res[0][1]
119 |         return self._least_frequent, self._least_frequent_count
120 | 
121 |     @h.memoize
122 |     def n_most_frequent(self, limit=10):
123 |         res = self._qexec("%s, count(*) as __cnt" % self.name(), group="%s" % self.name(),
124 |                           order="__cnt DESC LIMIT %d" % limit)
125 |         return res
126 | 
127 |     @h.memoize
128 |     def n_least_frequent(self, limit=10):
129 |         res = self._qexec("%s, count(*) as cnt" % self.name(), group="%s" % self.name(),
130 |                           order="cnt ASC LIMIT %d" % limit)
131 |         return res
132 | 
133 |     def size(self):
134 |         return self._table.size()
135 | 
136 |     def uniqueness(self):
137 |         return self.dcount() / float(self.size())
138 | 
139 |     def constancy(self):
140 |         tup = self.most_frequent()
141 |         return tup[1] / float(self.size())
142 | 
143 |     def _repr_html_(self):
144 | 
145 |         funs = [("Min", self.min), ("Max", self.max), ("#Distinct Values", self.dcount),
146 |         ("Most Frequent", lambda: "{0} ({1})".format(*self.most_frequent())),
147 |         ("Least Frequent", lambda: "{0} ({1})".format(*self.least_frequent())),
148 |         ("Top 10 MF", lambda: ",".join(map(str, h.car(self.n_most_frequent())))),
149 |         ("Top 10 LF", lambda: ", ".join(map(str, h.car(self.n_least_frequent())))),
150 |         ("Uniqueness", self.uniqueness),
151 |         ("Constancy", self.constancy),
152 |         ]
153 |         return h.render_table(["Name", "Value"], [[x[0], x[1]()] for x in funs])
154 | 
155 | 
156 | class Table:
157 |     """
158 |     Generic Table Object
159 | 
160 |     This class provides simple access to the columns of the table. Most of the methods that perform actual data access
161 |     are cached to avoid costly lookups.
162 | 
163 | 
164 |     """
165 | 
166 |     def __init__(self, name, con, db="default"):
167 |         self._cols = []
168 |         self._db = db
169 |         self._name = name
170 |         self._connection = con
171 | 
172 |     def name(self):
173 |         """
174 |         :return: name of the table
175 |         """
176 |         return self._name
177 | 
178 |     def db(self):
179 |         """
180 |         :return: name of the database used
181 |         """
182 |         return self._db
183 | 
184 |     def column(self, col):
185 |         """
186 |         Given either a column index or name return the column structure
187 |         :param col: either index or name
188 |         :return: column data structure
189 |         """
190 |         if type(col) is str:
191 |             for c in self._cols:
192 |                 if c.name == col:
193 |                     return c
194 |         else:
195 |             return self._cols[col]
196 | 
197 |     @h.memoize
198 |     def __len__(self):
199 |         """
200 |         :return: number of rows in the table
201 |         """
202 |         c = self._connection.cursor()
203 |         c.execute("select count(*) from `%s`.`%s`" % (self._db, self._name))
204 |         self._count = c.fetchall()[0][0]
205 |         return self._count
206 | 
207 |     def size(self):
208 |         """
209 |         alias to __len__()
210 |         :return:
211 |         """
212 |         return len(self)
213 | 
214 |     @h.memoize
215 |     def columns(self):
216 |         """
217 |         :return: the list of column in this table
218 |         """
219 |         c = self._connection.cursor()
220 |         c.execute("describe `%s`.`%s`" % (self._db, self._name))
221 |         self._cols = []
222 |         for col in c.fetchall():
223 |             self._cols.append(Column.build(col, table=self, con=self._connection))
224 |         return self._cols
225 | 
226 |     def __getitem__(self, item):
227 |         """
228 |         Subscript access to the tables by name
229 |         :param item:
230 |         :return:
231 |         """
232 |         for x in self.columns():
233 |             if x.name() == item:
234 |                 return x
235 |         raise KeyError(item)
236 | 
237 |     def __dir__(self):
238 |         """
239 |         :return: an array of custom attributes, for code-completion in ipython
240 |         """
241 |         return [x.name() for x in self.columns()]
242 | 
243 |     def __repr__(self):
244 |         return "<Table: \"%s.%s\">" % (self._db, self._name)
245 | 
246 |     def __getattr__(self, item):
247 |         """
248 |         :param item: name of the column
249 |         :return: column object for attribute-like access to the column
250 |         """
251 |         for x in self.columns():
252 |             if x.name() == item:
253 |                 return x
254 |         raise AttributeError("'%s' object has no attribute '%s'" % (type(self).__name__, item))
255 | 
256 |     def num_columns(self):
257 |         """
258 |         :return: number of columns of the table
259 |         """
260 |         return len(self.columns())
261 | 
262 |     def distinct_value_fractions(self):
263 |         """
264 |         :return: returns a data frame of name distinct value fractions
265 |         """
266 |         return pd.DataFrame([c.dcount() / float(self.size()) for c in self.columns()],
267 |                             index=[c.name() for c in self.columns()], columns=["fraction"])
268 | 


--------------------------------------------------------------------------------
/pyxplorer_stuff.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:4a51afdcc72d6c3700e33a0f3fa5a5570e51f803ccfcaeea43ab06c775c03da7"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "markdown",
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "#Pyxplorer - interactive data set exploration\n",
 16 |       "\n",
 17 |       "The goal of pyxplorer is to provide a simple tool that allows interactive\n",
 18 |       "profiling of datasets that are accessible via a SQL like interface. The only\n",
 19 |       "requirement to run data profiling is that you are able to provide a Python\n",
 20 |       "DBAPI like interface to your data source and the data source is able to\n",
 21 |       "understand simplistic SQL queries.\n",
 22 |       "\n",
 23 |       "I built this piece of software while trying to get a better understanding of\n",
 24 |       "data distribution in a massive several hundred million record large dataset.\n",
 25 |       "Depending on the size of the dataset and the query engine the resposne time\n",
 26 |       "can ranging from seconds (Impala) to minutes (Hive) or even hourse (MySQL)\n",
 27 |       "\n",
 28 |       "The typical use case is to use `pyxplorer` interactively from an iPython\n",
 29 |       "Notebook or iPython shell to incrementally extract information about your data."
 30 |      ]
 31 |     },
 32 |     {
 33 |      "cell_type": "markdown",
 34 |      "metadata": {},
 35 |      "source": [
 36 |       "    $> pip install pyxplorer pympala"
 37 |      ]
 38 |     },
 39 |     {
 40 |      "cell_type": "markdown",
 41 |      "metadata": {},
 42 |      "source": [
 43 |       "Questions, Ideas, Comments:\n",
 44 |       "\n",
 45 |       "https://github.com/grundprinzip/pyxplorer"
 46 |      ]
 47 |     },
 48 |     {
 49 |      "cell_type": "markdown",
 50 |      "metadata": {},
 51 |      "source": [
 52 |       "## Example using Impala\n",
 53 |       "\n",
 54 |       "Basically `pyexplorer` works with all DBAPI like interfaces, but to show the advantages of running a high-performance data analysis on large amounts of data we will use Impala to store our data."
 55 |      ]
 56 |     },
 57 |     {
 58 |      "cell_type": "code",
 59 |      "collapsed": false,
 60 |      "input": [
 61 |       "from impala.dbapi import connect\n",
 62 |       "conn = connect(host='diufpc57', port=21050)"
 63 |      ],
 64 |      "language": "python",
 65 |      "metadata": {},
 66 |      "outputs": [],
 67 |      "prompt_number": 2
 68 |     },
 69 |     {
 70 |      "cell_type": "markdown",
 71 |      "metadata": {},
 72 |      "source": [
 73 |       "## Database Operations"
 74 |      ]
 75 |     },
 76 |     {
 77 |      "cell_type": "markdown",
 78 |      "metadata": {},
 79 |      "source": [
 80 |       "Imagine that you are provided with access to a huge Hive/Impala database on\n",
 81 |       "your very own Hadoop cluster and you're asked to profile the data to get a\n",
 82 |       "better understanding for performing more specific data science later on. \n",
 83 |       "Based on this connection, we can now instantiate a new explorer object."
 84 |      ]
 85 |     },
 86 |     {
 87 |      "cell_type": "code",
 88 |      "collapsed": false,
 89 |      "input": [
 90 |       "import pyxplorer as pxp\n",
 91 |       "data = pxp.Database(\"tpcc3\", conn)\n",
 92 |       "data"
 93 |      ],
 94 |      "language": "python",
 95 |      "metadata": {},
 96 |      "outputs": [
 97 |       {
 98 |        "html": [
 99 |         "<table><tr><th>Name</th><th>Size</th></tr><tr><td>customerp</td><td>30000000</td></tr><tr><td>districtp</td><td>10000</td></tr><tr><td>historyp</td><td>30000000</td></tr><tr><td>itemp</td><td>100000</td></tr><tr><td>new_orderp</td><td>9000000</td></tr><tr><td>oorderp</td><td>30000000</td></tr><tr><td>order_linep</td><td>299991280</td></tr><tr><td>stockp</td><td>100000000</td></tr><tr><td>warehousep</td><td>1000</td></tr></table><p>Rows: 9 / Columns: 2</p>"
100 |        ],
101 |        "metadata": {},
102 |        "output_type": "pyout",
103 |        "prompt_number": 4,
104 |        "text": [
105 |         "<pyxplorer.manager.Database instance at 0x1033a5bd8>"
106 |        ]
107 |       }
108 |      ],
109 |      "prompt_number": 4
110 |     },
111 |     {
112 |      "cell_type": "markdown",
113 |      "metadata": {},
114 |      "source": [
115 |       "This simple code gives you access to all the tables in this database. Let's further investigate how many tables and columns exist in the database."
116 |      ]
117 |     },
118 |     {
119 |      "cell_type": "code",
120 |      "collapsed": false,
121 |      "input": [
122 |       "len(data)"
123 |      ],
124 |      "language": "python",
125 |      "metadata": {},
126 |      "outputs": [
127 |       {
128 |        "metadata": {},
129 |        "output_type": "pyout",
130 |        "prompt_number": 12,
131 |        "text": [
132 |         "9"
133 |        ]
134 |       }
135 |      ],
136 |      "prompt_number": 12
137 |     },
138 |     {
139 |      "cell_type": "markdown",
140 |      "metadata": {},
141 |      "source": [
142 |       "The above is the idiomatic python way, but sometimes, it might not be as easy to gasp what is meant, which gives you the chance to use as well"
143 |      ]
144 |     },
145 |     {
146 |      "cell_type": "code",
147 |      "collapsed": false,
148 |      "input": [
149 |       "data.num_tables()"
150 |      ],
151 |      "language": "python",
152 |      "metadata": {},
153 |      "outputs": [
154 |       {
155 |        "metadata": {},
156 |        "output_type": "pyout",
157 |        "prompt_number": 13,
158 |        "text": [
159 |         "9"
160 |        ]
161 |       }
162 |      ],
163 |      "prompt_number": 13
164 |     },
165 |     {
166 |      "cell_type": "markdown",
167 |      "metadata": {},
168 |      "source": [
169 |       "Get the total number of columns:"
170 |      ]
171 |     },
172 |     {
173 |      "cell_type": "code",
174 |      "collapsed": false,
175 |      "input": [
176 |       "sum([len(x.columns()) for x in data.tables()])"
177 |      ],
178 |      "language": "python",
179 |      "metadata": {},
180 |      "outputs": [
181 |       {
182 |        "metadata": {},
183 |        "output_type": "pyout",
184 |        "prompt_number": 7,
185 |        "text": [
186 |         "92"
187 |        ]
188 |       }
189 |      ],
190 |      "prompt_number": 7
191 |     },
192 |     {
193 |      "cell_type": "markdown",
194 |      "metadata": {},
195 |      "source": [
196 |       "Or we can directly use the number of columns method on the database object"
197 |      ]
198 |     },
199 |     {
200 |      "cell_type": "code",
201 |      "collapsed": false,
202 |      "input": [
203 |       "data.num_columns()"
204 |      ],
205 |      "language": "python",
206 |      "metadata": {},
207 |      "outputs": [
208 |       {
209 |        "metadata": {},
210 |        "output_type": "pyout",
211 |        "prompt_number": 9,
212 |        "text": [
213 |         "92"
214 |        ]
215 |       }
216 |      ],
217 |      "prompt_number": 9
218 |     },
219 |     {
220 |      "cell_type": "markdown",
221 |      "metadata": {},
222 |      "source": [
223 |       "It seems like we have a better understanding of the dataset, but how many tuples are we talking about?"
224 |      ]
225 |     },
226 |     {
227 |      "cell_type": "code",
228 |      "collapsed": false,
229 |      "input": [
230 |       "data.num_tuples()"
231 |      ],
232 |      "language": "python",
233 |      "metadata": {},
234 |      "outputs": [
235 |       {
236 |        "metadata": {},
237 |        "output_type": "pyout",
238 |        "prompt_number": 15,
239 |        "text": [
240 |         "499102280"
241 |        ]
242 |       }
243 |      ],
244 |      "prompt_number": 15
245 |     },
246 |     {
247 |      "cell_type": "markdown",
248 |      "metadata": {},
249 |      "source": [
250 |       "## Singel Table Operation"
251 |      ]
252 |     },
253 |     {
254 |      "cell_type": "markdown",
255 |      "metadata": {},
256 |      "source": [
257 |       "Using the above operations, we can perform simple operations on all tables, but let's have a further look at single table operations to extract more information from instances.\n",
258 |       "\n",
259 |       "In this example, we want to investigate the `order_line` table."
260 |      ]
261 |     },
262 |     {
263 |      "cell_type": "code",
264 |      "collapsed": false,
265 |      "input": [
266 |       "tab = data['order_linep']\n",
267 |       "tab"
268 |      ],
269 |      "language": "python",
270 |      "metadata": {},
271 |      "outputs": [
272 |       {
273 |        "metadata": {},
274 |        "output_type": "pyout",
275 |        "prompt_number": 14,
276 |        "text": [
277 |         "<Table: \"tpcc3.order_linep\">"
278 |        ]
279 |       }
280 |      ],
281 |      "prompt_number": 14
282 |     },
283 |     {
284 |      "cell_type": "markdown",
285 |      "metadata": {},
286 |      "source": [
287 |       "Let's start by doing some basic inspection of the table, like extracting the number of rows and the number of columns"
288 |      ]
289 |     },
290 |     {
291 |      "cell_type": "code",
292 |      "collapsed": false,
293 |      "input": [
294 |       "tab.size()"
295 |      ],
296 |      "language": "python",
297 |      "metadata": {},
298 |      "outputs": [
299 |       {
300 |        "metadata": {},
301 |        "output_type": "pyout",
302 |        "prompt_number": 17,
303 |        "text": [
304 |         "299991280"
305 |        ]
306 |       }
307 |      ],
308 |      "prompt_number": 17
309 |     },
310 |     {
311 |      "cell_type": "code",
312 |      "collapsed": false,
313 |      "input": [
314 |       "len(tab.columns())"
315 |      ],
316 |      "language": "python",
317 |      "metadata": {},
318 |      "outputs": [
319 |       {
320 |        "metadata": {},
321 |        "output_type": "pyout",
322 |        "prompt_number": 18,
323 |        "text": [
324 |         "10"
325 |        ]
326 |       }
327 |      ],
328 |      "prompt_number": 18
329 |     },
330 |     {
331 |      "cell_type": "code",
332 |      "collapsed": false,
333 |      "input": [
334 |       "tab.columns()"
335 |      ],
336 |      "language": "python",
337 |      "metadata": {},
338 |      "outputs": [
339 |       {
340 |        "metadata": {},
341 |        "output_type": "pyout",
342 |        "prompt_number": 19,
343 |        "text": [
344 |         "[ol_w_id,\n",
345 |         " ol_d_id,\n",
346 |         " ol_o_id,\n",
347 |         " ol_number,\n",
348 |         " ol_i_id,\n",
349 |         " ol_delivery_d,\n",
350 |         " ol_amount,\n",
351 |         " ol_supply_w_id,\n",
352 |         " ol_quantity,\n",
353 |         " ol_dist_info]"
354 |        ]
355 |       }
356 |      ],
357 |      "prompt_number": 19
358 |     },
359 |     {
360 |      "cell_type": "markdown",
361 |      "metadata": {},
362 |      "source": [
363 |       "Columns are special objects that can be easily and interactively inspected in iPython Notebooks, the default information per column are the `min` and `max` value, the most frequent and least frequent value and the total number of distinct values. Based on these measrues we provide information about the column.\n",
364 |       "\n",
365 |       "$uniqueness = \\frac{distinct}{rows}$\n",
366 |       "\n",
367 |       "$constancy = \\frac{count_{mf}}{rows}$"
368 |      ]
369 |     },
370 |     {
371 |      "cell_type": "code",
372 |      "collapsed": false,
373 |      "input": [
374 |       "tab['ol_w_id']"
375 |      ],
376 |      "language": "python",
377 |      "metadata": {},
378 |      "outputs": [
379 |       {
380 |        "html": [
381 |         "<table><tr><th>Name</th><th>Value</th></tr><tr><td>Min</td><td>1</td></tr><tr><td>Max</td><td>1000</td></tr><tr><td>#Distinct Values</td><td>1000</td></tr><tr><td>Most Frequent</td><td>109 (301593)</td></tr><tr><td>Least Frequent</td><td>212 (298395)</td></tr><tr><td>Top 10 MF</td><td>109,676,117,460,19,877,165,764,340,689</td></tr><tr><td>Top 10 LF</td><td>212, 405, 52, 284, 304, 769, 727, 665, 90, 163</td></tr><tr><td>Uniqueness</td><td>3.33343022504e-06</td></tr><tr><td>Constancy</td><td>0.00100533922186</td></tr></table><p>Rows: 9 / Columns: 2</p>"
382 |        ],
383 |        "metadata": {},
384 |        "output_type": "pyout",
385 |        "prompt_number": 20,
386 |        "text": [
387 |         "ol_w_id"
388 |        ]
389 |       }
390 |      ],
391 |      "prompt_number": 20
392 |     },
393 |     {
394 |      "cell_type": "markdown",
395 |      "metadata": {},
396 |      "source": [
397 |       "Its possible to access the column either using subscript notation or directly as an attribute of the object"
398 |      ]
399 |     },
400 |     {
401 |      "cell_type": "code",
402 |      "collapsed": false,
403 |      "input": [
404 |       "tab.ol_w_id"
405 |      ],
406 |      "language": "python",
407 |      "metadata": {},
408 |      "outputs": [
409 |       {
410 |        "html": [
411 |         "<table><tr><th>Name</th><th>Value</th></tr><tr><td>Min</td><td>1</td></tr><tr><td>Max</td><td>1000</td></tr><tr><td>#Distinct Values</td><td>1000</td></tr><tr><td>Most Frequent</td><td>109 (301593)</td></tr><tr><td>Least Frequent</td><td>212 (298395)</td></tr><tr><td>Top 10 MF</td><td>109,676,117,460,19,877,165,764,340,689</td></tr><tr><td>Top 10 LF</td><td>212, 405, 52, 284, 304, 769, 727, 665, 90, 163</td></tr><tr><td>Uniqueness</td><td>3.33343022504e-06</td></tr><tr><td>Constancy</td><td>0.00100533922186</td></tr></table><p>Rows: 9 / Columns: 2</p>"
412 |        ],
413 |        "metadata": {},
414 |        "output_type": "pyout",
415 |        "prompt_number": 21,
416 |        "text": [
417 |         "ol_w_id"
418 |        ]
419 |       }
420 |      ],
421 |      "prompt_number": 21
422 |     },
423 |     {
424 |      "cell_type": "markdown",
425 |      "metadata": {},
426 |      "source": [
427 |       "Based on this information we can further deduct what role this column might have in the overall schema. For example, based on the uniqueness we can say that the column is not suitable to uniquely identify every row. In additiona, based on the constancy of the most frequent value and the spread between the most and least frequent value we can deduct that the data is almost uniformly distributed."
428 |      ]
429 |     },
430 |     {
431 |      "cell_type": "markdown",
432 |      "metadata": {},
433 |      "source": [
434 |       "## Distinct Values and Distinct Value Distribution\n",
435 |       "\n",
436 |       "One important feature ist to look at the distinct values and their distribution"
437 |      ]
438 |     },
439 |     {
440 |      "cell_type": "code",
441 |      "collapsed": false,
442 |      "input": [
443 |       "tab.ol_w_id.dcount()"
444 |      ],
445 |      "language": "python",
446 |      "metadata": {},
447 |      "outputs": [
448 |       {
449 |        "metadata": {},
450 |        "output_type": "pyout",
451 |        "prompt_number": 22,
452 |        "text": [
453 |         "1000"
454 |        ]
455 |       }
456 |      ],
457 |      "prompt_number": 22
458 |     },
459 |     {
460 |      "cell_type": "code",
461 |      "collapsed": false,
462 |      "input": [
463 |       "dist = tab.ol_w_id.distribution(limit=10000)"
464 |      ],
465 |      "language": "python",
466 |      "metadata": {},
467 |      "outputs": [],
468 |      "prompt_number": 24
469 |     },
470 |     {
471 |      "cell_type": "code",
472 |      "collapsed": false,
473 |      "input": [
474 |       "%matplotlib inline\n",
475 |       "dist.fraction.hist()"
476 |      ],
477 |      "language": "python",
478 |      "metadata": {},
479 |      "outputs": [
480 |       {
481 |        "metadata": {},
482 |        "output_type": "pyout",
483 |        "prompt_number": 25,
484 |        "text": [
485 |         "<matplotlib.axes.AxesSubplot at 0x10c550ad0>"
486 |        ]
487 |       },
488 |       {
489 |        "metadata": {},
490 |        "output_type": "display_data",
491 |        "png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAD9CAYAAABN7FvjAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHJBJREFUeJzt3X9Qk/cdB/B3BG79Qy1SS7QJWyyEYgQJ0wF1s8YT7LSW\n2brpqF1j1d3WTrfOXR27XW/tHxP8o+1ZW9tut270vPPHzhXcbTJs6+O2bor1x11PPMUZlJ9ZXaDi\nuorKd38gkVTliUmefJ/ny/t1x5UnIXw+nybf50O+n8TYhBACREREIxgjOwEiIjI/NgsiItLFZkFE\nRLrYLIiISBebBRER6WKzICIiXbrNoq2tDXPnzsW0adOQn5+PV155BQDw/PPPw+l0oqioCEVFRdiz\nZ0/4NtXV1XC73cjLy0NjY6Nx2RMRUVLY9N5n0d3dje7ubni9Xly8eBEzZsxAXV0ddu7ciXHjxmHd\nunURP9/c3IzHHnsMhw4dQkdHB8rKynDq1CmMGcMnMUREVqV7Bp80aRK8Xi8AYOzYsZg6dSo6OjoA\nADfrM/X19aisrERaWhpcLhdycnLQ1NSU4LSJiCiZUm/nh1tbW3H06FGUlpbigw8+wObNm/H2229j\n5syZePHFF5Geno7Ozk6UlpaGb+N0OsPNZTibzRZ/9kREo4ysf3Qj6r2hixcv4pvf/CY2bdqEsWPH\n4qmnnkIgEMCxY8cwefJk/OQnP7nlbW/VGIQQSn794he/kJ4D62N9rE+9L5miahaXL1/GkiVL8Pjj\nj2Px4sUAgMzMTNhsNthsNqxevTq81eRwONDW1ha+bXt7OxwOhwGpm1dra6vsFAzF+qyN9VEsdJuF\nEAKrVq2Cx+PBM888E768q6sr/P0777yDgoICAEBFRQW2b9+O/v5+BAIBtLS0oLi42IDUiYgoWXRn\nFh988AG2bt2K6dOno6ioCACwYcMGbNu2DceOHYPNZsOUKVPw5ptvAgA8Hg+WLl0Kj8eD1NRUbNmy\nZdTNJ1asWCE7BUOxPmtjfRQL3ZfOGhbYZpO+B0dEZCUyz5t884MBNE2TnYKhWJ+1sT6KBZsFERHp\n4jYUEZFFcBuKiIhMjc3CAKrvmbI+a2N9FAs2CyIi0sWZBRGRRXBmQUREpsZmYQDV90xZn7WxPooF\nmwUREenizIKIyCI4syAiIlNjszCA6numrM/aWB/Fgs2CiIh0cWZBRGQRnFkQEZGpsVkYQPU9U9Zn\nbayPYsFmQUREujizINMaPz4DfX09SY87btwEXLgQSnpcIj0yz5tsFmRaNpsNgIzHCB+bZE4ccCtG\n9T1T1etTner3n+r1ycJmQUREurgNRaYlbxsqDcCVpEflrIT0cGZBdBMyZxaclZAZcWahGNX3TFWv\nT3Wq33+q1ycLmwUREeniNhSZFrehiCJxG4qIiEyNzcIAqu+Zql6f6lS//1SvTxY2CyIi0sWZBZkW\nZxZEkTizICIiU2OzMIDqe6aq16c61e8/1euThc2CiIh0cWZBpsWZBVEkU88s2traMHfuXEybNg35\n+fl45ZVXAAChUAjl5eXIzc3F/Pnz0dvbG75NdXU13G438vLy0NjYaFz2RESUFLrNIi0tDS+//DKO\nHz+OAwcO4LXXXsOJEydQU1OD8vJynDp1CvPmzUNNTQ0AoLm5GTt27EBzczMaGhrw9NNPY2BgwPBC\nzET1PVPV61Od6vef6vXJotssJk2aBK/XCwAYO3Yspk6dio6ODuzevRt+vx8A4Pf7UVdXBwCor69H\nZWUl0tLS4HK5kJOTg6amJgNLICIio6Xezg+3trbi6NGjKCkpQTAYhN1uBwDY7XYEg0EAQGdnJ0pL\nS8O3cTqd6OjouOnvW7FiBVwuFwAgPT0dXq8XPp8PwPW/Dqx47PP5TJWPleu7bujYp/jxtSNF7j/V\nH5/JePxrmobW1lbIFvWA++LFi5gzZw6ee+45LF68GBMmTEBPT0/4+oyMDIRCIaxduxalpaVYvnw5\nAGD16tVYuHAhHn300cjAHHCTDg64iSKZesANAJcvX8aSJUvwne98B4sXLwYw+Gyiu7sbANDV1YXM\nzEwAgMPhQFtbW/i27e3tcDgcic7b1G78q1gtqtenOtXvP9Xrk0W3WQghsGrVKng8HjzzzDPhyysq\nKlBbWwsAqK2tDTeRiooKbN++Hf39/QgEAmhpaUFxcbFB6RMRUTLobkP9/e9/xwMPPIDp06df2xYY\nfGlscXExli5dinPnzsHlcmHnzp1IT08HAGzYsAFvvfUWUlNTsWnTJjz44IM3BuY2FOngNhRRJH4G\nN9FNsFkQRTL9zIJuj+p7pqrXpzrV7z/V65OFzYKIiHRxG4pMi9tQRJG4DUVERKbGZmEA1fdMVa9P\ndarff6rXJwubBRER6eLMgkyLMwuiSJxZEBGRqbFZGED1PVPV61Od6vef6vXJwmZBRES6OLMg0+LM\ngigSZxZERGRqbBYGUH3PVPX6VKf6/ad6fbKwWRARkS7OLMi0OLMgisSZBRERmRqbhQFU3zNVvT7V\nqX7/qV6fLGwWRESkizMLMi3OLIgiyTxvpkqJSpYzfnwG+vp6ZKdBRJJwG8oAKu6ZDjYKce1r37Dv\njfwiI6j4+BxO9fpkYbMgIiJdnFlQVOTMDzizIBqO77MgIiJTY7MwgPp7pprsBCgOqj8+Va9PFjYL\nIiLSxZkFRYUzi+TE5ZqgkXBmQUREpsZmYQD190w12QlQHFR/fKpenyxsFkREpIszC4oKZxbJics1\nQSPhzIKIiEyNzcIA6u+ZarIToDio/vhUvT5Z2CyIiEgXZxYUFc4skhOXa4JGwpkFERGZWlTNYuXK\nlbDb7SgoKAhf9vzzz8PpdKKoqAhFRUXYs2dP+Lrq6mq43W7k5eWhsbEx8VmbnPp7pprsBCgOqj8+\nVa9PlqiaxZNPPomGhoaIy2w2G9atW4ejR4/i6NGjWLBgAQCgubkZO3bsQHNzMxoaGvD0009jYGAg\n8ZkTEVHSRNUsZs+ejQkTJtxw+c32zurr61FZWYm0tDS4XC7k5OSgqakp/kwtxOfzyU7BYD7ZCVAc\nVH98ql6fLHF9BvfmzZvx9ttvY+bMmXjxxReRnp6Ozs5OlJaWhn/G6XSio6PjprdfsWIFXC4XACA9\nPR1erzd8Rw89leSxOY4HabjeKLRr/zX6GDrXq3Z87chk9z+P5RwPfd/a2grpRJQCgYDIz88PHweD\nQTEwMCAGBgbEz3/+c7Fy5UohhBBr1qwRW7duDf/cqlWrxK5du274fbcR2nL27dsnO4WEAyAAce1r\n37DvjfxCkuKYJ24yqPj4HE7l+mSeN2N+ZpGZmRn+fvXq1Xj44YcBAA6HA21tbeHr2tvb4XA4Yg1D\nNIqkXnuJcnKNGzcBFy6Ekh6XrCXml852dXWFv3/nnXfCr5SqqKjA9u3b0d/fj0AggJaWFhQXF8ef\nqYWov2fqk52Aoq5g8P0dyf3q6+tJSnXJov76kyOqZxaVlZXYv38/zp8/j6ysLLzwwgvQNA3Hjh2D\nzWbDlClT8OabbwIAPB4Pli5dCo/Hg9TUVGzZskXKX0tERJQ4fAe3ATRNU+6vm8h3cGtIzrOL0fcO\n7uTE1RB5/6m1FlVcf0P4Dm4iIjI1PrOgqPDfhlI7LteiNfCZBRERmRqbhQHU/7dpNNkJUFw02QkY\nSv31JwebBRER6eLMgqLCmYXacbkWrYEzCyIiMjU2CwOov2eqyU6A4qLJTsBQ6q8/OdgsiIhIF2cW\nFBXOLNSOy7VoDZxZEBGRqbFZGED9PVNNdgIUF012AoZSf/3JwWZBRES6OLOgqHBmoXZcrkVr4MyC\niIhMjc3CAOrvmWqyE6C4aLITMJT6608ONgsiItLFmQVFhTMLteNyLVoDZxZERGRqbBYGUH/PVJOd\nAMVFk52AodRff3KwWRARkS7OLCgqnFmoHZdr0Ro4syAiIlNjszCA+nummuwEKC6a7AQMpf76k4PN\ngoiIdHFmQVHhzELtuFyL1sCZBRERmRqbhQHU3zPVZCdAcdFkJ2Ao9defHGwWRESkizMLigpnFmrH\n5Vq0Bs4siIjI1NgsDKD+nqkmOwGKiyY7AUOpv/7kYLMgIiJdnFlQVDizUDsu16I1cGZBRESmxmZh\nAPX3TDXZCVBcNNkJGEr99SdHVM1i5cqVsNvtKCgoCF8WCoVQXl6O3NxczJ8/H729veHrqqur4Xa7\nkZeXh8bGxsRnTURESRXVzOJvf/sbxo4diyeeeAIfffQRAGD9+vWYOHEi1q9fj40bN6Knpwc1NTVo\nbm7GY489hkOHDqGjowNlZWU4deoUxoyJ7EucWVgLZxZqx+VatAbTzyxmz56NCRMmRFy2e/du+P1+\nAIDf70ddXR0AoL6+HpWVlUhLS4PL5UJOTg6ampoSnDYRESVTaqw3DAaDsNvtAAC73Y5gMAgA6Ozs\nRGlpafjnnE4nOjo6bvo7VqxYAZfLBQBIT0+H1+uFz+cDcH3f0YrHw/dMzZBPIo4HaQB8iNzz9g27\nzohj6Fyv2jF0rk/E8fBYg9fLfnxx/d38eOj71tZWyBb1S2dbW1vx8MMPh7ehJkyYgJ6envD1GRkZ\nCIVCWLt2LUpLS7F8+XIAwOrVq7Fw4UI8+uijkYEV3obSNO1zJ1nri9yG0nD9JGRoVIy27aDkxNUQ\nef+ptRZVXH9DTL8NdTN2ux3d3d0AgK6uLmRmZgIAHA4H2trawj/X3t4Oh8MRZ5rWouoD9Tqf7AQo\nLj7ZCRhK/fUnR8zNoqKiArW1tQCA2tpaLF68OHz59u3b0d/fj0AggJaWFhQXFycmWyIikiKqZlFZ\nWYlZs2bh5MmTyMrKwm9/+1tUVVVh7969yM3Nxfvvv4+qqioAgMfjwdKlS+HxeLBgwQJs2bLl2hbG\n6KH+67w12QlQXDTZCRhK/fUnB/+5DwOouGfKmYVKcTVwZmFNMs+bbBYUFb7PQu24XIvWYMkBNxER\njR5sFgZQf89Uk50AxUWTnYCh1F9/crBZEBGRLs4sKCqcWagdl2vRGjizICIiU2OzMID6e6aa7AQo\nLprsBAyl/vqTg82CiIh0cWZBUeHMQu24XIvWwJkFERGZGpuFAdTfM9VkJ0Bx0T53nAqbzZbUr/Hj\nM4yrTvn1J0fMH35ERKq4gmRvf/X1ja5/XFQFnFlQVDizYNxEx+T6v32cWRARkamxWRjAyD3T8eMz\nkr6/fOPnkRhXHyWDJjsBQ3FmYQzOLCymr68H8rYqiGi04szCYuTMDgBZ+9qjp9bRFpfrPxacWRAR\nkamxWRhA/T1TTXYCFBdNdgKGUn/9ycFmQUREujizsBjOLBhXjbhc/7HgzIKIiEyNzcIA6u+ZarIT\noLhoshMwlPrrTw42CyIi0sWZhcVwZsG4asTl+o8FZxZERGRqbBYGUH/PVJOdAMVFk52AodRff3Kw\nWRARkS7OLCyGMwvGVSMu138sOLMgIiJTY7MwgPp7pprsBCgumuwEDKX++pODzYKIiHRxZmExnFkw\nrhpxuf5jwZkFERGZGpuFAdTfM9VkJ0Bx0WQnYCj1158ccX8Gt8vlwvjx45GSkoK0tDQ0NTUhFAph\n2bJlOHv2LFwuF3bu3In09PRE5EtERBLEPbOYMmUKDh8+jIyMjPBl69evx8SJE7F+/Xps3LgRPT09\nqKmpiQzMmUVMOLNgXDXicv3HwvIzi88nv3v3bvj9fgCA3+9HXV1dIsIQEZEkcW9D2Ww2lJWVISUl\nBd/73vfw3e9+F8FgEHa7HQBgt9sRDAZvetsVK1bA5XIBANLT0+H1euHz+QBc33e04vHwPdNE//7r\nho59SToeusyHyD1vo+ND53rVjqFzfSKOh8cy4vdHczz4mLba+kv28dD3ra2tkC3ubaiuri5MnjwZ\nH3/8McrLy7F582ZUVFSgp6cn/DMZGRkIhUKRgRXehhq+CBLNHNtQGiKbSDJiJpPqcTVE3n9qbUMZ\nuf5ks/Q21OTJkwEAd999Nx555BE0NTXBbreju7sbwGAzyczMjDeMpaj6QL3OJzsBiotPdgKGUn/9\nyRFXs/j000/R19cHAPjvf/+LxsZGFBQUoKKiArW1tQCA2tpaLF68OP5MiYhImriaRTAYxOzZs+H1\nelFSUoJFixZh/vz5qKqqwt69e5Gbm4v3338fVVVVicrXEtR/nbcmOwGKiyY7AUOpv/7kiGvAPWXK\nFBw7duyGyzMyMvDuu+/G86uJiMhE+G9DWYw5Btwqx2TcZMXk+r99lh5wExGR+tgsDKD+nqkmOwGK\niyY7AUOpv/7kYLMgIiJdnFlYDGcWjKtGXK7/WHBmQUREpsZmYQD190w12QlQXDTZCRhK/fUnB5sF\nERHp4szCYjizYFw14nL9x4IzCyIiMjU2CwOov2eqyU6A4qLJTgBAKmw2W9K/xo/P0E+NborNgogk\nuILBrS8jvvbd8rq+vuufs0O3hzMLi+HMgnHViCuvViufd2SeN+P+WNXRavz4DP6VQkSjBrehYjTY\nKG7/aXD8X2agyU6A4qLJTsBgmuwElMRmQUREujiziNHomh3Iijuaah1tcTmziAXfZ0FERKbGZmEI\nTXYCBtNkJ0Bx0WQnYDBNdgJKYrMgIiJdnFnEiDMLVWMyrroxB+Na/bzDmQUREZkWm4UhNNkJGEyT\nnQDFRZOdgME02Qkoic2CiIh0cWYRI84sVI3JuOrGHIxr9fMOZxZERGRabBaG0GQnYDBNdgIUF012\nAgbTZCegJDYLIiLSxZlFjDizUDUm46obczCu1c87/DwLIiLDpV77Qy+5xo2bgAsXQkmPm0jchjKE\nJjsBg2myE6C4aLITMJg2wnVGfpzrrb9U+KA0yz+zuHz5Mh566FsIhS7KToWISFmWn1lcuHABd901\nCVeu7E5AVrejHKNnf1lW3NFU62iLO5pqHYybiPMdZxZxGjMmFUCZ7DSIiJTFmYUhNNkJGEyTnQDF\nRZOdgME02Qkoic3CEMdkJ2Aw1etTner3n+r1yWFYs2hoaEBeXh7cbjc2btxoVBiT6pWdgMFUr091\nqt9/qtcnhyHN4urVq1izZg0aGhrQ3NyMbdu24cSJE0aEIiKiJDCkWTQ1NSEnJwculwtpaWn49re/\njfr6eiNCmVSr7AQM1io7AYpLq+wEDNYqOwElGfJqqI6ODmRlZYWPnU4nDh48eMPPJfadlMl/V+bI\nMWslxTXS8LhG1nermMmketzP339mWz/xGunxKee+lfHO8UQypFlE8z/Fyv8+CxHRaGPINpTD4UBb\nW1v4uK2tDU6n04hQRESUBIY0i5kzZ6KlpQWtra3o7+/Hjh07UFFRYUQoIiJKAkO2oVJTU/Hqq6/i\nwQcfxNWrV7Fq1SpMnTrViFBERJQEus8sonm/xA9/+EO43W4UFhbi6NGjAIAFCxZg06ZNSE1NxVtv\nvRVx21AohPLycuTm5mL+/Pno7R18XXR/fz+efPJJTJ8+HV6vF/v37w/fZseOHSgsLER+fj6qqqpu\nyGHXrl0YM2YMjhw5EnXxsdY20m0TXdvOnTsxbdo05OfnY/ny5VHXZoX6Tp8+jdmzZ6OoqAiFhYXY\ns2eP9Pp+//vfY9q0aUhJSbnhsVRdXQ232428vDw0NjaGLz98+DAKCgrgdrvxox/9KHz5pUuXsGzZ\nMrjdbpSWluLs2bNK1ffSSy9h2rRpKCwsRFlZGc6dO6dUfUPMcm5JdG23fW4RI7hy5YrIzs4WgUBA\n9Pf3i8LCQtHc3BzxM3/605/EggULhBBCHDhwQJSUlOje9tlnnxUbN24UQghRU1MjfvrTnwohhHj1\n1VfFypUrhRBC/Pvf/xYzZswQQghx/vx58cUvflGcP39eCCGE3+8X7733XjiHCxcuiNmzZ4v7779f\nHD58eKSSLFXbqVOnRFFRkejt7RVCCPHxxx9HVZtV6vP7/eKNN94QQgjR3NwsXC6X9PpOnDghTp48\nKXw+X8Rj6fjx46KwsFD09/eLQCAgsrOzxcDAgBBCiK985Svi4MGDQgghFixYIPbs2SOEEOK1114T\nTz31lBBCiO3bt4tly5YpVd++ffvE//73PyGEEK+//rpy9QlhrnNLImuL5dwy4jOLaN4vsXv3bvj9\nfgBASUkJent70d3dPeJth9/G7/ejrq4OAHDixAnMnTsXAHD33XcjPT0dhw4dwpkzZ+B2u3HXXXcB\nAObNm4ddu3aFc3juuedQVVWFL3zhC1G/ysoKtf3617/GmjVrcOeddwIAJk6cGFVtVqlv8uTJ+OST\nTwAAvb29cDgc0uvLy8tDbm7uDfHq6+tRWVmJtLQ0uFwu5OTk4ODBg+jq6kJfXx+Ki4sBAE888UT4\n/8nw+EuWLMF7772nVH0+nw933HFHOH57e7tS9QHmOrcksrZYzi0jNoubvV+io6Mjqp/p7Oy85W2D\nwSDsdjsAwG63IxgMAgAKCwuxe/duXL16FYFAAIcPH0Z7ezvcbjdOnjyJs2fP4sqVK6irqws/MI8c\nOYKOjg4sXLgQQPSvZbZCbS0tLTh58iS+9rWv4f7778df/vKXqGoze31Dr5T72c9+htraWmRlZeGh\nhx7C5s2bpdd3K52dnRGv6Bv+u4Zf7nA4wr9rePzU1FTceeedCIWi+7Q0K9Q33G9+85vwGlSlPrOd\nWxJZWyznlhEH3NH+z4mm4wohbvr7bDZb+PKVK1fixIkTmDlzJr70pS9h1qxZSElJQXp6Ol5//XUs\nW7YMY8aMwaxZs3DmzBkIIbBu3TrU1tZGxImG2WsDBj/Y6fTp09i/fz/a2trwwAMP4KOPPgr/NWDV\n+v71r38BANatW4fVq1fjxz/+MQ4cOIDHH38cx48fjyrvRNZnRlaqb+vWrThy5AhefvnlqG9j9vrM\ncm4xSiznlhGbRTTvl/j8z7S3t8PpdOLy5cs3XD60zWC329Hd3Y1Jkyahq6sLmZmZAICUlBS89NJL\n4dt89atfDT/tWrRoERYtWgQA+NWvfoXU1FT09fXh+PHj8Pl8AIDu7m5UVFTgj3/8I7785S+PVJrp\nawOArKwslJSUICUlBS6XC7m5uTh9+jRmzJgxYm1Wqe8f//gHXnjhBQBAaWkpPvvsM5w/fz6qp8SJ\nrC+a9wHd6nc5HI6I7Zehy4duc+7cOdxzzz24cuUKPvnkE2RkZOjWZvb6hm8Xvvvuu9iwYQP++te/\nIi0tLarazF6f0+k0zbnFqMdmTOeWkQYaly9fFvfee68IBALi0qVLuoOaf/7zn+FBzUi3ffbZZ0VN\nTY0QQojq6urwkPTTTz8VFy9eFEII0djYKObMmROOEwwGhRBChEIh4fV6RUtLyw35fn7wY/XaGhoa\nhN/vF0IMDqCysrJEKBRSpr5HHnlE/O53vxNCDA6477nnnqhqM7K+IT6fT3z44Yfh46Eh4qVLl8SZ\nM2fEvffeGx4iFhcXiwMHDoiBgYEbBtzf//73hRBCbNu27bYGwFao78iRIyI7O1ucPn066rqsVN/n\nf5/sc0sia4vl3DJisxBCiD//+c8iNzdXZGdniw0bNgghhHjjjTfCr2IRQogf/OAHIjs7W0yfPj3i\nf+jNbiuEEP/5z3/EvHnzhNvtFuXl5aKnp0cIIUQgEBD33XefmDp1qigvLxfnzp0L36ayslJ4PB7h\n8XjEjh07bprr7dyhVqlt3bp1wuPxiIKCglvWbdX6Tp8+LebMmSMKCwuF1+sVe/fulV7fH/7wB+F0\nOsUdd9wh7Ha7+PrXvx6+7pe//KXIzs4W9913n2hoaAhf/uGHH4r8/HyRnZ0t1q5dG778s88+E9/6\n1rdETk6OKCkpEYFAQKn6ysrKxKRJk4TX6xVer1d84xvfUKq+4cxwbkl0bbd7bpH2GdxERGQd/KQ8\nIiLSxWZBRES62CyIiEgXmwUREelisyAiIl1sFkREpOv/gLsNdbFXPv4AAAAASUVORK5CYII=\n",
492 |        "text": [
493 |         "<matplotlib.figure.Figure at 0x10c5506d0>"
494 |        ]
495 |       }
496 |      ],
497 |      "prompt_number": 25
498 |     },
499 |     {
500 |      "cell_type": "markdown",
501 |      "metadata": {},
502 |      "source": [
503 |       "The above example validates our assumption of unifrom distribution as the histograms spreads basically from $0.0009$ to $0.001006$.\n",
504 |       "\n",
505 |       "In addition to performing such analysis on a column level, we can have a look at the distributions as well from a higher level. "
506 |      ]
507 |     },
508 |     {
509 |      "cell_type": "code",
510 |      "collapsed": false,
511 |      "input": [
512 |       "data.dcounts().fraction.hist()"
513 |      ],
514 |      "language": "python",
515 |      "metadata": {},
516 |      "outputs": [
517 |       {
518 |        "metadata": {},
519 |        "output_type": "pyout",
520 |        "prompt_number": 7,
521 |        "text": [
522 |         "<matplotlib.axes.AxesSubplot at 0x109958510>"
523 |        ]
524 |       },
525 |       {
526 |        "metadata": {},
527 |        "output_type": "display_data",
528 |        "png": "iVBORw0KGgoAAAANSUhEUgAAAXMAAAD9CAYAAABOd5eOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFZNJREFUeJzt3XFs1PX9x/FXtfwy5qZtlX5x0PzKkKaUQnsbk7lNOVav\nKs6uTMI0hpzo+BnNsuF/zZbFbVnkxv4YTvfH/mBwMZsb2RbsMiSj0kMdKhulkCBS4worrL3JjrNM\nlJb2+/uDth8Y9Hq9b+/7/fZ7z0fS0M/12nv3lfpu+7rrWWTbti0AwLR2jdcDAACcY5kDQACwzAEg\nAFjmABAALHMACACWOQAEwITLPJ1Oa/Xq1Vq4cKFqamr05ptvKpVKKRKJqKqqSo2NjUqn027MCgAY\nx4TL/Nvf/rZWrlypo0eP6vDhw6qurlYsFlMkElFXV5caGhoUi8XcmBUAMI6iTH809P777ysUCunv\nf//7ZZdXV1dr7969sixLfX19CofDevvtt/M+LADg6oozvbG7u1uzZs3SunXrdOjQIX32s5/V5s2b\nlUwmZVmWJMmyLCWTySvet6ioKD8TA0DA5fKH+RlrlgsXLqijo0NPPPGEOjo6dN11111RqRQVFY27\nuG3b5sW29dRTT3k+g19eyIIsyCLzS64yLvO5c+dq7ty5+tznPidJWr16tTo6OjR79mz19fVJknp7\ne1VeXp7zAIXg+PHjXo/gG2RhkIVBFs5lXOazZ89WRUWFurq6JEltbW1atGiR7rvvPsXjcUlSPB5X\nc3Nz/icFAIwrY2cuSc8++6weeughDQwMaP78+dq6dauGhoa0Zs0abdmyRZWVldq+fbsbs05bDz/8\nsNcj+AZZGGRhkIVzGR/N4ugDFxU56n8AoBDlujv5C1AXJBIJr0fwDbIwyMIgC+cmrFmc2LRpUz4/\nfFZCoZAikYjXYwBAXuW1Zrn22jUqKvrffHz4rNj2CdXVvacDB/Z4NgMATEauNUtefzIfGvo/SQ35\nvIkJ7JFt/8jD2wcAd9CZu4A+0CALgywMsnCOZQ4AAZDXzlxqk9c1Syj0I3V00JkDmB54aCIAFDCW\nuQvoAw2yMMjCIAvnWOYAEAB05gDgI3TmAFDAWOYuoA80yMIgC4MsnGOZA0AA0JkDgI/QmQNAAWOZ\nu4A+0CALgywMsnCOZQ4AAUBnDgA+QmcOAAWMZe4C+kCDLAyyMMjCOZY5AAQAnTkA+AidOQAUMJa5\nC+gDDbIwyMIgC+dY5gAQAHTmAOAjdOYAUMAmXOaVlZVasmSJQqGQbr31VklSKpVSJBJRVVWVGhsb\nlU6n8z7odEYfaJCFQRYGWTg34TIvKipSIpHQwYMHtX//fklSLBZTJBJRV1eXGhoaFIvF8j4oAGB8\nE3bm8+bN09/+9jfdeOONY5dVV1dr7969sixLfX19CofDevvtty//wHTmADBpuXbmxdl84DvvvFPX\nXnutHnvsMa1fv17JZFKWZUmSLMtSMpkc5703SXp15PUSSfWSwiPnxMi/+Tx3jk0y+mtcOBzmzJkz\nZ9+cE4mEtm3bJulirZ0zewL//Oc/bdu27X/96192XV2d/corr9glJSWXXae0tPSK95NkS222ZHv4\n8rIdCq2Y6FPMu/b2dq9H8A2yMMjCIAsji7V8VRN25jfffLMkadasWVq1apX2798/Vq9IUm9vr8rL\ny3P/bgIAcCzjMj937pzOnj0rSfrggw/05z//WYsXL1ZTU5Pi8bgkKR6Pq7m5Of+TTmOjv1qBLC5F\nFgZZOJexM08mk1q1apUk6cKFC3rooYfU2NiopUuXas2aNdqyZYsqKyu1fft2V4YFAFxdxmU+b948\ndXZ2XnF5WVmZ2tra8jZU0CQSCX7yGEEWBlkYZOEcfwEKAAHAc7MAgI/w3CwAUMBY5i4Y/QMBkMWl\nyMIgC+dY5gAQAHTmAOAjdOYAUMBY5i6gDzTIwiALgyycY5kDQADQmQOAj9CZA0ABY5m7gD7QIAuD\nLAyycI5lDgABQGcOAD5CZw4ABYxl7gL6QIMsDLIwyMI5ljkABACdOQD4CJ05ABQwlrkL6AMNsjDI\nwiAL51jmABAAdOYA4CN05gBQwFjmLqAPNMjCIAuDLJxjmQNAANCZA4CP0JkDQAFjmbuAPtAgC4Ms\nDLJwLqtlPjQ0pFAopPvuu0+SlEqlFIlEVFVVpcbGRqXT6bwOCQDILKtl/swzz6impmakB5disZgi\nkYi6urrU0NCgWCyW1yGnu3A47PUIvkEWBlkYZOHchMv85MmT2rlzp77xjW+MlfKtra2KRqOSpGg0\nqh07duR3SgBARsUTXeHJJ5/UT37yE/X3949dlkwmZVmWJMmyLCWTyXHee5OkV0deL5FULyk8ck6M\n/JvPc+fYJKOd3OhPAG6eL+0Dvbh9P51HL/PLPF6eOzs7tWHDBt/M4+V58+bNqq+v9808bp4TiYS2\nbdsmSaqsrFTO7Az++Mc/2k888YRt27bd3t5uf+UrX7Ft27ZLSkouu15paekV7yvJltpsyfbw5WU7\nFFqR6VN0RXt7u9cj+AZZGGRhkIUxwVoeV8afzPft26fW1lbt3LlTH330kfr7+7V27VpZlqW+vj7N\nnj1bvb29Ki8vz/27SQEY/W4MsrgUWRhk4VzGzvzpp59WT0+Puru79Zvf/EZf/vKX9fzzz6upqUnx\neFySFI/H1dzc7MqwAICrm9TjzEcfzdLS0qLdu3erqqpKe/bsUUtLS16GC4pL++JCRxYGWRhk4dyE\nd4COWr58uZYvXy5JKisrU1tbW96GAgBMDs/NAgA+wnOzAEABY5m7gD7QIAuDLAyycI5lDgABQGcO\nAD5CZw4ABYxl7gL6QIMsDLIwyMI5ljkABACdOQD4CJ05ABQwlrkL6AMNsjDIwiAL51jmABAAdOYA\n4CN05gBQwFjmLqAPNMjCIAuDLJxjmQNAANCZA4CP0JkDQAFjmbuAPtAgC4MsDLJwjmUOAAFAZw4A\nPkJnDgAFjGXuAvpAgywMsjDIwjmWOQAEAJ05APgInTkAFDCWuQvoAw2yMMjCIAvnir0eAAD84Prr\ny3T27Bmvx8hZxs78o48+0vLly3X+/HkNDAzoq1/9qjZu3KhUKqWvf/3rOnHihCorK7V9+3aVlJRc\n/oHpzAFMIxd3Vl7uQpykPHTmH/vYx9Te3q7Ozk4dPnxY7e3teu211xSLxRSJRNTV1aWGhgbFYrGc\nxwYAODdhZ/7xj39ckjQwMKChoSGVlpaqtbVV0WhUkhSNRrVjx478TjnN0QcaZGGQhUEWzk3YmQ8P\nD+szn/mM3n33XT3++ONatGiRksmkLMuSJFmWpWQyOc57b5L06sjrJZLqJYVHzomRf/N57hybZPSL\nJRwOc/bwPMov83h57uzs9NU8Xp47Ozt9MY8xeg67cE5I2jZyrlSusn6c+fvvv6+77rpLGzdu1Ne+\n9jWdOWPuKCgrK1Mqlbr8A9OZA5hGAt2ZX+qGG27QvffeqwMHDsiyLPX19UmSent7VV5ePukbBgBM\nnYzL/PTp00qn05KkDz/8ULt371YoFFJTU5Pi8bgkKR6Pq7m5Of+TTmNX/gpXuMjCIAuDLJzL2Jn3\n9vYqGo1qeHhYw8PDWrt2rRoaGhQKhbRmzRpt2bJl7KGJAADv8NwsAKAC6swBAP7FMncBfaBBFgZZ\nGGThHMscAAKAzhwARGcOAPABlrkL6AMNsjDIwiAL51jmABAAdOYAIDpzAIAPsMxdQB9okIVBFgZZ\nOMcyB4AAoDMHANGZAwB8gGXuAvpAgywMsjDIwjmWOQAEAJ05AIjOHADgAyxzF9AHGmRhkIVBFs6x\nzAEgAOjMAUB05gAAH2CZu4A+0CALgywMsnCOZQ4AAUBnDgCiMwcA+ADL3AX0gQZZGGRhkIVzLHMA\nCAA6cwBQwDvznp4erVixQosWLVJtba1+9rOfSZJSqZQikYiqqqrU2NiodDqd28wAgCmRcZnPmDFD\nP/3pT3XkyBG98cYb+vnPf66jR48qFospEomoq6tLDQ0NisVibs07LdEHGmRhkIVBFs5lXOazZ89W\nfX29JOkTn/iEFi5cqFOnTqm1tVXRaFSSFI1GtWPHjvxPCgAYV3G2Vzx+/LgOHjyoZcuWKZlMyrIs\nSZJlWUomk+O81yZJr468XiKpXlJ45JwY+Tef586xSUa/84fDYdfP4XDY09vn7N/zKL/M49V59DKv\n5zFGz2EXzglJ20bOlcpVVneA/uc//9Hy5cv1ve99T83NzSotLdWZM2fG3l5WVqZUKnX5B+YOUADT\nSKDvAJWkwcFB3X///Vq7dq2am5slXfxpvK+vT5LU29ur8vLySd9wIbnyu37hIguDLAyycC7jMrdt\nW48++qhqamq0YcOGscubmpoUj8clSfF4fGzJAwC8kbFmee2113THHXdoyZIlI7+CSBs3btStt96q\nNWvW6B//+IcqKyu1fft2lZSUXP6BqVkATCPTvWbJeAfol770JQ0PD1/1bW1tbZO+MQBAfvDn/C6g\nDzTIwiALgyycY5kDQADw3CwAoOnfmfOTOQAEAMvcBfSBBlkYZGGQhXMscwAIADpzABCdOQDAB1jm\nLqAPNMjCIAuDLJxjmQNAANCZA4DozAEAPsAydwF9oEEWBlkYZOEcyxwAAoDOHABEZw4A8AGWuQvo\nAw2yMMjCIAvnWOYAEAB05gAgOnMAgA+wzF1AH2iQhUEWBlk4xzIHgACgMwcA0ZkDAHyAZe4C+kCD\nLAyyMMjCOZY5AAQAnTkAiM4cAOADLHMX0AcaZGGQhUEWzmVc5o888ogsy9LixYvHLkulUopEIqqq\nqlJjY6PS6XTehwQAZJZxma9bt067du267LJYLKZIJKKuri41NDQoFovldcAgCIfDXo/gG2RhkIVB\nFs5lXOa33367SktLL7ustbVV0WhUkhSNRrVjx478TQcAyErxZN8hmUzKsixJkmVZSiaTGa69SdKr\nI6+XSKqXFB45J0b+zee5c2yS0U5u9CcAN8+X9oFe3L6fzqOX+WUeL8+dnZ3asGGDb+bx8rx582bV\n19d7Po8xeg67cE5I2jZyrlTO7Al0d3fbtbW1Y+eSkpLL3l5aWnrV95NkS222ZHv48rIdCq2Y6FPM\nu/b2dq9H8A2yMMjC8EMWF3eWl/tq9GXCtXxVk340i2VZ6uvrkyT19vaqvLw89+8kBWL0uz/I4lJk\nYZCFc5Ne5k1NTYrH45KkeDyu5ubmKR8KADA5GZf5gw8+qC984Qs6duyYKioqtHXrVrW0tGj37t2q\nqqrSnj171NLS4tas09aVfVzhIguDLAyycC7jHaAvvPDCVS9va2vLyzAAgNzw3CwAIJ6bBQDgAyxz\nF9AHGmRhkIVBFs6xzAEgAOjMAUB05gAAH2CZu4A+0CALgywMsnCOZQ4AAUBnDgCiMwcA+ADL3AX0\ngQZZGGRhkIVzLHMACAA6cwAQnTkAwAdY5i6gDzTIwiALgyycY5kDQADQmQOA6MwBAD7AMncBfaBB\nFgZZGGThHMscAAKAzhwARGcOAPABlrkL6AMNsjDIwiAL51jmABAAdOYAIDpzAIAPsMxdQB9okIVB\nFgZZOFfs9QD5tUoHD/aP/Prknf/5n5k6f/6cpzP4RWdnp8LhsKczXH99mc6ePePpDJI/vi7IIjhy\n/sl8165dqq6u1oIFC/TjH/94KmeaQv262IF5+zIw8GHeP9PpIp1Oez3CyPLi60IiiyDJaZkPDQ3p\nm9/8pnbt2qW33npLL7zwgo4ePTrVswEAspTTMt+/f79uueUWVVZWasaMGXrggQf04osvTvVsCKDj\nx497PQIQSDl15qdOnVJFRcXYee7cuXrzzTevcs07c51rCnnbl4/yurf3k3g87vUI4uviUn6YgSyc\nymmZZxN6nh6+DgC4ipxqljlz5qinp2fs3NPTo7lz507ZUACAyclpmS9dulTvvPOOjh8/roGBAf32\nt79VU1PTVM8GAMhSTjVLcXGxnnvuOd11110aGhrSo48+qoULF071bACALOX8OPN77rlHx44d03PP\nPad4PJ7x8ebf+ta3tGDBAtXV1engwYM5D+t3Ez32/le/+pXq6uq0ZMkSffGLX9Thw4c9mNId2f4d\nwl//+lcVFxfrD3/4g4vTuSubLBKJhEKhkGpraz3/o6p8miiL06dP6+6771Z9fb1qa2u1bds294d0\nwSOPPCLLsrR48eJxrzPpvWk7cOHCBXv+/Pl2d3e3PTAwYNfV1dlvvfXWZdf505/+ZN9zzz22bdv2\nG2+8YS9btszJTfpWNlns27fPTqfTtm3b9ksvvVTQWYxeb8WKFfa9995r/+53v/Ng0vzLJoszZ87Y\nNTU1dk9Pj23btv3ee+95MWreZZPFU089Zbe0tNi2fTGHsrIye3Bw0Itx8+qVV16xOzo67Nra2qu+\nPZe96ei5WbJ5vHlra6ui0agkadmyZUqn00omk05u1peyyeK2227TDTfcIOliFidPnvRi1LzL9u8Q\nnn32Wa1evVqzZs3yYEp3ZJPFr3/9a91///1jDyK46aabvBg177LJ4uabb1Z/f78kqb+/XzfeeKOK\ni4P3rCO33367SktLx317LnvT0TK/2uPNT506NeF1grjEssniUlu2bNHKlSvdGM112X5dvPjii3r8\n8ccl+eUxxlMvmyzeeecdpVIprVixQkuXLtXzzz/v9piuyCaL9evX68iRI/rUpz6luro6PfPMM26P\n6Qu57E1H3/Ky/Q/Q/q/HnAfxP9zJfE7t7e365S9/qb/85S95nMg72WSxYcMGxWIxFRVdfO7m//4a\nCYpsshgcHFRHR4defvllnTt3Trfddps+//nPa8GCBS5M6J5ssnj66adVX1+vRCKhd999V5FIRIcO\nHdInP/lJFyb0l8nuTUfLPJvHm//3dU6ePKk5c+Y4uVlfyvax94cPH9b69eu1a9eujL9mTWfZZHHg\nwAE98MADki7e6fXSSy9pxowZgXuIazZZVFRU6KabbtLMmTM1c+ZM3XHHHTp06FDglnk2Wezbt0/f\n/e53JUnz58/XvHnzdOzYMS1dutTVWb2W0950UuIPDg7an/70p+3u7m77/PnzE94B+vrrrwf2Tr9s\nsjhx4oQ9f/58+/XXX/doSndkk8WlHn74Yfv3v/+9ixO6J5ssjh49ajc0NNgXLlywP/jgA7u2ttY+\ncuSIRxPnTzZZPPnkk/b3v/9927Ztu6+vz54zZ47973//24tx8667uzurO0Cz3ZuOfjIf7/Hmv/jF\nLyRJjz32mFauXKmdO3fqlltu0XXXXaetW7c6uUnfyiaLH/7whzpz5sxYTzxjxgzt37/fy7HzIpss\nCkU2WVRXV+vuu+/WkiVLdM0112j9+vWqqanxePKpl00W3/nOd7Ru3TrV1dVpeHhYmzZtUllZmceT\nT70HH3xQe/fu1enTp1VRUaEf/OAHGhwclJT73szb/wMUAOAe/rdxABAALHMACACWOQAEAMscAAKA\nZQ4AAcAyB4AA+H9PIHvJ8uT8wwAAAABJRU5ErkJggg==\n",
529 |        "text": [
530 |         "<matplotlib.figure.Figure at 0x10995a0d0>"
531 |        ]
532 |       }
533 |      ],
534 |      "prompt_number": 7
535 |     },
536 |     {
537 |      "cell_type": "code",
538 |      "collapsed": false,
539 |      "input": [
540 |       "len(data.dcounts())"
541 |      ],
542 |      "language": "python",
543 |      "metadata": {},
544 |      "outputs": [
545 |       {
546 |        "metadata": {},
547 |        "output_type": "pyout",
548 |        "prompt_number": 102,
549 |        "text": [
550 |         "98"
551 |        ]
552 |       }
553 |      ],
554 |      "prompt_number": 102
555 |     },
556 |     {
557 |      "cell_type": "code",
558 |      "collapsed": false,
559 |      "input": [
560 |       "data.dcounts().head(10)"
561 |      ],
562 |      "language": "python",
563 |      "metadata": {},
564 |      "outputs": [
565 |       {
566 |        "html": [
567 |         "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
568 |         "<table border=\"1\" class=\"dataframe\">\n",
569 |         "  <thead>\n",
570 |         "    <tr style=\"text-align: right;\">\n",
571 |         "      <th></th>\n",
572 |         "      <th>table</th>\n",
573 |         "      <th>column</th>\n",
574 |         "      <th>distinct</th>\n",
575 |         "      <th>size</th>\n",
576 |         "      <th>fraction</th>\n",
577 |         "    </tr>\n",
578 |         "  </thead>\n",
579 |         "  <tbody>\n",
580 |         "    <tr>\n",
581 |         "      <th>0</th>\n",
582 |         "      <td> customerp</td>\n",
583 |         "      <td>        c_w_id</td>\n",
584 |         "      <td>     1000</td>\n",
585 |         "      <td> 30000000</td>\n",
586 |         "      <td> 3.333333e-05</td>\n",
587 |         "    </tr>\n",
588 |         "    <tr>\n",
589 |         "      <th>1</th>\n",
590 |         "      <td> customerp</td>\n",
591 |         "      <td>        c_d_id</td>\n",
592 |         "      <td>       10</td>\n",
593 |         "      <td> 30000000</td>\n",
594 |         "      <td> 3.333333e-07</td>\n",
595 |         "    </tr>\n",
596 |         "    <tr>\n",
597 |         "      <th>2</th>\n",
598 |         "      <td> customerp</td>\n",
599 |         "      <td>          c_id</td>\n",
600 |         "      <td>     3000</td>\n",
601 |         "      <td> 30000000</td>\n",
602 |         "      <td> 1.000000e-04</td>\n",
603 |         "    </tr>\n",
604 |         "    <tr>\n",
605 |         "      <th>3</th>\n",
606 |         "      <td> customerp</td>\n",
607 |         "      <td>    c_discount</td>\n",
608 |         "      <td>     5000</td>\n",
609 |         "      <td> 30000000</td>\n",
610 |         "      <td> 1.666667e-04</td>\n",
611 |         "    </tr>\n",
612 |         "    <tr>\n",
613 |         "      <th>4</th>\n",
614 |         "      <td> customerp</td>\n",
615 |         "      <td>      c_credit</td>\n",
616 |         "      <td>        2</td>\n",
617 |         "      <td> 30000000</td>\n",
618 |         "      <td> 6.666667e-08</td>\n",
619 |         "    </tr>\n",
620 |         "    <tr>\n",
621 |         "      <th>5</th>\n",
622 |         "      <td> customerp</td>\n",
623 |         "      <td>        c_last</td>\n",
624 |         "      <td>     1000</td>\n",
625 |         "      <td> 30000000</td>\n",
626 |         "      <td> 3.333333e-05</td>\n",
627 |         "    </tr>\n",
628 |         "    <tr>\n",
629 |         "      <th>6</th>\n",
630 |         "      <td> customerp</td>\n",
631 |         "      <td>       c_first</td>\n",
632 |         "      <td> 29999284</td>\n",
633 |         "      <td> 30000000</td>\n",
634 |         "      <td> 9.999761e-01</td>\n",
635 |         "    </tr>\n",
636 |         "    <tr>\n",
637 |         "      <th>7</th>\n",
638 |         "      <td> customerp</td>\n",
639 |         "      <td>  c_credit_lim</td>\n",
640 |         "      <td>        1</td>\n",
641 |         "      <td> 30000000</td>\n",
642 |         "      <td> 3.333333e-08</td>\n",
643 |         "    </tr>\n",
644 |         "    <tr>\n",
645 |         "      <th>8</th>\n",
646 |         "      <td> customerp</td>\n",
647 |         "      <td>     c_balance</td>\n",
648 |         "      <td>        1</td>\n",
649 |         "      <td> 30000000</td>\n",
650 |         "      <td> 3.333333e-08</td>\n",
651 |         "    </tr>\n",
652 |         "    <tr>\n",
653 |         "      <th>9</th>\n",
654 |         "      <td> customerp</td>\n",
655 |         "      <td> c_ytd_payment</td>\n",
656 |         "      <td>        1</td>\n",
657 |         "      <td> 30000000</td>\n",
658 |         "      <td> 3.333333e-08</td>\n",
659 |         "    </tr>\n",
660 |         "  </tbody>\n",
661 |         "</table>\n",
662 |         "</div>"
663 |        ],
664 |        "metadata": {},
665 |        "output_type": "pyout",
666 |        "prompt_number": 9,
667 |        "text": [
668 |         "       table         column  distinct      size      fraction\n",
669 |         "0  customerp         c_w_id      1000  30000000  3.333333e-05\n",
670 |         "1  customerp         c_d_id        10  30000000  3.333333e-07\n",
671 |         "2  customerp           c_id      3000  30000000  1.000000e-04\n",
672 |         "3  customerp     c_discount      5000  30000000  1.666667e-04\n",
673 |         "4  customerp       c_credit         2  30000000  6.666667e-08\n",
674 |         "5  customerp         c_last      1000  30000000  3.333333e-05\n",
675 |         "6  customerp        c_first  29999284  30000000  9.999761e-01\n",
676 |         "7  customerp   c_credit_lim         1  30000000  3.333333e-08\n",
677 |         "8  customerp      c_balance         1  30000000  3.333333e-08\n",
678 |         "9  customerp  c_ytd_payment         1  30000000  3.333333e-08"
679 |        ]
680 |       }
681 |      ],
682 |      "prompt_number": 9
683 |     },
684 |     {
685 |      "cell_type": "markdown",
686 |      "metadata": {},
687 |      "source": [
688 |       "Herew, we see that `dcounts` refers to a Pandas data frame object with 96 rows."
689 |      ]
690 |     },
691 |     {
692 |      "cell_type": "markdown",
693 |      "metadata": {},
694 |      "source": [
695 |       "# Data Loading (Beta)\n",
696 |       "\n",
697 |       "While in most cases the data to profile is already available as a Hive or Impala or even MySQL table, there might exist the case that we simply have a directory full of CSV files following the same format and want to perform the same analysis. \n",
698 |       "\n",
699 |       "To facilitate this process, `pyxplorer` provides a simple interface to generate an external table based on a sample from the CSV files in a given directory.\n",
700 |       "\n",
701 |       "The idea is that a directory in HDFS already contains a set of files that all follow the same specification and the user now wants to perform some data profiling. We further assume that the user has access to Hive / Impala to run queries on the data."
702 |      ]
703 |     },
704 |     {
705 |      "cell_type": "code",
706 |      "collapsed": false,
707 |      "input": [
708 |       "from pyxplorer.loader import Loader\n",
709 |       "loader = Loader(\"/user/martin/test1\", name_node=\"diufpc56\", hive_server=\"diufpc301\")"
710 |      ],
711 |      "language": "python",
712 |      "metadata": {},
713 |      "outputs": [],
714 |      "prompt_number": 27
715 |     },
716 |     {
717 |      "cell_type": "markdown",
718 |      "metadata": {},
719 |      "source": [
720 |       "It will check the first file it finds in the directory, extract the data of the first few hundred lines. It will then try to guess the correct separator and build an external table. The rest of the operations follows the normal explorative schedule described before."
721 |      ]
722 |     },
723 |     {
724 |      "cell_type": "code",
725 |      "collapsed": false,
726 |      "input": [
727 |       "loader.load()"
728 |      ],
729 |      "language": "python",
730 |      "metadata": {},
731 |      "outputs": [
732 |       {
733 |        "metadata": {},
734 |        "output_type": "pyout",
735 |        "prompt_number": 28,
736 |        "text": [
737 |         "('default', 'pyxplorer_data')"
738 |        ]
739 |       }
740 |      ],
741 |      "prompt_number": 28
742 |     },
743 |     {
744 |      "cell_type": "code",
745 |      "collapsed": false,
746 |      "input": [
747 |       "pyxdb = m.Database(\"default\", conn)\n",
748 |       "pyxdb"
749 |      ],
750 |      "language": "python",
751 |      "metadata": {},
752 |      "outputs": [
753 |       {
754 |        "html": [
755 |         "<table><tr><th>Name</th><th>Size</th></tr><tr><td>pyxplorer_data</td><td>7</td></tr></table><p>Rows: 1 / Columns: 2</p>"
756 |        ],
757 |        "metadata": {},
758 |        "output_type": "pyout",
759 |        "prompt_number": 33,
760 |        "text": [
761 |         "<pyxplorer.manager.Database instance at 0x10cf79d88>"
762 |        ]
763 |       }
764 |      ],
765 |      "prompt_number": 33
766 |     },
767 |     {
768 |      "cell_type": "code",
769 |      "collapsed": false,
770 |      "input": [
771 |       "tab = pyxdb[\"pyxplorer_data\"]\n",
772 |       "tab"
773 |      ],
774 |      "language": "python",
775 |      "metadata": {},
776 |      "outputs": [
777 |       {
778 |        "metadata": {},
779 |        "output_type": "pyout",
780 |        "prompt_number": 37,
781 |        "text": [
782 |         "<Table: \"default.pyxplorer_data\">"
783 |        ]
784 |       }
785 |      ],
786 |      "prompt_number": 37
787 |     },
788 |     {
789 |      "cell_type": "code",
790 |      "collapsed": false,
791 |      "input": [
792 |       "tab.columns()"
793 |      ],
794 |      "language": "python",
795 |      "metadata": {},
796 |      "outputs": [
797 |       {
798 |        "metadata": {},
799 |        "output_type": "pyout",
800 |        "prompt_number": 38,
801 |        "text": [
802 |         "[col_0, col_1, col_2, col_3, col_4, col_5, col_6]"
803 |        ]
804 |       }
805 |      ],
806 |      "prompt_number": 38
807 |     },
808 |     {
809 |      "cell_type": "code",
810 |      "collapsed": false,
811 |      "input": [
812 |       "tab.col_0"
813 |      ],
814 |      "language": "python",
815 |      "metadata": {},
816 |      "outputs": [
817 |       {
818 |        "html": [
819 |         "<table><tr><th>Name</th><th>Value</th></tr><tr><td>Min</td><td>100</td></tr><tr><td>Max</td><td>distance</td></tr><tr><td>#Distinct Values</td><td>7</td></tr><tr><td>Most Frequent</td><td>distance (1)</td></tr><tr><td>Least Frequent</td><td>distance (1)</td></tr><tr><td>Top 10 MF</td><td>100,129,148,distance,192,113,168</td></tr><tr><td>Top 10 LF</td><td>100, 129, 148, distance, 192, 113, 168</td></tr><tr><td>Uniqueness</td><td>1.0</td></tr><tr><td>Constancy</td><td>0.142857142857</td></tr></table><p>Rows: 9 / Columns: 2</p>"
820 |        ],
821 |        "metadata": {},
822 |        "output_type": "pyout",
823 |        "prompt_number": 39,
824 |        "text": [
825 |         "col_0"
826 |        ]
827 |       }
828 |      ],
829 |      "prompt_number": 39
830 |     }
831 |    ],
832 |    "metadata": {}
833 |   }
834 |  ]
835 | }


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |     name='pyxplorer',
 5 |     version='0.1.0',
 6 |     author='Martin Grund',
 7 |     author_email='grundprinzip+pip@gmail.com',
 8 |     packages=['pyxplorer'],
 9 |     url='http://github.com/grundprinzip/pyxplorer',
10 |     license='LICENSE',
11 |     description='Simple Big Data Profiling',
12 |     long_description=open('README.rst').read(),
13 |     install_requires=[
14 |         "snakebite",
15 |         "pyhs2",
16 |         "pandas"
17 |     ],
18 | )
19 | 


--------------------------------------------------------------------------------
Name	Size
customerp	30000000
districtp	10000
historyp	30000000
itemp	100000
new_orderp	9000000
oorderp	30000000
order_linep	299991280
stockp	100000000
warehousep	1000
Name	Value
Min	1
Max	1000
#Distinct Values	1000
Most Frequent	109 (301593)
Least Frequent	212 (298395)
Top 10 MF	109,676,117,460,19,877,165,764,340,689
Top 10 LF	212, 405, 52, 284, 304, 769, 727, 665, 90, 163
Uniqueness	3.33343022504e-06
Constancy	0.00100533922186
	table	column	distinct	size	fraction
0	customerp	c_w_id	1000	30000000	3.333333e-05
1	customerp	c_d_id	10	30000000	3.333333e-07
2	customerp	c_id	3000	30000000	1.000000e-04
3	customerp	c_discount	5000	30000000	1.666667e-04
4	customerp	c_credit	2	30000000	6.666667e-08
5	customerp	c_last	1000	30000000	3.333333e-05
6	customerp	c_first	29999284	30000000	9.999761e-01
7	customerp	c_credit_lim	1	30000000	3.333333e-08
8	customerp	c_balance	1	30000000	3.333333e-08
9	customerp	c_ytd_payment	1	30000000	3.333333e-08
Name	Value
Min	100
Max	distance
#Distinct Values	7
Most Frequent	distance (1)
Least Frequent	distance (1)
Top 10 MF	100,129,148,distance,192,113,168
Top 10 LF	100, 129, 148, distance, 192, 113, 168
Uniqueness	1.0
Constancy	0.142857142857