├── Data_Cortex_Nuclear.xls
├── Exploratory Data Analysis in Python.ipynb
└── helpers.py


/Data_Cortex_Nuclear.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nathan-rice/exploratory-data-analysis/75585339d117ae37891f124f07d6c6b75b504220/Data_Cortex_Nuclear.xls


--------------------------------------------------------------------------------
/helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from matplotlib import pyplot as plt
  4 | from bokeh.plotting import figure, ColumnDataSource
  5 | from bokeh.models import HoverTool, LinearColorMapper, ColorBar, FuncTickFormatter, FixedTicker, AdaptiveTicker
  6 | from itertools import combinations, product, zip_longest
  7 | from scipy.stats import skew, kurtosis, gaussian_kde
  8 | from collections import Counter
  9 | 
 10 | bar_color = "cornflowerblue"
 11 | colors = ["#ADD8E6", "#9AC7E7", "#88B6E9", "#76A5EB", "#6495ED", "#647CD8", "#6564C3", "#654BAE", "#663399"]
 12 | 
 13 | def scatter_with_hover(df, x, y,
 14 |                        fig=None, cols=None, name=None, marker='x',
 15 |                        fig_width=500, fig_height=500, **kwargs):
 16 |     """
 17 |     Plots an interactive scatter plot of `x` vs `y` using bokeh, with automatic
 18 |     tooltips showing columns from `df`.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     df : pandas.DataFrame
 23 |         DataFrame containing the data to be plotted
 24 |     x : str
 25 |         Name of the column to use for the x-axis values
 26 |     y : str
 27 |         Name of the column to use for the y-axis values
 28 |     fig : bokeh.plotting.Figure, optional
 29 |         Figure on which to plot (if not given then a new figure will be created)
 30 |     cols : list of str
 31 |         Columns to show in the hover tooltip (default is to show all)
 32 |     name : str
 33 |         Bokeh series name to give to the scattered data
 34 |     marker : str
 35 |         Name of marker to use for scatter plot
 36 |     **kwargs
 37 |         Any further arguments to be passed to fig.scatter
 38 | 
 39 |     Returns
 40 |     -------
 41 |     bokeh.plotting.Figure
 42 |         Figure (the same as given, or the newly created figure)
 43 | 
 44 |     Example
 45 |     -------
 46 |     fig = scatter_with_hover(df, 'A', 'B')
 47 |     show(fig)
 48 | 
 49 |     fig = scatter_with_hover(df, 'A', 'B', cols=['C', 'D', 'E'], marker='x', color='red')
 50 |     show(fig)
 51 | 
 52 |     Author
 53 |     ------
 54 |     Robin Wilson <robin@rtwilson.com>
 55 |     with thanks to Max Albert for original code example
 56 |     """
 57 | 
 58 |     # If we haven't been given a Figure obj then create it with default
 59 |     # size etc.
 60 |     if fig is None:
 61 |         fig = figure(width=fig_width, height=fig_height, tools=['box_zoom', 'reset', 'save'])
 62 | 
 63 |     # We're getting data from the given dataframe
 64 |     source = ColumnDataSource(data=df)
 65 | 
 66 |     # We need a name so that we can restrict hover tools to just this
 67 |     # particular 'series' on the plot. You can specify it (in case it
 68 |     # needs to be something specific for other reasons), otherwise
 69 |     # we just use 'main'
 70 |     if name is None:
 71 |         name = 'main'
 72 | 
 73 |     # Actually do the scatter plot - the easy bit
 74 |     # (other keyword arguments will be passed to this function)
 75 |     fig.scatter(x=x, y=y, source=source, name=name, marker=marker, **kwargs)
 76 | 
 77 |     # Now we create the hover tool, and make sure it is only active with
 78 |     # the series we plotted in the previous line
 79 |     hover = HoverTool(names=[name])
 80 | 
 81 |     if cols is None:
 82 |         # Display *all* columns in the tooltips
 83 |         hover.tooltips = [(c, '@' + c) for c in df.columns]
 84 |     else:
 85 |         # Display just the given columns in the tooltips
 86 |         hover.tooltips = [(c, '@' + c) for c in cols]
 87 | 
 88 |     # Finally add/enable the tool
 89 |     fig.add_tools(hover)
 90 | 
 91 |     return fig
 92 | 
 93 | 
 94 | def block_heatmap(df, height=600, width=900):
 95 |     """
 96 |     Generates a
 97 | 
 98 | 
 99 | 
100 |     :param df:
101 |         The Pandas DataFrame to render in block-heatmap style.
102 |     :return:
103 |         A Bokeh block heatmap figure modeled after example code.  The figure has additional properties, df for
104 |         the plot data, and rect for the plot object.
105 |     """
106 |     # this colormap blatantly copied from the New York Times.
107 |     colors = ["#ADD8E6", "#9AC7E7", "#88B6E9", "#76A5EB", "#6495ED", "#647CD8", "#6564C3", "#654BAE", "#663399"]
108 |     mapper = LinearColorMapper(palette=colors, low=0, high=1)
109 |     cols = {i: c for (i, c) in enumerate(df.columns)}
110 |     index = {i: r for (i, r) in enumerate(df.index)}
111 |     cols_by_rows = product(enumerate(df.columns), enumerate(df.index))
112 |     data = np.array([[x, y, c, r, df.loc[r, c]] for ((x, c), (y, r)) in cols_by_rows])
113 |     combination_df = pd.DataFrame(data, columns=["gene_id", "sample_id", "gene", "sample", "value"])
114 |     source = ColumnDataSource(combination_df)
115 | 
116 |     fig = figure(title="Clustered Heatmap", toolbar_location="below", x_range=(0, len(df.columns)),
117 |                  y_range=(0, len(df.index)), tools=["box_zoom", "pan", "reset", "save"], name="heatmap",
118 |                  x_axis_location="above", plot_width=width, plot_height=height, active_drag="box_zoom")
119 |     fig.rect(x="gene_id", y="sample_id", source=source, width=1, height=1,
120 |              fill_color={'field': 'value', 'transform': mapper}, line_color=None)
121 | 
122 |     fig.grid.grid_line_color = None
123 |     fig.axis.axis_line_color = None
124 |     fig.axis.major_tick_line_color = None
125 |     fig.axis.major_label_text_font_size = "7pt"
126 |     fig.axis.major_label_standoff = 0
127 |     fig.xaxis.major_label_orientation = np.pi / 3
128 | 
129 |     fig.yaxis.formatter = FuncTickFormatter(code="""
130 |         var labels = %s;
131 |         return labels[tick] || '';
132 |     """ % index)
133 | 
134 |     fig.xaxis.formatter = FuncTickFormatter(code="""
135 |         var labels = %s;
136 |         return labels[tick] || '';
137 |     """ % cols)
138 | 
139 |     fig.yaxis.ticker = FixedTicker(ticks=list(index.keys()))
140 |     fig.xaxis.ticker = AdaptiveTicker(mantissas=list(range(10)), min_interval=1, max_interval=5)
141 | 
142 |     hover = HoverTool(names=["heatmap"])
143 |     hover.tooltips = [
144 |         ('gene', '@gene'),
145 |         ('sample', '@sample'),
146 |         ('percentile', '@value%')
147 |     ]
148 |     fig.add_tools(hover)
149 | 
150 |     return fig
151 | 
152 | 
153 | def plot_histogram(*data, title=None, columns=3):
154 |     def plot_data(d, a):
155 |         if d is None:
156 |             a.axis("off")
157 |             return
158 |         a.hist(d, normed=True, color=bar_color, label=None)
159 |         de = gaussian_kde(d)
160 |         edge = 1
161 |         x = pd.Series(np.linspace(edge * d.min(), d.max() / edge, 100))
162 |         interpolated_y = de(x)
163 |         cumulative = x.apply(lambda v: de.integrate_box_1d(d.min(), v)) * interpolated_y.max()
164 |         a.plot(x, interpolated_y, linestyle='--', color="rebeccapurple", label="PDF")
165 |         a.plot(x, cumulative, linestyle='--', color="dimgray", label="CDF")
166 |         a.fill_between(x, interpolated_y, interpolate=True, color="rebeccapurple", alpha=0.35, zorder=10)
167 |         a.fill_between(x, cumulative, interpolate=True, color="dimgray", alpha=0.125, zorder=15)
168 |         a.set_xlim([x.min(), x.max()])
169 | 
170 |         a.yaxis.set_ticks_position('none')
171 |         a.yaxis.set_ticklabels([])
172 | 
173 |     if columns > len(data):
174 |         columns = len(data)
175 |     rows = int(np.ceil(len(data) / columns))
176 | 
177 |     fig, axes = plt.subplots(rows, columns)
178 | 
179 |     if columns == 1:
180 |         plot_data(data[0], axes)
181 |         if title:
182 |             axes.set_title(title)
183 |         axes.set_ylabel("Density")
184 |         axes.legend()
185 |     else:
186 |         flat_axes = axes.flatten()
187 |         for d, a in zip_longest(data, flat_axes):
188 |             plot_data(d, a)
189 |         if title:
190 |             for t, a in zip(title, flat_axes):
191 |                 a.set_title(t)
192 | 
193 |     fig.tight_layout()
194 |     return fig
195 | 
196 | 
197 | def counter_histogram(labels):
198 |     counts = Counter(labels)
199 |     fig, ax = plt.subplots()
200 |     int_keys = [int(k) for k in counts.keys()]
201 |     ax.bar(int_keys, list(counts.values()), color=bar_color)
202 |     ax.set_xticks(sorted(int_keys))
203 | 
204 |     k_range = max(counts.keys()) - min(counts.keys())
205 |     max_v = max(counts.values())
206 | 
207 |     def offset(k, v):
208 |         return (k - k_range * 0.0125, v + max_v * 0.01)
209 | 
210 |     for (k, v) in counts.items():
211 |         ax.annotate(str(v), offset(k, v))
212 | 
213 | 
214 | def add_dummy(dataframe, column_name):
215 |     dummies = pd.get_dummies(dataframe[column_name], prefix="dummy_" + column_name)
216 |     return pd.concat([dataframe, dummies], axis=1)
217 | 
218 | 
219 | def filtered_combinations(columns, include_dummies=True, combine_dummies=False):
220 |     def filter_if_dummies(t):
221 |         a, b = t
222 |         a_dummy = a.startswith("dummy_")
223 |         b_dummy = b.startswith("dummy_")
224 |         if not include_dummies and (a_dummy or b_dummy):
225 |             return False
226 |         if a_dummy and b_dummy:
227 |             if combine_dummies:
228 |                 a_split = a.split("_")
229 |                 b_split = b.split("_")
230 |                 if not a_split[1] == b_split[1]:
231 |                     return True
232 |             return False
233 |         return True
234 | 
235 |     return filter(filter_if_dummies, combinations(columns))
236 | 
237 | 
238 | def generate_moment_statistics(data):
239 |     data_skew = skew(data)
240 |     data_kurtosis = kurtosis(data)
241 | 


--------------------------------------------------------------------------------