├── Data_Cortex_Nuclear.xls ├── Exploratory Data Analysis in Python.ipynb └── helpers.py /Data_Cortex_Nuclear.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nathan-rice/exploratory-data-analysis/75585339d117ae37891f124f07d6c6b75b504220/Data_Cortex_Nuclear.xls -------------------------------------------------------------------------------- /helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from matplotlib import pyplot as plt 4 | from bokeh.plotting import figure, ColumnDataSource 5 | from bokeh.models import HoverTool, LinearColorMapper, ColorBar, FuncTickFormatter, FixedTicker, AdaptiveTicker 6 | from itertools import combinations, product, zip_longest 7 | from scipy.stats import skew, kurtosis, gaussian_kde 8 | from collections import Counter 9 | 10 | bar_color = "cornflowerblue" 11 | colors = ["#ADD8E6", "#9AC7E7", "#88B6E9", "#76A5EB", "#6495ED", "#647CD8", "#6564C3", "#654BAE", "#663399"] 12 | 13 | def scatter_with_hover(df, x, y, 14 | fig=None, cols=None, name=None, marker='x', 15 | fig_width=500, fig_height=500, **kwargs): 16 | """ 17 | Plots an interactive scatter plot of `x` vs `y` using bokeh, with automatic 18 | tooltips showing columns from `df`. 19 | 20 | Parameters 21 | ---------- 22 | df : pandas.DataFrame 23 | DataFrame containing the data to be plotted 24 | x : str 25 | Name of the column to use for the x-axis values 26 | y : str 27 | Name of the column to use for the y-axis values 28 | fig : bokeh.plotting.Figure, optional 29 | Figure on which to plot (if not given then a new figure will be created) 30 | cols : list of str 31 | Columns to show in the hover tooltip (default is to show all) 32 | name : str 33 | Bokeh series name to give to the scattered data 34 | marker : str 35 | Name of marker to use for scatter plot 36 | **kwargs 37 | Any further arguments to be passed to fig.scatter 38 | 39 | Returns 40 | ------- 41 | bokeh.plotting.Figure 42 | Figure (the same as given, or the newly created figure) 43 | 44 | Example 45 | ------- 46 | fig = scatter_with_hover(df, 'A', 'B') 47 | show(fig) 48 | 49 | fig = scatter_with_hover(df, 'A', 'B', cols=['C', 'D', 'E'], marker='x', color='red') 50 | show(fig) 51 | 52 | Author 53 | ------ 54 | Robin Wilson 55 | with thanks to Max Albert for original code example 56 | """ 57 | 58 | # If we haven't been given a Figure obj then create it with default 59 | # size etc. 60 | if fig is None: 61 | fig = figure(width=fig_width, height=fig_height, tools=['box_zoom', 'reset', 'save']) 62 | 63 | # We're getting data from the given dataframe 64 | source = ColumnDataSource(data=df) 65 | 66 | # We need a name so that we can restrict hover tools to just this 67 | # particular 'series' on the plot. You can specify it (in case it 68 | # needs to be something specific for other reasons), otherwise 69 | # we just use 'main' 70 | if name is None: 71 | name = 'main' 72 | 73 | # Actually do the scatter plot - the easy bit 74 | # (other keyword arguments will be passed to this function) 75 | fig.scatter(x=x, y=y, source=source, name=name, marker=marker, **kwargs) 76 | 77 | # Now we create the hover tool, and make sure it is only active with 78 | # the series we plotted in the previous line 79 | hover = HoverTool(names=[name]) 80 | 81 | if cols is None: 82 | # Display *all* columns in the tooltips 83 | hover.tooltips = [(c, '@' + c) for c in df.columns] 84 | else: 85 | # Display just the given columns in the tooltips 86 | hover.tooltips = [(c, '@' + c) for c in cols] 87 | 88 | # Finally add/enable the tool 89 | fig.add_tools(hover) 90 | 91 | return fig 92 | 93 | 94 | def block_heatmap(df, height=600, width=900): 95 | """ 96 | Generates a 97 | 98 | 99 | 100 | :param df: 101 | The Pandas DataFrame to render in block-heatmap style. 102 | :return: 103 | A Bokeh block heatmap figure modeled after example code. The figure has additional properties, df for 104 | the plot data, and rect for the plot object. 105 | """ 106 | # this colormap blatantly copied from the New York Times. 107 | colors = ["#ADD8E6", "#9AC7E7", "#88B6E9", "#76A5EB", "#6495ED", "#647CD8", "#6564C3", "#654BAE", "#663399"] 108 | mapper = LinearColorMapper(palette=colors, low=0, high=1) 109 | cols = {i: c for (i, c) in enumerate(df.columns)} 110 | index = {i: r for (i, r) in enumerate(df.index)} 111 | cols_by_rows = product(enumerate(df.columns), enumerate(df.index)) 112 | data = np.array([[x, y, c, r, df.loc[r, c]] for ((x, c), (y, r)) in cols_by_rows]) 113 | combination_df = pd.DataFrame(data, columns=["gene_id", "sample_id", "gene", "sample", "value"]) 114 | source = ColumnDataSource(combination_df) 115 | 116 | fig = figure(title="Clustered Heatmap", toolbar_location="below", x_range=(0, len(df.columns)), 117 | y_range=(0, len(df.index)), tools=["box_zoom", "pan", "reset", "save"], name="heatmap", 118 | x_axis_location="above", plot_width=width, plot_height=height, active_drag="box_zoom") 119 | fig.rect(x="gene_id", y="sample_id", source=source, width=1, height=1, 120 | fill_color={'field': 'value', 'transform': mapper}, line_color=None) 121 | 122 | fig.grid.grid_line_color = None 123 | fig.axis.axis_line_color = None 124 | fig.axis.major_tick_line_color = None 125 | fig.axis.major_label_text_font_size = "7pt" 126 | fig.axis.major_label_standoff = 0 127 | fig.xaxis.major_label_orientation = np.pi / 3 128 | 129 | fig.yaxis.formatter = FuncTickFormatter(code=""" 130 | var labels = %s; 131 | return labels[tick] || ''; 132 | """ % index) 133 | 134 | fig.xaxis.formatter = FuncTickFormatter(code=""" 135 | var labels = %s; 136 | return labels[tick] || ''; 137 | """ % cols) 138 | 139 | fig.yaxis.ticker = FixedTicker(ticks=list(index.keys())) 140 | fig.xaxis.ticker = AdaptiveTicker(mantissas=list(range(10)), min_interval=1, max_interval=5) 141 | 142 | hover = HoverTool(names=["heatmap"]) 143 | hover.tooltips = [ 144 | ('gene', '@gene'), 145 | ('sample', '@sample'), 146 | ('percentile', '@value%') 147 | ] 148 | fig.add_tools(hover) 149 | 150 | return fig 151 | 152 | 153 | def plot_histogram(*data, title=None, columns=3): 154 | def plot_data(d, a): 155 | if d is None: 156 | a.axis("off") 157 | return 158 | a.hist(d, normed=True, color=bar_color, label=None) 159 | de = gaussian_kde(d) 160 | edge = 1 161 | x = pd.Series(np.linspace(edge * d.min(), d.max() / edge, 100)) 162 | interpolated_y = de(x) 163 | cumulative = x.apply(lambda v: de.integrate_box_1d(d.min(), v)) * interpolated_y.max() 164 | a.plot(x, interpolated_y, linestyle='--', color="rebeccapurple", label="PDF") 165 | a.plot(x, cumulative, linestyle='--', color="dimgray", label="CDF") 166 | a.fill_between(x, interpolated_y, interpolate=True, color="rebeccapurple", alpha=0.35, zorder=10) 167 | a.fill_between(x, cumulative, interpolate=True, color="dimgray", alpha=0.125, zorder=15) 168 | a.set_xlim([x.min(), x.max()]) 169 | 170 | a.yaxis.set_ticks_position('none') 171 | a.yaxis.set_ticklabels([]) 172 | 173 | if columns > len(data): 174 | columns = len(data) 175 | rows = int(np.ceil(len(data) / columns)) 176 | 177 | fig, axes = plt.subplots(rows, columns) 178 | 179 | if columns == 1: 180 | plot_data(data[0], axes) 181 | if title: 182 | axes.set_title(title) 183 | axes.set_ylabel("Density") 184 | axes.legend() 185 | else: 186 | flat_axes = axes.flatten() 187 | for d, a in zip_longest(data, flat_axes): 188 | plot_data(d, a) 189 | if title: 190 | for t, a in zip(title, flat_axes): 191 | a.set_title(t) 192 | 193 | fig.tight_layout() 194 | return fig 195 | 196 | 197 | def counter_histogram(labels): 198 | counts = Counter(labels) 199 | fig, ax = plt.subplots() 200 | int_keys = [int(k) for k in counts.keys()] 201 | ax.bar(int_keys, list(counts.values()), color=bar_color) 202 | ax.set_xticks(sorted(int_keys)) 203 | 204 | k_range = max(counts.keys()) - min(counts.keys()) 205 | max_v = max(counts.values()) 206 | 207 | def offset(k, v): 208 | return (k - k_range * 0.0125, v + max_v * 0.01) 209 | 210 | for (k, v) in counts.items(): 211 | ax.annotate(str(v), offset(k, v)) 212 | 213 | 214 | def add_dummy(dataframe, column_name): 215 | dummies = pd.get_dummies(dataframe[column_name], prefix="dummy_" + column_name) 216 | return pd.concat([dataframe, dummies], axis=1) 217 | 218 | 219 | def filtered_combinations(columns, include_dummies=True, combine_dummies=False): 220 | def filter_if_dummies(t): 221 | a, b = t 222 | a_dummy = a.startswith("dummy_") 223 | b_dummy = b.startswith("dummy_") 224 | if not include_dummies and (a_dummy or b_dummy): 225 | return False 226 | if a_dummy and b_dummy: 227 | if combine_dummies: 228 | a_split = a.split("_") 229 | b_split = b.split("_") 230 | if not a_split[1] == b_split[1]: 231 | return True 232 | return False 233 | return True 234 | 235 | return filter(filter_if_dummies, combinations(columns)) 236 | 237 | 238 | def generate_moment_statistics(data): 239 | data_skew = skew(data) 240 | data_kurtosis = kurtosis(data) 241 | --------------------------------------------------------------------------------