56 | Please activate JavaScript to enable the search
57 | functionality.
58 |
59 |
60 |
61 | From here you can search these documents. Enter your search
62 | words into the box below and click "search". Note that the search
63 | function will automatically search for all of the words. Pages
64 | containing fewer words won't appear in the result list.
65 |
')
207 | .appendTo($('#searchbox'));
208 | }
209 | },
210 |
211 | /**
212 | * init the domain index toggle buttons
213 | */
214 | initIndexTable : function() {
215 | var togglers = $('img.toggler').click(function() {
216 | var src = $(this).attr('src');
217 | var idnum = $(this).attr('id').substr(7);
218 | $('tr.cg-' + idnum).toggle();
219 | if (src.substr(-9) == 'minus.png')
220 | $(this).attr('src', src.substr(0, src.length-9) + 'plus.png');
221 | else
222 | $(this).attr('src', src.substr(0, src.length-8) + 'minus.png');
223 | }).css('display', '');
224 | if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) {
225 | togglers.click();
226 | }
227 | },
228 |
229 | /**
230 | * helper function to hide the search marks again
231 | */
232 | hideSearchWords : function() {
233 | $('#searchbox .highlight-link').fadeOut(300);
234 | $('span.highlighted').removeClass('highlighted');
235 | },
236 |
237 | /**
238 | * make the url absolute
239 | */
240 | makeURL : function(relativeURL) {
241 | return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL;
242 | },
243 |
244 | /**
245 | * get the current relative url
246 | */
247 | getCurrentURL : function() {
248 | var path = document.location.pathname;
249 | var parts = path.split(/\//);
250 | $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() {
251 | if (this == '..')
252 | parts.pop();
253 | });
254 | var url = parts.join('/');
255 | return path.substring(url.lastIndexOf('/') + 1, path.length - 1);
256 | },
257 |
258 | initOnKeyListeners: function() {
259 | $(document).keyup(function(event) {
260 | var activeElementType = document.activeElement.tagName;
261 | // don't navigate when in search box or textarea
262 | if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT') {
263 | switch (event.keyCode) {
264 | case 37: // left
265 | var prevHref = $('link[rel="prev"]').prop('href');
266 | if (prevHref) {
267 | window.location.href = prevHref;
268 | return false;
269 | }
270 | case 39: // right
271 | var nextHref = $('link[rel="next"]').prop('href');
272 | if (nextHref) {
273 | window.location.href = nextHref;
274 | return false;
275 | }
276 | }
277 | }
278 | });
279 | }
280 | };
281 |
282 | // quick alias for translations
283 | _ = Documentation.gettext;
284 |
285 | $(document).ready(function() {
286 | Documentation.init();
287 | });
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # pyspark_dist_explore
3 | ______________________________
4 |
5 | ## PySpark Dataframe Distribution Explorer
6 |
7 | Pyspark_dist_explore is a plotting library to get quick insights on data in Spark DataFrames through histograms and density plots, where the heavy lifting is done in Spark.
8 |
9 | Pypsark_dist_explore has two ways of working: there are 3 functions to create matplotlib graphs or pandas dataframes easily, and a class (Histogram) to do more advanced explorations while minimizing the amount of computation needed.
10 |
11 | ## Functions:
12 | * **hist(ax, x, \*\*kwargs)**. The *hist* function is almost exactly the same as the matplotlib hist function. See [here](https://matplotlib.org/examples/statistics/histogram_demo_multihist.html) for examples. The only two differences are:
13 | * Instead of being a function of an Axes object, an Axes object is needed as input.
14 | * Instead of having an numpy array, list of arrays, or matrix as input, the function works on Spark DataFrames with a single column, a list of single-column Spark DataFrames, or a SparkDataframe with multiple columns. All other keyword arguments of the [Matplotlib hist](https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.hist.html) function can be used.
15 |
16 |
17 | * **distplot(ax, x, \*\*kwargs)**. Combines a normalized histogram of each column in x with a density plot of the same column.
18 |
19 | * **pandas_histogram(x, bins=None, range=None)**. Creates histograms for all columns in x and converts this to a Pandas DataFrame
20 |
21 | ## Installing:
22 | Install from PyPi:
23 |
24 | ```pip install pyspark_dist_explore```
25 |
26 | Or directly from github:
27 |
28 | ```
29 | git clone https://github.com/Bergvca/pyspark_dist_explore.git
30 | cd pyspark_dist_explore
31 | pip install .
32 | ```
33 | ### Examples
34 |
35 |
36 |
37 | ```python
38 | import pyspark
39 | import pandas as pd
40 | import numpy as np
41 | import pyspark.sql.functions as F
42 | import matplotlib.pyplot as plt
43 | import seaborn as sns
44 |
45 | from IPython.display import display, HTML, display_html #usefull to display wide tables
46 | from pyspark_dist_explore import Histogram, hist, distplot, pandas_histogram
47 | from pyspark.sql import Row
48 |
49 | sc = pyspark.SparkContext()
50 | sqlContext = pyspark.SQLContext(sc)
51 | %matplotlib inline
52 | ```
53 |
54 | ```python
55 | # Create some data in a Spark DataFrame:
56 | n_observations = 200
57 |
58 | random_dist_1 = np.random.logistic(100, 1000, n_observations)
59 | random_dist_2 = np.random.logistic(400, 500, n_observations)
60 | age_dist_1 = 20 * np.random.randn(n_observations) + 40
61 | age_dist_2 = 15 * np.random.randn(n_observations) + 30
62 |
63 | list_male = [('M', rand_value, age_dist_1[i]) for i, rand_value in enumerate(random_dist_1)]
64 | list_female = [('F', rand_value, age_dist_2[i]) for i, rand_value in enumerate(random_dist_2)]
65 |
66 | list_male_female = list_male + list_female
67 |
68 | rdd = sc.parallelize(list_male_female)
69 | transactions = rdd.map(lambda x: Row(gender=x[0], amount=float(x[1]), age=float(x[2])))
70 | transactions_df = sqlContext.createDataFrame(transactions)
71 |
72 | ```
73 |
74 |
75 | ```python
76 | # Create some selections on this data
77 |
78 | filtered_by_gender_m = transactions_df.filter(F.col('gender') == 'M').select(F.col('amount').alias('amount_m'))
79 | filtered_by_gender_f = transactions_df.filter(F.col('gender') == 'F').select(F.col('amount').alias('amount_f') )
80 | filtered_by_age_50_plus = transactions_df.filter(F.col('age') > 50).select(F.col('amount').alias('amount_50_plus'))
81 | filtered_by_age_50_minus = transactions_df.filter(F.col('age') <= 50).select(F.col('amount').alias('amount_50_minus'))
82 |
83 | # Create the plots
84 |
85 | fig, axes = plt.subplots(nrows=2, ncols=2)
86 | fig.set_size_inches(20, 20)
87 |
88 | # Use the hist function to plot histograms on the Axes
89 | hist(axes[0, 0], [filtered_by_gender_m, filtered_by_gender_f], bins = 20, color=['red', 'tan'])
90 | axes[0, 0].set_title('01. Compare Genders')
91 | axes[0, 0].legend()
92 |
93 | hist(axes[0, 1], [filtered_by_age_50_plus, filtered_by_age_50_minus], overlapping=True)
94 | axes[0, 1].set_title('02. Compare Age')
95 | axes[0, 1].legend()
96 |
97 | # Use the distplot function to plot (scaled) histograms + density plots on the Axes
98 | distplot(axes[1, 0], [filtered_by_gender_m, filtered_by_gender_f], bins=20)
99 | axes[1, 0].set_title('03. Compare distribution per gender')
100 | axes[1, 0].legend()
101 |
102 | distplot(axes[1, 1], [filtered_by_age_50_plus, filtered_by_age_50_minus], bins=20, color=['orange', 'green'])
103 | axes[1, 1].set_title('03. Compare distribution per age group')
104 | _ = axes[1, 1].legend()
105 |
106 | ```
107 |
108 |
109 | 
110 |
111 |
112 |
113 | ```python
114 | # Convert Histograms of the 4 datasets to a pandas dataframe
115 |
116 | # Put the outliers in seperate bins:
117 | bins = [-6000, -3000] + [bin_range for bin_range in range(-2500, 4000, 500)] + [6000]
118 |
119 |
120 | compare_all_df = pandas_histogram([filtered_by_gender_m,
121 | filtered_by_gender_f,
122 | filtered_by_age_50_plus,
123 | filtered_by_age_50_minus],
124 | bins=bins, range=(-4000, 4000))
125 | display(compare_all_df)
126 | ```
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
amount_50_minus
136 |
amount_50_plus
137 |
amount_f
138 |
amount_m
139 |
140 |
141 |
142 |
143 |
-6000.00 - -3000.00
144 |
4
145 |
2
146 |
1
147 |
5
148 |
149 |
150 |
-3000.00 - -2500.00
151 |
4
152 |
2
153 |
0
154 |
6
155 |
156 |
157 |
-2500.00 - -2000.00
158 |
11
159 |
3
160 |
1
161 |
13
162 |
163 |
164 |
-2000.00 - -1500.00
165 |
10
166 |
5
167 |
4
168 |
11
169 |
170 |
171 |
-1500.00 - -1000.00
172 |
21
173 |
6
174 |
9
175 |
18
176 |
177 |
178 |
-1000.00 - -500.00
179 |
32
180 |
9
181 |
16
182 |
25
183 |
184 |
185 |
-500.00 - 0.00
186 |
39
187 |
8
188 |
30
189 |
17
190 |
191 |
192 |
0.00 - 500.00
193 |
68
194 |
13
195 |
52
196 |
29
197 |
198 |
199 |
500.00 - 1000.00
200 |
46
201 |
17
202 |
43
203 |
20
204 |
205 |
206 |
1000.00 - 1500.00
207 |
29
208 |
4
209 |
22
210 |
11
211 |
212 |
213 |
1500.00 - 2000.00
214 |
24
215 |
9
216 |
13
217 |
20
218 |
219 |
220 |
2000.00 - 2500.00
221 |
10
222 |
5
223 |
6
224 |
9
225 |
226 |
227 |
2500.00 - 3000.00
228 |
4
229 |
3
230 |
2
231 |
5
232 |
233 |
234 |
3000.00 - 3500.00
235 |
5
236 |
0
237 |
1
238 |
4
239 |
240 |
241 |
3500.00 - 6000.00
242 |
2
243 |
3
244 |
0
245 |
5
246 |
247 |
248 |
249 |
250 |
251 |
252 | ## The Histogram Class
253 |
254 | Next to running the functions as above to get results quickly, the pyspark_dist_explore library contains a Histogram class. The advantage of using this class is that it retains state, so if the histogram is build once, multiple actions can be done withouth recalculating the bin values.
255 |
256 | ### Examples
257 |
258 |
259 | ```python
260 | age_hist = Histogram(range=(-4000, 4000), bins=15)
261 |
262 | # Create a histogram for different age groups
263 | for age in range(0, 90, 10):
264 | age_hist.add_data(
265 | transactions_df.
266 | filter((F.col('age') > age) & (F.col('age') <= age+10)).
267 | select(F.col('amount').alias('amount_%d_%d' % (age, age+10)))
268 | )
269 |
270 | fig, axes = plt.subplots(nrows=2)
271 | fig.set_size_inches(20, 10)
272 |
273 | age_hist.plot_hist(axes[0], histtype='step', linewidth=2.0, fill=False, cumulative=True) # The Histogram is build here
274 | age_hist.plot_density(axes[1]) # The density plot is created from the already build histogram
275 |
276 | # Set the legends
277 | axes[0].legend(loc = 'upper left' )
278 | axes[0].set_title('Cumulative Histogram')
279 | axes[1].legend()
280 | axes[1].set_title('Kernel Density Plot')
281 |
282 | age_hist_pd_df = age_hist.to_pandas() # Again the histograms don't need to be recalculated.
283 |
284 | # Create a heatmap from the Pandas Dataframe
285 |
286 | fig, axes = plt.subplots()
287 | fig.set_size_inches(10, 10)
288 | ax = sns.heatmap(age_hist_pd_df, annot=True, ax=axes)
289 | _ = ax.set_title('Heatmap')
290 | ```
291 |
292 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 |
2 | # pyspark_dist_explore
3 | ______________________________
4 |
5 | ## PySpark Dataframe Distribution Explorer
6 |
7 | Pyspark_dist_explore is a plotting library to get quick insights on data in Spark DataFrames through histograms and density plots, where the heavy lifting is done in Spark.
8 |
9 | Pypsark_dist_explore has two ways of working: there are 3 functions to create matplotlib graphs or pandas dataframes easily, and a class (Histogram) to do more advanced explorations while minimizing the amount of computation needed.
10 |
11 | ## Functions:
12 | * **hist(ax, x, \*\*kwargs)**. The *hist* function is almost exactly the same as the matplotlib hist function. See [here](https://matplotlib.org/examples/statistics/histogram_demo_multihist.html) for examples. The only two differences are:
13 | * Instead of being a function of an Axes object, an Axes object is needed as input.
14 | * Instead of having an numpy array, list of arrays, or matrix as input, the function works on Spark DataFrames with a single column, a list of single-column Spark DataFrames, or a SparkDataframe with multiple columns. All other keyword arguments of the [Matplotlib hist](https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.hist.html) function can be used.
15 |
16 |
17 | * **distplot(ax, x, \*\*kwargs)**. Combines a normalized histogram of each column in x with a density plot of the same column.
18 |
19 | * **pandas_histogram(x, bins=None, range=None)**. Creates histograms for all columns in x and converts this to a Pandas DataFrame
20 |
21 | ## Installing:
22 | Install from PyPi:
23 |
24 | ```pip install pyspark_dist_explore```
25 |
26 | Or directly from github:
27 |
28 | ```
29 | git clone https://github.com/Bergvca/pyspark_dist_explore.git
30 | cd pyspark_dist_explore
31 | pip install .
32 | ```
33 | ### Examples
34 |
35 |
36 |
37 | ```python
38 | import pyspark
39 | import pandas as pd
40 | import numpy as np
41 | import pyspark.sql.functions as F
42 | import matplotlib.pyplot as plt
43 | import seaborn as sns
44 |
45 | from IPython.display import display, HTML, display_html #usefull to display wide tables
46 | from pyspark_dist_explore import Histogram, hist, distplot, pandas_histogram
47 | from pyspark.sql import Row
48 |
49 | sc = pyspark.SparkContext()
50 | sqlContext = pyspark.SQLContext(sc)
51 | %matplotlib inline
52 | ```
53 |
54 | ```python
55 | # Create some data in a Spark DataFrame:
56 | n_observations = 200
57 |
58 | random_dist_1 = np.random.logistic(100, 1000, n_observations)
59 | random_dist_2 = np.random.logistic(400, 500, n_observations)
60 | age_dist_1 = 20 * np.random.randn(n_observations) + 40
61 | age_dist_2 = 15 * np.random.randn(n_observations) + 30
62 |
63 | list_male = [('M', rand_value, age_dist_1[i]) for i, rand_value in enumerate(random_dist_1)]
64 | list_female = [('F', rand_value, age_dist_2[i]) for i, rand_value in enumerate(random_dist_2)]
65 |
66 | list_male_female = list_male + list_female
67 |
68 | rdd = sc.parallelize(list_male_female)
69 | transactions = rdd.map(lambda x: Row(gender=x[0], amount=float(x[1]), age=float(x[2])))
70 | transactions_df = sqlContext.createDataFrame(transactions)
71 |
72 | ```
73 |
74 |
75 | ```python
76 | # Create some selections on this data
77 |
78 | filtered_by_gender_m = transactions_df.filter(F.col('gender') == 'M').select(F.col('amount').alias('amount_m'))
79 | filtered_by_gender_f = transactions_df.filter(F.col('gender') == 'F').select(F.col('amount').alias('amount_f') )
80 | filtered_by_age_50_plus = transactions_df.filter(F.col('age') > 50).select(F.col('amount').alias('amount_50_plus'))
81 | filtered_by_age_50_minus = transactions_df.filter(F.col('age') <= 50).select(F.col('amount').alias('amount_50_minus'))
82 |
83 | # Create the plots
84 |
85 | fig, axes = plt.subplots(nrows=2, ncols=2)
86 | fig.set_size_inches(20, 20)
87 |
88 | # Use the hist function to plot histograms on the Axes
89 | hist(axes[0, 0], [filtered_by_gender_m, filtered_by_gender_f], bins = 20, color=['red', 'tan'])
90 | axes[0, 0].set_title('01. Compare Genders')
91 | axes[0, 0].legend()
92 |
93 | hist(axes[0, 1], [filtered_by_age_50_plus, filtered_by_age_50_minus], overlapping=True)
94 | axes[0, 1].set_title('02. Compare Age')
95 | axes[0, 1].legend()
96 |
97 | # Use the distplot function to plot (scaled) histograms + density plots on the Axes
98 | distplot(axes[1, 0], [filtered_by_gender_m, filtered_by_gender_f], bins=20)
99 | axes[1, 0].set_title('03. Compare distribution per gender')
100 | axes[1, 0].legend()
101 |
102 | distplot(axes[1, 1], [filtered_by_age_50_plus, filtered_by_age_50_minus], bins=20, color=['orange', 'green'])
103 | axes[1, 1].set_title('03. Compare distribution per age group')
104 | _ = axes[1, 1].legend()
105 |
106 | ```
107 |
108 |
109 | 
110 |
111 |
112 |
113 | ```python
114 | # Convert Histograms of the 4 datasets to a pandas dataframe
115 |
116 | # Put the outliers in seperate bins:
117 | bins = [-6000, -3000] + [bin_range for bin_range in range(-2500, 4000, 500)] + [6000]
118 |
119 |
120 | compare_all_df = pandas_histogram([filtered_by_gender_m,
121 | filtered_by_gender_f,
122 | filtered_by_age_50_plus,
123 | filtered_by_age_50_minus],
124 | bins=bins, range=(-4000, 4000))
125 | display(compare_all_df)
126 | ```
127 |
128 |
129 |
130 |
143 |
144 |
145 |
146 |
147 |
amount_50_minus
148 |
amount_50_plus
149 |
amount_f
150 |
amount_m
151 |
152 |
153 |
154 |
155 |
-6000.00 - -3000.00
156 |
4
157 |
2
158 |
1
159 |
5
160 |
161 |
162 |
-3000.00 - -2500.00
163 |
4
164 |
2
165 |
0
166 |
6
167 |
168 |
169 |
-2500.00 - -2000.00
170 |
11
171 |
3
172 |
1
173 |
13
174 |
175 |
176 |
-2000.00 - -1500.00
177 |
10
178 |
5
179 |
4
180 |
11
181 |
182 |
183 |
-1500.00 - -1000.00
184 |
21
185 |
6
186 |
9
187 |
18
188 |
189 |
190 |
-1000.00 - -500.00
191 |
32
192 |
9
193 |
16
194 |
25
195 |
196 |
197 |
-500.00 - 0.00
198 |
39
199 |
8
200 |
30
201 |
17
202 |
203 |
204 |
0.00 - 500.00
205 |
68
206 |
13
207 |
52
208 |
29
209 |
210 |
211 |
500.00 - 1000.00
212 |
46
213 |
17
214 |
43
215 |
20
216 |
217 |
218 |
1000.00 - 1500.00
219 |
29
220 |
4
221 |
22
222 |
11
223 |
224 |
225 |
1500.00 - 2000.00
226 |
24
227 |
9
228 |
13
229 |
20
230 |
231 |
232 |
2000.00 - 2500.00
233 |
10
234 |
5
235 |
6
236 |
9
237 |
238 |
239 |
2500.00 - 3000.00
240 |
4
241 |
3
242 |
2
243 |
5
244 |
245 |
246 |
3000.00 - 3500.00
247 |
5
248 |
0
249 |
1
250 |
4
251 |
252 |
253 |
3500.00 - 6000.00
254 |
2
255 |
3
256 |
0
257 |
5
258 |
259 |
260 |
261 |
262 |
263 |
264 | ## The Histogram Class
265 |
266 | Next to running the functions as above to get results quickly, the pyspark_dist_explore library contains a Histogram class. The advantage of using this class is that it retains state, so if the histogram is build once, multiple actions can be done withouth recalculating the bin values.
267 |
268 | ### Examples
269 |
270 |
271 | ```python
272 | age_hist = Histogram(range=(-4000, 4000), bins=15)
273 |
274 | # Create a histogram for different age groups
275 | for age in range(0, 90, 10):
276 | age_hist.add_data(
277 | transactions_df.
278 | filter((F.col('age') > age) & (F.col('age') <= age+10)).
279 | select(F.col('amount').alias('amount_%d_%d' % (age, age+10)))
280 | )
281 |
282 | fig, axes = plt.subplots(nrows=2)
283 | fig.set_size_inches(20, 10)
284 |
285 | age_hist.plot_hist(axes[0], histtype='step', linewidth=2.0, fill=False, cumulative=True) # The Histogram is build here
286 | age_hist.plot_density(axes[1]) # The density plot is created from the already build histogram
287 |
288 | # Set the legends
289 | axes[0].legend(loc = 'upper left' )
290 | axes[0].set_title('Cumulative Histogram')
291 | axes[1].legend()
292 | axes[1].set_title('Kernel Density Plot')
293 |
294 | age_hist_pd_df = age_hist.to_pandas() # Again the histograms don't need to be recalculated.
295 |
296 | # Create a heatmap from the Pandas Dataframe
297 |
298 | fig, axes = plt.subplots()
299 | fig.set_size_inches(10, 10)
300 | ax = sns.heatmap(age_hist_pd_df, annot=True, ax=axes)
301 | _ = ax.set_title('Heatmap')
302 | ```
303 |
304 |
--------------------------------------------------------------------------------
/docs/build/html/_static/underscore.js:
--------------------------------------------------------------------------------
1 | // Underscore.js 1.3.1
2 | // (c) 2009-2012 Jeremy Ashkenas, DocumentCloud Inc.
3 | // Underscore is freely distributable under the MIT license.
4 | // Portions of Underscore are inspired or borrowed from Prototype,
5 | // Oliver Steele's Functional, and John Resig's Micro-Templating.
6 | // For all details and documentation:
7 | // http://documentcloud.github.com/underscore
8 | (function(){function q(a,c,d){if(a===c)return a!==0||1/a==1/c;if(a==null||c==null)return a===c;if(a._chain)a=a._wrapped;if(c._chain)c=c._wrapped;if(a.isEqual&&b.isFunction(a.isEqual))return a.isEqual(c);if(c.isEqual&&b.isFunction(c.isEqual))return c.isEqual(a);var e=l.call(a);if(e!=l.call(c))return false;switch(e){case "[object String]":return a==String(c);case "[object Number]":return a!=+a?c!=+c:a==0?1/a==1/c:a==+c;case "[object Date]":case "[object Boolean]":return+a==+c;case "[object RegExp]":return a.source==
9 | c.source&&a.global==c.global&&a.multiline==c.multiline&&a.ignoreCase==c.ignoreCase}if(typeof a!="object"||typeof c!="object")return false;for(var f=d.length;f--;)if(d[f]==a)return true;d.push(a);var f=0,g=true;if(e=="[object Array]"){if(f=a.length,g=f==c.length)for(;f--;)if(!(g=f in a==f in c&&q(a[f],c[f],d)))break}else{if("constructor"in a!="constructor"in c||a.constructor!=c.constructor)return false;for(var h in a)if(b.has(a,h)&&(f++,!(g=b.has(c,h)&&q(a[h],c[h],d))))break;if(g){for(h in c)if(b.has(c,
10 | h)&&!f--)break;g=!f}}d.pop();return g}var r=this,G=r._,n={},k=Array.prototype,o=Object.prototype,i=k.slice,H=k.unshift,l=o.toString,I=o.hasOwnProperty,w=k.forEach,x=k.map,y=k.reduce,z=k.reduceRight,A=k.filter,B=k.every,C=k.some,p=k.indexOf,D=k.lastIndexOf,o=Array.isArray,J=Object.keys,s=Function.prototype.bind,b=function(a){return new m(a)};if(typeof exports!=="undefined"){if(typeof module!=="undefined"&&module.exports)exports=module.exports=b;exports._=b}else r._=b;b.VERSION="1.3.1";var j=b.each=
11 | b.forEach=function(a,c,d){if(a!=null)if(w&&a.forEach===w)a.forEach(c,d);else if(a.length===+a.length)for(var e=0,f=a.length;e2;a==
12 | null&&(a=[]);if(y&&a.reduce===y)return e&&(c=b.bind(c,e)),f?a.reduce(c,d):a.reduce(c);j(a,function(a,b,i){f?d=c.call(e,d,a,b,i):(d=a,f=true)});if(!f)throw new TypeError("Reduce of empty array with no initial value");return d};b.reduceRight=b.foldr=function(a,c,d,e){var f=arguments.length>2;a==null&&(a=[]);if(z&&a.reduceRight===z)return e&&(c=b.bind(c,e)),f?a.reduceRight(c,d):a.reduceRight(c);var g=b.toArray(a).reverse();e&&!f&&(c=b.bind(c,e));return f?b.reduce(g,c,d,e):b.reduce(g,c)};b.find=b.detect=
13 | function(a,c,b){var e;E(a,function(a,g,h){if(c.call(b,a,g,h))return e=a,true});return e};b.filter=b.select=function(a,c,b){var e=[];if(a==null)return e;if(A&&a.filter===A)return a.filter(c,b);j(a,function(a,g,h){c.call(b,a,g,h)&&(e[e.length]=a)});return e};b.reject=function(a,c,b){var e=[];if(a==null)return e;j(a,function(a,g,h){c.call(b,a,g,h)||(e[e.length]=a)});return e};b.every=b.all=function(a,c,b){var e=true;if(a==null)return e;if(B&&a.every===B)return a.every(c,b);j(a,function(a,g,h){if(!(e=
14 | e&&c.call(b,a,g,h)))return n});return e};var E=b.some=b.any=function(a,c,d){c||(c=b.identity);var e=false;if(a==null)return e;if(C&&a.some===C)return a.some(c,d);j(a,function(a,b,h){if(e||(e=c.call(d,a,b,h)))return n});return!!e};b.include=b.contains=function(a,c){var b=false;if(a==null)return b;return p&&a.indexOf===p?a.indexOf(c)!=-1:b=E(a,function(a){return a===c})};b.invoke=function(a,c){var d=i.call(arguments,2);return b.map(a,function(a){return(b.isFunction(c)?c||a:a[c]).apply(a,d)})};b.pluck=
15 | function(a,c){return b.map(a,function(a){return a[c]})};b.max=function(a,c,d){if(!c&&b.isArray(a))return Math.max.apply(Math,a);if(!c&&b.isEmpty(a))return-Infinity;var e={computed:-Infinity};j(a,function(a,b,h){b=c?c.call(d,a,b,h):a;b>=e.computed&&(e={value:a,computed:b})});return e.value};b.min=function(a,c,d){if(!c&&b.isArray(a))return Math.min.apply(Math,a);if(!c&&b.isEmpty(a))return Infinity;var e={computed:Infinity};j(a,function(a,b,h){b=c?c.call(d,a,b,h):a;bd?1:0}),"value")};b.groupBy=function(a,c){var d={},e=b.isFunction(c)?c:function(a){return a[c]};j(a,function(a,b){var c=e(a,b);(d[c]||(d[c]=[])).push(a)});return d};b.sortedIndex=function(a,
17 | c,d){d||(d=b.identity);for(var e=0,f=a.length;e>1;d(a[g])=0})})};b.difference=function(a){var c=b.flatten(i.call(arguments,1));return b.filter(a,function(a){return!b.include(c,a)})};b.zip=function(){for(var a=i.call(arguments),c=b.max(b.pluck(a,"length")),d=Array(c),e=0;e=0;d--)b=[a[d].apply(this,b)];return b[0]}};
24 | b.after=function(a,b){return a<=0?b():function(){if(--a<1)return b.apply(this,arguments)}};b.keys=J||function(a){if(a!==Object(a))throw new TypeError("Invalid object");var c=[],d;for(d in a)b.has(a,d)&&(c[c.length]=d);return c};b.values=function(a){return b.map(a,b.identity)};b.functions=b.methods=function(a){var c=[],d;for(d in a)b.isFunction(a[d])&&c.push(d);return c.sort()};b.extend=function(a){j(i.call(arguments,1),function(b){for(var d in b)a[d]=b[d]});return a};b.defaults=function(a){j(i.call(arguments,
25 | 1),function(b){for(var d in b)a[d]==null&&(a[d]=b[d])});return a};b.clone=function(a){return!b.isObject(a)?a:b.isArray(a)?a.slice():b.extend({},a)};b.tap=function(a,b){b(a);return a};b.isEqual=function(a,b){return q(a,b,[])};b.isEmpty=function(a){if(b.isArray(a)||b.isString(a))return a.length===0;for(var c in a)if(b.has(a,c))return false;return true};b.isElement=function(a){return!!(a&&a.nodeType==1)};b.isArray=o||function(a){return l.call(a)=="[object Array]"};b.isObject=function(a){return a===Object(a)};
26 | b.isArguments=function(a){return l.call(a)=="[object Arguments]"};if(!b.isArguments(arguments))b.isArguments=function(a){return!(!a||!b.has(a,"callee"))};b.isFunction=function(a){return l.call(a)=="[object Function]"};b.isString=function(a){return l.call(a)=="[object String]"};b.isNumber=function(a){return l.call(a)=="[object Number]"};b.isNaN=function(a){return a!==a};b.isBoolean=function(a){return a===true||a===false||l.call(a)=="[object Boolean]"};b.isDate=function(a){return l.call(a)=="[object Date]"};
27 | b.isRegExp=function(a){return l.call(a)=="[object RegExp]"};b.isNull=function(a){return a===null};b.isUndefined=function(a){return a===void 0};b.has=function(a,b){return I.call(a,b)};b.noConflict=function(){r._=G;return this};b.identity=function(a){return a};b.times=function(a,b,d){for(var e=0;e/g,">").replace(/"/g,""").replace(/'/g,"'").replace(/\//g,"/")};b.mixin=function(a){j(b.functions(a),
28 | function(c){K(c,b[c]=a[c])})};var L=0;b.uniqueId=function(a){var b=L++;return a?a+b:b};b.templateSettings={evaluate:/<%([\s\S]+?)%>/g,interpolate:/<%=([\s\S]+?)%>/g,escape:/<%-([\s\S]+?)%>/g};var t=/.^/,u=function(a){return a.replace(/\\\\/g,"\\").replace(/\\'/g,"'")};b.template=function(a,c){var d=b.templateSettings,d="var __p=[],print=function(){__p.push.apply(__p,arguments);};with(obj||{}){__p.push('"+a.replace(/\\/g,"\\\\").replace(/'/g,"\\'").replace(d.escape||t,function(a,b){return"',_.escape("+
29 | u(b)+"),'"}).replace(d.interpolate||t,function(a,b){return"',"+u(b)+",'"}).replace(d.evaluate||t,function(a,b){return"');"+u(b).replace(/[\r\n\t]/g," ")+";__p.push('"}).replace(/\r/g,"\\r").replace(/\n/g,"\\n").replace(/\t/g,"\\t")+"');}return __p.join('');",e=new Function("obj","_",d);return c?e(c,b):function(a){return e.call(this,a,b)}};b.chain=function(a){return b(a).chain()};var m=function(a){this._wrapped=a};b.prototype=m.prototype;var v=function(a,c){return c?b(a).chain():a},K=function(a,c){m.prototype[a]=
30 | function(){var a=i.call(arguments);H.call(a,this._wrapped);return v(c.apply(b,a),this._chain)}};b.mixin(b);j("pop,push,reverse,shift,sort,splice,unshift".split(","),function(a){var b=k[a];m.prototype[a]=function(){var d=this._wrapped;b.apply(d,arguments);var e=d.length;(a=="shift"||a=="splice")&&e===0&&delete d[0];return v(d,this._chain)}});j(["concat","join","slice"],function(a){var b=k[a];m.prototype[a]=function(){return v(b.apply(this._wrapped,arguments),this._chain)}});m.prototype.chain=function(){this._chain=
31 | true;return this};m.prototype.value=function(){return this._wrapped}}).call(this);
32 |
--------------------------------------------------------------------------------
/docs/build/html/_static/basic.css:
--------------------------------------------------------------------------------
1 | /*
2 | * basic.css
3 | * ~~~~~~~~~
4 | *
5 | * Sphinx stylesheet -- basic theme.
6 | *
7 | * :copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS.
8 | * :license: BSD, see LICENSE for details.
9 | *
10 | */
11 |
12 | /* -- main layout ----------------------------------------------------------- */
13 |
14 | div.clearer {
15 | clear: both;
16 | }
17 |
18 | /* -- relbar ---------------------------------------------------------------- */
19 |
20 | div.related {
21 | width: 100%;
22 | font-size: 90%;
23 | }
24 |
25 | div.related h3 {
26 | display: none;
27 | }
28 |
29 | div.related ul {
30 | margin: 0;
31 | padding: 0 0 0 10px;
32 | list-style: none;
33 | }
34 |
35 | div.related li {
36 | display: inline;
37 | }
38 |
39 | div.related li.right {
40 | float: right;
41 | margin-right: 5px;
42 | }
43 |
44 | /* -- sidebar --------------------------------------------------------------- */
45 |
46 | div.sphinxsidebarwrapper {
47 | padding: 10px 5px 0 10px;
48 | }
49 |
50 | div.sphinxsidebar {
51 | float: left;
52 | width: 230px;
53 | margin-left: -100%;
54 | font-size: 90%;
55 | word-wrap: break-word;
56 | overflow-wrap : break-word;
57 | }
58 |
59 | div.sphinxsidebar ul {
60 | list-style: none;
61 | }
62 |
63 | div.sphinxsidebar ul ul,
64 | div.sphinxsidebar ul.want-points {
65 | margin-left: 20px;
66 | list-style: square;
67 | }
68 |
69 | div.sphinxsidebar ul ul {
70 | margin-top: 0;
71 | margin-bottom: 0;
72 | }
73 |
74 | div.sphinxsidebar form {
75 | margin-top: 10px;
76 | }
77 |
78 | div.sphinxsidebar input {
79 | border: 1px solid #98dbcc;
80 | font-family: sans-serif;
81 | font-size: 1em;
82 | }
83 |
84 | div.sphinxsidebar #searchbox input[type="text"] {
85 | width: 170px;
86 | }
87 |
88 | img {
89 | border: 0;
90 | max-width: 100%;
91 | }
92 |
93 | /* -- search page ----------------------------------------------------------- */
94 |
95 | ul.search {
96 | margin: 10px 0 0 20px;
97 | padding: 0;
98 | }
99 |
100 | ul.search li {
101 | padding: 5px 0 5px 20px;
102 | background-image: url(file.png);
103 | background-repeat: no-repeat;
104 | background-position: 0 7px;
105 | }
106 |
107 | ul.search li a {
108 | font-weight: bold;
109 | }
110 |
111 | ul.search li div.context {
112 | color: #888;
113 | margin: 2px 0 0 30px;
114 | text-align: left;
115 | }
116 |
117 | ul.keywordmatches li.goodmatch a {
118 | font-weight: bold;
119 | }
120 |
121 | /* -- index page ------------------------------------------------------------ */
122 |
123 | table.contentstable {
124 | width: 90%;
125 | margin-left: auto;
126 | margin-right: auto;
127 | }
128 |
129 | table.contentstable p.biglink {
130 | line-height: 150%;
131 | }
132 |
133 | a.biglink {
134 | font-size: 1.3em;
135 | }
136 |
137 | span.linkdescr {
138 | font-style: italic;
139 | padding-top: 5px;
140 | font-size: 90%;
141 | }
142 |
143 | /* -- general index --------------------------------------------------------- */
144 |
145 | table.indextable {
146 | width: 100%;
147 | }
148 |
149 | table.indextable td {
150 | text-align: left;
151 | vertical-align: top;
152 | }
153 |
154 | table.indextable ul {
155 | margin-top: 0;
156 | margin-bottom: 0;
157 | list-style-type: none;
158 | }
159 |
160 | table.indextable > tbody > tr > td > ul {
161 | padding-left: 0em;
162 | }
163 |
164 | table.indextable tr.pcap {
165 | height: 10px;
166 | }
167 |
168 | table.indextable tr.cap {
169 | margin-top: 10px;
170 | background-color: #f2f2f2;
171 | }
172 |
173 | img.toggler {
174 | margin-right: 3px;
175 | margin-top: 3px;
176 | cursor: pointer;
177 | }
178 |
179 | div.modindex-jumpbox {
180 | border-top: 1px solid #ddd;
181 | border-bottom: 1px solid #ddd;
182 | margin: 1em 0 1em 0;
183 | padding: 0.4em;
184 | }
185 |
186 | div.genindex-jumpbox {
187 | border-top: 1px solid #ddd;
188 | border-bottom: 1px solid #ddd;
189 | margin: 1em 0 1em 0;
190 | padding: 0.4em;
191 | }
192 |
193 | /* -- domain module index --------------------------------------------------- */
194 |
195 | table.modindextable td {
196 | padding: 2px;
197 | border-collapse: collapse;
198 | }
199 |
200 | /* -- general body styles --------------------------------------------------- */
201 |
202 | div.body p, div.body dd, div.body li, div.body blockquote {
203 | -moz-hyphens: auto;
204 | -ms-hyphens: auto;
205 | -webkit-hyphens: auto;
206 | hyphens: auto;
207 | }
208 |
209 | a.headerlink {
210 | visibility: hidden;
211 | }
212 |
213 | h1:hover > a.headerlink,
214 | h2:hover > a.headerlink,
215 | h3:hover > a.headerlink,
216 | h4:hover > a.headerlink,
217 | h5:hover > a.headerlink,
218 | h6:hover > a.headerlink,
219 | dt:hover > a.headerlink,
220 | caption:hover > a.headerlink,
221 | p.caption:hover > a.headerlink,
222 | div.code-block-caption:hover > a.headerlink {
223 | visibility: visible;
224 | }
225 |
226 | div.body p.caption {
227 | text-align: inherit;
228 | }
229 |
230 | div.body td {
231 | text-align: left;
232 | }
233 |
234 | .first {
235 | margin-top: 0 !important;
236 | }
237 |
238 | p.rubric {
239 | margin-top: 30px;
240 | font-weight: bold;
241 | }
242 |
243 | img.align-left, .figure.align-left, object.align-left {
244 | clear: left;
245 | float: left;
246 | margin-right: 1em;
247 | }
248 |
249 | img.align-right, .figure.align-right, object.align-right {
250 | clear: right;
251 | float: right;
252 | margin-left: 1em;
253 | }
254 |
255 | img.align-center, .figure.align-center, object.align-center {
256 | display: block;
257 | margin-left: auto;
258 | margin-right: auto;
259 | }
260 |
261 | .align-left {
262 | text-align: left;
263 | }
264 |
265 | .align-center {
266 | text-align: center;
267 | }
268 |
269 | .align-right {
270 | text-align: right;
271 | }
272 |
273 | /* -- sidebars -------------------------------------------------------------- */
274 |
275 | div.sidebar {
276 | margin: 0 0 0.5em 1em;
277 | border: 1px solid #ddb;
278 | padding: 7px 7px 0 7px;
279 | background-color: #ffe;
280 | width: 40%;
281 | float: right;
282 | }
283 |
284 | p.sidebar-title {
285 | font-weight: bold;
286 | }
287 |
288 | /* -- topics ---------------------------------------------------------------- */
289 |
290 | div.topic {
291 | border: 1px solid #ccc;
292 | padding: 7px 7px 0 7px;
293 | margin: 10px 0 10px 0;
294 | }
295 |
296 | p.topic-title {
297 | font-size: 1.1em;
298 | font-weight: bold;
299 | margin-top: 10px;
300 | }
301 |
302 | /* -- admonitions ----------------------------------------------------------- */
303 |
304 | div.admonition {
305 | margin-top: 10px;
306 | margin-bottom: 10px;
307 | padding: 7px;
308 | }
309 |
310 | div.admonition dt {
311 | font-weight: bold;
312 | }
313 |
314 | div.admonition dl {
315 | margin-bottom: 0;
316 | }
317 |
318 | p.admonition-title {
319 | margin: 0px 10px 5px 0px;
320 | font-weight: bold;
321 | }
322 |
323 | div.body p.centered {
324 | text-align: center;
325 | margin-top: 25px;
326 | }
327 |
328 | /* -- tables ---------------------------------------------------------------- */
329 |
330 | table.docutils {
331 | border: 0;
332 | border-collapse: collapse;
333 | }
334 |
335 | table caption span.caption-number {
336 | font-style: italic;
337 | }
338 |
339 | table caption span.caption-text {
340 | }
341 |
342 | table.docutils td, table.docutils th {
343 | padding: 1px 8px 1px 5px;
344 | border-top: 0;
345 | border-left: 0;
346 | border-right: 0;
347 | border-bottom: 1px solid #aaa;
348 | }
349 |
350 | table.footnote td, table.footnote th {
351 | border: 0 !important;
352 | }
353 |
354 | th {
355 | text-align: left;
356 | padding-right: 5px;
357 | }
358 |
359 | table.citation {
360 | border-left: solid 1px gray;
361 | margin-left: 1px;
362 | }
363 |
364 | table.citation td {
365 | border-bottom: none;
366 | }
367 |
368 | /* -- figures --------------------------------------------------------------- */
369 |
370 | div.figure {
371 | margin: 0.5em;
372 | padding: 0.5em;
373 | }
374 |
375 | div.figure p.caption {
376 | padding: 0.3em;
377 | }
378 |
379 | div.figure p.caption span.caption-number {
380 | font-style: italic;
381 | }
382 |
383 | div.figure p.caption span.caption-text {
384 | }
385 |
386 | /* -- field list styles ----------------------------------------------------- */
387 |
388 | table.field-list td, table.field-list th {
389 | border: 0 !important;
390 | }
391 |
392 | .field-list ul {
393 | margin: 0;
394 | padding-left: 1em;
395 | }
396 |
397 | .field-list p {
398 | margin: 0;
399 | }
400 |
401 | /* -- other body styles ----------------------------------------------------- */
402 |
403 | ol.arabic {
404 | list-style: decimal;
405 | }
406 |
407 | ol.loweralpha {
408 | list-style: lower-alpha;
409 | }
410 |
411 | ol.upperalpha {
412 | list-style: upper-alpha;
413 | }
414 |
415 | ol.lowerroman {
416 | list-style: lower-roman;
417 | }
418 |
419 | ol.upperroman {
420 | list-style: upper-roman;
421 | }
422 |
423 | dl {
424 | margin-bottom: 15px;
425 | }
426 |
427 | dd p {
428 | margin-top: 0px;
429 | }
430 |
431 | dd ul, dd table {
432 | margin-bottom: 10px;
433 | }
434 |
435 | dd {
436 | margin-top: 3px;
437 | margin-bottom: 10px;
438 | margin-left: 30px;
439 | }
440 |
441 | dt:target, .highlighted {
442 | background-color: #fbe54e;
443 | }
444 |
445 | dl.glossary dt {
446 | font-weight: bold;
447 | font-size: 1.1em;
448 | }
449 |
450 | .optional {
451 | font-size: 1.3em;
452 | }
453 |
454 | .sig-paren {
455 | font-size: larger;
456 | }
457 |
458 | .versionmodified {
459 | font-style: italic;
460 | }
461 |
462 | .system-message {
463 | background-color: #fda;
464 | padding: 5px;
465 | border: 3px solid red;
466 | }
467 |
468 | .footnote:target {
469 | background-color: #ffa;
470 | }
471 |
472 | .line-block {
473 | display: block;
474 | margin-top: 1em;
475 | margin-bottom: 1em;
476 | }
477 |
478 | .line-block .line-block {
479 | margin-top: 0;
480 | margin-bottom: 0;
481 | margin-left: 1.5em;
482 | }
483 |
484 | .guilabel, .menuselection {
485 | font-family: sans-serif;
486 | }
487 |
488 | .accelerator {
489 | text-decoration: underline;
490 | }
491 |
492 | .classifier {
493 | font-style: oblique;
494 | }
495 |
496 | abbr, acronym {
497 | border-bottom: dotted 1px;
498 | cursor: help;
499 | }
500 |
501 | /* -- code displays --------------------------------------------------------- */
502 |
503 | pre {
504 | overflow: auto;
505 | overflow-y: hidden; /* fixes display issues on Chrome browsers */
506 | }
507 |
508 | span.pre {
509 | -moz-hyphens: none;
510 | -ms-hyphens: none;
511 | -webkit-hyphens: none;
512 | hyphens: none;
513 | }
514 |
515 | td.linenos pre {
516 | padding: 5px 0px;
517 | border: 0;
518 | background-color: transparent;
519 | color: #aaa;
520 | }
521 |
522 | table.highlighttable {
523 | margin-left: 0.5em;
524 | }
525 |
526 | table.highlighttable td {
527 | padding: 0 0.5em 0 0.5em;
528 | }
529 |
530 | div.code-block-caption {
531 | padding: 2px 5px;
532 | font-size: small;
533 | }
534 |
535 | div.code-block-caption code {
536 | background-color: transparent;
537 | }
538 |
539 | div.code-block-caption + div > div.highlight > pre {
540 | margin-top: 0;
541 | }
542 |
543 | div.code-block-caption span.caption-number {
544 | padding: 0.1em 0.3em;
545 | font-style: italic;
546 | }
547 |
548 | div.code-block-caption span.caption-text {
549 | }
550 |
551 | div.literal-block-wrapper {
552 | padding: 1em 1em 0;
553 | }
554 |
555 | div.literal-block-wrapper div.highlight {
556 | margin: 0;
557 | }
558 |
559 | code.descname {
560 | background-color: transparent;
561 | font-weight: bold;
562 | font-size: 1.2em;
563 | }
564 |
565 | code.descclassname {
566 | background-color: transparent;
567 | }
568 |
569 | code.xref, a code {
570 | background-color: transparent;
571 | font-weight: bold;
572 | }
573 |
574 | h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
575 | background-color: transparent;
576 | }
577 |
578 | .viewcode-link {
579 | float: right;
580 | }
581 |
582 | .viewcode-back {
583 | float: right;
584 | font-family: sans-serif;
585 | }
586 |
587 | div.viewcode-block:target {
588 | margin: -1px -10px;
589 | padding: 0 10px;
590 | }
591 |
592 | /* -- math display ---------------------------------------------------------- */
593 |
594 | img.math {
595 | vertical-align: middle;
596 | }
597 |
598 | div.body div.math p {
599 | text-align: center;
600 | }
601 |
602 | span.eqno {
603 | float: right;
604 | }
605 |
606 | span.eqno a.headerlink {
607 | position: relative;
608 | left: 0px;
609 | z-index: 1;
610 | }
611 |
612 | div.math:hover a.headerlink {
613 | visibility: visible;
614 | }
615 |
616 | /* -- printout stylesheet --------------------------------------------------- */
617 |
618 | @media print {
619 | div.document,
620 | div.documentwrapper,
621 | div.bodywrapper {
622 | margin: 0 !important;
623 | width: 100%;
624 | }
625 |
626 | div.sphinxsidebar,
627 | div.related,
628 | div.footer,
629 | #top-link {
630 | display: none;
631 | }
632 | }
--------------------------------------------------------------------------------
/pyspark_dist_explore/tests/test_pyspark_dist_explore.py:
--------------------------------------------------------------------------------
1 | import findspark
2 | findspark.init('/media/chris/data/spark-2.4.0-bin-hadoop2.7/')
3 |
4 | import pyspark.sql.functions as F
5 | import sparktestingbase.sqltestcase
6 | import pandas as pd
7 | import unittest
8 | import math
9 | from pyspark.sql import Row
10 | from unittest import mock
11 |
12 | import sys
13 | sys.path.append('../' )
14 | from pyspark_dist_explore import Histogram
15 | from pyspark_dist_explore.pyspark_dist_explore import create_histogram_object
16 |
17 |
18 | class HistogramTest(sparktestingbase.sqltestcase.SQLTestCase):
19 | def test_init_default(self):
20 | """Should set default settings when no arguments are given"""
21 | hist = Histogram()
22 | self.assertIsNone(hist.min_value)
23 | self.assertIsNone(hist.max_value)
24 | self.assertEqual(10, hist.nr_bins)
25 | self.assertEqual(0, len(hist.bin_boundaries))
26 | self.assertEqual(0, len(hist.hist_dict))
27 | self.assertEqual(0, len(hist.col_list))
28 | self.assertFalse(hist.is_build)
29 |
30 | def test_init_non_default(self):
31 | """"Should set min bin, max bin, and number of bins"""
32 | hist = Histogram(bins=10, range=(5, 8))
33 | self.assertEqual(10, hist.nr_bins)
34 | self.assertEqual(5, hist.min_value)
35 | self.assertEqual(8, hist.max_value)
36 | self.assertEqual(0, len(hist.bin_boundaries))
37 |
38 | def test_init_bins_given(self):
39 | """"Should set the list of bins when given in the constructor,
40 | bins are converted to float"""
41 | hist = Histogram(bins=[1, 2, '3'])
42 | self.assertListEqual([1, 2, 3], hist.bin_boundaries)
43 |
44 | def create_test_df(self):
45 | test_list = [(1, 2), (2, 3), (3, 4)]
46 | rdd = self.sc.parallelize(test_list)
47 | rdd_f = rdd.map(lambda x: Row(value=x[0], value2=x[1]))
48 | return self.sqlCtx.createDataFrame(rdd_f)
49 |
50 | def test_add_column(self):
51 | """"Should add a column name, column tuple to the col_list when a single column data frame is given"""
52 | hist = Histogram(bins=10)
53 | test_df = self.create_test_df()
54 | hist.add_column(test_df.select(F.col('value')))
55 | self.assertEqual(1, len(hist.col_list))
56 | self.assertEqual('value', hist.col_list[0][1])
57 | self.assertDataFrameEqual(test_df.select(F.col('value')), hist.col_list[0][0])
58 |
59 | def test_add_column_more_then_1_column_in_dataframe(self):
60 | """"Should throw an error when the input data frame contains more then one column"""
61 | hist = Histogram(bins=10)
62 | test_df = self.create_test_df()
63 | with self.assertRaises(ValueError):
64 | hist.add_column(test_df)
65 |
66 | def test_add_column_non_numeric(self):
67 | """Should raise an ValueError if a non-numeric column is added"""
68 | test_list = ['a', 'b']
69 | rdd = self.sc.parallelize(test_list)
70 | rdd_f = rdd.map(lambda x: Row(value=x))
71 | spark_df = self.sqlCtx.createDataFrame(rdd_f)
72 | hist = Histogram()
73 | with self.assertRaises(ValueError):
74 | hist.add_column(spark_df)
75 |
76 | def test_add_multiple_columns(self):
77 | """Adds new items to the col_list when new items are added"""
78 | hist = Histogram(bins=10)
79 | test_df = self.create_test_df()
80 | hist.add_column(test_df.select(F.col('value')))
81 | hist.add_column(test_df.select(F.col('value2')))
82 | self.assertEqual(2, len(hist.col_list))
83 | self.assertEqual('value', hist.col_list[0][1])
84 | self.assertDataFrameEqual(test_df.select(F.col('value')), hist.col_list[0][0])
85 | self.assertEqual('value2', hist.col_list[1][1])
86 | self.assertDataFrameEqual(test_df.select(F.col('value2')), hist.col_list[1][0])
87 |
88 | def test_get_min_value(self):
89 | """Should return the minimum value over all columns in a Histogram"""
90 | hist = Histogram(bins=10)
91 | test_df = self.create_test_df()
92 | hist.add_column(test_df.select(F.col('value')))
93 | hist.add_column(test_df.select(F.col('value2')))
94 | self.assertEqual(1, hist._get_min_value())
95 |
96 | def test_get_max_value(self):
97 | """Should return the maximum value over all columns in a Histogram"""
98 | hist = Histogram(bins=10)
99 | test_df = self.create_test_df()
100 | hist.add_column(test_df.select(F.col('value')))
101 | hist.add_column(test_df.select(F.col('value2')))
102 | self.assertEqual(4, hist._get_max_value())
103 |
104 | def test_calculate_bins(self):
105 | """Should return a list of evenly spaced bins between min and max bin if they are set"""
106 | hist = Histogram(range=(5, 10), bins=2)
107 | self.assertListEqual([5, 7.5, 10], hist._calculate_bins())
108 |
109 | def test_calculate_bins_bins_set(self):
110 | """Should just return the list of bins edges when this was set in the constructor"""
111 | hist = Histogram(bins=[1, 2, 3])
112 | self.assertListEqual([1, 2, 3], hist._calculate_bins())
113 |
114 | def test_calculate_bins_single_column(self):
115 | """Should return the number of bins when there is only a single column, and no min and max is set"""
116 | hist = Histogram(bins=5)
117 | test_df = self.create_test_df()
118 | hist.add_column(test_df.select(F.col('value')))
119 | self.assertEqual(5, hist._calculate_bins())
120 |
121 | def test_calculate_bins_multiple_columns(self):
122 | """Should return a list of evenly spaced bins between the smallest and highest value over all columns"""
123 | hist = Histogram(bins=3)
124 | test_df = self.create_test_df() # The lowest value in this DF is 1, the highest is 4
125 | hist.add_column(test_df.select(F.col('value')))
126 | hist.add_column(test_df.select(F.col('value2')))
127 | self.assertListEqual([1, 2, 3, 4], hist._calculate_bins())
128 |
129 | def test_add_hist_single_column(self):
130 | """Should add a list of bin values (e.g. the number of values that fall in a bin) to the hist_dict, where
131 | the key is the column name. If multiple columns have the same name a number is appended"""
132 | hist = Histogram(bins=2)
133 | test_df = self.create_test_df()
134 | column_to_ad = test_df.select(F.col('value'))
135 | hist.add_column(column_to_ad)
136 | hist.bin_boundaries = hist._calculate_bins()
137 | hist._add_hist(column_to_ad, 'value')
138 | self.assertEqual(1, len(hist.hist_dict))
139 | self.assertListEqual([1, 2], hist.hist_dict['value'])
140 |
141 | def test_add_hist_single_column_sets_bin_list(self):
142 | """Should set the bin list if this is a single number"""
143 | hist = Histogram(bins=2)
144 | test_df = self.create_test_df()
145 | column_to_ad = test_df.select(F.col('value'))
146 | hist.add_column(column_to_ad)
147 | hist.bin_boundaries = hist._calculate_bins()
148 | hist._add_hist(column_to_ad, 'value')
149 | self.assertEqual(3, len(hist.bin_boundaries))
150 |
151 | def test_add_hist_multiple_column(self):
152 | """Should add a second list of bin values to the hist_dict"""
153 | hist = Histogram(bins=2)
154 | test_df = self.create_test_df()
155 | column_to_ad = test_df.select(F.col('value'))
156 | column_to_ad_2 = test_df.select(F.col('value2'))
157 | hist.add_column(column_to_ad)
158 | hist.add_column(column_to_ad_2)
159 | hist.bin_boundaries = hist._calculate_bins()
160 | hist._add_hist(column_to_ad, 'value')
161 | hist._add_hist(column_to_ad_2, 'value2')
162 | self.assertEqual(2, len(hist.hist_dict))
163 | self.assertListEqual([1, 2], hist.hist_dict['value2'])
164 |
165 | def test_add_hist_multiple_column_rename_column(self):
166 | """Should rename the column name if the same column name is added"""
167 | hist = Histogram(bins=2)
168 | test_df = self.create_test_df()
169 | column_to_ad = test_df.select(F.col('value'))
170 | column_to_ad_2 = test_df.select(F.col('value'))
171 | hist.add_column(column_to_ad)
172 | hist.add_column(column_to_ad_2)
173 | hist.bin_boundaries = hist._calculate_bins()
174 | hist._add_hist(column_to_ad, 'value')
175 | hist._add_hist(column_to_ad_2, 'value')
176 | self.assertEqual(2, len(hist.hist_dict))
177 | self.assertTrue('value (1)' in hist.hist_dict)
178 |
179 | def test_add_hist_single_value(self):
180 | """Should set the bin list to n (self.nr_bins) bins (n+1 bin borders) where the min bin border is the
181 | single value -0.5 and the max bin border is the single value +0.5 incase a column is input with only a
182 | single value"""
183 | single_column_value = 1
184 | nr_bins = 5
185 | column_values = [single_column_value] * 100
186 | test_df = self.sqlCtx.createDataFrame(pd.DataFrame({'foo': column_values}))
187 | hist = Histogram(bins=nr_bins)
188 | hist.add_column(test_df.select(F.col('foo')))
189 | hist.build()
190 | self.assertEqual(6, len(hist.bin_boundaries))
191 | self.assertEqual(single_column_value - 0.5, min(hist.bin_boundaries))
192 | self.assertEqual(single_column_value + 0.5, max(hist.bin_boundaries))
193 | self.assertEqual(len(column_values), hist.hist_dict['foo'][math.floor(nr_bins/2)])
194 |
195 | def test_build(self):
196 | """Should calculate the bin list, and hist values for each column in the Histogram, if the
197 | histogram hasn't been build before"""
198 | hist = Histogram(bins=2)
199 | test_df = self.create_test_df()
200 | column_to_ad = test_df.select(F.col('value'))
201 | column_to_ad_2 = test_df.select(F.col('value2'))
202 | hist.add_column(column_to_ad)
203 | hist.add_column(column_to_ad_2)
204 | hist.build()
205 | self.assertEqual(3, len(hist.bin_boundaries))
206 | self.assertEqual(2, len(hist.hist_dict))
207 | self.assertTrue(hist.is_build)
208 |
209 | @mock.patch('pyspark_dist_explore.Histogram._add_hist')
210 | @mock.patch('pyspark_dist_explore.Histogram._calculate_bins')
211 | def test_build_already_build(self, calculate_bins_func, add_hist_func):
212 | """Should not rebuild if Histogram was already build before"""
213 | hist = Histogram()
214 | hist.is_build = True
215 | hist.build()
216 | self.assertFalse(add_hist_func.called)
217 | self.assertFalse(calculate_bins_func.called)
218 |
219 | def test_to_pandas_default(self):
220 | """Should create a pandas dataframe from the Histogram object"""
221 | hist = Histogram(bins=2)
222 | test_df = self.create_test_df()
223 | column_to_ad = test_df.select(F.col('value'))
224 | column_to_ad_2 = test_df.select(F.col('value2'))
225 | hist.add_column(column_to_ad)
226 | hist.add_column(column_to_ad_2)
227 | expected_df = pd.DataFrame({'value': [2, 1],
228 | 'value2': [1, 2]}).set_index([['1.00 - 2.50', '2.50 - 4.00']])
229 | self.assertTrue(expected_df.equals(hist.to_pandas()))
230 |
231 | def test_to_pandas_density(self):
232 | """Should create a pandas dataframe of a denisty plot of the histogram"""
233 | hist = Histogram(bins=2)
234 | test_df = self.create_test_df()
235 | column_to_ad = test_df.select(F.col('value'))
236 | column_to_ad_2 = test_df.select(F.col('value2'))
237 | hist.add_column(column_to_ad)
238 | hist.add_column(column_to_ad_2)
239 | expected_df = pd.DataFrame({'value': [1.0, 0.5], 'value2': [0.5, 1.0]}).set_index([[1.75, 3.25]])
240 | self.assertTrue(expected_df.equals(hist.to_pandas('density')))
241 |
242 | def test_add_data_single_column(self):
243 | """Should add a single column of data to the Histogram"""
244 | hist = Histogram()
245 | test_df = self.create_test_df()
246 | column_to_ad = test_df.select(F.col('value'))
247 | hist.add_data(column_to_ad)
248 | self.assertEqual(1, len(hist.col_list))
249 |
250 | def test_add_data_list_of_columns(self):
251 | """Should add all columns from the list of columns to the Histogram"""
252 | test_df = self.create_test_df()
253 | column_to_ad = test_df.select(F.col('value'))
254 | column_to_ad_2 = test_df.select(F.col('value2'))
255 | hist = Histogram()
256 | hist.add_data([column_to_ad, column_to_ad_2])
257 | self.assertEqual(2, len(hist.col_list))
258 |
259 | def test_add_data_entire_dataframe(self):
260 | """Should add all columns of a dataframe to the histogram"""
261 | test_df = self.create_test_df()
262 | hist = Histogram()
263 | hist.add_data(test_df)
264 | self.assertEqual(2, len(hist.col_list))
265 |
266 |
267 | class FunctionsTest(unittest.TestCase):
268 | def test_create_histogram_object_default(self):
269 | """Should return an histogram object with default settings"""
270 | test_hist = create_histogram_object(dict())
271 | self.assertEqual(10, test_hist.nr_bins)
272 | self.assertIsNone(test_hist.min_value)
273 | self.assertIsNone(test_hist.max_value)
274 |
275 | def test_create_histogram_object_non_default(self):
276 | """Should return an histogram object with 'bins' and 'range' set"""
277 | test_kwargs = dict(bins=11, range=(10, 20))
278 | test_hist = create_histogram_object(test_kwargs)
279 | self.assertEqual(11, test_hist.nr_bins)
280 | self.assertEqual(10, test_hist.min_value)
281 | self.assertEqual(20, test_hist.max_value)
282 |
283 |
284 | if __name__ == "__main__":
285 | unittest.main()
286 |
--------------------------------------------------------------------------------
/docs/build/html/_static/alabaster.css:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | @import url("basic.css");
54 |
55 | /* -- page layout ----------------------------------------------------------- */
56 |
57 | body {
58 | font-family: 'goudy old style', 'minion pro', 'bell mt', Georgia, 'Hiragino Mincho Pro', serif;
59 | font-size: 17px;
60 | background-color: #fff;
61 | color: #000;
62 | margin: 0;
63 | padding: 0;
64 | }
65 |
66 |
67 | div.document {
68 | width: 940px;
69 | margin: 30px auto 0 auto;
70 | }
71 |
72 | div.documentwrapper {
73 | float: left;
74 | width: 100%;
75 | }
76 |
77 | div.bodywrapper {
78 | margin: 0 0 0 220px;
79 | }
80 |
81 | div.sphinxsidebar {
82 | width: 220px;
83 | font-size: 14px;
84 | line-height: 1.5;
85 | }
86 |
87 | hr {
88 | border: 1px solid #B1B4B6;
89 | }
90 |
91 | div.body {
92 | background-color: #fff;
93 | color: #3E4349;
94 | padding: 0 30px 0 30px;
95 | }
96 |
97 | div.body > .section {
98 | text-align: left;
99 | }
100 |
101 | div.footer {
102 | width: 940px;
103 | margin: 20px auto 30px auto;
104 | font-size: 14px;
105 | color: #888;
106 | text-align: right;
107 | }
108 |
109 | div.footer a {
110 | color: #888;
111 | }
112 |
113 | p.caption {
114 | font-family: inherit;
115 | font-size: inherit;
116 | }
117 |
118 |
119 | div.relations {
120 | display: none;
121 | }
122 |
123 |
124 | div.sphinxsidebar a {
125 | color: #444;
126 | text-decoration: none;
127 | border-bottom: 1px dotted #999;
128 | }
129 |
130 | div.sphinxsidebar a:hover {
131 | border-bottom: 1px solid #999;
132 | }
133 |
134 | div.sphinxsidebarwrapper {
135 | padding: 18px 10px;
136 | }
137 |
138 | div.sphinxsidebarwrapper p.logo {
139 | padding: 0;
140 | margin: -10px 0 0 0px;
141 | text-align: center;
142 | }
143 |
144 | div.sphinxsidebarwrapper h1.logo {
145 | margin-top: -10px;
146 | text-align: center;
147 | margin-bottom: 5px;
148 | text-align: left;
149 | }
150 |
151 | div.sphinxsidebarwrapper h1.logo-name {
152 | margin-top: 0px;
153 | }
154 |
155 | div.sphinxsidebarwrapper p.blurb {
156 | margin-top: 0;
157 | font-style: normal;
158 | }
159 |
160 | div.sphinxsidebar h3,
161 | div.sphinxsidebar h4 {
162 | font-family: 'Garamond', 'Georgia', serif;
163 | color: #444;
164 | font-size: 24px;
165 | font-weight: normal;
166 | margin: 0 0 5px 0;
167 | padding: 0;
168 | }
169 |
170 | div.sphinxsidebar h4 {
171 | font-size: 20px;
172 | }
173 |
174 | div.sphinxsidebar h3 a {
175 | color: #444;
176 | }
177 |
178 | div.sphinxsidebar p.logo a,
179 | div.sphinxsidebar h3 a,
180 | div.sphinxsidebar p.logo a:hover,
181 | div.sphinxsidebar h3 a:hover {
182 | border: none;
183 | }
184 |
185 | div.sphinxsidebar p {
186 | color: #555;
187 | margin: 10px 0;
188 | }
189 |
190 | div.sphinxsidebar ul {
191 | margin: 10px 0;
192 | padding: 0;
193 | color: #000;
194 | }
195 |
196 | div.sphinxsidebar ul li.toctree-l1 > a {
197 | font-size: 120%;
198 | }
199 |
200 | div.sphinxsidebar ul li.toctree-l2 > a {
201 | font-size: 110%;
202 | }
203 |
204 | div.sphinxsidebar input {
205 | border: 1px solid #CCC;
206 | font-family: 'goudy old style', 'minion pro', 'bell mt', Georgia, 'Hiragino Mincho Pro', serif;
207 | font-size: 1em;
208 | }
209 |
210 | div.sphinxsidebar hr {
211 | border: none;
212 | height: 1px;
213 | color: #AAA;
214 | background: #AAA;
215 |
216 | text-align: left;
217 | margin-left: 0;
218 | width: 50%;
219 | }
220 |
221 | /* -- body styles ----------------------------------------------------------- */
222 |
223 | a {
224 | color: #004B6B;
225 | text-decoration: underline;
226 | }
227 |
228 | a:hover {
229 | color: #6D4100;
230 | text-decoration: underline;
231 | }
232 |
233 | div.body h1,
234 | div.body h2,
235 | div.body h3,
236 | div.body h4,
237 | div.body h5,
238 | div.body h6 {
239 | font-family: 'Garamond', 'Georgia', serif;
240 | font-weight: normal;
241 | margin: 30px 0px 10px 0px;
242 | padding: 0;
243 | }
244 |
245 | div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; }
246 | div.body h2 { font-size: 180%; }
247 | div.body h3 { font-size: 150%; }
248 | div.body h4 { font-size: 130%; }
249 | div.body h5 { font-size: 100%; }
250 | div.body h6 { font-size: 100%; }
251 |
252 | a.headerlink {
253 | color: #DDD;
254 | padding: 0 4px;
255 | text-decoration: none;
256 | }
257 |
258 | a.headerlink:hover {
259 | color: #444;
260 | background: #EAEAEA;
261 | }
262 |
263 | div.body p, div.body dd, div.body li {
264 | line-height: 1.4em;
265 | }
266 |
267 | div.admonition {
268 | margin: 20px 0px;
269 | padding: 10px 30px;
270 | background-color: #EEE;
271 | border: 1px solid #CCC;
272 | }
273 |
274 | div.admonition tt.xref, div.admonition code.xref, div.admonition a tt {
275 | background-color: #FBFBFB;
276 | border-bottom: 1px solid #fafafa;
277 | }
278 |
279 | div.admonition p.admonition-title {
280 | font-family: 'Garamond', 'Georgia', serif;
281 | font-weight: normal;
282 | font-size: 24px;
283 | margin: 0 0 10px 0;
284 | padding: 0;
285 | line-height: 1;
286 | }
287 |
288 | div.admonition p.last {
289 | margin-bottom: 0;
290 | }
291 |
292 | div.highlight {
293 | background-color: #fff;
294 | }
295 |
296 | dt:target, .highlight {
297 | background: #FAF3E8;
298 | }
299 |
300 | div.warning {
301 | background-color: #FCC;
302 | border: 1px solid #FAA;
303 | }
304 |
305 | div.danger {
306 | background-color: #FCC;
307 | border: 1px solid #FAA;
308 | -moz-box-shadow: 2px 2px 4px #D52C2C;
309 | -webkit-box-shadow: 2px 2px 4px #D52C2C;
310 | box-shadow: 2px 2px 4px #D52C2C;
311 | }
312 |
313 | div.error {
314 | background-color: #FCC;
315 | border: 1px solid #FAA;
316 | -moz-box-shadow: 2px 2px 4px #D52C2C;
317 | -webkit-box-shadow: 2px 2px 4px #D52C2C;
318 | box-shadow: 2px 2px 4px #D52C2C;
319 | }
320 |
321 | div.caution {
322 | background-color: #FCC;
323 | border: 1px solid #FAA;
324 | }
325 |
326 | div.attention {
327 | background-color: #FCC;
328 | border: 1px solid #FAA;
329 | }
330 |
331 | div.important {
332 | background-color: #EEE;
333 | border: 1px solid #CCC;
334 | }
335 |
336 | div.note {
337 | background-color: #EEE;
338 | border: 1px solid #CCC;
339 | }
340 |
341 | div.tip {
342 | background-color: #EEE;
343 | border: 1px solid #CCC;
344 | }
345 |
346 | div.hint {
347 | background-color: #EEE;
348 | border: 1px solid #CCC;
349 | }
350 |
351 | div.seealso {
352 | background-color: #EEE;
353 | border: 1px solid #CCC;
354 | }
355 |
356 | div.topic {
357 | background-color: #EEE;
358 | }
359 |
360 | p.admonition-title {
361 | display: inline;
362 | }
363 |
364 | p.admonition-title:after {
365 | content: ":";
366 | }
367 |
368 | pre, tt, code {
369 | font-family: 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
370 | font-size: 0.9em;
371 | }
372 |
373 | .hll {
374 | background-color: #FFC;
375 | margin: 0 -12px;
376 | padding: 0 12px;
377 | display: block;
378 | }
379 |
380 | img.screenshot {
381 | }
382 |
383 | tt.descname, tt.descclassname, code.descname, code.descclassname {
384 | font-size: 0.95em;
385 | }
386 |
387 | tt.descname, code.descname {
388 | padding-right: 0.08em;
389 | }
390 |
391 | img.screenshot {
392 | -moz-box-shadow: 2px 2px 4px #EEE;
393 | -webkit-box-shadow: 2px 2px 4px #EEE;
394 | box-shadow: 2px 2px 4px #EEE;
395 | }
396 |
397 | table.docutils {
398 | border: 1px solid #888;
399 | -moz-box-shadow: 2px 2px 4px #EEE;
400 | -webkit-box-shadow: 2px 2px 4px #EEE;
401 | box-shadow: 2px 2px 4px #EEE;
402 | }
403 |
404 | table.docutils td, table.docutils th {
405 | border: 1px solid #888;
406 | padding: 0.25em 0.7em;
407 | }
408 |
409 | table.field-list, table.footnote {
410 | border: none;
411 | -moz-box-shadow: none;
412 | -webkit-box-shadow: none;
413 | box-shadow: none;
414 | }
415 |
416 | table.footnote {
417 | margin: 15px 0;
418 | width: 100%;
419 | border: 1px solid #EEE;
420 | background: #FDFDFD;
421 | font-size: 0.9em;
422 | }
423 |
424 | table.footnote + table.footnote {
425 | margin-top: -15px;
426 | border-top: none;
427 | }
428 |
429 | table.field-list th {
430 | padding: 0 0.8em 0 0;
431 | }
432 |
433 | table.field-list td {
434 | padding: 0;
435 | }
436 |
437 | table.field-list p {
438 | margin-bottom: 0.8em;
439 | }
440 |
441 | /* Cloned from
442 | * https://github.com/sphinx-doc/sphinx/commit/ef60dbfce09286b20b7385333d63a60321784e68
443 | */
444 | .field-name {
445 | -moz-hyphens: manual;
446 | -ms-hyphens: manual;
447 | -webkit-hyphens: manual;
448 | hyphens: manual;
449 | }
450 |
451 | table.footnote td.label {
452 | width: .1px;
453 | padding: 0.3em 0 0.3em 0.5em;
454 | }
455 |
456 | table.footnote td {
457 | padding: 0.3em 0.5em;
458 | }
459 |
460 | dl {
461 | margin: 0;
462 | padding: 0;
463 | }
464 |
465 | dl dd {
466 | margin-left: 30px;
467 | }
468 |
469 | blockquote {
470 | margin: 0 0 0 30px;
471 | padding: 0;
472 | }
473 |
474 | ul, ol {
475 | /* Matches the 30px from the narrow-screen "li > ul" selector below */
476 | margin: 10px 0 10px 30px;
477 | padding: 0;
478 | }
479 |
480 | pre {
481 | background: #EEE;
482 | padding: 7px 30px;
483 | margin: 15px 0px;
484 | line-height: 1.3em;
485 | }
486 |
487 | div.viewcode-block:target {
488 | background: #ffd;
489 | }
490 |
491 | dl pre, blockquote pre, li pre {
492 | margin-left: 0;
493 | padding-left: 30px;
494 | }
495 |
496 | tt, code {
497 | background-color: #ecf0f3;
498 | color: #222;
499 | /* padding: 1px 2px; */
500 | }
501 |
502 | tt.xref, code.xref, a tt {
503 | background-color: #FBFBFB;
504 | border-bottom: 1px solid #fff;
505 | }
506 |
507 | a.reference {
508 | text-decoration: none;
509 | border-bottom: 1px dotted #004B6B;
510 | }
511 |
512 | /* Don't put an underline on images */
513 | a.image-reference, a.image-reference:hover {
514 | border-bottom: none;
515 | }
516 |
517 | a.reference:hover {
518 | border-bottom: 1px solid #6D4100;
519 | }
520 |
521 | a.footnote-reference {
522 | text-decoration: none;
523 | font-size: 0.7em;
524 | vertical-align: top;
525 | border-bottom: 1px dotted #004B6B;
526 | }
527 |
528 | a.footnote-reference:hover {
529 | border-bottom: 1px solid #6D4100;
530 | }
531 |
532 | a:hover tt, a:hover code {
533 | background: #EEE;
534 | }
535 |
536 |
537 | @media screen and (max-width: 870px) {
538 |
539 | div.sphinxsidebar {
540 | display: none;
541 | }
542 |
543 | div.document {
544 | width: 100%;
545 |
546 | }
547 |
548 | div.documentwrapper {
549 | margin-left: 0;
550 | margin-top: 0;
551 | margin-right: 0;
552 | margin-bottom: 0;
553 | }
554 |
555 | div.bodywrapper {
556 | margin-top: 0;
557 | margin-right: 0;
558 | margin-bottom: 0;
559 | margin-left: 0;
560 | }
561 |
562 | ul {
563 | margin-left: 0;
564 | }
565 |
566 | li > ul {
567 | /* Matches the 30px from the "ul, ol" selector above */
568 | margin-left: 30px;
569 | }
570 |
571 | .document {
572 | width: auto;
573 | }
574 |
575 | .footer {
576 | width: auto;
577 | }
578 |
579 | .bodywrapper {
580 | margin: 0;
581 | }
582 |
583 | .footer {
584 | width: auto;
585 | }
586 |
587 | .github {
588 | display: none;
589 | }
590 |
591 |
592 |
593 | }
594 |
595 |
596 |
597 | @media screen and (max-width: 875px) {
598 |
599 | body {
600 | margin: 0;
601 | padding: 20px 30px;
602 | }
603 |
604 | div.documentwrapper {
605 | float: none;
606 | background: #fff;
607 | }
608 |
609 | div.sphinxsidebar {
610 | display: block;
611 | float: none;
612 | width: 102.5%;
613 | margin: 50px -30px -20px -30px;
614 | padding: 10px 20px;
615 | background: #333;
616 | color: #FFF;
617 | }
618 |
619 | div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p,
620 | div.sphinxsidebar h3 a {
621 | color: #fff;
622 | }
623 |
624 | div.sphinxsidebar a {
625 | color: #AAA;
626 | }
627 |
628 | div.sphinxsidebar p.logo {
629 | display: none;
630 | }
631 |
632 | div.document {
633 | width: 100%;
634 | margin: 0;
635 | }
636 |
637 | div.footer {
638 | display: none;
639 | }
640 |
641 | div.bodywrapper {
642 | margin: 0;
643 | }
644 |
645 | div.body {
646 | min-height: 0;
647 | padding: 0;
648 | }
649 |
650 | .rtd_doc_footer {
651 | display: none;
652 | }
653 |
654 | .document {
655 | width: auto;
656 | }
657 |
658 | .footer {
659 | width: auto;
660 | }
661 |
662 | .footer {
663 | width: auto;
664 | }
665 |
666 | .github {
667 | display: none;
668 | }
669 | }
670 |
671 |
672 | /* misc. */
673 |
674 | .revsys-inline {
675 | display: none!important;
676 | }
677 |
678 | /* Make nested-list/multi-paragraph items look better in Releases changelog
679 | * pages. Without this, docutils' magical list fuckery causes inconsistent
680 | * formatting between different release sub-lists.
681 | */
682 | div#changelog > div.section > ul > li > p:only-child {
683 | margin-bottom: 0;
684 | }
685 |
686 | /* Hide fugly table cell borders in ..bibliography:: directive output */
687 | table.docutils.citation, table.docutils.citation td, table.docutils.citation th {
688 | border: none;
689 | /* Below needed in some edge cases; if not applied, bottom shadows appear */
690 | -moz-box-shadow: none;
691 | -webkit-box-shadow: none;
692 | box-shadow: none;
693 | }
--------------------------------------------------------------------------------
/pyspark_dist_explore/pyspark_dist_explore.py:
--------------------------------------------------------------------------------
1 | from scipy.interpolate import interp1d
2 |
3 | try:
4 | from pyspark.sql.types import NumericType
5 |
6 | import pyspark.sql.functions as F
7 | except:
8 | pass
9 |
10 | import pandas as pd
11 | import numpy as np
12 |
13 | import matplotlib.pyplot as plt
14 | from matplotlib.patches import Rectangle
15 |
16 |
17 | def hist(axis, x, overlapping=False, formatted_yaxis=True, **kwargs):
18 | """Plots a histogram on an Axis object
19 |
20 | Args:
21 | :axis: (`Axes`)
22 | An matplotlib Axes object on which the histogram will be plot.
23 | :x: (`DataFrame` or `list` of `DataFrame`)
24 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
25 | :overlapping: (`bool`, optional)
26 | Generate overlapping histograms.
27 |
28 | If set to true, this will generate an overlapping plot.
29 | When set to False it will generate a normal grouped histogram. Defaults to False.
30 | :formatted_yaxis: (`bool`, optional)
31 | If set to true, the numbers on the yaxis will be formatted
32 | for better readability. E.g. 1500000 will become 1.5M. Defaults to True
33 |
34 | :\*\*kwargs:
35 | The keyword arguments as used in matplotlib.pyplot.hist
36 |
37 | Returns:
38 | :n: (`array` or `list` of `arrays`)
39 | The values of the histogram bins. See normed and weights for a description of the possible semantics.
40 | If input x is an array, then this is an array of length nbins. If input is a sequence arrays
41 | [data1, data2,..], then this is a list of arrays with the values of the histograms for each of the
42 | arrays in the same order.
43 | :bins: (`array`)
44 | The edges of the bins.
45 | Length nbins + 1 (nbins left edges and right edge of last bin). Always a single array even
46 | when multiple data sets are passed in.
47 | :patches: (`list` or `list` of `lists`)
48 | Silent list of individual patches used to create the histogram or list of such lists if multiple
49 | input datasets.
50 |
51 | """
52 | histogram = create_histogram_object(kwargs)
53 | histogram.add_data(x)
54 | return histogram.plot_hist(axis, overlapping, formatted_yaxis, **kwargs)
55 |
56 |
57 | def distplot(axis, x, **kwargs):
58 | """Plots a normalised histogram and a density plot on an Axes object
59 |
60 | Args:
61 | :axis: (`Axes`)
62 | An matplotlib Axes object on which the histogram will be plot.
63 | :x: (`DataFrame` or `list` of `DataFrame`)
64 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
65 | :\*\*kwargs:
66 | The keyword arguments as used in matplotlib.pyplot.hist. Normed is set to True
67 |
68 | Returns:
69 | :n: (`array` or `list` of `arrays`)
70 | The values of the histogram bins. See normed and weights for a description of the possible semantics.
71 | If input x is an array, then this is an array of length nbins. If input is a sequence arrays
72 | [data1, data2,..], then this is a list of arrays with the values of the histograms for each of the
73 | arrays in the same order.
74 | :bins: (`array`)
75 | The edges of the bins.
76 | Length nbins + 1 (nbins left edges and right edge of last bin). Always a single array even
77 | when multiple data sets are passed in.
78 | :patches: (`list` or `list` of `lists`)
79 | Silent list of individual patches used to create the histogram or list of such lists if multiple
80 | input datasets.
81 | """
82 | histogram = create_histogram_object(kwargs)
83 | histogram.add_data(x)
84 | n, bins, patches = histogram.plot_hist(axis, density=True, **kwargs)
85 |
86 | # If working with a list of DataFrames as input, patches will be a list of lists with Rectangle objects
87 | # We will get the color of the first Rectangle object. If there is only one DataFrame patches is a single list
88 | # Of Rectangle objects
89 | if type(x) == list and len(x) > 1:
90 | colors = [patch[0].get_facecolor() for patch in patches]
91 | elif type(patches[0]) is Rectangle:
92 | colors = [patches[0].get_facecolor()]
93 | else:
94 | raise TypeError("Unexpected Patch Type. Expected Rectangle")
95 |
96 | histogram.plot_density(axis, color=colors)
97 | return n, bins, patches
98 |
99 |
100 | def pandas_histogram(x, bins=10, range=None):
101 | """Returns a pandas DataFrame with histograms of the Spark DataFrame
102 |
103 | Bin ranges are formatted as text an put on the Index.
104 |
105 | Args:
106 | :x: (`DataFrame` or `list` of `DataFrame`)
107 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
108 | :bins: (`integer` or `array_like`, optional)
109 | If an integer is given, bins + 1 bin edges are returned, consistently with numpy.histogram() for
110 | numpy version >= 1.3.
111 |
112 | Unequally spaced bins are supported if bins is a sequence.
113 |
114 | Default is 10
115 | :range: (tuple or None, optional)
116 | The lower and upper range of the bins. Lower and upper outliers are ignored.
117 | If not provided, range is (x.min(), x.max()). Range has no effect if bins is a sequence.
118 |
119 | If bins is a sequence or range is specified, autoscaling is based on the specified bin range instead
120 | of the range of x.
121 |
122 | Default is None
123 | """
124 | histogram = Histogram(bins=bins, range=range)
125 | histogram.add_data(x)
126 | return histogram.to_pandas()
127 |
128 |
129 | def create_histogram_object(kwargs):
130 | bins = 10
131 | b_range = None
132 |
133 | if 'bins' in kwargs:
134 | bins = kwargs['bins']
135 | del kwargs['bins']
136 |
137 | if 'range' in kwargs:
138 | b_range = kwargs['range']
139 | del kwargs['range']
140 |
141 | return Histogram(bins=bins, range=b_range)
142 |
143 |
144 | class Histogram(object):
145 | """The Histogram object leverages Spark to calculate histograms, and matplotlib to visualize these.
146 |
147 | Args:
148 | :range: (`tuple`, optional)
149 | The lower and upper range of the bins.
150 |
151 | Lower and upper outliers are ignored. If not provided, range is (min(x), max(x)). Range has no
152 | effect if bins is a sequence. If bins is a sequence or range is specified, autoscaling is
153 | based on the specified bin range instead of the range of x.
154 | :bins: (`int` or `list` of `str` or `list of `int`, optional)
155 | If an integer is given: Number of bins in the histogram.
156 |
157 | Defaults to 10.
158 |
159 | If a list is given: Predefined list of bin boundaries.
160 |
161 | The bins are all open to the right except for the last which is closed. e.g. [1,10,20,50] means
162 | the buckets are [1,10) [10,20) [20,50], which means 1<=x<10, 10<=x<20, 20<=x<=50.
163 |
164 | """
165 | def __init__(self, bins=10, range=None):
166 | self.col_list = []
167 | self.bin_boundaries = []
168 | self.hist_dict = {} # column names: bin weight lists pairs
169 | self.nr_bins = None
170 | self.min_value = None
171 | self.max_value = None
172 | self.is_build = False
173 |
174 | if isinstance(bins, list):
175 | self.bin_boundaries = [float(bin_border) for bin_border in bins]
176 | else:
177 | self.nr_bins = bins
178 |
179 | if range is not None:
180 | self.min_value = range[0]
181 | self.max_value = range[1]
182 |
183 | def add_column(self, table):
184 | """Add single column DataFrame to the histogram object.
185 |
186 | If multiple columns share the same name, a (n) will be appended to the name, where n is
187 | the next available number.
188 |
189 | Args:
190 | :table: (:obj:`dataframe`)
191 | A PySpark DataFrame with a single column
192 |
193 | """
194 | if len(table.columns) > 1:
195 | raise ValueError('More then one column is being added, use add_data() to add multi-column DataFrames')
196 |
197 | column_name = table.columns[0]
198 |
199 | if not isinstance(table.schema.fields[0].dataType, NumericType):
200 | raise ValueError('Column %s has a non-numeric type (%s), only numeric types are supported'
201 | % (column_name, str(table.schema.fields[0].dataType)))
202 |
203 | self.col_list.append((table, column_name))
204 |
205 | def _get_bin_centers(self):
206 | result = []
207 | for i in range(len(self.bin_boundaries) - 1):
208 | result.append(((self.bin_boundaries[i + 1] - self.bin_boundaries[i]) / 2) + self.bin_boundaries[i])
209 | return result
210 |
211 | def _get_col_names(self):
212 | new_col_names = []
213 | for i in range(len(self.bin_boundaries) - 1):
214 | new_col_names.append('%.2f - %.2f' % (self.bin_boundaries[i], self.bin_boundaries[i + 1]))
215 | return new_col_names
216 |
217 | def _check_col_name(self, column_name):
218 | n = 0
219 | col_name_new = column_name
220 | while col_name_new in self.hist_dict.keys():
221 | n += 1
222 | col_name_new = '%s (%d)' % (column_name, n)
223 | return col_name_new
224 |
225 | def _get_min_value(self):
226 | if self.min_value is not None:
227 | return self.min_value
228 | return min([table.select(F.min(F.col(col_name))).collect()[0][0]
229 | for table, col_name in self.col_list])
230 |
231 | def _get_max_value(self):
232 | if self.max_value is not None:
233 | return self.max_value
234 | return max([table.select(F.max(F.col(col_name))).collect()[0][0]
235 | for table, col_name in self.col_list])
236 |
237 | def _calculate_bins(self):
238 | if len(self.bin_boundaries) > 0:
239 | return self.bin_boundaries
240 |
241 | if len(self.bin_boundaries) == 0 and len(self.col_list) == 1 \
242 | and self.min_value is None and self.max_value is None:
243 | # Only use the amount of bins as input For the histogram function
244 | return self.nr_bins
245 |
246 | min_value = self._get_min_value()
247 | max_value = self._get_max_value()
248 |
249 | # expand empty range to avoid empty graph
250 | return Histogram._calc_n_bins_between(min_value, max_value, self.nr_bins)
251 |
252 | def _add_hist(self, table, column_name):
253 | """Uses spark to calculate the hist values: for each column a list of weights, and if the bin_list is not set
254 | a set of bin boundaries"""
255 | bin_boundaries, bin_weights = table.select(column_name).rdd.flatMap(lambda x: x).histogram(self.bin_boundaries)
256 | self.hist_dict[self._check_col_name(column_name)] = bin_weights
257 |
258 | if isinstance(self.bin_boundaries, int): # the bin_list is not set
259 | if len(bin_boundaries) == 2 and bin_boundaries[0] == bin_boundaries[1]:
260 | # In case of a column with 1 unique value we need to calculate the histogram ourselves.
261 | min_value = bin_boundaries[0]
262 | max_value = bin_boundaries[1]
263 | self.bin_boundaries = self._calc_n_bins_between(min_value, max_value, self.nr_bins)
264 | self.hist_dict[column_name] = Histogram._calc_weights(self.bin_boundaries, min_value, bin_weights)
265 | else:
266 | self.bin_boundaries = bin_boundaries
267 |
268 | @staticmethod
269 | def _calc_n_bins_between(min_value, max_value, nr_bins):
270 | """Returns a list of bin borders between min_value and max_value"""
271 | if min_value == max_value:
272 | min_value = min_value - 0.5
273 | max_value = max_value + 0.5
274 | step = (float(max_value) - float(min_value)) / nr_bins
275 | return [min_value + (step * float(bn_nr)) for bn_nr in range(nr_bins + 1)]
276 |
277 | @staticmethod
278 | def _calc_weights(bins, value, value_count):
279 | """Calculate weights given a bin list, value within that bin list and a count"""
280 | # first we get a list of bin boundary tuples
281 | weights = list()
282 | bin_boundary_idx = [(idx, idx+2) for idx in range(len(bins)-1)]
283 | bin_boundaries = [tuple(bins[left_idx:right_idx]) for (left_idx, right_idx) in bin_boundary_idx]
284 | for left_boundary, right_boundary in bin_boundaries:
285 | if left_boundary <= value < right_boundary:
286 | weights.append(value_count[0])
287 | else:
288 | weights.append(0)
289 | return weights
290 |
291 | @staticmethod
292 | def _convert_number_bmk(axis_value, _):
293 | """Converts the values on axes to Billions, Millions or Thousands"""
294 | if axis_value >= 1e9:
295 | return '{:1.1f}B'.format(axis_value * 1e-9)
296 | if axis_value >= 1e6:
297 | return '{:1.1f}M'.format(axis_value * 1e-6)
298 | if axis_value >= 1e3:
299 | return '{:1.1f}K'.format(axis_value * 1e-3)
300 | if axis_value >= 1 or axis_value == 0:
301 | return '{:1.0f}'.format(axis_value)
302 | return axis_value
303 |
304 | def build(self):
305 | """Calculates the histogram values for each of the columns.
306 |
307 | If the Histogram has already been build, it doesn't build it again.
308 | """
309 | if not self.is_build:
310 | self.bin_boundaries = self._calculate_bins()
311 | for table, column_name in self.col_list:
312 | self._add_hist(table, column_name)
313 | self.is_build = True
314 |
315 | def to_pandas(self, kind='hist'):
316 | """Returns a pandas dataframe from the Histogram object.
317 |
318 | This function calculates the Histogram function in Spark if it was not done yet.
319 |
320 | Args:
321 | :kind: (:obj:`str`, optional):
322 | 'hist' or 'density'. When using hist this returns the histogram object
323 | as pandas dataframe. When using density the index contains the bin centers, and the values in the
324 | DataFrame are the scaled values. Defaults to 'hist'
325 |
326 | Returns:
327 | A pandas DataFrame from the Histogram object.
328 | """
329 | self.build()
330 | if kind == 'hist':
331 | return pd.DataFrame(self.hist_dict).set_index([self._get_col_names()])
332 | elif kind == 'density':
333 | result = pd.DataFrame(self.hist_dict).set_index([self._get_bin_centers()])
334 | return result.apply(lambda x: x / x.max(), axis=0)
335 |
336 | def plot_hist(self, ax, overlapping=False, formatted_yaxis=True, **kwargs):
337 | """Returns a matplotlib style histogram (matplotlib.pyplot.hist)
338 |
339 | Uses the matplotlib object oriented interface to add a Histogram to an matplotlib Axes object.
340 | All named arguments from pyplot.hist can be used. A new argument called "type" makes it possible to
341 | make overlapping histogram plots.
342 |
343 | Args:
344 | :ax: (`Axes`)
345 | An matplotlib Axes object on which the histogram will be plot
346 | :overlapping (`bool`, optional):
347 | If set to true, this will generate an overlapping plot.
348 | When set to False it will generate a normal grouped histogram. Defaults to False.
349 | :formatted_yaxis: (`bool`, optional).
350 | If set to true, the numbers on the yaxis will be formatted
351 | for better readability. E.g. 1500000 will become 1.5M. Defaults to True
352 | :**kwargs:
353 | The keyword arguments as used in matplotlib.pyplot.hist
354 | """
355 | self.build()
356 |
357 | if formatted_yaxis:
358 | # Round the y-axis value to nearest thousand, million, or billion for readable y-axis
359 | formatter = plt.FuncFormatter(Histogram._convert_number_bmk)
360 | ax.yaxis.set_major_formatter(formatter)
361 |
362 | if overlapping:
363 | for colname in self.hist_dict:
364 | ax.hist(self._get_bin_centers(),
365 | bins=self.bin_boundaries,
366 | alpha=0.5,
367 | label=self.hist_dict.keys(),
368 | weights=self.hist_dict[colname],
369 | **kwargs
370 | )
371 | else:
372 | weights_multi = [self.hist_dict[colname] for colname in self.hist_dict]
373 | return ax.hist([self._get_bin_centers()] * len(self.hist_dict),
374 | bins=self.bin_boundaries,
375 | weights=weights_multi,
376 | label=self.hist_dict.keys(),
377 | **kwargs)
378 |
379 | def plot_density(self, ax, num=300, **kwargs):
380 | """Returns a density plot on an Pyplot Axes object.
381 |
382 | Args:
383 | :ax: (`Axes`)
384 | An matplotlib Axes object on which the histogram will be plot
385 | :num: (`int`)
386 | The number of x values the line is plotted on. Default: 300
387 | :**kwargs:
388 | Keyword arguments that are passed on to the pyplot.plot function.
389 | """
390 | colors = []
391 |
392 | self.build()
393 | bin_centers = np.asarray(self._get_bin_centers())
394 | x_new = np.linspace(bin_centers.min(), bin_centers.max(), num)
395 |
396 | if 'color' in kwargs:
397 | colors = kwargs['color']
398 | del kwargs['color']
399 |
400 | power_smooth = []
401 |
402 | for (colname, bin_values) in self.hist_dict.items():
403 | normed_values, ble = np.histogram(self._get_bin_centers(),
404 | bins=self.bin_boundaries,
405 | weights=bin_values,
406 | density=True
407 | )
408 | interpolation_function = interp1d(bin_centers, normed_values, kind='quadratic')
409 |
410 | power_smooth.append(x_new)
411 | power_smooth.append(interpolation_function(x_new))
412 |
413 | lines = ax.plot(*power_smooth, **kwargs)
414 |
415 | for i, line in enumerate(lines):
416 | if len(colors) > 0:
417 | plt.setp(line, color=colors[i], label=list(self.hist_dict.keys())[i])
418 | else:
419 | plt.setp(line, label=list(self.hist_dict.keys())[i])
420 |
421 | return lines
422 |
423 | def add_data(self, data):
424 | """Ads 1 or more columns to a histogram.
425 |
426 | Multiple options are available:
427 | * Add a single column dataframe
428 | * Add a list of single column dataframes
429 | * Add a dataframe with multiple columns
430 |
431 | Args:
432 | :data:
433 | A single column Spark dataframe, a list of single column Spark
434 | dataframes, or a multi column Spark dataframe.
435 | """
436 | if isinstance(data, list):
437 | for df_column in data:
438 | self.add_column(df_column)
439 |
440 | elif len(data.columns) > 1:
441 | for col_name in data.columns:
442 | self.add_column(data.select(col_name))
443 |
444 | else:
445 | self.add_column(data)
446 |
--------------------------------------------------------------------------------
/docs/build/html/index.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Welcome to pyspark_histogram’s documentation! — pyspark_dist_explore 0.1.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
If set to true, this will generate an overlapping plot.
79 | When set to False it will generate a normal grouped histogram. Defaults to False.
80 |
81 |
82 |
formatted_yaxis:
83 |
(bool, optional)
84 | If set to true, the numbers on the yaxis will be formatted
85 | for better readability. E.g. 1500000 will become 1.5M. Defaults to True
86 |
87 |
88 |
**kwargs:
The keyword arguments as used in matplotlib.pyplot.hist
89 |
90 |
91 |
92 |
93 |
94 |
Returns:
95 |
96 |
97 |
98 |
99 |
n:
(array or list of arrays)
100 | The values of the histogram bins. See normed and weights for a description of the possible semantics.
101 | If input x is an array, then this is an array of length nbins. If input is a sequence arrays
102 | [data1, data2,..], then this is a list of arrays with the values of the histograms for each of the
103 | arrays in the same order.
104 |
105 |
bins:
(array)
106 | The edges of the bins.
107 | Length nbins + 1 (nbins left edges and right edge of last bin). Always a single array even
108 | when multiple data sets are passed in.
109 |
110 |
patches:
(list or list of lists)
111 | Silent list of individual patches used to create the histogram or list of such lists if multiple
112 | input datasets.
Plots a normalised histogram and a density plot on an Axes object
124 |
125 |
Args:
126 |
127 |
128 |
129 |
130 |
axis:
(Axes)
131 | An matplotlib Axes object on which the histogram will be plot.
132 |
133 |
x:
(DataFrame or list of DataFrame)
134 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
135 |
136 |
**kwargs:
The keyword arguments as used in matplotlib.pyplot.hist. Normed is set to True
137 |
138 |
139 |
140 |
141 |
Returns:
142 |
143 |
144 |
145 |
146 |
n:
(array or list of arrays)
147 | The values of the histogram bins. See normed and weights for a description of the possible semantics.
148 | If input x is an array, then this is an array of length nbins. If input is a sequence arrays
149 | [data1, data2,..], then this is a list of arrays with the values of the histograms for each of the
150 | arrays in the same order.
151 |
152 |
bins:
(array)
153 | The edges of the bins.
154 | Length nbins + 1 (nbins left edges and right edge of last bin). Always a single array even
155 | when multiple data sets are passed in.
156 |
157 |
patches:
(list or list of lists)
158 | Silent list of individual patches used to create the histogram or list of such lists if multiple
159 | input datasets.
Returns a pandas DataFrame with histograms of the Spark DataFrame
171 |
Bin ranges are formatted as text an put on the Index.
172 |
173 |
Args:
174 |
175 |
176 |
177 |
178 |
x:
(DataFrame or list of DataFrame)
179 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
180 |
181 |
182 |
bins:
(integer or array_like, optional)
183 | If an integer is given, bins + 1 bin edges are returned, consistently with numpy.histogram() for
184 | numpy version >= 1.3.
185 |
Unequally spaced bins are supported if bins is a sequence.
186 |
Default is 10
187 |
188 |
189 |
range:
(tuple or None, optional)
190 | The lower and upper range of the bins. Lower and upper outliers are ignored.
191 | If not provided, range is (x.min(), x.max()). Range has no effect if bins is a sequence.
192 |
If bins is a sequence or range is specified, autoscaling is based on the specified bin range instead
193 | of the range of x.
194 |
Default is None
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 | class pyspark_dist_explore.Histogram(bins=10, range=None)[source]¶
206 |
The Histogram object leverages Spark to calculate histograms, and matplotlib to visualize these.
207 |
208 |
Args:
209 |
210 |
211 |
212 |
213 |
range:
(tuple, optional)
214 | The lower and upper range of the bins.
215 | Lower and upper outliers are ignored. If not provided, range is (min(x), max(x)). Range has no
216 | effect if bins is a sequence. If bins is a sequence or range is specified, autoscaling is
217 | based on the specified bin range instead of the range of x.
218 |
219 |
bins:
(int or list of str or list of `int, optional)
220 | If an integer is given: Number of bins in the histogram. Defaults to 10.
221 | If a list is given: Predefined list of bin boundaries.
222 | The bins are all open to the right except for the last which is closed. e.g. [1,10,20,50] means
223 | the buckets are [1,10) [10,20) [20,50], which means 1<=x<10, 10<=x<20, 20<=x<=50.
Returns a matplotlib style histogram (matplotlib.pyplot.hist)
311 |
Uses the matplotlib object oriented interface to add a Histogram to an matplotlib Axes object.
312 | All named arguments from pyplot.hist can be used. A new argument called “type” makes it possible to
313 | make overlapping histogram plots.
314 |
315 |
Args:
316 |
317 |
318 |
319 |
320 |
ax:
(Axes)
321 | An matplotlib Axes object on which the histogram will be plot
322 |
323 |
overlapping (bool, optional):
324 |
If set to true, this will generate an overlapping plot.
325 | When set to False it will generate a normal grouped histogram. Defaults to False.
326 |
327 |
formatted_yaxis:
328 |
(bool, optional).
329 | If set to true, the numbers on the yaxis will be formatted
330 | for better readability. E.g. 1500000 will become 1.5M. Defaults to True
331 |
332 |
**kwargs:
The keyword arguments as used in matplotlib.pyplot.hist
Returns a pandas dataframe from the Histogram object.
344 |
This function calculates the Histogram function in Spark if it was not done yet.
345 |
346 |
Args:
347 |
348 |
349 |
350 |
351 |
kind:
(str, optional):
352 | ‘hist’ or ‘density’. When using hist this returns the histogram object
353 | as pandas dataframe. When using density the index contains the bin centers, and the values in the
354 | DataFrame are the scaled values. Defaults to ‘hist’
\ 689 | Sort by:\ 690 | best rated\ 691 | newest\ 692 | oldest\ 693 |
\ 694 |\ 698 |
Add a comment\ 700 | (markup):
\ 701 |``code``, \ 704 | code blocks:::and an indented block after blank line