├── .gitignore ├── docs ├── build │ ├── html │ │ ├── .nojekyll │ │ ├── _static │ │ │ ├── custom.css │ │ │ ├── down.png │ │ │ ├── file.png │ │ │ ├── plus.png │ │ │ ├── up.png │ │ │ ├── minus.png │ │ │ ├── comment.png │ │ │ ├── ajax-loader.gif │ │ │ ├── up-pressed.png │ │ │ ├── comment-bright.png │ │ │ ├── comment-close.png │ │ │ ├── down-pressed.png │ │ │ ├── pygments.css │ │ │ ├── doctools.js │ │ │ ├── underscore.js │ │ │ ├── basic.css │ │ │ ├── alabaster.css │ │ │ ├── searchtools.js │ │ │ └── websupport.js │ │ ├── objects.inv │ │ ├── .buildinfo │ │ ├── _sources │ │ │ └── index.rst.txt │ │ ├── searchindex.js │ │ ├── _modules │ │ │ └── index.html │ │ ├── search.html │ │ ├── py-modindex.html │ │ ├── genindex.html │ │ └── index.html │ └── doctrees │ │ ├── index.doctree │ │ └── environment.pickle ├── README_files │ └── README_5_0.png ├── Makefile ├── source │ ├── index.rst │ └── conf.py └── README.md ├── pyspark_dist_explore ├── tests │ ├── __init__.py │ ├── test_pyspark_dist_explore.pyc │ └── test_pyspark_dist_explore.py ├── __init__.py ├── requirements.txt └── pyspark_dist_explore.py ├── pyspark_dist_explore.egg-info ├── not-zip-safe ├── dependency_links.txt ├── top_level.txt ├── requires.txt ├── PKG-INFO └── SOURCES.txt ├── README_files ├── README_4_0.png ├── README_5_0.png ├── README_5_1.png ├── README_7_0.png ├── README_7_1.png ├── README_8_0.png └── README_8_1.png ├── setup.py ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | -------------------------------------------------------------------------------- /docs/build/html/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyspark_dist_explore/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyspark_dist_explore.egg-info/not-zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pyspark_dist_explore.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pyspark_dist_explore.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | pyspark_dist_explore 2 | -------------------------------------------------------------------------------- /docs/build/html/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* This file intentionally left blank. */ 2 | -------------------------------------------------------------------------------- /pyspark_dist_explore.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | scipy 4 | matplotlib 5 | -------------------------------------------------------------------------------- /pyspark_dist_explore/__init__.py: -------------------------------------------------------------------------------- 1 | from .pyspark_dist_explore import Histogram, hist, distplot, pandas_histogram -------------------------------------------------------------------------------- /README_files/README_4_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/README_files/README_4_0.png -------------------------------------------------------------------------------- /README_files/README_5_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/README_files/README_5_0.png -------------------------------------------------------------------------------- /README_files/README_5_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/README_files/README_5_1.png -------------------------------------------------------------------------------- /README_files/README_7_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/README_files/README_7_0.png -------------------------------------------------------------------------------- /README_files/README_7_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/README_files/README_7_1.png -------------------------------------------------------------------------------- /README_files/README_8_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/README_files/README_8_0.png -------------------------------------------------------------------------------- /README_files/README_8_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/README_files/README_8_1.png -------------------------------------------------------------------------------- /docs/build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/objects.inv -------------------------------------------------------------------------------- /docs/README_files/README_5_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/README_files/README_5_0.png -------------------------------------------------------------------------------- /docs/build/html/_static/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/down.png -------------------------------------------------------------------------------- /docs/build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/file.png -------------------------------------------------------------------------------- /docs/build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/plus.png -------------------------------------------------------------------------------- /docs/build/html/_static/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/up.png -------------------------------------------------------------------------------- /docs/build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/doctrees/index.doctree -------------------------------------------------------------------------------- /docs/build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/minus.png -------------------------------------------------------------------------------- /docs/build/html/_static/comment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/comment.png -------------------------------------------------------------------------------- /docs/build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/doctrees/environment.pickle -------------------------------------------------------------------------------- /docs/build/html/_static/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/ajax-loader.gif -------------------------------------------------------------------------------- /docs/build/html/_static/up-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/up-pressed.png -------------------------------------------------------------------------------- /docs/build/html/_static/comment-bright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/comment-bright.png -------------------------------------------------------------------------------- /docs/build/html/_static/comment-close.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/comment-close.png -------------------------------------------------------------------------------- /docs/build/html/_static/down-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/docs/build/html/_static/down-pressed.png -------------------------------------------------------------------------------- /pyspark_dist_explore/tests/test_pyspark_dist_explore.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Bergvca/pyspark_dist_explore/HEAD/pyspark_dist_explore/tests/test_pyspark_dist_explore.pyc -------------------------------------------------------------------------------- /pyspark_dist_explore/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas>=0.20.1 2 | spark_testing_base>=0.6.0 3 | numpy>=1.12.1 4 | scipy>=0.19.0 5 | matplotlib>=2.0.2 6 | findspark>=1.1.0 7 | pyspark>=2.2.2 8 | -------------------------------------------------------------------------------- /docs/build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 5a1addecaae6ebf371e18734a5b42090 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /pyspark_dist_explore.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: pyspark-dist-explore 3 | Version: 0.1.5 4 | Summary: Create histogram and density plots from PySpark Dataframes 5 | Home-page: UNKNOWN 6 | Author: Chris van den Berg 7 | Author-email: fake_email@gmail.com 8 | License: MIT License 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /pyspark_dist_explore.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | pyspark_dist_explore/__init__.py 4 | pyspark_dist_explore/pyspark_dist_explore.py 5 | pyspark_dist_explore.egg-info/PKG-INFO 6 | pyspark_dist_explore.egg-info/SOURCES.txt 7 | pyspark_dist_explore.egg-info/dependency_links.txt 8 | pyspark_dist_explore.egg-info/not-zip-safe 9 | pyspark_dist_explore.egg-info/requires.txt 10 | pyspark_dist_explore.egg-info/top_level.txt -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = pyspark_dist_explore 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='pyspark_dist_explore', 5 | version='0.1.7', 6 | packages=['pyspark_dist_explore'], 7 | license='MIT License', 8 | description='Create histogram and density plots from PySpark Dataframes', 9 | author='Chris van den Berg', 10 | author_email='fake_email@gmail.com', 11 | zip_safe=False, 12 | install_requires=['pandas' 13 | , 'numpy' 14 | , 'scipy' 15 | , 'matplotlib' 16 | # , 'spark_testing_base' # Only required for testing 17 | # , 'findspark' # Only required for testing 18 | ] 19 | 20 | ) 21 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. pyspark_histogram documentation master file, created by 2 | sphinx-quickstart on Tue Jun 27 16:46:34 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pyspark_histogram's documentation! 7 | ============================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | 22 | 23 | Documentation for the Code 24 | ************************** 25 | .. automodule:: pyspark_dist_explore 26 | :members: hist, distplot, pandas_histogram 27 | 28 | 29 | .. autoclass:: Histogram 30 | :members: add_data, add_column, build, to_pandas, plot_hist, plot_density 31 | -------------------------------------------------------------------------------- /docs/build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. pyspark_histogram documentation master file, created by 2 | sphinx-quickstart on Tue Jun 27 16:46:34 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pyspark_histogram's documentation! 7 | ============================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | 22 | 23 | Documentation for the Code 24 | ************************** 25 | .. automodule:: pyspark_dist_explore 26 | :members: hist, distplot, pandas_histogram 27 | 28 | 29 | .. autoclass:: Histogram 30 | :members: add_data, add_column, build, to_pandas, plot_hist, plot_density 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Chris van den Berg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/build/html/searchindex.js: -------------------------------------------------------------------------------- 1 | Search.setIndex({docnames:["index"],envversion:51,filenames:["index.rst"],objects:{"":{pyspark_dist_explore:[0,0,0,"-"]},"pyspark_dist_explore.Histogram":{add_column:[0,2,1,""],add_data:[0,2,1,""],build:[0,2,1,""],plot_density:[0,2,1,""],plot_hist:[0,2,1,""],to_pandas:[0,2,1,""]},pyspark_dist_explore:{Histogram:[0,1,1,""],distplot:[0,3,1,""],hist:[0,3,1,""],pandas_histogram:[0,3,1,""]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","method","Python method"],"3":["py","function","Python function"]},objtypes:{"0":"py:module","1":"py:class","2":"py:method","3":"py:function"},terms:{"class":0,"default":0,"function":0,"int":0,"new":0,"return":0,"true":0,Ads:0,Axes:0,The:0,Uses:0,add:0,add_column:0,add_data:0,again:0,all:0,alreadi:0,alwai:0,append:0,arg:0,argument:0,arrai:0,array_lik:0,autosc:0,avail:0,axi:0,base:0,becom:0,been:0,better:0,bin:0,bool:0,boundari:0,bucket:0,build:0,calcul:0,call:0,can:0,center:0,close:0,column:0,consist:0,contain:0,creat:0,data1:0,data2:0,data:0,datafram:0,dataset:0,densiti:0,descript:0,distplot:0,doesn:0,done:0,each:0,edg:0,effect:0,even:0,except:0,fals:0,format:0,formatted_yaxi:0,from:0,gener:0,given:0,group:0,has:0,hist:0,histogram:0,ignor:0,index:0,individu:0,input:0,instead:0,integ:0,interfac:0,keyword:0,kind:0,kwarg:0,last:0,left:0,length:0,leverag:0,line:0,list:0,lower:0,make:0,matplotlib:0,max:0,mean:0,min:0,modul:0,more:0,multi:0,multipl:0,name:0,nbin:0,next:0,none:0,norm:0,normal:0,normalis:0,num:0,number:0,numer:0,numpi:0,obj:[],object:0,one:0,open:0,option:0,order:0,orient:0,outlier:0,overlap:0,page:0,panda:0,pandas_histogram:0,pass:0,patch:0,plot:0,plot_dens:0,plot_hist:0,possibl:0,predefin:0,provid:0,put:0,pyplot:0,pyspark:0,pyspark_dist_explor:0,rang:0,readabl:0,right:0,same:0,scale:0,search:0,see:0,semant:0,sequenc:0,set:0,share:0,silent:0,singl:0,sourc:0,space:0,spark:0,specifi:0,str:0,style:0,support:0,text:0,thi:0,to_panda:0,tupl:0,type:0,unequ:0,upper:0,use_log10:[],used:0,using:0,valu:0,version:0,visual:0,weight:0,when:0,where:0,which:0,yaxi:0,yet:0},titles:["Welcome to pyspark_histogram’s documentation!"],titleterms:{code:0,document:0,indic:0,pyspark_histogram:0,tabl:0,welcom:0}}) -------------------------------------------------------------------------------- /docs/build/html/_modules/index.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Overview: module code — pyspark_dist_explore 0.1.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 |
40 |
41 |
42 |
43 | 44 |

All modules for which code is available

45 | 47 | 48 |
49 |
50 |
51 | 71 |
72 |
73 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /docs/build/html/search.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Search — pyspark_dist_explore 0.1.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
48 |
49 |
50 |
51 | 52 |

Search

53 |
54 | 55 |

56 | Please activate JavaScript to enable the search 57 | functionality. 58 |

59 |
60 |

61 | From here you can search these documents. Enter your search 62 | words into the box below and click "search". Note that the search 63 | function will automatically search for all of the words. Pages 64 | containing fewer words won't appear in the result list. 65 |

66 |
67 | 68 | 69 | 70 |
71 | 72 |
73 | 74 |
75 | 76 |
77 |
78 |
79 | 89 |
90 |
91 | 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /docs/build/html/py-modindex.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Python Module Index — pyspark_dist_explore 0.1.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 |
48 |
49 |
50 | 51 | 52 |

Python Module Index

53 | 54 |
55 | p 56 |
57 | 58 | 59 | 60 | 62 | 63 | 64 | 67 |
 
61 | p
65 | pyspark_dist_explore 66 |
68 | 69 | 70 |
71 |
72 |
73 | 93 |
94 |
95 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /docs/build/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #ffffcc } 2 | .highlight { background: #eeffcc; } 3 | .highlight .c { color: #408090; font-style: italic } /* Comment */ 4 | .highlight .err { border: 1px solid #FF0000 } /* Error */ 5 | .highlight .k { color: #007020; font-weight: bold } /* Keyword */ 6 | .highlight .o { color: #666666 } /* Operator */ 7 | .highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */ 8 | .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ 9 | .highlight .cp { color: #007020 } /* Comment.Preproc */ 10 | .highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */ 11 | .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ 12 | .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */ 13 | .highlight .gd { color: #A00000 } /* Generic.Deleted */ 14 | .highlight .ge { font-style: italic } /* Generic.Emph */ 15 | .highlight .gr { color: #FF0000 } /* Generic.Error */ 16 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 17 | .highlight .gi { color: #00A000 } /* Generic.Inserted */ 18 | .highlight .go { color: #333333 } /* Generic.Output */ 19 | .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ 20 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 21 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 22 | .highlight .gt { color: #0044DD } /* Generic.Traceback */ 23 | .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ 24 | .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ 25 | .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ 26 | .highlight .kp { color: #007020 } /* Keyword.Pseudo */ 27 | .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ 28 | .highlight .kt { color: #902000 } /* Keyword.Type */ 29 | .highlight .m { color: #208050 } /* Literal.Number */ 30 | .highlight .s { color: #4070a0 } /* Literal.String */ 31 | .highlight .na { color: #4070a0 } /* Name.Attribute */ 32 | .highlight .nb { color: #007020 } /* Name.Builtin */ 33 | .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ 34 | .highlight .no { color: #60add5 } /* Name.Constant */ 35 | .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ 36 | .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ 37 | .highlight .ne { color: #007020 } /* Name.Exception */ 38 | .highlight .nf { color: #06287e } /* Name.Function */ 39 | .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ 40 | .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ 41 | .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ 42 | .highlight .nv { color: #bb60d5 } /* Name.Variable */ 43 | .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ 44 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 45 | .highlight .mb { color: #208050 } /* Literal.Number.Bin */ 46 | .highlight .mf { color: #208050 } /* Literal.Number.Float */ 47 | .highlight .mh { color: #208050 } /* Literal.Number.Hex */ 48 | .highlight .mi { color: #208050 } /* Literal.Number.Integer */ 49 | .highlight .mo { color: #208050 } /* Literal.Number.Oct */ 50 | .highlight .sa { color: #4070a0 } /* Literal.String.Affix */ 51 | .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ 52 | .highlight .sc { color: #4070a0 } /* Literal.String.Char */ 53 | .highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */ 54 | .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ 55 | .highlight .s2 { color: #4070a0 } /* Literal.String.Double */ 56 | .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ 57 | .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ 58 | .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ 59 | .highlight .sx { color: #c65d09 } /* Literal.String.Other */ 60 | .highlight .sr { color: #235388 } /* Literal.String.Regex */ 61 | .highlight .s1 { color: #4070a0 } /* Literal.String.Single */ 62 | .highlight .ss { color: #517918 } /* Literal.String.Symbol */ 63 | .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ 64 | .highlight .fm { color: #06287e } /* Name.Function.Magic */ 65 | .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ 66 | .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ 67 | .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ 68 | .highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */ 69 | .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # pyspark_dist_explore documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Jun 27 16:46:34 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | # import os 20 | # import sys 21 | # sys.path.insert(0, os.path.abspath('.')) 22 | 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # 28 | # needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = ['sphinx.ext.autodoc', 34 | 'sphinx.ext.intersphinx', 35 | 'sphinx.ext.ifconfig', 36 | 'sphinx.ext.viewcode', 37 | 'sphinx.ext.githubpages'] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # The suffix(es) of source filenames. 43 | # You can specify multiple suffix as a list of string: 44 | # 45 | # source_suffix = ['.rst', '.md'] 46 | source_suffix = '.rst' 47 | 48 | # The master toctree document. 49 | master_doc = 'index' 50 | 51 | # General information about the project. 52 | project = u'pyspark_dist_explore' 53 | copyright = u'2017, Chris van den Berg' 54 | author = u'Chris van den Berg' 55 | 56 | # The version info for the project you're documenting, acts as replacement for 57 | # |version| and |release|, also used in various other places throughout the 58 | # built documents. 59 | # 60 | # The short X.Y version. 61 | version = u'0.1.0' 62 | # The full version, including alpha/beta/rc tags. 63 | release = u'0.1.0' 64 | 65 | # The language for content autogenerated by Sphinx. Refer to documentation 66 | # for a list of supported languages. 67 | # 68 | # This is also used if you do content translation via gettext catalogs. 69 | # Usually you set "language" from the command line for these cases. 70 | language = None 71 | 72 | # List of patterns, relative to source directory, that match files and 73 | # directories to ignore when looking for source files. 74 | # This patterns also effect to html_static_path and html_extra_path 75 | exclude_patterns = [] 76 | 77 | # The name of the Pygments (syntax highlighting) style to use. 78 | pygments_style = 'sphinx' 79 | 80 | # If true, `todo` and `todoList` produce output, else they produce nothing. 81 | todo_include_todos = False 82 | 83 | 84 | # -- Options for HTML output ---------------------------------------------- 85 | 86 | # The theme to use for HTML and HTML Help pages. See the documentation for 87 | # a list of builtin themes. 88 | # 89 | html_theme = 'alabaster' 90 | 91 | # Theme options are theme-specific and customize the look and feel of a theme 92 | # further. For a list of options available for each theme, see the 93 | # documentation. 94 | # 95 | # html_theme_options = {} 96 | 97 | # Add any paths that contain custom static files (such as style sheets) here, 98 | # relative to this directory. They are copied after the builtin static files, 99 | # so a file named "default.css" will overwrite the builtin "default.css". 100 | html_static_path = ['_static'] 101 | 102 | 103 | # -- Options for HTMLHelp output ------------------------------------------ 104 | 105 | # Output file base name for HTML help builder. 106 | htmlhelp_basename = 'pyspark_histogramdoc' 107 | 108 | 109 | # -- Options for LaTeX output --------------------------------------------- 110 | 111 | latex_elements = { 112 | # The paper size ('letterpaper' or 'a4paper'). 113 | # 114 | # 'papersize': 'letterpaper', 115 | 116 | # The font size ('10pt', '11pt' or '12pt'). 117 | # 118 | # 'pointsize': '10pt', 119 | 120 | # Additional stuff for the LaTeX preamble. 121 | # 122 | # 'preamble': '', 123 | 124 | # Latex figure (float) alignment 125 | # 126 | # 'figure_align': 'htbp', 127 | } 128 | 129 | # Grouping the document tree into LaTeX files. List of tuples 130 | # (source start file, target name, title, 131 | # author, documentclass [howto, manual, or own class]). 132 | latex_documents = [ 133 | (master_doc, 'pyspark_dist_explore.tex', u'pyspark\\_histogram Documentation', 134 | u'Chris van den Berg', 'manual'), 135 | ] 136 | 137 | 138 | # -- Options for manual page output --------------------------------------- 139 | 140 | # One entry per manual page. List of tuples 141 | # (source start file, name, description, authors, manual section). 142 | man_pages = [ 143 | (master_doc, 'pyspark_dist_explore', u'pyspark_dist_explore Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ------------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'pyspark_dist_explore', u'pyspark_dist_explore Documentation', 155 | author, 'pyspark_dist_explore', 'One line description of project.', 156 | 'Miscellaneous'), 157 | ] 158 | 159 | 160 | 161 | 162 | # Example configuration for intersphinx: refer to the Python standard library. 163 | intersphinx_mapping = {'https://docs.python.org/': None} 164 | 165 | import sys 166 | import os 167 | sys.path.append(os.path.abspath('../../')) 168 | -------------------------------------------------------------------------------- /docs/build/html/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Index — pyspark_dist_explore 0.1.0 documentation 11 | 12 | 13 | 14 | 15 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 |
41 |
42 |
43 |
44 | 45 | 46 |

Index

47 | 48 |
49 | A 50 | | B 51 | | D 52 | | H 53 | | P 54 | | T 55 | 56 |
57 |

A

58 | 59 | 63 | 67 |
68 | 69 |

B

70 | 71 | 75 |
76 | 77 |

D

78 | 79 | 83 |
84 | 85 |

H

86 | 87 | 91 | 95 |
96 | 97 |

P

98 | 99 | 105 | 111 |
112 | 113 |

T

114 | 115 | 119 |
120 | 121 | 122 | 123 |
124 |
125 |
126 | 149 |
150 |
151 | 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/build/html/_static/doctools.js: -------------------------------------------------------------------------------- 1 | /* 2 | * doctools.js 3 | * ~~~~~~~~~~~ 4 | * 5 | * Sphinx JavaScript utilities for all documentation. 6 | * 7 | * :copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | /** 13 | * select a different prefix for underscore 14 | */ 15 | $u = _.noConflict(); 16 | 17 | /** 18 | * make the code below compatible with browsers without 19 | * an installed firebug like debugger 20 | if (!window.console || !console.firebug) { 21 | var names = ["log", "debug", "info", "warn", "error", "assert", "dir", 22 | "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", 23 | "profile", "profileEnd"]; 24 | window.console = {}; 25 | for (var i = 0; i < names.length; ++i) 26 | window.console[names[i]] = function() {}; 27 | } 28 | */ 29 | 30 | /** 31 | * small helper function to urldecode strings 32 | */ 33 | jQuery.urldecode = function(x) { 34 | return decodeURIComponent(x).replace(/\+/g, ' '); 35 | }; 36 | 37 | /** 38 | * small helper function to urlencode strings 39 | */ 40 | jQuery.urlencode = encodeURIComponent; 41 | 42 | /** 43 | * This function returns the parsed url parameters of the 44 | * current request. Multiple values per key are supported, 45 | * it will always return arrays of strings for the value parts. 46 | */ 47 | jQuery.getQueryParameters = function(s) { 48 | if (typeof s == 'undefined') 49 | s = document.location.search; 50 | var parts = s.substr(s.indexOf('?') + 1).split('&'); 51 | var result = {}; 52 | for (var i = 0; i < parts.length; i++) { 53 | var tmp = parts[i].split('=', 2); 54 | var key = jQuery.urldecode(tmp[0]); 55 | var value = jQuery.urldecode(tmp[1]); 56 | if (key in result) 57 | result[key].push(value); 58 | else 59 | result[key] = [value]; 60 | } 61 | return result; 62 | }; 63 | 64 | /** 65 | * highlight a given string on a jquery object by wrapping it in 66 | * span elements with the given class name. 67 | */ 68 | jQuery.fn.highlightText = function(text, className) { 69 | function highlight(node) { 70 | if (node.nodeType == 3) { 71 | var val = node.nodeValue; 72 | var pos = val.toLowerCase().indexOf(text); 73 | if (pos >= 0 && !jQuery(node.parentNode).hasClass(className)) { 74 | var span = document.createElement("span"); 75 | span.className = className; 76 | span.appendChild(document.createTextNode(val.substr(pos, text.length))); 77 | node.parentNode.insertBefore(span, node.parentNode.insertBefore( 78 | document.createTextNode(val.substr(pos + text.length)), 79 | node.nextSibling)); 80 | node.nodeValue = val.substr(0, pos); 81 | } 82 | } 83 | else if (!jQuery(node).is("button, select, textarea")) { 84 | jQuery.each(node.childNodes, function() { 85 | highlight(this); 86 | }); 87 | } 88 | } 89 | return this.each(function() { 90 | highlight(this); 91 | }); 92 | }; 93 | 94 | /* 95 | * backward compatibility for jQuery.browser 96 | * This will be supported until firefox bug is fixed. 97 | */ 98 | if (!jQuery.browser) { 99 | jQuery.uaMatch = function(ua) { 100 | ua = ua.toLowerCase(); 101 | 102 | var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || 103 | /(webkit)[ \/]([\w.]+)/.exec(ua) || 104 | /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || 105 | /(msie) ([\w.]+)/.exec(ua) || 106 | ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || 107 | []; 108 | 109 | return { 110 | browser: match[ 1 ] || "", 111 | version: match[ 2 ] || "0" 112 | }; 113 | }; 114 | jQuery.browser = {}; 115 | jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; 116 | } 117 | 118 | /** 119 | * Small JavaScript module for the documentation. 120 | */ 121 | var Documentation = { 122 | 123 | init : function() { 124 | this.fixFirefoxAnchorBug(); 125 | this.highlightSearchWords(); 126 | this.initIndexTable(); 127 | 128 | }, 129 | 130 | /** 131 | * i18n support 132 | */ 133 | TRANSLATIONS : {}, 134 | PLURAL_EXPR : function(n) { return n == 1 ? 0 : 1; }, 135 | LOCALE : 'unknown', 136 | 137 | // gettext and ngettext don't access this so that the functions 138 | // can safely bound to a different name (_ = Documentation.gettext) 139 | gettext : function(string) { 140 | var translated = Documentation.TRANSLATIONS[string]; 141 | if (typeof translated == 'undefined') 142 | return string; 143 | return (typeof translated == 'string') ? translated : translated[0]; 144 | }, 145 | 146 | ngettext : function(singular, plural, n) { 147 | var translated = Documentation.TRANSLATIONS[singular]; 148 | if (typeof translated == 'undefined') 149 | return (n == 1) ? singular : plural; 150 | return translated[Documentation.PLURALEXPR(n)]; 151 | }, 152 | 153 | addTranslations : function(catalog) { 154 | for (var key in catalog.messages) 155 | this.TRANSLATIONS[key] = catalog.messages[key]; 156 | this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); 157 | this.LOCALE = catalog.locale; 158 | }, 159 | 160 | /** 161 | * add context elements like header anchor links 162 | */ 163 | addContextElements : function() { 164 | $('div[id] > :header:first').each(function() { 165 | $('\u00B6'). 166 | attr('href', '#' + this.id). 167 | attr('title', _('Permalink to this headline')). 168 | appendTo(this); 169 | }); 170 | $('dt[id]').each(function() { 171 | $('\u00B6'). 172 | attr('href', '#' + this.id). 173 | attr('title', _('Permalink to this definition')). 174 | appendTo(this); 175 | }); 176 | }, 177 | 178 | /** 179 | * workaround a firefox stupidity 180 | * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075 181 | */ 182 | fixFirefoxAnchorBug : function() { 183 | if (document.location.hash) 184 | window.setTimeout(function() { 185 | document.location.href += ''; 186 | }, 10); 187 | }, 188 | 189 | /** 190 | * highlight the search words provided in the url in the text 191 | */ 192 | highlightSearchWords : function() { 193 | var params = $.getQueryParameters(); 194 | var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; 195 | if (terms.length) { 196 | var body = $('div.body'); 197 | if (!body.length) { 198 | body = $('body'); 199 | } 200 | window.setTimeout(function() { 201 | $.each(terms, function() { 202 | body.highlightText(this.toLowerCase(), 'highlighted'); 203 | }); 204 | }, 10); 205 | $('') 207 | .appendTo($('#searchbox')); 208 | } 209 | }, 210 | 211 | /** 212 | * init the domain index toggle buttons 213 | */ 214 | initIndexTable : function() { 215 | var togglers = $('img.toggler').click(function() { 216 | var src = $(this).attr('src'); 217 | var idnum = $(this).attr('id').substr(7); 218 | $('tr.cg-' + idnum).toggle(); 219 | if (src.substr(-9) == 'minus.png') 220 | $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); 221 | else 222 | $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); 223 | }).css('display', ''); 224 | if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { 225 | togglers.click(); 226 | } 227 | }, 228 | 229 | /** 230 | * helper function to hide the search marks again 231 | */ 232 | hideSearchWords : function() { 233 | $('#searchbox .highlight-link').fadeOut(300); 234 | $('span.highlighted').removeClass('highlighted'); 235 | }, 236 | 237 | /** 238 | * make the url absolute 239 | */ 240 | makeURL : function(relativeURL) { 241 | return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; 242 | }, 243 | 244 | /** 245 | * get the current relative url 246 | */ 247 | getCurrentURL : function() { 248 | var path = document.location.pathname; 249 | var parts = path.split(/\//); 250 | $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { 251 | if (this == '..') 252 | parts.pop(); 253 | }); 254 | var url = parts.join('/'); 255 | return path.substring(url.lastIndexOf('/') + 1, path.length - 1); 256 | }, 257 | 258 | initOnKeyListeners: function() { 259 | $(document).keyup(function(event) { 260 | var activeElementType = document.activeElement.tagName; 261 | // don't navigate when in search box or textarea 262 | if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT') { 263 | switch (event.keyCode) { 264 | case 37: // left 265 | var prevHref = $('link[rel="prev"]').prop('href'); 266 | if (prevHref) { 267 | window.location.href = prevHref; 268 | return false; 269 | } 270 | case 39: // right 271 | var nextHref = $('link[rel="next"]').prop('href'); 272 | if (nextHref) { 273 | window.location.href = nextHref; 274 | return false; 275 | } 276 | } 277 | } 278 | }); 279 | } 280 | }; 281 | 282 | // quick alias for translations 283 | _ = Documentation.gettext; 284 | 285 | $(document).ready(function() { 286 | Documentation.init(); 287 | }); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # pyspark_dist_explore 3 | ______________________________ 4 | 5 | ## PySpark Dataframe Distribution Explorer 6 | 7 | Pyspark_dist_explore is a plotting library to get quick insights on data in Spark DataFrames through histograms and density plots, where the heavy lifting is done in Spark. 8 | 9 | Pypsark_dist_explore has two ways of working: there are 3 functions to create matplotlib graphs or pandas dataframes easily, and a class (Histogram) to do more advanced explorations while minimizing the amount of computation needed. 10 | 11 | ## Functions: 12 | * **hist(ax, x, \*\*kwargs)**. The *hist* function is almost exactly the same as the matplotlib hist function. See [here](https://matplotlib.org/examples/statistics/histogram_demo_multihist.html) for examples. The only two differences are: 13 | * Instead of being a function of an Axes object, an Axes object is needed as input. 14 | * Instead of having an numpy array, list of arrays, or matrix as input, the function works on Spark DataFrames with a single column, a list of single-column Spark DataFrames, or a SparkDataframe with multiple columns. All other keyword arguments of the [Matplotlib hist](https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.hist.html) function can be used. 15 | 16 | 17 | * **distplot(ax, x, \*\*kwargs)**. Combines a normalized histogram of each column in x with a density plot of the same column. 18 | 19 | * **pandas_histogram(x, bins=None, range=None)**. Creates histograms for all columns in x and converts this to a Pandas DataFrame 20 | 21 | ## Installing: 22 | Install from PyPi: 23 | 24 | ```pip install pyspark_dist_explore``` 25 | 26 | Or directly from github: 27 | 28 | ``` 29 | git clone https://github.com/Bergvca/pyspark_dist_explore.git 30 | cd pyspark_dist_explore 31 | pip install . 32 | ``` 33 | ### Examples 34 | 35 | 36 | 37 | ```python 38 | import pyspark 39 | import pandas as pd 40 | import numpy as np 41 | import pyspark.sql.functions as F 42 | import matplotlib.pyplot as plt 43 | import seaborn as sns 44 | 45 | from IPython.display import display, HTML, display_html #usefull to display wide tables 46 | from pyspark_dist_explore import Histogram, hist, distplot, pandas_histogram 47 | from pyspark.sql import Row 48 | 49 | sc = pyspark.SparkContext() 50 | sqlContext = pyspark.SQLContext(sc) 51 | %matplotlib inline 52 | ``` 53 | 54 | ```python 55 | # Create some data in a Spark DataFrame: 56 | n_observations = 200 57 | 58 | random_dist_1 = np.random.logistic(100, 1000, n_observations) 59 | random_dist_2 = np.random.logistic(400, 500, n_observations) 60 | age_dist_1 = 20 * np.random.randn(n_observations) + 40 61 | age_dist_2 = 15 * np.random.randn(n_observations) + 30 62 | 63 | list_male = [('M', rand_value, age_dist_1[i]) for i, rand_value in enumerate(random_dist_1)] 64 | list_female = [('F', rand_value, age_dist_2[i]) for i, rand_value in enumerate(random_dist_2)] 65 | 66 | list_male_female = list_male + list_female 67 | 68 | rdd = sc.parallelize(list_male_female) 69 | transactions = rdd.map(lambda x: Row(gender=x[0], amount=float(x[1]), age=float(x[2]))) 70 | transactions_df = sqlContext.createDataFrame(transactions) 71 | 72 | ``` 73 | 74 | 75 | ```python 76 | # Create some selections on this data 77 | 78 | filtered_by_gender_m = transactions_df.filter(F.col('gender') == 'M').select(F.col('amount').alias('amount_m')) 79 | filtered_by_gender_f = transactions_df.filter(F.col('gender') == 'F').select(F.col('amount').alias('amount_f') ) 80 | filtered_by_age_50_plus = transactions_df.filter(F.col('age') > 50).select(F.col('amount').alias('amount_50_plus')) 81 | filtered_by_age_50_minus = transactions_df.filter(F.col('age') <= 50).select(F.col('amount').alias('amount_50_minus')) 82 | 83 | # Create the plots 84 | 85 | fig, axes = plt.subplots(nrows=2, ncols=2) 86 | fig.set_size_inches(20, 20) 87 | 88 | # Use the hist function to plot histograms on the Axes 89 | hist(axes[0, 0], [filtered_by_gender_m, filtered_by_gender_f], bins = 20, color=['red', 'tan']) 90 | axes[0, 0].set_title('01. Compare Genders') 91 | axes[0, 0].legend() 92 | 93 | hist(axes[0, 1], [filtered_by_age_50_plus, filtered_by_age_50_minus], overlapping=True) 94 | axes[0, 1].set_title('02. Compare Age') 95 | axes[0, 1].legend() 96 | 97 | # Use the distplot function to plot (scaled) histograms + density plots on the Axes 98 | distplot(axes[1, 0], [filtered_by_gender_m, filtered_by_gender_f], bins=20) 99 | axes[1, 0].set_title('03. Compare distribution per gender') 100 | axes[1, 0].legend() 101 | 102 | distplot(axes[1, 1], [filtered_by_age_50_plus, filtered_by_age_50_minus], bins=20, color=['orange', 'green']) 103 | axes[1, 1].set_title('03. Compare distribution per age group') 104 | _ = axes[1, 1].legend() 105 | 106 | ``` 107 | 108 | 109 | ![png](README_files/README_5_0.png) 110 | 111 | 112 | 113 | ```python 114 | # Convert Histograms of the 4 datasets to a pandas dataframe 115 | 116 | # Put the outliers in seperate bins: 117 | bins = [-6000, -3000] + [bin_range for bin_range in range(-2500, 4000, 500)] + [6000] 118 | 119 | 120 | compare_all_df = pandas_histogram([filtered_by_gender_m, 121 | filtered_by_gender_f, 122 | filtered_by_age_50_plus, 123 | filtered_by_age_50_minus], 124 | bins=bins, range=(-4000, 4000)) 125 | display(compare_all_df) 126 | ``` 127 | 128 | 129 |
130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 |
amount_50_minusamount_50_plusamount_famount_m
-6000.00 - -3000.004215
-3000.00 - -2500.004206
-2500.00 - -2000.00113113
-2000.00 - -1500.00105411
-1500.00 - -1000.00216918
-1000.00 - -500.003291625
-500.00 - 0.003983017
0.00 - 500.0068135229
500.00 - 1000.0046174320
1000.00 - 1500.002942211
1500.00 - 2000.002491320
2000.00 - 2500.0010569
2500.00 - 3000.004325
3000.00 - 3500.005014
3500.00 - 6000.002305
249 |
250 | 251 | 252 | ## The Histogram Class 253 | 254 | Next to running the functions as above to get results quickly, the pyspark_dist_explore library contains a Histogram class. The advantage of using this class is that it retains state, so if the histogram is build once, multiple actions can be done withouth recalculating the bin values. 255 | 256 | ### Examples 257 | 258 | 259 | ```python 260 | age_hist = Histogram(range=(-4000, 4000), bins=15) 261 | 262 | # Create a histogram for different age groups 263 | for age in range(0, 90, 10): 264 | age_hist.add_data( 265 | transactions_df. 266 | filter((F.col('age') > age) & (F.col('age') <= age+10)). 267 | select(F.col('amount').alias('amount_%d_%d' % (age, age+10))) 268 | ) 269 | 270 | fig, axes = plt.subplots(nrows=2) 271 | fig.set_size_inches(20, 10) 272 | 273 | age_hist.plot_hist(axes[0], histtype='step', linewidth=2.0, fill=False, cumulative=True) # The Histogram is build here 274 | age_hist.plot_density(axes[1]) # The density plot is created from the already build histogram 275 | 276 | # Set the legends 277 | axes[0].legend(loc = 'upper left' ) 278 | axes[0].set_title('Cumulative Histogram') 279 | axes[1].legend() 280 | axes[1].set_title('Kernel Density Plot') 281 | 282 | age_hist_pd_df = age_hist.to_pandas() # Again the histograms don't need to be recalculated. 283 | 284 | # Create a heatmap from the Pandas Dataframe 285 | 286 | fig, axes = plt.subplots() 287 | fig.set_size_inches(10, 10) 288 | ax = sns.heatmap(age_hist_pd_df, annot=True, ax=axes) 289 | _ = ax.set_title('Heatmap') 290 | ``` 291 | 292 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | 2 | # pyspark_dist_explore 3 | ______________________________ 4 | 5 | ## PySpark Dataframe Distribution Explorer 6 | 7 | Pyspark_dist_explore is a plotting library to get quick insights on data in Spark DataFrames through histograms and density plots, where the heavy lifting is done in Spark. 8 | 9 | Pypsark_dist_explore has two ways of working: there are 3 functions to create matplotlib graphs or pandas dataframes easily, and a class (Histogram) to do more advanced explorations while minimizing the amount of computation needed. 10 | 11 | ## Functions: 12 | * **hist(ax, x, \*\*kwargs)**. The *hist* function is almost exactly the same as the matplotlib hist function. See [here](https://matplotlib.org/examples/statistics/histogram_demo_multihist.html) for examples. The only two differences are: 13 | * Instead of being a function of an Axes object, an Axes object is needed as input. 14 | * Instead of having an numpy array, list of arrays, or matrix as input, the function works on Spark DataFrames with a single column, a list of single-column Spark DataFrames, or a SparkDataframe with multiple columns. All other keyword arguments of the [Matplotlib hist](https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.hist.html) function can be used. 15 | 16 | 17 | * **distplot(ax, x, \*\*kwargs)**. Combines a normalized histogram of each column in x with a density plot of the same column. 18 | 19 | * **pandas_histogram(x, bins=None, range=None)**. Creates histograms for all columns in x and converts this to a Pandas DataFrame 20 | 21 | ## Installing: 22 | Install from PyPi: 23 | 24 | ```pip install pyspark_dist_explore``` 25 | 26 | Or directly from github: 27 | 28 | ``` 29 | git clone https://github.com/Bergvca/pyspark_dist_explore.git 30 | cd pyspark_dist_explore 31 | pip install . 32 | ``` 33 | ### Examples 34 | 35 | 36 | 37 | ```python 38 | import pyspark 39 | import pandas as pd 40 | import numpy as np 41 | import pyspark.sql.functions as F 42 | import matplotlib.pyplot as plt 43 | import seaborn as sns 44 | 45 | from IPython.display import display, HTML, display_html #usefull to display wide tables 46 | from pyspark_dist_explore import Histogram, hist, distplot, pandas_histogram 47 | from pyspark.sql import Row 48 | 49 | sc = pyspark.SparkContext() 50 | sqlContext = pyspark.SQLContext(sc) 51 | %matplotlib inline 52 | ``` 53 | 54 | ```python 55 | # Create some data in a Spark DataFrame: 56 | n_observations = 200 57 | 58 | random_dist_1 = np.random.logistic(100, 1000, n_observations) 59 | random_dist_2 = np.random.logistic(400, 500, n_observations) 60 | age_dist_1 = 20 * np.random.randn(n_observations) + 40 61 | age_dist_2 = 15 * np.random.randn(n_observations) + 30 62 | 63 | list_male = [('M', rand_value, age_dist_1[i]) for i, rand_value in enumerate(random_dist_1)] 64 | list_female = [('F', rand_value, age_dist_2[i]) for i, rand_value in enumerate(random_dist_2)] 65 | 66 | list_male_female = list_male + list_female 67 | 68 | rdd = sc.parallelize(list_male_female) 69 | transactions = rdd.map(lambda x: Row(gender=x[0], amount=float(x[1]), age=float(x[2]))) 70 | transactions_df = sqlContext.createDataFrame(transactions) 71 | 72 | ``` 73 | 74 | 75 | ```python 76 | # Create some selections on this data 77 | 78 | filtered_by_gender_m = transactions_df.filter(F.col('gender') == 'M').select(F.col('amount').alias('amount_m')) 79 | filtered_by_gender_f = transactions_df.filter(F.col('gender') == 'F').select(F.col('amount').alias('amount_f') ) 80 | filtered_by_age_50_plus = transactions_df.filter(F.col('age') > 50).select(F.col('amount').alias('amount_50_plus')) 81 | filtered_by_age_50_minus = transactions_df.filter(F.col('age') <= 50).select(F.col('amount').alias('amount_50_minus')) 82 | 83 | # Create the plots 84 | 85 | fig, axes = plt.subplots(nrows=2, ncols=2) 86 | fig.set_size_inches(20, 20) 87 | 88 | # Use the hist function to plot histograms on the Axes 89 | hist(axes[0, 0], [filtered_by_gender_m, filtered_by_gender_f], bins = 20, color=['red', 'tan']) 90 | axes[0, 0].set_title('01. Compare Genders') 91 | axes[0, 0].legend() 92 | 93 | hist(axes[0, 1], [filtered_by_age_50_plus, filtered_by_age_50_minus], overlapping=True) 94 | axes[0, 1].set_title('02. Compare Age') 95 | axes[0, 1].legend() 96 | 97 | # Use the distplot function to plot (scaled) histograms + density plots on the Axes 98 | distplot(axes[1, 0], [filtered_by_gender_m, filtered_by_gender_f], bins=20) 99 | axes[1, 0].set_title('03. Compare distribution per gender') 100 | axes[1, 0].legend() 101 | 102 | distplot(axes[1, 1], [filtered_by_age_50_plus, filtered_by_age_50_minus], bins=20, color=['orange', 'green']) 103 | axes[1, 1].set_title('03. Compare distribution per age group') 104 | _ = axes[1, 1].legend() 105 | 106 | ``` 107 | 108 | 109 | ![png](README_files/README_5_0.png) 110 | 111 | 112 | 113 | ```python 114 | # Convert Histograms of the 4 datasets to a pandas dataframe 115 | 116 | # Put the outliers in seperate bins: 117 | bins = [-6000, -3000] + [bin_range for bin_range in range(-2500, 4000, 500)] + [6000] 118 | 119 | 120 | compare_all_df = pandas_histogram([filtered_by_gender_m, 121 | filtered_by_gender_f, 122 | filtered_by_age_50_plus, 123 | filtered_by_age_50_minus], 124 | bins=bins, range=(-4000, 4000)) 125 | display(compare_all_df) 126 | ``` 127 | 128 | 129 |
130 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 |
amount_50_minusamount_50_plusamount_famount_m
-6000.00 - -3000.004215
-3000.00 - -2500.004206
-2500.00 - -2000.00113113
-2000.00 - -1500.00105411
-1500.00 - -1000.00216918
-1000.00 - -500.003291625
-500.00 - 0.003983017
0.00 - 500.0068135229
500.00 - 1000.0046174320
1000.00 - 1500.002942211
1500.00 - 2000.002491320
2000.00 - 2500.0010569
2500.00 - 3000.004325
3000.00 - 3500.005014
3500.00 - 6000.002305
261 |
262 | 263 | 264 | ## The Histogram Class 265 | 266 | Next to running the functions as above to get results quickly, the pyspark_dist_explore library contains a Histogram class. The advantage of using this class is that it retains state, so if the histogram is build once, multiple actions can be done withouth recalculating the bin values. 267 | 268 | ### Examples 269 | 270 | 271 | ```python 272 | age_hist = Histogram(range=(-4000, 4000), bins=15) 273 | 274 | # Create a histogram for different age groups 275 | for age in range(0, 90, 10): 276 | age_hist.add_data( 277 | transactions_df. 278 | filter((F.col('age') > age) & (F.col('age') <= age+10)). 279 | select(F.col('amount').alias('amount_%d_%d' % (age, age+10))) 280 | ) 281 | 282 | fig, axes = plt.subplots(nrows=2) 283 | fig.set_size_inches(20, 10) 284 | 285 | age_hist.plot_hist(axes[0], histtype='step', linewidth=2.0, fill=False, cumulative=True) # The Histogram is build here 286 | age_hist.plot_density(axes[1]) # The density plot is created from the already build histogram 287 | 288 | # Set the legends 289 | axes[0].legend(loc = 'upper left' ) 290 | axes[0].set_title('Cumulative Histogram') 291 | axes[1].legend() 292 | axes[1].set_title('Kernel Density Plot') 293 | 294 | age_hist_pd_df = age_hist.to_pandas() # Again the histograms don't need to be recalculated. 295 | 296 | # Create a heatmap from the Pandas Dataframe 297 | 298 | fig, axes = plt.subplots() 299 | fig.set_size_inches(10, 10) 300 | ax = sns.heatmap(age_hist_pd_df, annot=True, ax=axes) 301 | _ = ax.set_title('Heatmap') 302 | ``` 303 | 304 | -------------------------------------------------------------------------------- /docs/build/html/_static/underscore.js: -------------------------------------------------------------------------------- 1 | // Underscore.js 1.3.1 2 | // (c) 2009-2012 Jeremy Ashkenas, DocumentCloud Inc. 3 | // Underscore is freely distributable under the MIT license. 4 | // Portions of Underscore are inspired or borrowed from Prototype, 5 | // Oliver Steele's Functional, and John Resig's Micro-Templating. 6 | // For all details and documentation: 7 | // http://documentcloud.github.com/underscore 8 | (function(){function q(a,c,d){if(a===c)return a!==0||1/a==1/c;if(a==null||c==null)return a===c;if(a._chain)a=a._wrapped;if(c._chain)c=c._wrapped;if(a.isEqual&&b.isFunction(a.isEqual))return a.isEqual(c);if(c.isEqual&&b.isFunction(c.isEqual))return c.isEqual(a);var e=l.call(a);if(e!=l.call(c))return false;switch(e){case "[object String]":return a==String(c);case "[object Number]":return a!=+a?c!=+c:a==0?1/a==1/c:a==+c;case "[object Date]":case "[object Boolean]":return+a==+c;case "[object RegExp]":return a.source== 9 | c.source&&a.global==c.global&&a.multiline==c.multiline&&a.ignoreCase==c.ignoreCase}if(typeof a!="object"||typeof c!="object")return false;for(var f=d.length;f--;)if(d[f]==a)return true;d.push(a);var f=0,g=true;if(e=="[object Array]"){if(f=a.length,g=f==c.length)for(;f--;)if(!(g=f in a==f in c&&q(a[f],c[f],d)))break}else{if("constructor"in a!="constructor"in c||a.constructor!=c.constructor)return false;for(var h in a)if(b.has(a,h)&&(f++,!(g=b.has(c,h)&&q(a[h],c[h],d))))break;if(g){for(h in c)if(b.has(c, 10 | h)&&!f--)break;g=!f}}d.pop();return g}var r=this,G=r._,n={},k=Array.prototype,o=Object.prototype,i=k.slice,H=k.unshift,l=o.toString,I=o.hasOwnProperty,w=k.forEach,x=k.map,y=k.reduce,z=k.reduceRight,A=k.filter,B=k.every,C=k.some,p=k.indexOf,D=k.lastIndexOf,o=Array.isArray,J=Object.keys,s=Function.prototype.bind,b=function(a){return new m(a)};if(typeof exports!=="undefined"){if(typeof module!=="undefined"&&module.exports)exports=module.exports=b;exports._=b}else r._=b;b.VERSION="1.3.1";var j=b.each= 11 | b.forEach=function(a,c,d){if(a!=null)if(w&&a.forEach===w)a.forEach(c,d);else if(a.length===+a.length)for(var e=0,f=a.length;e2;a== 12 | null&&(a=[]);if(y&&a.reduce===y)return e&&(c=b.bind(c,e)),f?a.reduce(c,d):a.reduce(c);j(a,function(a,b,i){f?d=c.call(e,d,a,b,i):(d=a,f=true)});if(!f)throw new TypeError("Reduce of empty array with no initial value");return d};b.reduceRight=b.foldr=function(a,c,d,e){var f=arguments.length>2;a==null&&(a=[]);if(z&&a.reduceRight===z)return e&&(c=b.bind(c,e)),f?a.reduceRight(c,d):a.reduceRight(c);var g=b.toArray(a).reverse();e&&!f&&(c=b.bind(c,e));return f?b.reduce(g,c,d,e):b.reduce(g,c)};b.find=b.detect= 13 | function(a,c,b){var e;E(a,function(a,g,h){if(c.call(b,a,g,h))return e=a,true});return e};b.filter=b.select=function(a,c,b){var e=[];if(a==null)return e;if(A&&a.filter===A)return a.filter(c,b);j(a,function(a,g,h){c.call(b,a,g,h)&&(e[e.length]=a)});return e};b.reject=function(a,c,b){var e=[];if(a==null)return e;j(a,function(a,g,h){c.call(b,a,g,h)||(e[e.length]=a)});return e};b.every=b.all=function(a,c,b){var e=true;if(a==null)return e;if(B&&a.every===B)return a.every(c,b);j(a,function(a,g,h){if(!(e= 14 | e&&c.call(b,a,g,h)))return n});return e};var E=b.some=b.any=function(a,c,d){c||(c=b.identity);var e=false;if(a==null)return e;if(C&&a.some===C)return a.some(c,d);j(a,function(a,b,h){if(e||(e=c.call(d,a,b,h)))return n});return!!e};b.include=b.contains=function(a,c){var b=false;if(a==null)return b;return p&&a.indexOf===p?a.indexOf(c)!=-1:b=E(a,function(a){return a===c})};b.invoke=function(a,c){var d=i.call(arguments,2);return b.map(a,function(a){return(b.isFunction(c)?c||a:a[c]).apply(a,d)})};b.pluck= 15 | function(a,c){return b.map(a,function(a){return a[c]})};b.max=function(a,c,d){if(!c&&b.isArray(a))return Math.max.apply(Math,a);if(!c&&b.isEmpty(a))return-Infinity;var e={computed:-Infinity};j(a,function(a,b,h){b=c?c.call(d,a,b,h):a;b>=e.computed&&(e={value:a,computed:b})});return e.value};b.min=function(a,c,d){if(!c&&b.isArray(a))return Math.min.apply(Math,a);if(!c&&b.isEmpty(a))return Infinity;var e={computed:Infinity};j(a,function(a,b,h){b=c?c.call(d,a,b,h):a;bd?1:0}),"value")};b.groupBy=function(a,c){var d={},e=b.isFunction(c)?c:function(a){return a[c]};j(a,function(a,b){var c=e(a,b);(d[c]||(d[c]=[])).push(a)});return d};b.sortedIndex=function(a, 17 | c,d){d||(d=b.identity);for(var e=0,f=a.length;e>1;d(a[g])=0})})};b.difference=function(a){var c=b.flatten(i.call(arguments,1));return b.filter(a,function(a){return!b.include(c,a)})};b.zip=function(){for(var a=i.call(arguments),c=b.max(b.pluck(a,"length")),d=Array(c),e=0;e=0;d--)b=[a[d].apply(this,b)];return b[0]}}; 24 | b.after=function(a,b){return a<=0?b():function(){if(--a<1)return b.apply(this,arguments)}};b.keys=J||function(a){if(a!==Object(a))throw new TypeError("Invalid object");var c=[],d;for(d in a)b.has(a,d)&&(c[c.length]=d);return c};b.values=function(a){return b.map(a,b.identity)};b.functions=b.methods=function(a){var c=[],d;for(d in a)b.isFunction(a[d])&&c.push(d);return c.sort()};b.extend=function(a){j(i.call(arguments,1),function(b){for(var d in b)a[d]=b[d]});return a};b.defaults=function(a){j(i.call(arguments, 25 | 1),function(b){for(var d in b)a[d]==null&&(a[d]=b[d])});return a};b.clone=function(a){return!b.isObject(a)?a:b.isArray(a)?a.slice():b.extend({},a)};b.tap=function(a,b){b(a);return a};b.isEqual=function(a,b){return q(a,b,[])};b.isEmpty=function(a){if(b.isArray(a)||b.isString(a))return a.length===0;for(var c in a)if(b.has(a,c))return false;return true};b.isElement=function(a){return!!(a&&a.nodeType==1)};b.isArray=o||function(a){return l.call(a)=="[object Array]"};b.isObject=function(a){return a===Object(a)}; 26 | b.isArguments=function(a){return l.call(a)=="[object Arguments]"};if(!b.isArguments(arguments))b.isArguments=function(a){return!(!a||!b.has(a,"callee"))};b.isFunction=function(a){return l.call(a)=="[object Function]"};b.isString=function(a){return l.call(a)=="[object String]"};b.isNumber=function(a){return l.call(a)=="[object Number]"};b.isNaN=function(a){return a!==a};b.isBoolean=function(a){return a===true||a===false||l.call(a)=="[object Boolean]"};b.isDate=function(a){return l.call(a)=="[object Date]"}; 27 | b.isRegExp=function(a){return l.call(a)=="[object RegExp]"};b.isNull=function(a){return a===null};b.isUndefined=function(a){return a===void 0};b.has=function(a,b){return I.call(a,b)};b.noConflict=function(){r._=G;return this};b.identity=function(a){return a};b.times=function(a,b,d){for(var e=0;e/g,">").replace(/"/g,""").replace(/'/g,"'").replace(/\//g,"/")};b.mixin=function(a){j(b.functions(a), 28 | function(c){K(c,b[c]=a[c])})};var L=0;b.uniqueId=function(a){var b=L++;return a?a+b:b};b.templateSettings={evaluate:/<%([\s\S]+?)%>/g,interpolate:/<%=([\s\S]+?)%>/g,escape:/<%-([\s\S]+?)%>/g};var t=/.^/,u=function(a){return a.replace(/\\\\/g,"\\").replace(/\\'/g,"'")};b.template=function(a,c){var d=b.templateSettings,d="var __p=[],print=function(){__p.push.apply(__p,arguments);};with(obj||{}){__p.push('"+a.replace(/\\/g,"\\\\").replace(/'/g,"\\'").replace(d.escape||t,function(a,b){return"',_.escape("+ 29 | u(b)+"),'"}).replace(d.interpolate||t,function(a,b){return"',"+u(b)+",'"}).replace(d.evaluate||t,function(a,b){return"');"+u(b).replace(/[\r\n\t]/g," ")+";__p.push('"}).replace(/\r/g,"\\r").replace(/\n/g,"\\n").replace(/\t/g,"\\t")+"');}return __p.join('');",e=new Function("obj","_",d);return c?e(c,b):function(a){return e.call(this,a,b)}};b.chain=function(a){return b(a).chain()};var m=function(a){this._wrapped=a};b.prototype=m.prototype;var v=function(a,c){return c?b(a).chain():a},K=function(a,c){m.prototype[a]= 30 | function(){var a=i.call(arguments);H.call(a,this._wrapped);return v(c.apply(b,a),this._chain)}};b.mixin(b);j("pop,push,reverse,shift,sort,splice,unshift".split(","),function(a){var b=k[a];m.prototype[a]=function(){var d=this._wrapped;b.apply(d,arguments);var e=d.length;(a=="shift"||a=="splice")&&e===0&&delete d[0];return v(d,this._chain)}});j(["concat","join","slice"],function(a){var b=k[a];m.prototype[a]=function(){return v(b.apply(this._wrapped,arguments),this._chain)}});m.prototype.chain=function(){this._chain= 31 | true;return this};m.prototype.value=function(){return this._wrapped}}).call(this); 32 | -------------------------------------------------------------------------------- /docs/build/html/_static/basic.css: -------------------------------------------------------------------------------- 1 | /* 2 | * basic.css 3 | * ~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- basic theme. 6 | * 7 | * :copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | /* -- main layout ----------------------------------------------------------- */ 13 | 14 | div.clearer { 15 | clear: both; 16 | } 17 | 18 | /* -- relbar ---------------------------------------------------------------- */ 19 | 20 | div.related { 21 | width: 100%; 22 | font-size: 90%; 23 | } 24 | 25 | div.related h3 { 26 | display: none; 27 | } 28 | 29 | div.related ul { 30 | margin: 0; 31 | padding: 0 0 0 10px; 32 | list-style: none; 33 | } 34 | 35 | div.related li { 36 | display: inline; 37 | } 38 | 39 | div.related li.right { 40 | float: right; 41 | margin-right: 5px; 42 | } 43 | 44 | /* -- sidebar --------------------------------------------------------------- */ 45 | 46 | div.sphinxsidebarwrapper { 47 | padding: 10px 5px 0 10px; 48 | } 49 | 50 | div.sphinxsidebar { 51 | float: left; 52 | width: 230px; 53 | margin-left: -100%; 54 | font-size: 90%; 55 | word-wrap: break-word; 56 | overflow-wrap : break-word; 57 | } 58 | 59 | div.sphinxsidebar ul { 60 | list-style: none; 61 | } 62 | 63 | div.sphinxsidebar ul ul, 64 | div.sphinxsidebar ul.want-points { 65 | margin-left: 20px; 66 | list-style: square; 67 | } 68 | 69 | div.sphinxsidebar ul ul { 70 | margin-top: 0; 71 | margin-bottom: 0; 72 | } 73 | 74 | div.sphinxsidebar form { 75 | margin-top: 10px; 76 | } 77 | 78 | div.sphinxsidebar input { 79 | border: 1px solid #98dbcc; 80 | font-family: sans-serif; 81 | font-size: 1em; 82 | } 83 | 84 | div.sphinxsidebar #searchbox input[type="text"] { 85 | width: 170px; 86 | } 87 | 88 | img { 89 | border: 0; 90 | max-width: 100%; 91 | } 92 | 93 | /* -- search page ----------------------------------------------------------- */ 94 | 95 | ul.search { 96 | margin: 10px 0 0 20px; 97 | padding: 0; 98 | } 99 | 100 | ul.search li { 101 | padding: 5px 0 5px 20px; 102 | background-image: url(file.png); 103 | background-repeat: no-repeat; 104 | background-position: 0 7px; 105 | } 106 | 107 | ul.search li a { 108 | font-weight: bold; 109 | } 110 | 111 | ul.search li div.context { 112 | color: #888; 113 | margin: 2px 0 0 30px; 114 | text-align: left; 115 | } 116 | 117 | ul.keywordmatches li.goodmatch a { 118 | font-weight: bold; 119 | } 120 | 121 | /* -- index page ------------------------------------------------------------ */ 122 | 123 | table.contentstable { 124 | width: 90%; 125 | margin-left: auto; 126 | margin-right: auto; 127 | } 128 | 129 | table.contentstable p.biglink { 130 | line-height: 150%; 131 | } 132 | 133 | a.biglink { 134 | font-size: 1.3em; 135 | } 136 | 137 | span.linkdescr { 138 | font-style: italic; 139 | padding-top: 5px; 140 | font-size: 90%; 141 | } 142 | 143 | /* -- general index --------------------------------------------------------- */ 144 | 145 | table.indextable { 146 | width: 100%; 147 | } 148 | 149 | table.indextable td { 150 | text-align: left; 151 | vertical-align: top; 152 | } 153 | 154 | table.indextable ul { 155 | margin-top: 0; 156 | margin-bottom: 0; 157 | list-style-type: none; 158 | } 159 | 160 | table.indextable > tbody > tr > td > ul { 161 | padding-left: 0em; 162 | } 163 | 164 | table.indextable tr.pcap { 165 | height: 10px; 166 | } 167 | 168 | table.indextable tr.cap { 169 | margin-top: 10px; 170 | background-color: #f2f2f2; 171 | } 172 | 173 | img.toggler { 174 | margin-right: 3px; 175 | margin-top: 3px; 176 | cursor: pointer; 177 | } 178 | 179 | div.modindex-jumpbox { 180 | border-top: 1px solid #ddd; 181 | border-bottom: 1px solid #ddd; 182 | margin: 1em 0 1em 0; 183 | padding: 0.4em; 184 | } 185 | 186 | div.genindex-jumpbox { 187 | border-top: 1px solid #ddd; 188 | border-bottom: 1px solid #ddd; 189 | margin: 1em 0 1em 0; 190 | padding: 0.4em; 191 | } 192 | 193 | /* -- domain module index --------------------------------------------------- */ 194 | 195 | table.modindextable td { 196 | padding: 2px; 197 | border-collapse: collapse; 198 | } 199 | 200 | /* -- general body styles --------------------------------------------------- */ 201 | 202 | div.body p, div.body dd, div.body li, div.body blockquote { 203 | -moz-hyphens: auto; 204 | -ms-hyphens: auto; 205 | -webkit-hyphens: auto; 206 | hyphens: auto; 207 | } 208 | 209 | a.headerlink { 210 | visibility: hidden; 211 | } 212 | 213 | h1:hover > a.headerlink, 214 | h2:hover > a.headerlink, 215 | h3:hover > a.headerlink, 216 | h4:hover > a.headerlink, 217 | h5:hover > a.headerlink, 218 | h6:hover > a.headerlink, 219 | dt:hover > a.headerlink, 220 | caption:hover > a.headerlink, 221 | p.caption:hover > a.headerlink, 222 | div.code-block-caption:hover > a.headerlink { 223 | visibility: visible; 224 | } 225 | 226 | div.body p.caption { 227 | text-align: inherit; 228 | } 229 | 230 | div.body td { 231 | text-align: left; 232 | } 233 | 234 | .first { 235 | margin-top: 0 !important; 236 | } 237 | 238 | p.rubric { 239 | margin-top: 30px; 240 | font-weight: bold; 241 | } 242 | 243 | img.align-left, .figure.align-left, object.align-left { 244 | clear: left; 245 | float: left; 246 | margin-right: 1em; 247 | } 248 | 249 | img.align-right, .figure.align-right, object.align-right { 250 | clear: right; 251 | float: right; 252 | margin-left: 1em; 253 | } 254 | 255 | img.align-center, .figure.align-center, object.align-center { 256 | display: block; 257 | margin-left: auto; 258 | margin-right: auto; 259 | } 260 | 261 | .align-left { 262 | text-align: left; 263 | } 264 | 265 | .align-center { 266 | text-align: center; 267 | } 268 | 269 | .align-right { 270 | text-align: right; 271 | } 272 | 273 | /* -- sidebars -------------------------------------------------------------- */ 274 | 275 | div.sidebar { 276 | margin: 0 0 0.5em 1em; 277 | border: 1px solid #ddb; 278 | padding: 7px 7px 0 7px; 279 | background-color: #ffe; 280 | width: 40%; 281 | float: right; 282 | } 283 | 284 | p.sidebar-title { 285 | font-weight: bold; 286 | } 287 | 288 | /* -- topics ---------------------------------------------------------------- */ 289 | 290 | div.topic { 291 | border: 1px solid #ccc; 292 | padding: 7px 7px 0 7px; 293 | margin: 10px 0 10px 0; 294 | } 295 | 296 | p.topic-title { 297 | font-size: 1.1em; 298 | font-weight: bold; 299 | margin-top: 10px; 300 | } 301 | 302 | /* -- admonitions ----------------------------------------------------------- */ 303 | 304 | div.admonition { 305 | margin-top: 10px; 306 | margin-bottom: 10px; 307 | padding: 7px; 308 | } 309 | 310 | div.admonition dt { 311 | font-weight: bold; 312 | } 313 | 314 | div.admonition dl { 315 | margin-bottom: 0; 316 | } 317 | 318 | p.admonition-title { 319 | margin: 0px 10px 5px 0px; 320 | font-weight: bold; 321 | } 322 | 323 | div.body p.centered { 324 | text-align: center; 325 | margin-top: 25px; 326 | } 327 | 328 | /* -- tables ---------------------------------------------------------------- */ 329 | 330 | table.docutils { 331 | border: 0; 332 | border-collapse: collapse; 333 | } 334 | 335 | table caption span.caption-number { 336 | font-style: italic; 337 | } 338 | 339 | table caption span.caption-text { 340 | } 341 | 342 | table.docutils td, table.docutils th { 343 | padding: 1px 8px 1px 5px; 344 | border-top: 0; 345 | border-left: 0; 346 | border-right: 0; 347 | border-bottom: 1px solid #aaa; 348 | } 349 | 350 | table.footnote td, table.footnote th { 351 | border: 0 !important; 352 | } 353 | 354 | th { 355 | text-align: left; 356 | padding-right: 5px; 357 | } 358 | 359 | table.citation { 360 | border-left: solid 1px gray; 361 | margin-left: 1px; 362 | } 363 | 364 | table.citation td { 365 | border-bottom: none; 366 | } 367 | 368 | /* -- figures --------------------------------------------------------------- */ 369 | 370 | div.figure { 371 | margin: 0.5em; 372 | padding: 0.5em; 373 | } 374 | 375 | div.figure p.caption { 376 | padding: 0.3em; 377 | } 378 | 379 | div.figure p.caption span.caption-number { 380 | font-style: italic; 381 | } 382 | 383 | div.figure p.caption span.caption-text { 384 | } 385 | 386 | /* -- field list styles ----------------------------------------------------- */ 387 | 388 | table.field-list td, table.field-list th { 389 | border: 0 !important; 390 | } 391 | 392 | .field-list ul { 393 | margin: 0; 394 | padding-left: 1em; 395 | } 396 | 397 | .field-list p { 398 | margin: 0; 399 | } 400 | 401 | /* -- other body styles ----------------------------------------------------- */ 402 | 403 | ol.arabic { 404 | list-style: decimal; 405 | } 406 | 407 | ol.loweralpha { 408 | list-style: lower-alpha; 409 | } 410 | 411 | ol.upperalpha { 412 | list-style: upper-alpha; 413 | } 414 | 415 | ol.lowerroman { 416 | list-style: lower-roman; 417 | } 418 | 419 | ol.upperroman { 420 | list-style: upper-roman; 421 | } 422 | 423 | dl { 424 | margin-bottom: 15px; 425 | } 426 | 427 | dd p { 428 | margin-top: 0px; 429 | } 430 | 431 | dd ul, dd table { 432 | margin-bottom: 10px; 433 | } 434 | 435 | dd { 436 | margin-top: 3px; 437 | margin-bottom: 10px; 438 | margin-left: 30px; 439 | } 440 | 441 | dt:target, .highlighted { 442 | background-color: #fbe54e; 443 | } 444 | 445 | dl.glossary dt { 446 | font-weight: bold; 447 | font-size: 1.1em; 448 | } 449 | 450 | .optional { 451 | font-size: 1.3em; 452 | } 453 | 454 | .sig-paren { 455 | font-size: larger; 456 | } 457 | 458 | .versionmodified { 459 | font-style: italic; 460 | } 461 | 462 | .system-message { 463 | background-color: #fda; 464 | padding: 5px; 465 | border: 3px solid red; 466 | } 467 | 468 | .footnote:target { 469 | background-color: #ffa; 470 | } 471 | 472 | .line-block { 473 | display: block; 474 | margin-top: 1em; 475 | margin-bottom: 1em; 476 | } 477 | 478 | .line-block .line-block { 479 | margin-top: 0; 480 | margin-bottom: 0; 481 | margin-left: 1.5em; 482 | } 483 | 484 | .guilabel, .menuselection { 485 | font-family: sans-serif; 486 | } 487 | 488 | .accelerator { 489 | text-decoration: underline; 490 | } 491 | 492 | .classifier { 493 | font-style: oblique; 494 | } 495 | 496 | abbr, acronym { 497 | border-bottom: dotted 1px; 498 | cursor: help; 499 | } 500 | 501 | /* -- code displays --------------------------------------------------------- */ 502 | 503 | pre { 504 | overflow: auto; 505 | overflow-y: hidden; /* fixes display issues on Chrome browsers */ 506 | } 507 | 508 | span.pre { 509 | -moz-hyphens: none; 510 | -ms-hyphens: none; 511 | -webkit-hyphens: none; 512 | hyphens: none; 513 | } 514 | 515 | td.linenos pre { 516 | padding: 5px 0px; 517 | border: 0; 518 | background-color: transparent; 519 | color: #aaa; 520 | } 521 | 522 | table.highlighttable { 523 | margin-left: 0.5em; 524 | } 525 | 526 | table.highlighttable td { 527 | padding: 0 0.5em 0 0.5em; 528 | } 529 | 530 | div.code-block-caption { 531 | padding: 2px 5px; 532 | font-size: small; 533 | } 534 | 535 | div.code-block-caption code { 536 | background-color: transparent; 537 | } 538 | 539 | div.code-block-caption + div > div.highlight > pre { 540 | margin-top: 0; 541 | } 542 | 543 | div.code-block-caption span.caption-number { 544 | padding: 0.1em 0.3em; 545 | font-style: italic; 546 | } 547 | 548 | div.code-block-caption span.caption-text { 549 | } 550 | 551 | div.literal-block-wrapper { 552 | padding: 1em 1em 0; 553 | } 554 | 555 | div.literal-block-wrapper div.highlight { 556 | margin: 0; 557 | } 558 | 559 | code.descname { 560 | background-color: transparent; 561 | font-weight: bold; 562 | font-size: 1.2em; 563 | } 564 | 565 | code.descclassname { 566 | background-color: transparent; 567 | } 568 | 569 | code.xref, a code { 570 | background-color: transparent; 571 | font-weight: bold; 572 | } 573 | 574 | h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { 575 | background-color: transparent; 576 | } 577 | 578 | .viewcode-link { 579 | float: right; 580 | } 581 | 582 | .viewcode-back { 583 | float: right; 584 | font-family: sans-serif; 585 | } 586 | 587 | div.viewcode-block:target { 588 | margin: -1px -10px; 589 | padding: 0 10px; 590 | } 591 | 592 | /* -- math display ---------------------------------------------------------- */ 593 | 594 | img.math { 595 | vertical-align: middle; 596 | } 597 | 598 | div.body div.math p { 599 | text-align: center; 600 | } 601 | 602 | span.eqno { 603 | float: right; 604 | } 605 | 606 | span.eqno a.headerlink { 607 | position: relative; 608 | left: 0px; 609 | z-index: 1; 610 | } 611 | 612 | div.math:hover a.headerlink { 613 | visibility: visible; 614 | } 615 | 616 | /* -- printout stylesheet --------------------------------------------------- */ 617 | 618 | @media print { 619 | div.document, 620 | div.documentwrapper, 621 | div.bodywrapper { 622 | margin: 0 !important; 623 | width: 100%; 624 | } 625 | 626 | div.sphinxsidebar, 627 | div.related, 628 | div.footer, 629 | #top-link { 630 | display: none; 631 | } 632 | } -------------------------------------------------------------------------------- /pyspark_dist_explore/tests/test_pyspark_dist_explore.py: -------------------------------------------------------------------------------- 1 | import findspark 2 | findspark.init('/media/chris/data/spark-2.4.0-bin-hadoop2.7/') 3 | 4 | import pyspark.sql.functions as F 5 | import sparktestingbase.sqltestcase 6 | import pandas as pd 7 | import unittest 8 | import math 9 | from pyspark.sql import Row 10 | from unittest import mock 11 | 12 | import sys 13 | sys.path.append('../' ) 14 | from pyspark_dist_explore import Histogram 15 | from pyspark_dist_explore.pyspark_dist_explore import create_histogram_object 16 | 17 | 18 | class HistogramTest(sparktestingbase.sqltestcase.SQLTestCase): 19 | def test_init_default(self): 20 | """Should set default settings when no arguments are given""" 21 | hist = Histogram() 22 | self.assertIsNone(hist.min_value) 23 | self.assertIsNone(hist.max_value) 24 | self.assertEqual(10, hist.nr_bins) 25 | self.assertEqual(0, len(hist.bin_boundaries)) 26 | self.assertEqual(0, len(hist.hist_dict)) 27 | self.assertEqual(0, len(hist.col_list)) 28 | self.assertFalse(hist.is_build) 29 | 30 | def test_init_non_default(self): 31 | """"Should set min bin, max bin, and number of bins""" 32 | hist = Histogram(bins=10, range=(5, 8)) 33 | self.assertEqual(10, hist.nr_bins) 34 | self.assertEqual(5, hist.min_value) 35 | self.assertEqual(8, hist.max_value) 36 | self.assertEqual(0, len(hist.bin_boundaries)) 37 | 38 | def test_init_bins_given(self): 39 | """"Should set the list of bins when given in the constructor, 40 | bins are converted to float""" 41 | hist = Histogram(bins=[1, 2, '3']) 42 | self.assertListEqual([1, 2, 3], hist.bin_boundaries) 43 | 44 | def create_test_df(self): 45 | test_list = [(1, 2), (2, 3), (3, 4)] 46 | rdd = self.sc.parallelize(test_list) 47 | rdd_f = rdd.map(lambda x: Row(value=x[0], value2=x[1])) 48 | return self.sqlCtx.createDataFrame(rdd_f) 49 | 50 | def test_add_column(self): 51 | """"Should add a column name, column tuple to the col_list when a single column data frame is given""" 52 | hist = Histogram(bins=10) 53 | test_df = self.create_test_df() 54 | hist.add_column(test_df.select(F.col('value'))) 55 | self.assertEqual(1, len(hist.col_list)) 56 | self.assertEqual('value', hist.col_list[0][1]) 57 | self.assertDataFrameEqual(test_df.select(F.col('value')), hist.col_list[0][0]) 58 | 59 | def test_add_column_more_then_1_column_in_dataframe(self): 60 | """"Should throw an error when the input data frame contains more then one column""" 61 | hist = Histogram(bins=10) 62 | test_df = self.create_test_df() 63 | with self.assertRaises(ValueError): 64 | hist.add_column(test_df) 65 | 66 | def test_add_column_non_numeric(self): 67 | """Should raise an ValueError if a non-numeric column is added""" 68 | test_list = ['a', 'b'] 69 | rdd = self.sc.parallelize(test_list) 70 | rdd_f = rdd.map(lambda x: Row(value=x)) 71 | spark_df = self.sqlCtx.createDataFrame(rdd_f) 72 | hist = Histogram() 73 | with self.assertRaises(ValueError): 74 | hist.add_column(spark_df) 75 | 76 | def test_add_multiple_columns(self): 77 | """Adds new items to the col_list when new items are added""" 78 | hist = Histogram(bins=10) 79 | test_df = self.create_test_df() 80 | hist.add_column(test_df.select(F.col('value'))) 81 | hist.add_column(test_df.select(F.col('value2'))) 82 | self.assertEqual(2, len(hist.col_list)) 83 | self.assertEqual('value', hist.col_list[0][1]) 84 | self.assertDataFrameEqual(test_df.select(F.col('value')), hist.col_list[0][0]) 85 | self.assertEqual('value2', hist.col_list[1][1]) 86 | self.assertDataFrameEqual(test_df.select(F.col('value2')), hist.col_list[1][0]) 87 | 88 | def test_get_min_value(self): 89 | """Should return the minimum value over all columns in a Histogram""" 90 | hist = Histogram(bins=10) 91 | test_df = self.create_test_df() 92 | hist.add_column(test_df.select(F.col('value'))) 93 | hist.add_column(test_df.select(F.col('value2'))) 94 | self.assertEqual(1, hist._get_min_value()) 95 | 96 | def test_get_max_value(self): 97 | """Should return the maximum value over all columns in a Histogram""" 98 | hist = Histogram(bins=10) 99 | test_df = self.create_test_df() 100 | hist.add_column(test_df.select(F.col('value'))) 101 | hist.add_column(test_df.select(F.col('value2'))) 102 | self.assertEqual(4, hist._get_max_value()) 103 | 104 | def test_calculate_bins(self): 105 | """Should return a list of evenly spaced bins between min and max bin if they are set""" 106 | hist = Histogram(range=(5, 10), bins=2) 107 | self.assertListEqual([5, 7.5, 10], hist._calculate_bins()) 108 | 109 | def test_calculate_bins_bins_set(self): 110 | """Should just return the list of bins edges when this was set in the constructor""" 111 | hist = Histogram(bins=[1, 2, 3]) 112 | self.assertListEqual([1, 2, 3], hist._calculate_bins()) 113 | 114 | def test_calculate_bins_single_column(self): 115 | """Should return the number of bins when there is only a single column, and no min and max is set""" 116 | hist = Histogram(bins=5) 117 | test_df = self.create_test_df() 118 | hist.add_column(test_df.select(F.col('value'))) 119 | self.assertEqual(5, hist._calculate_bins()) 120 | 121 | def test_calculate_bins_multiple_columns(self): 122 | """Should return a list of evenly spaced bins between the smallest and highest value over all columns""" 123 | hist = Histogram(bins=3) 124 | test_df = self.create_test_df() # The lowest value in this DF is 1, the highest is 4 125 | hist.add_column(test_df.select(F.col('value'))) 126 | hist.add_column(test_df.select(F.col('value2'))) 127 | self.assertListEqual([1, 2, 3, 4], hist._calculate_bins()) 128 | 129 | def test_add_hist_single_column(self): 130 | """Should add a list of bin values (e.g. the number of values that fall in a bin) to the hist_dict, where 131 | the key is the column name. If multiple columns have the same name a number is appended""" 132 | hist = Histogram(bins=2) 133 | test_df = self.create_test_df() 134 | column_to_ad = test_df.select(F.col('value')) 135 | hist.add_column(column_to_ad) 136 | hist.bin_boundaries = hist._calculate_bins() 137 | hist._add_hist(column_to_ad, 'value') 138 | self.assertEqual(1, len(hist.hist_dict)) 139 | self.assertListEqual([1, 2], hist.hist_dict['value']) 140 | 141 | def test_add_hist_single_column_sets_bin_list(self): 142 | """Should set the bin list if this is a single number""" 143 | hist = Histogram(bins=2) 144 | test_df = self.create_test_df() 145 | column_to_ad = test_df.select(F.col('value')) 146 | hist.add_column(column_to_ad) 147 | hist.bin_boundaries = hist._calculate_bins() 148 | hist._add_hist(column_to_ad, 'value') 149 | self.assertEqual(3, len(hist.bin_boundaries)) 150 | 151 | def test_add_hist_multiple_column(self): 152 | """Should add a second list of bin values to the hist_dict""" 153 | hist = Histogram(bins=2) 154 | test_df = self.create_test_df() 155 | column_to_ad = test_df.select(F.col('value')) 156 | column_to_ad_2 = test_df.select(F.col('value2')) 157 | hist.add_column(column_to_ad) 158 | hist.add_column(column_to_ad_2) 159 | hist.bin_boundaries = hist._calculate_bins() 160 | hist._add_hist(column_to_ad, 'value') 161 | hist._add_hist(column_to_ad_2, 'value2') 162 | self.assertEqual(2, len(hist.hist_dict)) 163 | self.assertListEqual([1, 2], hist.hist_dict['value2']) 164 | 165 | def test_add_hist_multiple_column_rename_column(self): 166 | """Should rename the column name if the same column name is added""" 167 | hist = Histogram(bins=2) 168 | test_df = self.create_test_df() 169 | column_to_ad = test_df.select(F.col('value')) 170 | column_to_ad_2 = test_df.select(F.col('value')) 171 | hist.add_column(column_to_ad) 172 | hist.add_column(column_to_ad_2) 173 | hist.bin_boundaries = hist._calculate_bins() 174 | hist._add_hist(column_to_ad, 'value') 175 | hist._add_hist(column_to_ad_2, 'value') 176 | self.assertEqual(2, len(hist.hist_dict)) 177 | self.assertTrue('value (1)' in hist.hist_dict) 178 | 179 | def test_add_hist_single_value(self): 180 | """Should set the bin list to n (self.nr_bins) bins (n+1 bin borders) where the min bin border is the 181 | single value -0.5 and the max bin border is the single value +0.5 incase a column is input with only a 182 | single value""" 183 | single_column_value = 1 184 | nr_bins = 5 185 | column_values = [single_column_value] * 100 186 | test_df = self.sqlCtx.createDataFrame(pd.DataFrame({'foo': column_values})) 187 | hist = Histogram(bins=nr_bins) 188 | hist.add_column(test_df.select(F.col('foo'))) 189 | hist.build() 190 | self.assertEqual(6, len(hist.bin_boundaries)) 191 | self.assertEqual(single_column_value - 0.5, min(hist.bin_boundaries)) 192 | self.assertEqual(single_column_value + 0.5, max(hist.bin_boundaries)) 193 | self.assertEqual(len(column_values), hist.hist_dict['foo'][math.floor(nr_bins/2)]) 194 | 195 | def test_build(self): 196 | """Should calculate the bin list, and hist values for each column in the Histogram, if the 197 | histogram hasn't been build before""" 198 | hist = Histogram(bins=2) 199 | test_df = self.create_test_df() 200 | column_to_ad = test_df.select(F.col('value')) 201 | column_to_ad_2 = test_df.select(F.col('value2')) 202 | hist.add_column(column_to_ad) 203 | hist.add_column(column_to_ad_2) 204 | hist.build() 205 | self.assertEqual(3, len(hist.bin_boundaries)) 206 | self.assertEqual(2, len(hist.hist_dict)) 207 | self.assertTrue(hist.is_build) 208 | 209 | @mock.patch('pyspark_dist_explore.Histogram._add_hist') 210 | @mock.patch('pyspark_dist_explore.Histogram._calculate_bins') 211 | def test_build_already_build(self, calculate_bins_func, add_hist_func): 212 | """Should not rebuild if Histogram was already build before""" 213 | hist = Histogram() 214 | hist.is_build = True 215 | hist.build() 216 | self.assertFalse(add_hist_func.called) 217 | self.assertFalse(calculate_bins_func.called) 218 | 219 | def test_to_pandas_default(self): 220 | """Should create a pandas dataframe from the Histogram object""" 221 | hist = Histogram(bins=2) 222 | test_df = self.create_test_df() 223 | column_to_ad = test_df.select(F.col('value')) 224 | column_to_ad_2 = test_df.select(F.col('value2')) 225 | hist.add_column(column_to_ad) 226 | hist.add_column(column_to_ad_2) 227 | expected_df = pd.DataFrame({'value': [2, 1], 228 | 'value2': [1, 2]}).set_index([['1.00 - 2.50', '2.50 - 4.00']]) 229 | self.assertTrue(expected_df.equals(hist.to_pandas())) 230 | 231 | def test_to_pandas_density(self): 232 | """Should create a pandas dataframe of a denisty plot of the histogram""" 233 | hist = Histogram(bins=2) 234 | test_df = self.create_test_df() 235 | column_to_ad = test_df.select(F.col('value')) 236 | column_to_ad_2 = test_df.select(F.col('value2')) 237 | hist.add_column(column_to_ad) 238 | hist.add_column(column_to_ad_2) 239 | expected_df = pd.DataFrame({'value': [1.0, 0.5], 'value2': [0.5, 1.0]}).set_index([[1.75, 3.25]]) 240 | self.assertTrue(expected_df.equals(hist.to_pandas('density'))) 241 | 242 | def test_add_data_single_column(self): 243 | """Should add a single column of data to the Histogram""" 244 | hist = Histogram() 245 | test_df = self.create_test_df() 246 | column_to_ad = test_df.select(F.col('value')) 247 | hist.add_data(column_to_ad) 248 | self.assertEqual(1, len(hist.col_list)) 249 | 250 | def test_add_data_list_of_columns(self): 251 | """Should add all columns from the list of columns to the Histogram""" 252 | test_df = self.create_test_df() 253 | column_to_ad = test_df.select(F.col('value')) 254 | column_to_ad_2 = test_df.select(F.col('value2')) 255 | hist = Histogram() 256 | hist.add_data([column_to_ad, column_to_ad_2]) 257 | self.assertEqual(2, len(hist.col_list)) 258 | 259 | def test_add_data_entire_dataframe(self): 260 | """Should add all columns of a dataframe to the histogram""" 261 | test_df = self.create_test_df() 262 | hist = Histogram() 263 | hist.add_data(test_df) 264 | self.assertEqual(2, len(hist.col_list)) 265 | 266 | 267 | class FunctionsTest(unittest.TestCase): 268 | def test_create_histogram_object_default(self): 269 | """Should return an histogram object with default settings""" 270 | test_hist = create_histogram_object(dict()) 271 | self.assertEqual(10, test_hist.nr_bins) 272 | self.assertIsNone(test_hist.min_value) 273 | self.assertIsNone(test_hist.max_value) 274 | 275 | def test_create_histogram_object_non_default(self): 276 | """Should return an histogram object with 'bins' and 'range' set""" 277 | test_kwargs = dict(bins=11, range=(10, 20)) 278 | test_hist = create_histogram_object(test_kwargs) 279 | self.assertEqual(11, test_hist.nr_bins) 280 | self.assertEqual(10, test_hist.min_value) 281 | self.assertEqual(20, test_hist.max_value) 282 | 283 | 284 | if __name__ == "__main__": 285 | unittest.main() 286 | -------------------------------------------------------------------------------- /docs/build/html/_static/alabaster.css: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | @import url("basic.css"); 54 | 55 | /* -- page layout ----------------------------------------------------------- */ 56 | 57 | body { 58 | font-family: 'goudy old style', 'minion pro', 'bell mt', Georgia, 'Hiragino Mincho Pro', serif; 59 | font-size: 17px; 60 | background-color: #fff; 61 | color: #000; 62 | margin: 0; 63 | padding: 0; 64 | } 65 | 66 | 67 | div.document { 68 | width: 940px; 69 | margin: 30px auto 0 auto; 70 | } 71 | 72 | div.documentwrapper { 73 | float: left; 74 | width: 100%; 75 | } 76 | 77 | div.bodywrapper { 78 | margin: 0 0 0 220px; 79 | } 80 | 81 | div.sphinxsidebar { 82 | width: 220px; 83 | font-size: 14px; 84 | line-height: 1.5; 85 | } 86 | 87 | hr { 88 | border: 1px solid #B1B4B6; 89 | } 90 | 91 | div.body { 92 | background-color: #fff; 93 | color: #3E4349; 94 | padding: 0 30px 0 30px; 95 | } 96 | 97 | div.body > .section { 98 | text-align: left; 99 | } 100 | 101 | div.footer { 102 | width: 940px; 103 | margin: 20px auto 30px auto; 104 | font-size: 14px; 105 | color: #888; 106 | text-align: right; 107 | } 108 | 109 | div.footer a { 110 | color: #888; 111 | } 112 | 113 | p.caption { 114 | font-family: inherit; 115 | font-size: inherit; 116 | } 117 | 118 | 119 | div.relations { 120 | display: none; 121 | } 122 | 123 | 124 | div.sphinxsidebar a { 125 | color: #444; 126 | text-decoration: none; 127 | border-bottom: 1px dotted #999; 128 | } 129 | 130 | div.sphinxsidebar a:hover { 131 | border-bottom: 1px solid #999; 132 | } 133 | 134 | div.sphinxsidebarwrapper { 135 | padding: 18px 10px; 136 | } 137 | 138 | div.sphinxsidebarwrapper p.logo { 139 | padding: 0; 140 | margin: -10px 0 0 0px; 141 | text-align: center; 142 | } 143 | 144 | div.sphinxsidebarwrapper h1.logo { 145 | margin-top: -10px; 146 | text-align: center; 147 | margin-bottom: 5px; 148 | text-align: left; 149 | } 150 | 151 | div.sphinxsidebarwrapper h1.logo-name { 152 | margin-top: 0px; 153 | } 154 | 155 | div.sphinxsidebarwrapper p.blurb { 156 | margin-top: 0; 157 | font-style: normal; 158 | } 159 | 160 | div.sphinxsidebar h3, 161 | div.sphinxsidebar h4 { 162 | font-family: 'Garamond', 'Georgia', serif; 163 | color: #444; 164 | font-size: 24px; 165 | font-weight: normal; 166 | margin: 0 0 5px 0; 167 | padding: 0; 168 | } 169 | 170 | div.sphinxsidebar h4 { 171 | font-size: 20px; 172 | } 173 | 174 | div.sphinxsidebar h3 a { 175 | color: #444; 176 | } 177 | 178 | div.sphinxsidebar p.logo a, 179 | div.sphinxsidebar h3 a, 180 | div.sphinxsidebar p.logo a:hover, 181 | div.sphinxsidebar h3 a:hover { 182 | border: none; 183 | } 184 | 185 | div.sphinxsidebar p { 186 | color: #555; 187 | margin: 10px 0; 188 | } 189 | 190 | div.sphinxsidebar ul { 191 | margin: 10px 0; 192 | padding: 0; 193 | color: #000; 194 | } 195 | 196 | div.sphinxsidebar ul li.toctree-l1 > a { 197 | font-size: 120%; 198 | } 199 | 200 | div.sphinxsidebar ul li.toctree-l2 > a { 201 | font-size: 110%; 202 | } 203 | 204 | div.sphinxsidebar input { 205 | border: 1px solid #CCC; 206 | font-family: 'goudy old style', 'minion pro', 'bell mt', Georgia, 'Hiragino Mincho Pro', serif; 207 | font-size: 1em; 208 | } 209 | 210 | div.sphinxsidebar hr { 211 | border: none; 212 | height: 1px; 213 | color: #AAA; 214 | background: #AAA; 215 | 216 | text-align: left; 217 | margin-left: 0; 218 | width: 50%; 219 | } 220 | 221 | /* -- body styles ----------------------------------------------------------- */ 222 | 223 | a { 224 | color: #004B6B; 225 | text-decoration: underline; 226 | } 227 | 228 | a:hover { 229 | color: #6D4100; 230 | text-decoration: underline; 231 | } 232 | 233 | div.body h1, 234 | div.body h2, 235 | div.body h3, 236 | div.body h4, 237 | div.body h5, 238 | div.body h6 { 239 | font-family: 'Garamond', 'Georgia', serif; 240 | font-weight: normal; 241 | margin: 30px 0px 10px 0px; 242 | padding: 0; 243 | } 244 | 245 | div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; } 246 | div.body h2 { font-size: 180%; } 247 | div.body h3 { font-size: 150%; } 248 | div.body h4 { font-size: 130%; } 249 | div.body h5 { font-size: 100%; } 250 | div.body h6 { font-size: 100%; } 251 | 252 | a.headerlink { 253 | color: #DDD; 254 | padding: 0 4px; 255 | text-decoration: none; 256 | } 257 | 258 | a.headerlink:hover { 259 | color: #444; 260 | background: #EAEAEA; 261 | } 262 | 263 | div.body p, div.body dd, div.body li { 264 | line-height: 1.4em; 265 | } 266 | 267 | div.admonition { 268 | margin: 20px 0px; 269 | padding: 10px 30px; 270 | background-color: #EEE; 271 | border: 1px solid #CCC; 272 | } 273 | 274 | div.admonition tt.xref, div.admonition code.xref, div.admonition a tt { 275 | background-color: #FBFBFB; 276 | border-bottom: 1px solid #fafafa; 277 | } 278 | 279 | div.admonition p.admonition-title { 280 | font-family: 'Garamond', 'Georgia', serif; 281 | font-weight: normal; 282 | font-size: 24px; 283 | margin: 0 0 10px 0; 284 | padding: 0; 285 | line-height: 1; 286 | } 287 | 288 | div.admonition p.last { 289 | margin-bottom: 0; 290 | } 291 | 292 | div.highlight { 293 | background-color: #fff; 294 | } 295 | 296 | dt:target, .highlight { 297 | background: #FAF3E8; 298 | } 299 | 300 | div.warning { 301 | background-color: #FCC; 302 | border: 1px solid #FAA; 303 | } 304 | 305 | div.danger { 306 | background-color: #FCC; 307 | border: 1px solid #FAA; 308 | -moz-box-shadow: 2px 2px 4px #D52C2C; 309 | -webkit-box-shadow: 2px 2px 4px #D52C2C; 310 | box-shadow: 2px 2px 4px #D52C2C; 311 | } 312 | 313 | div.error { 314 | background-color: #FCC; 315 | border: 1px solid #FAA; 316 | -moz-box-shadow: 2px 2px 4px #D52C2C; 317 | -webkit-box-shadow: 2px 2px 4px #D52C2C; 318 | box-shadow: 2px 2px 4px #D52C2C; 319 | } 320 | 321 | div.caution { 322 | background-color: #FCC; 323 | border: 1px solid #FAA; 324 | } 325 | 326 | div.attention { 327 | background-color: #FCC; 328 | border: 1px solid #FAA; 329 | } 330 | 331 | div.important { 332 | background-color: #EEE; 333 | border: 1px solid #CCC; 334 | } 335 | 336 | div.note { 337 | background-color: #EEE; 338 | border: 1px solid #CCC; 339 | } 340 | 341 | div.tip { 342 | background-color: #EEE; 343 | border: 1px solid #CCC; 344 | } 345 | 346 | div.hint { 347 | background-color: #EEE; 348 | border: 1px solid #CCC; 349 | } 350 | 351 | div.seealso { 352 | background-color: #EEE; 353 | border: 1px solid #CCC; 354 | } 355 | 356 | div.topic { 357 | background-color: #EEE; 358 | } 359 | 360 | p.admonition-title { 361 | display: inline; 362 | } 363 | 364 | p.admonition-title:after { 365 | content: ":"; 366 | } 367 | 368 | pre, tt, code { 369 | font-family: 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace; 370 | font-size: 0.9em; 371 | } 372 | 373 | .hll { 374 | background-color: #FFC; 375 | margin: 0 -12px; 376 | padding: 0 12px; 377 | display: block; 378 | } 379 | 380 | img.screenshot { 381 | } 382 | 383 | tt.descname, tt.descclassname, code.descname, code.descclassname { 384 | font-size: 0.95em; 385 | } 386 | 387 | tt.descname, code.descname { 388 | padding-right: 0.08em; 389 | } 390 | 391 | img.screenshot { 392 | -moz-box-shadow: 2px 2px 4px #EEE; 393 | -webkit-box-shadow: 2px 2px 4px #EEE; 394 | box-shadow: 2px 2px 4px #EEE; 395 | } 396 | 397 | table.docutils { 398 | border: 1px solid #888; 399 | -moz-box-shadow: 2px 2px 4px #EEE; 400 | -webkit-box-shadow: 2px 2px 4px #EEE; 401 | box-shadow: 2px 2px 4px #EEE; 402 | } 403 | 404 | table.docutils td, table.docutils th { 405 | border: 1px solid #888; 406 | padding: 0.25em 0.7em; 407 | } 408 | 409 | table.field-list, table.footnote { 410 | border: none; 411 | -moz-box-shadow: none; 412 | -webkit-box-shadow: none; 413 | box-shadow: none; 414 | } 415 | 416 | table.footnote { 417 | margin: 15px 0; 418 | width: 100%; 419 | border: 1px solid #EEE; 420 | background: #FDFDFD; 421 | font-size: 0.9em; 422 | } 423 | 424 | table.footnote + table.footnote { 425 | margin-top: -15px; 426 | border-top: none; 427 | } 428 | 429 | table.field-list th { 430 | padding: 0 0.8em 0 0; 431 | } 432 | 433 | table.field-list td { 434 | padding: 0; 435 | } 436 | 437 | table.field-list p { 438 | margin-bottom: 0.8em; 439 | } 440 | 441 | /* Cloned from 442 | * https://github.com/sphinx-doc/sphinx/commit/ef60dbfce09286b20b7385333d63a60321784e68 443 | */ 444 | .field-name { 445 | -moz-hyphens: manual; 446 | -ms-hyphens: manual; 447 | -webkit-hyphens: manual; 448 | hyphens: manual; 449 | } 450 | 451 | table.footnote td.label { 452 | width: .1px; 453 | padding: 0.3em 0 0.3em 0.5em; 454 | } 455 | 456 | table.footnote td { 457 | padding: 0.3em 0.5em; 458 | } 459 | 460 | dl { 461 | margin: 0; 462 | padding: 0; 463 | } 464 | 465 | dl dd { 466 | margin-left: 30px; 467 | } 468 | 469 | blockquote { 470 | margin: 0 0 0 30px; 471 | padding: 0; 472 | } 473 | 474 | ul, ol { 475 | /* Matches the 30px from the narrow-screen "li > ul" selector below */ 476 | margin: 10px 0 10px 30px; 477 | padding: 0; 478 | } 479 | 480 | pre { 481 | background: #EEE; 482 | padding: 7px 30px; 483 | margin: 15px 0px; 484 | line-height: 1.3em; 485 | } 486 | 487 | div.viewcode-block:target { 488 | background: #ffd; 489 | } 490 | 491 | dl pre, blockquote pre, li pre { 492 | margin-left: 0; 493 | padding-left: 30px; 494 | } 495 | 496 | tt, code { 497 | background-color: #ecf0f3; 498 | color: #222; 499 | /* padding: 1px 2px; */ 500 | } 501 | 502 | tt.xref, code.xref, a tt { 503 | background-color: #FBFBFB; 504 | border-bottom: 1px solid #fff; 505 | } 506 | 507 | a.reference { 508 | text-decoration: none; 509 | border-bottom: 1px dotted #004B6B; 510 | } 511 | 512 | /* Don't put an underline on images */ 513 | a.image-reference, a.image-reference:hover { 514 | border-bottom: none; 515 | } 516 | 517 | a.reference:hover { 518 | border-bottom: 1px solid #6D4100; 519 | } 520 | 521 | a.footnote-reference { 522 | text-decoration: none; 523 | font-size: 0.7em; 524 | vertical-align: top; 525 | border-bottom: 1px dotted #004B6B; 526 | } 527 | 528 | a.footnote-reference:hover { 529 | border-bottom: 1px solid #6D4100; 530 | } 531 | 532 | a:hover tt, a:hover code { 533 | background: #EEE; 534 | } 535 | 536 | 537 | @media screen and (max-width: 870px) { 538 | 539 | div.sphinxsidebar { 540 | display: none; 541 | } 542 | 543 | div.document { 544 | width: 100%; 545 | 546 | } 547 | 548 | div.documentwrapper { 549 | margin-left: 0; 550 | margin-top: 0; 551 | margin-right: 0; 552 | margin-bottom: 0; 553 | } 554 | 555 | div.bodywrapper { 556 | margin-top: 0; 557 | margin-right: 0; 558 | margin-bottom: 0; 559 | margin-left: 0; 560 | } 561 | 562 | ul { 563 | margin-left: 0; 564 | } 565 | 566 | li > ul { 567 | /* Matches the 30px from the "ul, ol" selector above */ 568 | margin-left: 30px; 569 | } 570 | 571 | .document { 572 | width: auto; 573 | } 574 | 575 | .footer { 576 | width: auto; 577 | } 578 | 579 | .bodywrapper { 580 | margin: 0; 581 | } 582 | 583 | .footer { 584 | width: auto; 585 | } 586 | 587 | .github { 588 | display: none; 589 | } 590 | 591 | 592 | 593 | } 594 | 595 | 596 | 597 | @media screen and (max-width: 875px) { 598 | 599 | body { 600 | margin: 0; 601 | padding: 20px 30px; 602 | } 603 | 604 | div.documentwrapper { 605 | float: none; 606 | background: #fff; 607 | } 608 | 609 | div.sphinxsidebar { 610 | display: block; 611 | float: none; 612 | width: 102.5%; 613 | margin: 50px -30px -20px -30px; 614 | padding: 10px 20px; 615 | background: #333; 616 | color: #FFF; 617 | } 618 | 619 | div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p, 620 | div.sphinxsidebar h3 a { 621 | color: #fff; 622 | } 623 | 624 | div.sphinxsidebar a { 625 | color: #AAA; 626 | } 627 | 628 | div.sphinxsidebar p.logo { 629 | display: none; 630 | } 631 | 632 | div.document { 633 | width: 100%; 634 | margin: 0; 635 | } 636 | 637 | div.footer { 638 | display: none; 639 | } 640 | 641 | div.bodywrapper { 642 | margin: 0; 643 | } 644 | 645 | div.body { 646 | min-height: 0; 647 | padding: 0; 648 | } 649 | 650 | .rtd_doc_footer { 651 | display: none; 652 | } 653 | 654 | .document { 655 | width: auto; 656 | } 657 | 658 | .footer { 659 | width: auto; 660 | } 661 | 662 | .footer { 663 | width: auto; 664 | } 665 | 666 | .github { 667 | display: none; 668 | } 669 | } 670 | 671 | 672 | /* misc. */ 673 | 674 | .revsys-inline { 675 | display: none!important; 676 | } 677 | 678 | /* Make nested-list/multi-paragraph items look better in Releases changelog 679 | * pages. Without this, docutils' magical list fuckery causes inconsistent 680 | * formatting between different release sub-lists. 681 | */ 682 | div#changelog > div.section > ul > li > p:only-child { 683 | margin-bottom: 0; 684 | } 685 | 686 | /* Hide fugly table cell borders in ..bibliography:: directive output */ 687 | table.docutils.citation, table.docutils.citation td, table.docutils.citation th { 688 | border: none; 689 | /* Below needed in some edge cases; if not applied, bottom shadows appear */ 690 | -moz-box-shadow: none; 691 | -webkit-box-shadow: none; 692 | box-shadow: none; 693 | } -------------------------------------------------------------------------------- /pyspark_dist_explore/pyspark_dist_explore.py: -------------------------------------------------------------------------------- 1 | from scipy.interpolate import interp1d 2 | 3 | try: 4 | from pyspark.sql.types import NumericType 5 | 6 | import pyspark.sql.functions as F 7 | except: 8 | pass 9 | 10 | import pandas as pd 11 | import numpy as np 12 | 13 | import matplotlib.pyplot as plt 14 | from matplotlib.patches import Rectangle 15 | 16 | 17 | def hist(axis, x, overlapping=False, formatted_yaxis=True, **kwargs): 18 | """Plots a histogram on an Axis object 19 | 20 | Args: 21 | :axis: (`Axes`) 22 | An matplotlib Axes object on which the histogram will be plot. 23 | :x: (`DataFrame` or `list` of `DataFrame`) 24 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames 25 | :overlapping: (`bool`, optional) 26 | Generate overlapping histograms. 27 | 28 | If set to true, this will generate an overlapping plot. 29 | When set to False it will generate a normal grouped histogram. Defaults to False. 30 | :formatted_yaxis: (`bool`, optional) 31 | If set to true, the numbers on the yaxis will be formatted 32 | for better readability. E.g. 1500000 will become 1.5M. Defaults to True 33 | 34 | :\*\*kwargs: 35 | The keyword arguments as used in matplotlib.pyplot.hist 36 | 37 | Returns: 38 | :n: (`array` or `list` of `arrays`) 39 | The values of the histogram bins. See normed and weights for a description of the possible semantics. 40 | If input x is an array, then this is an array of length nbins. If input is a sequence arrays 41 | [data1, data2,..], then this is a list of arrays with the values of the histograms for each of the 42 | arrays in the same order. 43 | :bins: (`array`) 44 | The edges of the bins. 45 | Length nbins + 1 (nbins left edges and right edge of last bin). Always a single array even 46 | when multiple data sets are passed in. 47 | :patches: (`list` or `list` of `lists`) 48 | Silent list of individual patches used to create the histogram or list of such lists if multiple 49 | input datasets. 50 | 51 | """ 52 | histogram = create_histogram_object(kwargs) 53 | histogram.add_data(x) 54 | return histogram.plot_hist(axis, overlapping, formatted_yaxis, **kwargs) 55 | 56 | 57 | def distplot(axis, x, **kwargs): 58 | """Plots a normalised histogram and a density plot on an Axes object 59 | 60 | Args: 61 | :axis: (`Axes`) 62 | An matplotlib Axes object on which the histogram will be plot. 63 | :x: (`DataFrame` or `list` of `DataFrame`) 64 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames 65 | :\*\*kwargs: 66 | The keyword arguments as used in matplotlib.pyplot.hist. Normed is set to True 67 | 68 | Returns: 69 | :n: (`array` or `list` of `arrays`) 70 | The values of the histogram bins. See normed and weights for a description of the possible semantics. 71 | If input x is an array, then this is an array of length nbins. If input is a sequence arrays 72 | [data1, data2,..], then this is a list of arrays with the values of the histograms for each of the 73 | arrays in the same order. 74 | :bins: (`array`) 75 | The edges of the bins. 76 | Length nbins + 1 (nbins left edges and right edge of last bin). Always a single array even 77 | when multiple data sets are passed in. 78 | :patches: (`list` or `list` of `lists`) 79 | Silent list of individual patches used to create the histogram or list of such lists if multiple 80 | input datasets. 81 | """ 82 | histogram = create_histogram_object(kwargs) 83 | histogram.add_data(x) 84 | n, bins, patches = histogram.plot_hist(axis, density=True, **kwargs) 85 | 86 | # If working with a list of DataFrames as input, patches will be a list of lists with Rectangle objects 87 | # We will get the color of the first Rectangle object. If there is only one DataFrame patches is a single list 88 | # Of Rectangle objects 89 | if type(x) == list and len(x) > 1: 90 | colors = [patch[0].get_facecolor() for patch in patches] 91 | elif type(patches[0]) is Rectangle: 92 | colors = [patches[0].get_facecolor()] 93 | else: 94 | raise TypeError("Unexpected Patch Type. Expected Rectangle") 95 | 96 | histogram.plot_density(axis, color=colors) 97 | return n, bins, patches 98 | 99 | 100 | def pandas_histogram(x, bins=10, range=None): 101 | """Returns a pandas DataFrame with histograms of the Spark DataFrame 102 | 103 | Bin ranges are formatted as text an put on the Index. 104 | 105 | Args: 106 | :x: (`DataFrame` or `list` of `DataFrame`) 107 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames 108 | :bins: (`integer` or `array_like`, optional) 109 | If an integer is given, bins + 1 bin edges are returned, consistently with numpy.histogram() for 110 | numpy version >= 1.3. 111 | 112 | Unequally spaced bins are supported if bins is a sequence. 113 | 114 | Default is 10 115 | :range: (tuple or None, optional) 116 | The lower and upper range of the bins. Lower and upper outliers are ignored. 117 | If not provided, range is (x.min(), x.max()). Range has no effect if bins is a sequence. 118 | 119 | If bins is a sequence or range is specified, autoscaling is based on the specified bin range instead 120 | of the range of x. 121 | 122 | Default is None 123 | """ 124 | histogram = Histogram(bins=bins, range=range) 125 | histogram.add_data(x) 126 | return histogram.to_pandas() 127 | 128 | 129 | def create_histogram_object(kwargs): 130 | bins = 10 131 | b_range = None 132 | 133 | if 'bins' in kwargs: 134 | bins = kwargs['bins'] 135 | del kwargs['bins'] 136 | 137 | if 'range' in kwargs: 138 | b_range = kwargs['range'] 139 | del kwargs['range'] 140 | 141 | return Histogram(bins=bins, range=b_range) 142 | 143 | 144 | class Histogram(object): 145 | """The Histogram object leverages Spark to calculate histograms, and matplotlib to visualize these. 146 | 147 | Args: 148 | :range: (`tuple`, optional) 149 | The lower and upper range of the bins. 150 | 151 | Lower and upper outliers are ignored. If not provided, range is (min(x), max(x)). Range has no 152 | effect if bins is a sequence. If bins is a sequence or range is specified, autoscaling is 153 | based on the specified bin range instead of the range of x. 154 | :bins: (`int` or `list` of `str` or `list of `int`, optional) 155 | If an integer is given: Number of bins in the histogram. 156 | 157 | Defaults to 10. 158 | 159 | If a list is given: Predefined list of bin boundaries. 160 | 161 | The bins are all open to the right except for the last which is closed. e.g. [1,10,20,50] means 162 | the buckets are [1,10) [10,20) [20,50], which means 1<=x<10, 10<=x<20, 20<=x<=50. 163 | 164 | """ 165 | def __init__(self, bins=10, range=None): 166 | self.col_list = [] 167 | self.bin_boundaries = [] 168 | self.hist_dict = {} # column names: bin weight lists pairs 169 | self.nr_bins = None 170 | self.min_value = None 171 | self.max_value = None 172 | self.is_build = False 173 | 174 | if isinstance(bins, list): 175 | self.bin_boundaries = [float(bin_border) for bin_border in bins] 176 | else: 177 | self.nr_bins = bins 178 | 179 | if range is not None: 180 | self.min_value = range[0] 181 | self.max_value = range[1] 182 | 183 | def add_column(self, table): 184 | """Add single column DataFrame to the histogram object. 185 | 186 | If multiple columns share the same name, a (n) will be appended to the name, where n is 187 | the next available number. 188 | 189 | Args: 190 | :table: (:obj:`dataframe`) 191 | A PySpark DataFrame with a single column 192 | 193 | """ 194 | if len(table.columns) > 1: 195 | raise ValueError('More then one column is being added, use add_data() to add multi-column DataFrames') 196 | 197 | column_name = table.columns[0] 198 | 199 | if not isinstance(table.schema.fields[0].dataType, NumericType): 200 | raise ValueError('Column %s has a non-numeric type (%s), only numeric types are supported' 201 | % (column_name, str(table.schema.fields[0].dataType))) 202 | 203 | self.col_list.append((table, column_name)) 204 | 205 | def _get_bin_centers(self): 206 | result = [] 207 | for i in range(len(self.bin_boundaries) - 1): 208 | result.append(((self.bin_boundaries[i + 1] - self.bin_boundaries[i]) / 2) + self.bin_boundaries[i]) 209 | return result 210 | 211 | def _get_col_names(self): 212 | new_col_names = [] 213 | for i in range(len(self.bin_boundaries) - 1): 214 | new_col_names.append('%.2f - %.2f' % (self.bin_boundaries[i], self.bin_boundaries[i + 1])) 215 | return new_col_names 216 | 217 | def _check_col_name(self, column_name): 218 | n = 0 219 | col_name_new = column_name 220 | while col_name_new in self.hist_dict.keys(): 221 | n += 1 222 | col_name_new = '%s (%d)' % (column_name, n) 223 | return col_name_new 224 | 225 | def _get_min_value(self): 226 | if self.min_value is not None: 227 | return self.min_value 228 | return min([table.select(F.min(F.col(col_name))).collect()[0][0] 229 | for table, col_name in self.col_list]) 230 | 231 | def _get_max_value(self): 232 | if self.max_value is not None: 233 | return self.max_value 234 | return max([table.select(F.max(F.col(col_name))).collect()[0][0] 235 | for table, col_name in self.col_list]) 236 | 237 | def _calculate_bins(self): 238 | if len(self.bin_boundaries) > 0: 239 | return self.bin_boundaries 240 | 241 | if len(self.bin_boundaries) == 0 and len(self.col_list) == 1 \ 242 | and self.min_value is None and self.max_value is None: 243 | # Only use the amount of bins as input For the histogram function 244 | return self.nr_bins 245 | 246 | min_value = self._get_min_value() 247 | max_value = self._get_max_value() 248 | 249 | # expand empty range to avoid empty graph 250 | return Histogram._calc_n_bins_between(min_value, max_value, self.nr_bins) 251 | 252 | def _add_hist(self, table, column_name): 253 | """Uses spark to calculate the hist values: for each column a list of weights, and if the bin_list is not set 254 | a set of bin boundaries""" 255 | bin_boundaries, bin_weights = table.select(column_name).rdd.flatMap(lambda x: x).histogram(self.bin_boundaries) 256 | self.hist_dict[self._check_col_name(column_name)] = bin_weights 257 | 258 | if isinstance(self.bin_boundaries, int): # the bin_list is not set 259 | if len(bin_boundaries) == 2 and bin_boundaries[0] == bin_boundaries[1]: 260 | # In case of a column with 1 unique value we need to calculate the histogram ourselves. 261 | min_value = bin_boundaries[0] 262 | max_value = bin_boundaries[1] 263 | self.bin_boundaries = self._calc_n_bins_between(min_value, max_value, self.nr_bins) 264 | self.hist_dict[column_name] = Histogram._calc_weights(self.bin_boundaries, min_value, bin_weights) 265 | else: 266 | self.bin_boundaries = bin_boundaries 267 | 268 | @staticmethod 269 | def _calc_n_bins_between(min_value, max_value, nr_bins): 270 | """Returns a list of bin borders between min_value and max_value""" 271 | if min_value == max_value: 272 | min_value = min_value - 0.5 273 | max_value = max_value + 0.5 274 | step = (float(max_value) - float(min_value)) / nr_bins 275 | return [min_value + (step * float(bn_nr)) for bn_nr in range(nr_bins + 1)] 276 | 277 | @staticmethod 278 | def _calc_weights(bins, value, value_count): 279 | """Calculate weights given a bin list, value within that bin list and a count""" 280 | # first we get a list of bin boundary tuples 281 | weights = list() 282 | bin_boundary_idx = [(idx, idx+2) for idx in range(len(bins)-1)] 283 | bin_boundaries = [tuple(bins[left_idx:right_idx]) for (left_idx, right_idx) in bin_boundary_idx] 284 | for left_boundary, right_boundary in bin_boundaries: 285 | if left_boundary <= value < right_boundary: 286 | weights.append(value_count[0]) 287 | else: 288 | weights.append(0) 289 | return weights 290 | 291 | @staticmethod 292 | def _convert_number_bmk(axis_value, _): 293 | """Converts the values on axes to Billions, Millions or Thousands""" 294 | if axis_value >= 1e9: 295 | return '{:1.1f}B'.format(axis_value * 1e-9) 296 | if axis_value >= 1e6: 297 | return '{:1.1f}M'.format(axis_value * 1e-6) 298 | if axis_value >= 1e3: 299 | return '{:1.1f}K'.format(axis_value * 1e-3) 300 | if axis_value >= 1 or axis_value == 0: 301 | return '{:1.0f}'.format(axis_value) 302 | return axis_value 303 | 304 | def build(self): 305 | """Calculates the histogram values for each of the columns. 306 | 307 | If the Histogram has already been build, it doesn't build it again. 308 | """ 309 | if not self.is_build: 310 | self.bin_boundaries = self._calculate_bins() 311 | for table, column_name in self.col_list: 312 | self._add_hist(table, column_name) 313 | self.is_build = True 314 | 315 | def to_pandas(self, kind='hist'): 316 | """Returns a pandas dataframe from the Histogram object. 317 | 318 | This function calculates the Histogram function in Spark if it was not done yet. 319 | 320 | Args: 321 | :kind: (:obj:`str`, optional): 322 | 'hist' or 'density'. When using hist this returns the histogram object 323 | as pandas dataframe. When using density the index contains the bin centers, and the values in the 324 | DataFrame are the scaled values. Defaults to 'hist' 325 | 326 | Returns: 327 | A pandas DataFrame from the Histogram object. 328 | """ 329 | self.build() 330 | if kind == 'hist': 331 | return pd.DataFrame(self.hist_dict).set_index([self._get_col_names()]) 332 | elif kind == 'density': 333 | result = pd.DataFrame(self.hist_dict).set_index([self._get_bin_centers()]) 334 | return result.apply(lambda x: x / x.max(), axis=0) 335 | 336 | def plot_hist(self, ax, overlapping=False, formatted_yaxis=True, **kwargs): 337 | """Returns a matplotlib style histogram (matplotlib.pyplot.hist) 338 | 339 | Uses the matplotlib object oriented interface to add a Histogram to an matplotlib Axes object. 340 | All named arguments from pyplot.hist can be used. A new argument called "type" makes it possible to 341 | make overlapping histogram plots. 342 | 343 | Args: 344 | :ax: (`Axes`) 345 | An matplotlib Axes object on which the histogram will be plot 346 | :overlapping (`bool`, optional): 347 | If set to true, this will generate an overlapping plot. 348 | When set to False it will generate a normal grouped histogram. Defaults to False. 349 | :formatted_yaxis: (`bool`, optional). 350 | If set to true, the numbers on the yaxis will be formatted 351 | for better readability. E.g. 1500000 will become 1.5M. Defaults to True 352 | :**kwargs: 353 | The keyword arguments as used in matplotlib.pyplot.hist 354 | """ 355 | self.build() 356 | 357 | if formatted_yaxis: 358 | # Round the y-axis value to nearest thousand, million, or billion for readable y-axis 359 | formatter = plt.FuncFormatter(Histogram._convert_number_bmk) 360 | ax.yaxis.set_major_formatter(formatter) 361 | 362 | if overlapping: 363 | for colname in self.hist_dict: 364 | ax.hist(self._get_bin_centers(), 365 | bins=self.bin_boundaries, 366 | alpha=0.5, 367 | label=self.hist_dict.keys(), 368 | weights=self.hist_dict[colname], 369 | **kwargs 370 | ) 371 | else: 372 | weights_multi = [self.hist_dict[colname] for colname in self.hist_dict] 373 | return ax.hist([self._get_bin_centers()] * len(self.hist_dict), 374 | bins=self.bin_boundaries, 375 | weights=weights_multi, 376 | label=self.hist_dict.keys(), 377 | **kwargs) 378 | 379 | def plot_density(self, ax, num=300, **kwargs): 380 | """Returns a density plot on an Pyplot Axes object. 381 | 382 | Args: 383 | :ax: (`Axes`) 384 | An matplotlib Axes object on which the histogram will be plot 385 | :num: (`int`) 386 | The number of x values the line is plotted on. Default: 300 387 | :**kwargs: 388 | Keyword arguments that are passed on to the pyplot.plot function. 389 | """ 390 | colors = [] 391 | 392 | self.build() 393 | bin_centers = np.asarray(self._get_bin_centers()) 394 | x_new = np.linspace(bin_centers.min(), bin_centers.max(), num) 395 | 396 | if 'color' in kwargs: 397 | colors = kwargs['color'] 398 | del kwargs['color'] 399 | 400 | power_smooth = [] 401 | 402 | for (colname, bin_values) in self.hist_dict.items(): 403 | normed_values, ble = np.histogram(self._get_bin_centers(), 404 | bins=self.bin_boundaries, 405 | weights=bin_values, 406 | density=True 407 | ) 408 | interpolation_function = interp1d(bin_centers, normed_values, kind='quadratic') 409 | 410 | power_smooth.append(x_new) 411 | power_smooth.append(interpolation_function(x_new)) 412 | 413 | lines = ax.plot(*power_smooth, **kwargs) 414 | 415 | for i, line in enumerate(lines): 416 | if len(colors) > 0: 417 | plt.setp(line, color=colors[i], label=list(self.hist_dict.keys())[i]) 418 | else: 419 | plt.setp(line, label=list(self.hist_dict.keys())[i]) 420 | 421 | return lines 422 | 423 | def add_data(self, data): 424 | """Ads 1 or more columns to a histogram. 425 | 426 | Multiple options are available: 427 | * Add a single column dataframe 428 | * Add a list of single column dataframes 429 | * Add a dataframe with multiple columns 430 | 431 | Args: 432 | :data: 433 | A single column Spark dataframe, a list of single column Spark 434 | dataframes, or a multi column Spark dataframe. 435 | """ 436 | if isinstance(data, list): 437 | for df_column in data: 438 | self.add_column(df_column) 439 | 440 | elif len(data.columns) > 1: 441 | for col_name in data.columns: 442 | self.add_column(data.select(col_name)) 443 | 444 | else: 445 | self.add_column(data) 446 | -------------------------------------------------------------------------------- /docs/build/html/index.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Welcome to pyspark_histogram’s documentation! — pyspark_dist_explore 0.1.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 |
40 |
41 |
42 |
43 | 44 |
45 |

Welcome to pyspark_histogram’s documentation!

46 |
47 |
48 |
49 |
50 |

Indices and tables

51 | 56 |
57 |

Documentation for the Code

58 |
59 |
60 | pyspark_dist_explore.hist(axis, x, overlapping=False, formatted_yaxis=True, **kwargs)[source]
61 |

Plots a histogram on an Axis object

62 |
63 |
Args:
64 |
65 | 66 | 67 | 68 | 71 | 72 | 75 | 76 | 81 | 82 | 83 | 87 | 88 | 90 | 91 | 92 |
axis:

(Axes) 69 | An matplotlib Axes object on which the histogram will be plot.

70 |
x:

(DataFrame or list of DataFrame) 73 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames

74 |
overlapping:

(bool, optional) 77 | Generate overlapping histograms.

78 |

If set to true, this will generate an overlapping plot. 79 | When set to False it will generate a normal grouped histogram. Defaults to False.

80 |
formatted_yaxis:
 

(bool, optional) 84 | If set to true, the numbers on the yaxis will be formatted 85 | for better readability. E.g. 1500000 will become 1.5M. Defaults to True

86 |
**kwargs:

The keyword arguments as used in matplotlib.pyplot.hist

89 |
93 |
94 |
Returns:
95 |
96 | 97 | 98 | 99 | 104 | 105 | 109 | 110 | 113 | 114 | 115 |
n:(array or list of arrays) 100 | The values of the histogram bins. See normed and weights for a description of the possible semantics. 101 | If input x is an array, then this is an array of length nbins. If input is a sequence arrays 102 | [data1, data2,..], then this is a list of arrays with the values of the histograms for each of the 103 | arrays in the same order.
bins:(array) 106 | The edges of the bins. 107 | Length nbins + 1 (nbins left edges and right edge of last bin). Always a single array even 108 | when multiple data sets are passed in.
patches:(list or list of lists) 111 | Silent list of individual patches used to create the histogram or list of such lists if multiple 112 | input datasets.
116 |
117 |
118 |
119 | 120 |
121 |
122 | pyspark_dist_explore.distplot(axis, x, **kwargs)[source]
123 |

Plots a normalised histogram and a density plot on an Axes object

124 |
125 |
Args:
126 |
127 | 128 | 129 | 130 | 132 | 133 | 135 | 136 | 137 | 138 | 139 |
axis:(Axes) 131 | An matplotlib Axes object on which the histogram will be plot.
x:(DataFrame or list of DataFrame) 134 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames
**kwargs:The keyword arguments as used in matplotlib.pyplot.hist. Normed is set to True
140 |
141 |
Returns:
142 |
143 | 144 | 145 | 146 | 151 | 152 | 156 | 157 | 160 | 161 | 162 |
n:(array or list of arrays) 147 | The values of the histogram bins. See normed and weights for a description of the possible semantics. 148 | If input x is an array, then this is an array of length nbins. If input is a sequence arrays 149 | [data1, data2,..], then this is a list of arrays with the values of the histograms for each of the 150 | arrays in the same order.
bins:(array) 153 | The edges of the bins. 154 | Length nbins + 1 (nbins left edges and right edge of last bin). Always a single array even 155 | when multiple data sets are passed in.
patches:(list or list of lists) 158 | Silent list of individual patches used to create the histogram or list of such lists if multiple 159 | input datasets.
163 |
164 |
165 |
166 | 167 |
168 |
169 | pyspark_dist_explore.pandas_histogram(x, bins=10, range=None)[source]
170 |

Returns a pandas DataFrame with histograms of the Spark DataFrame

171 |

Bin ranges are formatted as text an put on the Index.

172 |
173 |
Args:
174 |
175 | 176 | 177 | 178 | 181 | 182 | 188 | 189 | 196 | 197 | 198 |
x:

(DataFrame or list of DataFrame) 179 | A DataFrame with one or more numerical columns, or a list of single numerical column DataFrames

180 |
bins:

(integer or array_like, optional) 183 | If an integer is given, bins + 1 bin edges are returned, consistently with numpy.histogram() for 184 | numpy version >= 1.3.

185 |

Unequally spaced bins are supported if bins is a sequence.

186 |

Default is 10

187 |
range:

(tuple or None, optional) 190 | The lower and upper range of the bins. Lower and upper outliers are ignored. 191 | If not provided, range is (x.min(), x.max()). Range has no effect if bins is a sequence.

192 |

If bins is a sequence or range is specified, autoscaling is based on the specified bin range instead 193 | of the range of x.

194 |

Default is None

195 |
199 |
200 |
201 |
202 | 203 |
204 |
205 | class pyspark_dist_explore.Histogram(bins=10, range=None)[source]
206 |

The Histogram object leverages Spark to calculate histograms, and matplotlib to visualize these.

207 |
208 |
Args:
209 |
210 | 211 | 212 | 213 | 218 | 219 | 224 | 225 | 226 |
range:(tuple, optional) 214 | The lower and upper range of the bins. 215 | Lower and upper outliers are ignored. If not provided, range is (min(x), max(x)). Range has no 216 | effect if bins is a sequence. If bins is a sequence or range is specified, autoscaling is 217 | based on the specified bin range instead of the range of x.
bins:(int or list of str or list of `int, optional) 220 | If an integer is given: Number of bins in the histogram. Defaults to 10. 221 | If a list is given: Predefined list of bin boundaries. 222 | The bins are all open to the right except for the last which is closed. e.g. [1,10,20,50] means 223 | the buckets are [1,10) [10,20) [20,50], which means 1<=x<10, 10<=x<20, 20<=x<=50.
227 |
228 |
229 |
230 |
231 | add_column(table)[source]
232 |

Add single column DataFrame to the histogram object.

233 |

If multiple columns share the same name, a (n) will be appended to the name, where n is 234 | the next available number.

235 |
236 |
Args:
237 |
238 | 239 | 240 | 241 | 243 | 244 | 245 |
table:(dataframe) 242 | A PySpark DataFrame with a single column
246 |
247 |
248 |
249 | 250 |
251 |
252 | add_data(data)[source]
253 |

Ads 1 or more columns to a histogram.

254 |
255 |
Multiple options are available:
256 |
    257 |
  • Add a single column dataframe
  • 258 |
  • Add a list of single column dataframes
  • 259 |
  • Add a dataframe with multiple columns
  • 260 |
261 |
262 |
Args:
263 |
264 | 265 | 266 | 267 | 269 | 270 | 271 |
data:A single column Spark dataframe, a list of single column Spark 268 | dataframes, or a multi column Spark dataframe.
272 |
273 |
274 |
275 | 276 |
277 |
278 | build()[source]
279 |

Calculates the histogram values for each of the columns.

280 |

If the Histogram has already been build, it doesn’t build it again.

281 |
282 | 283 |
284 |
285 | plot_density(ax, num=300, **kwargs)[source]
286 |

Returns a density plot on an Pyplot Axes object.

287 |
288 |
Args:
289 |
290 | 291 | 292 | 293 | 295 | 296 | 298 | 299 | 300 | 301 | 302 |
ax:(Axes) 294 | An matplotlib Axes object on which the histogram will be plot
num:(int) 297 | The number of x values the line is plotted on. Default: 300
**kwargs:Keyword arguments that are passed on to the pyplot.plot function.
303 |
304 |
305 |
306 | 307 |
308 |
309 | plot_hist(ax, overlapping=False, formatted_yaxis=True, **kwargs)[source]
310 |

Returns a matplotlib style histogram (matplotlib.pyplot.hist)

311 |

Uses the matplotlib object oriented interface to add a Histogram to an matplotlib Axes object. 312 | All named arguments from pyplot.hist can be used. A new argument called “type” makes it possible to 313 | make overlapping histogram plots.

314 |
315 |
Args:
316 |
317 | 318 | 319 | 320 | 322 | 323 | 324 | 326 | 327 | 328 | 331 | 332 | 333 | 334 | 335 |
ax:(Axes) 321 | An matplotlib Axes object on which the histogram will be plot
overlapping (bool, optional):
 If set to true, this will generate an overlapping plot. 325 | When set to False it will generate a normal grouped histogram. Defaults to False.
formatted_yaxis:
 (bool, optional). 329 | If set to true, the numbers on the yaxis will be formatted 330 | for better readability. E.g. 1500000 will become 1.5M. Defaults to True
**kwargs:The keyword arguments as used in matplotlib.pyplot.hist
336 |
337 |
338 |
339 | 340 |
341 |
342 | to_pandas(kind='hist')[source]
343 |

Returns a pandas dataframe from the Histogram object.

344 |

This function calculates the Histogram function in Spark if it was not done yet.

345 |
346 |
Args:
347 |
348 | 349 | 350 | 351 | 355 | 356 | 357 |
kind:(str, optional): 352 | ‘hist’ or ‘density’. When using hist this returns the histogram object 353 | as pandas dataframe. When using density the index contains the bin centers, and the values in the 354 | DataFrame are the scaled values. Defaults to ‘hist’
358 |
359 |
Returns:
360 |
A pandas DataFrame from the Histogram object.
361 |
362 |
363 | 364 |
365 | 366 |
367 |
368 | 369 | 370 |
371 |
372 |
373 | 409 |
410 |
411 | 422 | 423 | 424 | 425 | 426 | 427 | -------------------------------------------------------------------------------- /docs/build/html/_static/searchtools.js: -------------------------------------------------------------------------------- 1 | /* 2 | * searchtools.js_t 3 | * ~~~~~~~~~~~~~~~~ 4 | * 5 | * Sphinx JavaScript utilities for the full-text search. 6 | * 7 | * :copyright: Copyright 2007-2017 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | 13 | /* Non-minified version JS is _stemmer.js if file is provided */ 14 | /** 15 | * Porter Stemmer 16 | */ 17 | var Stemmer = function() { 18 | 19 | var step2list = { 20 | ational: 'ate', 21 | tional: 'tion', 22 | enci: 'ence', 23 | anci: 'ance', 24 | izer: 'ize', 25 | bli: 'ble', 26 | alli: 'al', 27 | entli: 'ent', 28 | eli: 'e', 29 | ousli: 'ous', 30 | ization: 'ize', 31 | ation: 'ate', 32 | ator: 'ate', 33 | alism: 'al', 34 | iveness: 'ive', 35 | fulness: 'ful', 36 | ousness: 'ous', 37 | aliti: 'al', 38 | iviti: 'ive', 39 | biliti: 'ble', 40 | logi: 'log' 41 | }; 42 | 43 | var step3list = { 44 | icate: 'ic', 45 | ative: '', 46 | alize: 'al', 47 | iciti: 'ic', 48 | ical: 'ic', 49 | ful: '', 50 | ness: '' 51 | }; 52 | 53 | var c = "[^aeiou]"; // consonant 54 | var v = "[aeiouy]"; // vowel 55 | var C = c + "[^aeiouy]*"; // consonant sequence 56 | var V = v + "[aeiou]*"; // vowel sequence 57 | 58 | var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 59 | var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 60 | var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 61 | var s_v = "^(" + C + ")?" + v; // vowel in stem 62 | 63 | this.stemWord = function (w) { 64 | var stem; 65 | var suffix; 66 | var firstch; 67 | var origword = w; 68 | 69 | if (w.length < 3) 70 | return w; 71 | 72 | var re; 73 | var re2; 74 | var re3; 75 | var re4; 76 | 77 | firstch = w.substr(0,1); 78 | if (firstch == "y") 79 | w = firstch.toUpperCase() + w.substr(1); 80 | 81 | // Step 1a 82 | re = /^(.+?)(ss|i)es$/; 83 | re2 = /^(.+?)([^s])s$/; 84 | 85 | if (re.test(w)) 86 | w = w.replace(re,"$1$2"); 87 | else if (re2.test(w)) 88 | w = w.replace(re2,"$1$2"); 89 | 90 | // Step 1b 91 | re = /^(.+?)eed$/; 92 | re2 = /^(.+?)(ed|ing)$/; 93 | if (re.test(w)) { 94 | var fp = re.exec(w); 95 | re = new RegExp(mgr0); 96 | if (re.test(fp[1])) { 97 | re = /.$/; 98 | w = w.replace(re,""); 99 | } 100 | } 101 | else if (re2.test(w)) { 102 | var fp = re2.exec(w); 103 | stem = fp[1]; 104 | re2 = new RegExp(s_v); 105 | if (re2.test(stem)) { 106 | w = stem; 107 | re2 = /(at|bl|iz)$/; 108 | re3 = new RegExp("([^aeiouylsz])\\1$"); 109 | re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 110 | if (re2.test(w)) 111 | w = w + "e"; 112 | else if (re3.test(w)) { 113 | re = /.$/; 114 | w = w.replace(re,""); 115 | } 116 | else if (re4.test(w)) 117 | w = w + "e"; 118 | } 119 | } 120 | 121 | // Step 1c 122 | re = /^(.+?)y$/; 123 | if (re.test(w)) { 124 | var fp = re.exec(w); 125 | stem = fp[1]; 126 | re = new RegExp(s_v); 127 | if (re.test(stem)) 128 | w = stem + "i"; 129 | } 130 | 131 | // Step 2 132 | re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; 133 | if (re.test(w)) { 134 | var fp = re.exec(w); 135 | stem = fp[1]; 136 | suffix = fp[2]; 137 | re = new RegExp(mgr0); 138 | if (re.test(stem)) 139 | w = stem + step2list[suffix]; 140 | } 141 | 142 | // Step 3 143 | re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; 144 | if (re.test(w)) { 145 | var fp = re.exec(w); 146 | stem = fp[1]; 147 | suffix = fp[2]; 148 | re = new RegExp(mgr0); 149 | if (re.test(stem)) 150 | w = stem + step3list[suffix]; 151 | } 152 | 153 | // Step 4 154 | re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; 155 | re2 = /^(.+?)(s|t)(ion)$/; 156 | if (re.test(w)) { 157 | var fp = re.exec(w); 158 | stem = fp[1]; 159 | re = new RegExp(mgr1); 160 | if (re.test(stem)) 161 | w = stem; 162 | } 163 | else if (re2.test(w)) { 164 | var fp = re2.exec(w); 165 | stem = fp[1] + fp[2]; 166 | re2 = new RegExp(mgr1); 167 | if (re2.test(stem)) 168 | w = stem; 169 | } 170 | 171 | // Step 5 172 | re = /^(.+?)e$/; 173 | if (re.test(w)) { 174 | var fp = re.exec(w); 175 | stem = fp[1]; 176 | re = new RegExp(mgr1); 177 | re2 = new RegExp(meq1); 178 | re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 179 | if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) 180 | w = stem; 181 | } 182 | re = /ll$/; 183 | re2 = new RegExp(mgr1); 184 | if (re.test(w) && re2.test(w)) { 185 | re = /.$/; 186 | w = w.replace(re,""); 187 | } 188 | 189 | // and turn initial Y back to y 190 | if (firstch == "y") 191 | w = firstch.toLowerCase() + w.substr(1); 192 | return w; 193 | } 194 | } 195 | 196 | 197 | 198 | /** 199 | * Simple result scoring code. 200 | */ 201 | var Scorer = { 202 | // Implement the following function to further tweak the score for each result 203 | // The function takes a result array [filename, title, anchor, descr, score] 204 | // and returns the new score. 205 | /* 206 | score: function(result) { 207 | return result[4]; 208 | }, 209 | */ 210 | 211 | // query matches the full name of an object 212 | objNameMatch: 11, 213 | // or matches in the last dotted part of the object name 214 | objPartialMatch: 6, 215 | // Additive scores depending on the priority of the object 216 | objPrio: {0: 15, // used to be importantResults 217 | 1: 5, // used to be objectResults 218 | 2: -5}, // used to be unimportantResults 219 | // Used when the priority is not in the mapping. 220 | objPrioDefault: 0, 221 | 222 | // query found in title 223 | title: 15, 224 | // query found in terms 225 | term: 5 226 | }; 227 | 228 | 229 | 230 | 231 | 232 | var splitChars = (function() { 233 | var result = {}; 234 | var singles = [96, 180, 187, 191, 215, 247, 749, 885, 903, 907, 909, 930, 1014, 1648, 235 | 1748, 1809, 2416, 2473, 2481, 2526, 2601, 2609, 2612, 2615, 2653, 2702, 236 | 2706, 2729, 2737, 2740, 2857, 2865, 2868, 2910, 2928, 2948, 2961, 2971, 237 | 2973, 3085, 3089, 3113, 3124, 3213, 3217, 3241, 3252, 3295, 3341, 3345, 238 | 3369, 3506, 3516, 3633, 3715, 3721, 3736, 3744, 3748, 3750, 3756, 3761, 239 | 3781, 3912, 4239, 4347, 4681, 4695, 4697, 4745, 4785, 4799, 4801, 4823, 240 | 4881, 5760, 5901, 5997, 6313, 7405, 8024, 8026, 8028, 8030, 8117, 8125, 241 | 8133, 8181, 8468, 8485, 8487, 8489, 8494, 8527, 11311, 11359, 11687, 11695, 242 | 11703, 11711, 11719, 11727, 11735, 12448, 12539, 43010, 43014, 43019, 43587, 243 | 43696, 43713, 64286, 64297, 64311, 64317, 64319, 64322, 64325, 65141]; 244 | var i, j, start, end; 245 | for (i = 0; i < singles.length; i++) { 246 | result[singles[i]] = true; 247 | } 248 | var ranges = [[0, 47], [58, 64], [91, 94], [123, 169], [171, 177], [182, 184], [706, 709], 249 | [722, 735], [741, 747], [751, 879], [888, 889], [894, 901], [1154, 1161], 250 | [1318, 1328], [1367, 1368], [1370, 1376], [1416, 1487], [1515, 1519], [1523, 1568], 251 | [1611, 1631], [1642, 1645], [1750, 1764], [1767, 1773], [1789, 1790], [1792, 1807], 252 | [1840, 1868], [1958, 1968], [1970, 1983], [2027, 2035], [2038, 2041], [2043, 2047], 253 | [2070, 2073], [2075, 2083], [2085, 2087], [2089, 2307], [2362, 2364], [2366, 2383], 254 | [2385, 2391], [2402, 2405], [2419, 2424], [2432, 2436], [2445, 2446], [2449, 2450], 255 | [2483, 2485], [2490, 2492], [2494, 2509], [2511, 2523], [2530, 2533], [2546, 2547], 256 | [2554, 2564], [2571, 2574], [2577, 2578], [2618, 2648], [2655, 2661], [2672, 2673], 257 | [2677, 2692], [2746, 2748], [2750, 2767], [2769, 2783], [2786, 2789], [2800, 2820], 258 | [2829, 2830], [2833, 2834], [2874, 2876], [2878, 2907], [2914, 2917], [2930, 2946], 259 | [2955, 2957], [2966, 2968], [2976, 2978], [2981, 2983], [2987, 2989], [3002, 3023], 260 | [3025, 3045], [3059, 3076], [3130, 3132], [3134, 3159], [3162, 3167], [3170, 3173], 261 | [3184, 3191], [3199, 3204], [3258, 3260], [3262, 3293], [3298, 3301], [3312, 3332], 262 | [3386, 3388], [3390, 3423], [3426, 3429], [3446, 3449], [3456, 3460], [3479, 3481], 263 | [3518, 3519], [3527, 3584], [3636, 3647], [3655, 3663], [3674, 3712], [3717, 3718], 264 | [3723, 3724], [3726, 3731], [3752, 3753], [3764, 3772], [3774, 3775], [3783, 3791], 265 | [3802, 3803], [3806, 3839], [3841, 3871], [3892, 3903], [3949, 3975], [3980, 4095], 266 | [4139, 4158], [4170, 4175], [4182, 4185], [4190, 4192], [4194, 4196], [4199, 4205], 267 | [4209, 4212], [4226, 4237], [4250, 4255], [4294, 4303], [4349, 4351], [4686, 4687], 268 | [4702, 4703], [4750, 4751], [4790, 4791], [4806, 4807], [4886, 4887], [4955, 4968], 269 | [4989, 4991], [5008, 5023], [5109, 5120], [5741, 5742], [5787, 5791], [5867, 5869], 270 | [5873, 5887], [5906, 5919], [5938, 5951], [5970, 5983], [6001, 6015], [6068, 6102], 271 | [6104, 6107], [6109, 6111], [6122, 6127], [6138, 6159], [6170, 6175], [6264, 6271], 272 | [6315, 6319], [6390, 6399], [6429, 6469], [6510, 6511], [6517, 6527], [6572, 6592], 273 | [6600, 6607], [6619, 6655], [6679, 6687], [6741, 6783], [6794, 6799], [6810, 6822], 274 | [6824, 6916], [6964, 6980], [6988, 6991], [7002, 7042], [7073, 7085], [7098, 7167], 275 | [7204, 7231], [7242, 7244], [7294, 7400], [7410, 7423], [7616, 7679], [7958, 7959], 276 | [7966, 7967], [8006, 8007], [8014, 8015], [8062, 8063], [8127, 8129], [8141, 8143], 277 | [8148, 8149], [8156, 8159], [8173, 8177], [8189, 8303], [8306, 8307], [8314, 8318], 278 | [8330, 8335], [8341, 8449], [8451, 8454], [8456, 8457], [8470, 8472], [8478, 8483], 279 | [8506, 8507], [8512, 8516], [8522, 8525], [8586, 9311], [9372, 9449], [9472, 10101], 280 | [10132, 11263], [11493, 11498], [11503, 11516], [11518, 11519], [11558, 11567], 281 | [11622, 11630], [11632, 11647], [11671, 11679], [11743, 11822], [11824, 12292], 282 | [12296, 12320], [12330, 12336], [12342, 12343], [12349, 12352], [12439, 12444], 283 | [12544, 12548], [12590, 12592], [12687, 12689], [12694, 12703], [12728, 12783], 284 | [12800, 12831], [12842, 12880], [12896, 12927], [12938, 12976], [12992, 13311], 285 | [19894, 19967], [40908, 40959], [42125, 42191], [42238, 42239], [42509, 42511], 286 | [42540, 42559], [42592, 42593], [42607, 42622], [42648, 42655], [42736, 42774], 287 | [42784, 42785], [42889, 42890], [42893, 43002], [43043, 43055], [43062, 43071], 288 | [43124, 43137], [43188, 43215], [43226, 43249], [43256, 43258], [43260, 43263], 289 | [43302, 43311], [43335, 43359], [43389, 43395], [43443, 43470], [43482, 43519], 290 | [43561, 43583], [43596, 43599], [43610, 43615], [43639, 43641], [43643, 43647], 291 | [43698, 43700], [43703, 43704], [43710, 43711], [43715, 43738], [43742, 43967], 292 | [44003, 44015], [44026, 44031], [55204, 55215], [55239, 55242], [55292, 55295], 293 | [57344, 63743], [64046, 64047], [64110, 64111], [64218, 64255], [64263, 64274], 294 | [64280, 64284], [64434, 64466], [64830, 64847], [64912, 64913], [64968, 65007], 295 | [65020, 65135], [65277, 65295], [65306, 65312], [65339, 65344], [65371, 65381], 296 | [65471, 65473], [65480, 65481], [65488, 65489], [65496, 65497]]; 297 | for (i = 0; i < ranges.length; i++) { 298 | start = ranges[i][0]; 299 | end = ranges[i][1]; 300 | for (j = start; j <= end; j++) { 301 | result[j] = true; 302 | } 303 | } 304 | return result; 305 | })(); 306 | 307 | function splitQuery(query) { 308 | var result = []; 309 | var start = -1; 310 | for (var i = 0; i < query.length; i++) { 311 | if (splitChars[query.charCodeAt(i)]) { 312 | if (start !== -1) { 313 | result.push(query.slice(start, i)); 314 | start = -1; 315 | } 316 | } else if (start === -1) { 317 | start = i; 318 | } 319 | } 320 | if (start !== -1) { 321 | result.push(query.slice(start)); 322 | } 323 | return result; 324 | } 325 | 326 | 327 | 328 | 329 | /** 330 | * Search Module 331 | */ 332 | var Search = { 333 | 334 | _index : null, 335 | _queued_query : null, 336 | _pulse_status : -1, 337 | 338 | init : function() { 339 | var params = $.getQueryParameters(); 340 | if (params.q) { 341 | var query = params.q[0]; 342 | $('input[name="q"]')[0].value = query; 343 | this.performSearch(query); 344 | } 345 | }, 346 | 347 | loadIndex : function(url) { 348 | $.ajax({type: "GET", url: url, data: null, 349 | dataType: "script", cache: true, 350 | complete: function(jqxhr, textstatus) { 351 | if (textstatus != "success") { 352 | document.getElementById("searchindexloader").src = url; 353 | } 354 | }}); 355 | }, 356 | 357 | setIndex : function(index) { 358 | var q; 359 | this._index = index; 360 | if ((q = this._queued_query) !== null) { 361 | this._queued_query = null; 362 | Search.query(q); 363 | } 364 | }, 365 | 366 | hasIndex : function() { 367 | return this._index !== null; 368 | }, 369 | 370 | deferQuery : function(query) { 371 | this._queued_query = query; 372 | }, 373 | 374 | stopPulse : function() { 375 | this._pulse_status = 0; 376 | }, 377 | 378 | startPulse : function() { 379 | if (this._pulse_status >= 0) 380 | return; 381 | function pulse() { 382 | var i; 383 | Search._pulse_status = (Search._pulse_status + 1) % 4; 384 | var dotString = ''; 385 | for (i = 0; i < Search._pulse_status; i++) 386 | dotString += '.'; 387 | Search.dots.text(dotString); 388 | if (Search._pulse_status > -1) 389 | window.setTimeout(pulse, 500); 390 | } 391 | pulse(); 392 | }, 393 | 394 | /** 395 | * perform a search for something (or wait until index is loaded) 396 | */ 397 | performSearch : function(query) { 398 | // create the required interface elements 399 | this.out = $('#search-results'); 400 | this.title = $('

' + _('Searching') + '

').appendTo(this.out); 401 | this.dots = $('').appendTo(this.title); 402 | this.status = $('

').appendTo(this.out); 403 | this.output = $('