├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── doc ├── Makefile ├── conf.py ├── index.rst └── make.bat ├── faststat ├── __init__.py ├── _faststat.c ├── cache.py ├── faststat.py ├── format.py └── test.py ├── setup.py └── test └── biased_quantile_stream.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Kurt Rose 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include faststat *.py *.c 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | faststat 2 | ======== 3 | 4 | fast online statistics collection 5 | 6 | very simple API: 7 | 8 | ```python 9 | >>> import faststat 10 | >>> stats = faststat.Stats() 11 | >>> stats.add(point) 12 | >>> stats.add(point) 13 | ... 14 | ``` 15 | 16 | 17 | The following properties are accesible on a Stats object: n, min, max, variance, skewness, and kurtosis. 18 | In addition, a Stats object tracks percentiles. 19 | 20 | Performance is pretty good: 0.63 microseconds per point on my machine. (Provided the C module is available.) 21 | 22 | In pure Python mode, performance is about 9 microseconds per point. 23 | 24 | ```python 25 | 0.615999937057 microseconds per point 26 | mean (should be 1) 0.998333953189 27 | kurtosis / reference kurtosis -0.0021881144433 -0.00220621681959 28 | variance / reference variance 0.999219190297 0.999219190297 29 | skewness (should be 0) -0.0071960817771 30 | max, min 5.83625092886 -3.4749002526 31 | m2, m3, m4 999218.191078 -7187.64448532 2993126.28574 32 | 9.00099992752 microseconds per point 33 | mean (should be 1) 0.998333953189 34 | kurtosis / reference kurtosis -0.0021881144433 -0.00220621681959 35 | variance / reference variance 0.999219190297 0.999219190297 36 | skewness (should be 0) -0.0071960817771 37 | max, min 5.83625092886 -3.4749002526 38 | m2, m3, m4 999218.191078 -7187.64448532 2993126.28574 39 | ``` 40 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/faststat.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/faststat.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/faststat" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/faststat" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # faststat documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Apr 14 00:45:48 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | #sys.path.insert(0, os.path.abspath('.')) 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc', 33 | 'sphinx.ext.intersphinx', 34 | 'sphinx.ext.mathjax', 35 | 'sphinx.ext.viewcode', 36 | ] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # The suffix of source filenames. 42 | source_suffix = '.rst' 43 | 44 | # The encoding of source files. 45 | #source_encoding = 'utf-8-sig' 46 | 47 | # The master toctree document. 48 | master_doc = 'index' 49 | 50 | # General information about the project. 51 | project = u'faststat' 52 | copyright = u'2014, Kurt Rose' 53 | 54 | # The version info for the project you're documenting, acts as replacement for 55 | # |version| and |release|, also used in various other places throughout the 56 | # built documents. 57 | # 58 | # The short X.Y version. 59 | version = '0.7' 60 | # The full version, including alpha/beta/rc tags. 61 | release = '0.7' 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | #language = None 66 | 67 | # There are two options for replacing |today|: either, you set today to some 68 | # non-false value, then it is used: 69 | #today = '' 70 | # Else, today_fmt is used as the format for a strftime call. 71 | #today_fmt = '%B %d, %Y' 72 | 73 | # List of patterns, relative to source directory, that match files and 74 | # directories to ignore when looking for source files. 75 | exclude_patterns = ['_build'] 76 | 77 | # The reST default role (used for this markup: `text`) to use for all 78 | # documents. 79 | #default_role = None 80 | 81 | # If true, '()' will be appended to :func: etc. cross-reference text. 82 | #add_function_parentheses = True 83 | 84 | # If true, the current module name will be prepended to all description 85 | # unit titles (such as .. function::). 86 | #add_module_names = True 87 | 88 | # If true, sectionauthor and moduleauthor directives will be shown in the 89 | # output. They are ignored by default. 90 | #show_authors = False 91 | 92 | # The name of the Pygments (syntax highlighting) style to use. 93 | pygments_style = 'sphinx' 94 | 95 | # A list of ignored prefixes for module index sorting. 96 | #modindex_common_prefix = [] 97 | 98 | # If true, keep warnings as "system message" paragraphs in the built documents. 99 | #keep_warnings = False 100 | 101 | 102 | # -- Options for HTML output ---------------------------------------------- 103 | 104 | # The theme to use for HTML and HTML Help pages. See the documentation for 105 | # a list of builtin themes. 106 | html_theme = 'default' 107 | 108 | # Theme options are theme-specific and customize the look and feel of a theme 109 | # further. For a list of options available for each theme, see the 110 | # documentation. 111 | #html_theme_options = {} 112 | 113 | # Add any paths that contain custom themes here, relative to this directory. 114 | #html_theme_path = [] 115 | 116 | # The name for this set of Sphinx documents. If None, it defaults to 117 | # " v documentation". 118 | #html_title = None 119 | 120 | # A shorter title for the navigation bar. Default is the same as html_title. 121 | #html_short_title = None 122 | 123 | # The name of an image file (relative to this directory) to place at the top 124 | # of the sidebar. 125 | #html_logo = None 126 | 127 | # The name of an image file (within the static path) to use as favicon of the 128 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 129 | # pixels large. 130 | #html_favicon = None 131 | 132 | # Add any paths that contain custom static files (such as style sheets) here, 133 | # relative to this directory. They are copied after the builtin static files, 134 | # so a file named "default.css" will overwrite the builtin "default.css". 135 | html_static_path = ['_static'] 136 | 137 | # Add any extra paths that contain custom files (such as robots.txt or 138 | # .htaccess) here, relative to this directory. These files are copied 139 | # directly to the root of the documentation. 140 | #html_extra_path = [] 141 | 142 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 143 | # using the given strftime format. 144 | #html_last_updated_fmt = '%b %d, %Y' 145 | 146 | # If true, SmartyPants will be used to convert quotes and dashes to 147 | # typographically correct entities. 148 | #html_use_smartypants = True 149 | 150 | # Custom sidebar templates, maps document names to template names. 151 | #html_sidebars = {} 152 | 153 | # Additional templates that should be rendered to pages, maps page names to 154 | # template names. 155 | #html_additional_pages = {} 156 | 157 | # If false, no module index is generated. 158 | #html_domain_indices = True 159 | 160 | # If false, no index is generated. 161 | #html_use_index = True 162 | 163 | # If true, the index is split into individual pages for each letter. 164 | #html_split_index = False 165 | 166 | # If true, links to the reST sources are added to the pages. 167 | #html_show_sourcelink = True 168 | 169 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 170 | #html_show_sphinx = True 171 | 172 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 173 | #html_show_copyright = True 174 | 175 | # If true, an OpenSearch description file will be output, and all pages will 176 | # contain a tag referring to it. The value of this option must be the 177 | # base URL from which the finished HTML is served. 178 | #html_use_opensearch = '' 179 | 180 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 181 | #html_file_suffix = None 182 | 183 | # Output file base name for HTML help builder. 184 | htmlhelp_basename = 'faststatdoc' 185 | 186 | 187 | # -- Options for LaTeX output --------------------------------------------- 188 | 189 | latex_elements = { 190 | # The paper size ('letterpaper' or 'a4paper'). 191 | #'papersize': 'letterpaper', 192 | 193 | # The font size ('10pt', '11pt' or '12pt'). 194 | #'pointsize': '10pt', 195 | 196 | # Additional stuff for the LaTeX preamble. 197 | #'preamble': '', 198 | } 199 | 200 | # Grouping the document tree into LaTeX files. List of tuples 201 | # (source start file, target name, title, 202 | # author, documentclass [howto, manual, or own class]). 203 | latex_documents = [ 204 | ('index', 'faststat.tex', u'faststat Documentation', 205 | u'Kurt Rose', 'manual'), 206 | ] 207 | 208 | # The name of an image file (relative to this directory) to place at the top of 209 | # the title page. 210 | #latex_logo = None 211 | 212 | # For "manual" documents, if this is true, then toplevel headings are parts, 213 | # not chapters. 214 | #latex_use_parts = False 215 | 216 | # If true, show page references after internal links. 217 | #latex_show_pagerefs = False 218 | 219 | # If true, show URL addresses after external links. 220 | #latex_show_urls = False 221 | 222 | # Documents to append as an appendix to all manuals. 223 | #latex_appendices = [] 224 | 225 | # If false, no module index is generated. 226 | #latex_domain_indices = True 227 | 228 | 229 | # -- Options for manual page output --------------------------------------- 230 | 231 | # One entry per manual page. List of tuples 232 | # (source start file, name, description, authors, manual section). 233 | man_pages = [ 234 | ('index', 'faststat', u'faststat Documentation', 235 | [u'Kurt Rose'], 1) 236 | ] 237 | 238 | # If true, show URL addresses after external links. 239 | #man_show_urls = False 240 | 241 | 242 | # -- Options for Texinfo output ------------------------------------------- 243 | 244 | # Grouping the document tree into Texinfo files. List of tuples 245 | # (source start file, target name, title, author, 246 | # dir menu entry, description, category) 247 | texinfo_documents = [ 248 | ('index', 'faststat', u'faststat Documentation', 249 | u'Kurt Rose', 'faststat', 'One line description of project.', 250 | 'Miscellaneous'), 251 | ] 252 | 253 | # Documents to append as an appendix to all manuals. 254 | #texinfo_appendices = [] 255 | 256 | # If false, no module index is generated. 257 | #texinfo_domain_indices = True 258 | 259 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 260 | #texinfo_show_urls = 'footnote' 261 | 262 | # If true, do not generate a @detailmenu in the "Top" node's menu. 263 | #texinfo_no_detailmenu = False 264 | 265 | 266 | # Example configuration for intersphinx: refer to the Python standard library. 267 | intersphinx_mapping = {'http://docs.python.org/': None} 268 | 269 | import sys 270 | import os.path 271 | sys.path = [os.path.dirname(os.path.dirname(os.path.abspath(__file__)))] + sys.path 272 | 273 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | faststat 2 | ======== 3 | 4 | faststat is a *streaming*, *light-weight* statistics library designed for embedding 5 | in other Python applications. *Steaming* means 6 | that faststat operates on data points as they arrive, without needing to store 7 | previous data points. *Light-weight* means that statistics do not take up a great 8 | deal of CPU or RAM. Adding a data point to a stat object is a 0.5 - 3 microsecond operation. 9 | Each stat object takes about 4kiB of memory. 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | Basic usage 15 | ----------- 16 | The most basic usage of faststat is to create a Stats object to represent some continuous 17 | variable, and then add data points to it. 18 | 19 | :: 20 | 21 | >>> import faststat 22 | >>> s = faststat.Stats() 23 | >>> for i in range(100): 24 | ... s.add(i) 25 | ... 26 | >>> s 27 | 28 | 29 | Collected data 30 | -------------- 31 | The following data is collected for each point. A data point is considered to be `(x, t)` 32 | where `x` is a floating point value, and `t` is the system clock at the time the data was passed 33 | to faststat. 34 | 35 | =============== ==================================================================================== 36 | attribute description 37 | =============== ==================================================================================== 38 | n The number of data points. 39 | 40 | mean The arithmetic mean, also known as expected value E(x) or 41 | :math:`\bar{x}`. Defined as :math:`\bar{x} = \frac{x_1 + x_2 + ... + x_n}{n}` 42 | 43 | max The largest value seen. 44 | 45 | maxtime The time of the largest data point. 46 | 47 | min The smallest value seen. 48 | 49 | mintime The time of the smallest data point. 50 | 51 | lasttime The time of the most recent data point. 52 | 53 | percentiles A dictionary of approximate percentiles. 54 | 55 | buckets Counts of data points which have occurred in different ranges. 56 | Essentially logarithmic-scale histogram data. 57 | 58 | variance The variance. See http://en.wikipedia.org/wiki/Variance 59 | 60 | skewness The skewness. See http://en.wikipedia.org/wiki/Skewness 61 | 62 | kurtosis The kurtosis. See http://en.wikipedia.org/wiki/Kurtosis 63 | 64 | geometric_mean The geometric mean, or NaN if any points are <= 0. 65 | See http://en.wikipedia.org/wiki/Geometric_mean 66 | 67 | harmonic_mean The harmonic mean, or NaN if any points are <= 0. 68 | See http://en.wikipedia.org/wiki/Harmonic_mean 69 | 70 | expo_avgs A dictionary mapping exponential decay factors to current values. 71 | See http://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average 72 | 73 | get_prev() The most recent data points. 74 | 75 | get_topN() The largest data points 76 | 77 | window_avg The mean of the data points in get_prev(). 78 | =============== ==================================================================================== 79 | 80 | 81 | Error 82 | ----- 83 | 84 | There is no proven error bound on the algorithm used to calculate percentiles. However, empirically 85 | the error is observed to be low. It is worth noting that the percentile algorithm used (P2) performs 86 | interpolation of values. Therefore for a sequence consisting of ~50% 1's and ~50% 2's, the algorithm 87 | would report a median around 1.5. 88 | 89 | 90 | Examples 91 | -------- 92 | 93 | Tracking the average number of items present each time a new item was added. 94 | 95 | .. code-block:: python 96 | 97 | import collections 98 | import faststat 99 | 100 | class Queue(object): 101 | def __init__(self): 102 | self.deq = collections.deque() 103 | self.put_stats = faststat.Stats() 104 | 105 | def put(self, item): 106 | self.put_stats.add(len(self.deq)) 107 | self.deq.append(item) 108 | 109 | def get(self): 110 | return self.deq.popleft() 111 | 112 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\faststat.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\faststat.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /faststat/__init__.py: -------------------------------------------------------------------------------- 1 | from faststat import * 2 | from format import * 3 | -------------------------------------------------------------------------------- /faststat/_faststat.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | //define gettime() which returns integral nanoseconds since epoch 8 | //as a 64 bit integer for a variety of platforms 9 | #ifdef _WIN32 10 | //windows has its own brand of time keeping functions 11 | #include 12 | #define DELTA_EPOCH_IN_SECS 11644473600ULL 13 | //difference between Jan 1, 1601 and Jan 1, 1970 (unix epoch) 14 | 15 | static unsigned long long _nanotime(void) { 16 | FILETIME ft; 17 | ULARGE_INTEGER result; 18 | GetSystemTimeAsFileTime(&ft); //returns time in 100ns intervals since Jan 1, 1601 19 | result.HighPart = ft.dwHighDateTime; 20 | result.LowPart = ft.dwLowDateTime; 21 | result.QuadPart -= DELTA_EPOCH_IN_SECS * 10000000ULL; // 1000 (ms) * 1000 (us) * 10 (100ns) 22 | return result.QuadPart * 100; 23 | } 24 | 25 | // for old versions of MSVC which do not include NAN macro 26 | #ifndef NAN 27 | static const unsigned long __nan[2] = {0xffffffff, 0x7fffffff}; 28 | #define NAN (*(const float *) __nan) 29 | #endif 30 | 31 | #elif defined linux 32 | //linux has clock_gettime(CLOCK_REALTIME) which is ns since epoch -- perfect 33 | #include 34 | 35 | static unsigned long long _nanotime(void) { 36 | struct timespec ts; 37 | if(clock_gettime(CLOCK_REALTIME, &ts) == -1) { 38 | return 0; 39 | } 40 | return 1000000000ULL * ts.tv_sec + ts.tv_nsec; 41 | } 42 | 43 | #else 44 | //for those oddballs like OSX and BSD, fall back on gettimeofday() which is at least microseconds 45 | #include 46 | 47 | static unsigned long long _nanotime(void) { 48 | struct timeval tv; 49 | if(gettimeofday(&tv, NULL) == -1) { 50 | return 0; 51 | } 52 | return tv.tv_sec * 1000000000ULL + tv.tv_usec * 1000ULL; 53 | } 54 | 55 | #endif 56 | 57 | static unsigned long long nanotime_override = 0; 58 | 59 | static unsigned long long nanotime(void) { 60 | if(nanotime_override) { 61 | return nanotime_override; 62 | } 63 | return _nanotime(); 64 | } 65 | 66 | //percentile point for usage in P2 algorithm 67 | typedef struct { 68 | unsigned short percentile; //divide by 0xFFFF to get a float between 0 and 1 69 | double val; //estimate of current percentile value 70 | unsigned int n; //estimate of how many values were less than this 71 | } faststat_P2Percentile; 72 | 73 | 74 | typedef struct { 75 | float max; 76 | unsigned int count; 77 | } faststat_Bucket; 78 | 79 | 80 | // keeping this size a power of 2 makes pointer arithmetic in heap more efficient 81 | typedef struct { 82 | double value; 83 | unsigned long long nanotime; 84 | } faststat_DataPoint; 85 | 86 | // represents an exponential moving average 87 | // aka a low pass filter, infinite impulse response filter 88 | typedef struct { 89 | double val; 90 | double alpha; 91 | } faststat_ExpoAvg; 92 | 93 | 94 | // represents a count for a given interval, 95 | // aligned on unix epoch 96 | typedef struct { 97 | unsigned short num_windows; // number of counts -- MUST BE A POWER OF 2 98 | unsigned long long window_size_nanosecs; // size of each window in seconds 99 | unsigned int *counts; // counts for the previous num_windows intervals 100 | } faststat_WindowCount; 101 | 102 | 103 | // for representing a normally distributed variable 104 | typedef struct faststat_Stats_struct { 105 | PyObject_HEAD 106 | unsigned long long n; 107 | double mean, min, max, m2, m3, m4; 108 | double sum_of_logs, sum_of_inv; // for geometric and harmonic mean 109 | unsigned long long mintime, maxtime, lasttime; 110 | unsigned int num_percentiles; 111 | faststat_P2Percentile *percentiles; 112 | unsigned int num_buckets; 113 | faststat_Bucket *buckets; // last bucket MUST BE +inf 114 | unsigned int num_expo_avgs; 115 | faststat_ExpoAvg *expo_avgs; 116 | double window_avg; 117 | unsigned int num_prev; // MUST BE A POWER OF 2! 118 | faststat_DataPoint *lastN; 119 | unsigned int num_top; // MUST BE A POWER OF 2! 120 | faststat_DataPoint *topN; 121 | unsigned int num_window_counts; 122 | //window counts must be sorted by window_size, to 123 | //make handling code cleaner/smaller 124 | faststat_WindowCount *window_counts; 125 | struct faststat_Stats_struct *interval; 126 | } faststat_Stats; 127 | 128 | /* 129 | typedef struct { 130 | unsigned int n; 131 | unsigned int num_prev; 132 | 133 | } faststat_StatsGroup; 134 | */ 135 | 136 | char* NEW_ARGS[] = {"buckets", "lastN", "percentiles", "interval", "expo_avgs", 137 | "window_counts", "num_top", NULL}; 138 | 139 | 140 | static PyObject* faststat_Stats_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { 141 | faststat_Stats *self; 142 | PyObject *buckets, *percentiles, *interval, *expo_avgs, *window_counts, *cur; 143 | int num_prev, num_buckets, num_percentiles, num_expo_avgs, num_window_counts, num_top; 144 | int i, total, offset; 145 | double temp; 146 | if(!PyArg_ParseTupleAndKeywords(args, kwds, "OiOOOOi", NEW_ARGS, 147 | &buckets, &num_prev, &percentiles, &interval, &expo_avgs, &window_counts, &num_top)) { 148 | return NULL; 149 | } 150 | 151 | buckets = PySequence_Fast(buckets, "expected a sequence"); 152 | percentiles = PySequence_Fast(percentiles, "expected a sequence"); 153 | expo_avgs = PySequence_Fast(expo_avgs, "expected a sequence"); 154 | window_counts = PySequence_Fast(window_counts, "expected a sequence"); 155 | if(!buckets || !percentiles || !expo_avgs || !window_counts) { 156 | // TODO: decref on buckets and percentiles 157 | return NULL; 158 | } 159 | num_buckets = (int)PySequence_Fast_GET_SIZE(buckets); 160 | num_percentiles = (int)PySequence_Fast_GET_SIZE(percentiles); 161 | num_expo_avgs = (int)PySequence_Fast_GET_SIZE(expo_avgs); 162 | num_window_counts = (int)PySequence_Fast_GET_SIZE(window_counts); 163 | 164 | self = (faststat_Stats*)type->tp_alloc(type, 0); 165 | if(self != NULL) { 166 | self->interval = NULL; 167 | self->n = 0; 168 | self->mean = self->m2 = self->m3 = self->m4 = self->min = self->max = 0; 169 | self->sum_of_logs = self->sum_of_inv = 0; 170 | self->mintime = self->maxtime = self->lasttime = 0; 171 | self->num_percentiles = num_percentiles; 172 | if(interval != Py_None ) { 173 | self->interval = (faststat_Stats*)interval; // WARNING: incompatible pointer type.. 174 | } else { // TODO: figure out a better test of type here 175 | self->interval = NULL; 176 | } 177 | if(num_percentiles) { 178 | self->percentiles = PyMem_New(faststat_P2Percentile, num_percentiles); 179 | for(i=0; ipercentiles[i].percentile = (unsigned short)(temp * 0x10000); 182 | self->percentiles[i].val = 0; 183 | self->percentiles[i].n = i + 1; 184 | } 185 | } else { 186 | self->percentiles = NULL; 187 | } 188 | self->num_buckets = num_buckets; 189 | if(num_buckets) { 190 | self->buckets = PyMem_New(faststat_Bucket, num_buckets); 191 | for(i=0; ibuckets[i].count = 0; 193 | self->buckets[i].max = (float)PyFloat_AsDouble(PySequence_Fast_GET_ITEM(buckets, i)); 194 | // don't bother checking for error; let it raise later 195 | } 196 | } else { 197 | self->buckets = NULL; 198 | } 199 | self->num_expo_avgs = num_expo_avgs; 200 | if(num_expo_avgs) { 201 | self->expo_avgs = PyMem_New(faststat_ExpoAvg, num_expo_avgs); 202 | for(i=0; iexpo_avgs[i].val = 0; 204 | self->expo_avgs[i].alpha = (double)PyFloat_AsDouble(PySequence_Fast_GET_ITEM(expo_avgs, i)); 205 | } 206 | } else { 207 | self->expo_avgs = NULL; 208 | } 209 | self->num_prev = num_prev; 210 | if(num_prev) { 211 | self->lastN = PyMem_New(faststat_DataPoint, num_prev); 212 | for(i=0; ilastN[i].value = 0; 214 | self->lastN[i].nanotime = 0; 215 | } 216 | } else { 217 | self->lastN = NULL; 218 | } 219 | if(num_top == 0) { 220 | num_top = 1; 221 | } 222 | self->num_top = num_top; 223 | self->topN = PyMem_New(faststat_DataPoint, num_top); 224 | memset(self->topN, 0, sizeof(faststat_DataPoint) * num_top); 225 | self->topN -= 1; //use 1 based indexing 226 | 227 | self->num_window_counts = num_window_counts; 228 | if(num_window_counts) { 229 | self->window_counts = PyMem_New(faststat_WindowCount, num_window_counts); 230 | PyList_Sort(window_counts); 231 | total = 0; 232 | for(i=0; iwindow_counts[i].num_windows), 239 | &(self->window_counts[i].window_size_nanosecs)); 240 | total += self->window_counts[i].num_windows; 241 | } 242 | // allocate all of the window counts as one contiguous block 243 | self->window_counts[0].counts = PyMem_New(unsigned int, total); 244 | memset(self->window_counts[0].counts, 0, sizeof(unsigned int) * total); 245 | offset = self->window_counts[0].num_windows; 246 | for(i=1; iwindow_counts[i].counts = self->window_counts[0].counts + offset; 248 | offset += self->window_counts[i].num_windows; 249 | } 250 | } else { 251 | self->window_counts = NULL; 252 | } 253 | } 254 | 255 | if(PyErr_Occurred()) { 256 | Py_DECREF(self); 257 | return NULL; 258 | } 259 | 260 | return (PyObject*) self; 261 | } 262 | 263 | 264 | static void faststat_Stats_dealloc(faststat_Stats* self) { 265 | if(self->percentiles) { 266 | PyMem_Del(self->percentiles); 267 | } 268 | if(self->buckets) { 269 | PyMem_Del(self->buckets); 270 | } 271 | if(self->expo_avgs) { 272 | PyMem_Del(self->expo_avgs); 273 | } 274 | if(self->lastN) { 275 | PyMem_Del(self->lastN); 276 | } 277 | if(self->topN) { 278 | PyMem_Del(self->topN + 1); // undo 1-based indexing 279 | } 280 | if(self->window_counts) { 281 | // see constructor; all window_counts are allocated as one chunk 282 | PyMem_Del(self->window_counts[0].counts); 283 | PyMem_Del(self->window_counts); 284 | } 285 | self->ob_type->tp_free((PyObject*)self); 286 | } 287 | 288 | #define STR_VAL(arg) #arg 289 | #define DBL_MEMBER(name, description) {STR_VAL(name), T_DOUBLE, offsetof(faststat_Stats, name), 0, description} 290 | #define DBL_MEMBER1(name) {STR_VAL(name), T_DOUBLE, offsetof(faststat_Stats, name), 0, STR_VAL(name)} 291 | #define MEMBER(name, type, description) {STR_VAL(name), type, offsetof(faststat_Stats, name), 0, description} 292 | static PyMemberDef faststat_Stats_members[] = { 293 | MEMBER(n, T_UINT, "number of points"), 294 | DBL_MEMBER1(mean), DBL_MEMBER1(min), DBL_MEMBER1(max), 295 | DBL_MEMBER(sum_of_logs, "sum of logs of values, for geometric mean; NaN if undefined"), 296 | DBL_MEMBER(sum_of_inv, "sum of inverses or values, for harmonic mean; NaN if undefined"), 297 | MEMBER(lasttime, T_ULONGLONG, "time (in nanoseconds since epoch) of last point"), 298 | MEMBER(mintime, T_ULONGLONG, "time (in nanoseconds since epoch) of min value"), 299 | MEMBER(maxtime, T_ULONGLONG, "time (in nanoseconds since epoch) of max value"), 300 | DBL_MEMBER1(m2), DBL_MEMBER1(m3), DBL_MEMBER1(m4), 301 | MEMBER(interval, T_OBJECT, "another Stat object which measures the time interval between data points"), 302 | DBL_MEMBER(window_avg, "average of stored most recent data points"), 303 | MEMBER(num_prev, T_ULONG, "number of most recent data points stored (accessible via get_prev() )"), 304 | {NULL} 305 | }; 306 | #undef MEMBER 307 | #undef DBL_MEMBER 308 | #undef DBL_MEMBER1 309 | #undef STR_VAL 310 | 311 | 312 | //update mean, and second third and fourth moments 313 | static void _update_moments(faststat_Stats *self, double x) { 314 | double n, delta, delta_n, delta_m2, delta_m3, delta_m4; 315 | n = (double)self->n; // note: math with 32 bit ints can cause problems 316 | //pre-compute a bunch of intermediate values 317 | delta = x - self->mean; 318 | delta_n = delta / n; 319 | delta_m2 = delta * delta_n * (n - 1); 320 | delta_m3 = delta_m2 * delta_n * (n - 2); 321 | delta_m4 = delta_m2 * delta_n * delta_n * (n * (n - 3) + 3); 322 | //compute updated values 323 | self->mean = self->mean + delta_n; 324 | //note: order matters here 325 | self->m4 += delta_m4 + delta_n * (6 * delta_n * self->m2 - 4 * self->m3); 326 | self->m3 += delta_m3 + delta_n * 3 * self->m2; 327 | self->m2 += delta_m2; 328 | } 329 | 330 | 331 | //helper for _update_percentiles 332 | static void _p2_update_point(double l_v, double l_n, faststat_P2Percentile *cur, 333 | double r_v, double r_n, unsigned long long n) { 334 | int d; 335 | double percentile, new_val, c_v, c_n, diff; 336 | percentile = ((double)cur->percentile) / 0x10000; 337 | c_n = cur->n; 338 | diff = (n - 1) * percentile + 1 - c_n; 339 | // clamp d at +/- 1 340 | if(diff >= 1) { 341 | d = 1; 342 | } else if(diff <= -1) { 343 | d = -1; 344 | } else { 345 | return; 346 | } 347 | c_v = cur->val; 348 | if(l_n < c_n + d && c_n + d < r_n) { // try updating estimate with parabolic 349 | new_val = c_v + (d / (r_n - l_n)) * ( 350 | (c_n - l_n + d) * (r_v - c_v) / (r_n - c_n) + 351 | (r_n - c_n - d) * (c_v - l_v) / (c_n - l_n)); 352 | if(l_v >= new_val || r_v <= new_val) { // fall back on linear 353 | if(d == 1) { 354 | new_val = c_v + (r_v - c_v) / (r_n - c_n); 355 | } else { // d == -1 356 | new_val = c_v - (l_v - c_v) / (l_n - c_n); 357 | } 358 | } 359 | cur->val = new_val; 360 | cur->n += d; 361 | } 362 | } 363 | 364 | 365 | static void _insert_percentile_sorted(faststat_Stats *self, double x) { 366 | unsigned long long num, i; // prevent loss of precision compiler warning 367 | double tmp; 368 | num = self->n < self->num_percentiles ? self->n : self->num_percentiles; 369 | for(i = 0; i < num-1; i++) { //insert in sorted order 370 | if(x < self->percentiles[i].val) { 371 | tmp = x; 372 | x = self->percentiles[i].val; 373 | self->percentiles[i].val = tmp; 374 | } 375 | } 376 | self->percentiles[num-1].val = x; 377 | } 378 | 379 | 380 | //update percentiles using piece-wise parametric algorithm 381 | static void _update_percentiles(faststat_Stats *self, double x) { 382 | unsigned int i; 383 | //double percentile; //TODO: remove me 384 | faststat_P2Percentile *right, *left, *cur, *prev, *nxt; 385 | right = &(self->percentiles[self->num_percentiles-1]); 386 | left = &(self->percentiles[0]); 387 | if(!(right->n < self->n) ) { // just insert until self->n > self->num_percentiles 388 | _insert_percentile_sorted(self, x); 389 | return; 390 | } 391 | //right-most is stopping case; handle first 392 | if(x < right->val && right->n + 1 < self->n) { 393 | right->n++; 394 | } 395 | //handle the rest of the points 396 | prev = right; 397 | for(i = self->num_percentiles-2; ; i--) { 398 | cur = &(self->percentiles[i]); 399 | if(x < cur->val && cur->n + 1 < prev->n) { 400 | cur->n++; 401 | } 402 | prev = cur; 403 | if(i == 0) { //making i unsigned fixes some warnings 404 | break; 405 | } 406 | } 407 | //left-most point is a special case 408 | nxt = &(self->percentiles[1]); 409 | _p2_update_point(self->min, 0, left, nxt->val, nxt->n, self->n); 410 | cur = left; 411 | for(i=1; i < self->num_percentiles - 1; i++) { 412 | prev = cur; 413 | cur = nxt; 414 | nxt = &(self->percentiles[i+1]); 415 | _p2_update_point(prev->val, prev->n, cur, nxt->val, nxt->n, self->n); 416 | } 417 | _p2_update_point(cur->val, cur->n, right, (double)self->max, (double)self->n, self->n); 418 | } 419 | 420 | // be careful; if-condition must properly terminate when max == +inf, even for nan and +inf 421 | #define OFFSET(n) if(!(x >= self->buckets[i+n].max)) { self->buckets[i+n].count++; break; } 422 | 423 | static void _update_buckets(faststat_Stats *self, double x) { 424 | unsigned int i; 425 | for(i=0; ; i += 16) { 426 | OFFSET( 0) OFFSET( 1) OFFSET( 2) OFFSET( 3) 427 | OFFSET( 4) OFFSET( 5) OFFSET( 6) OFFSET( 7) 428 | OFFSET( 8) OFFSET( 9) OFFSET(10) OFFSET(11) 429 | OFFSET(12) OFFSET(13) OFFSET(14) OFFSET(15) 430 | } 431 | } 432 | 433 | #undef OFFSET 434 | 435 | static void _update_lastN(faststat_Stats *self, double x) { 436 | unsigned int offset; 437 | if(self->num_prev == 0) { return; } 438 | offset = (self->n - 1) & (self->num_prev - 1); 439 | self->window_avg -= self->lastN[offset].value / (1.0 * self->num_prev); 440 | self->window_avg += x / (1.0 * self->num_prev); 441 | self->lastN[offset].value = x; 442 | self->lastN[offset].nanotime = self->lasttime; 443 | } 444 | 445 | 446 | //this algorithm deterministically favors storing newer values over older values 447 | static void _update_topN(faststat_Stats *self, double x, unsigned long long t) { 448 | faststat_DataPoint *cur, *left, *right, *end, *min, *topN; 449 | unsigned int cur_i, left_i, right_i; 450 | // uses one based indexing to save additions when navigating down heap 451 | topN = self->topN; 452 | if(x < topN[1].value) { 453 | return; 454 | } 455 | // replace the smallest element of the topN with the new point 456 | topN[1].value = x; 457 | topN[1].nanotime = t; 458 | // restore the heap condition 459 | cur = topN + 1; //use pointers instead of array indices 460 | cur_i = 1; 461 | left = topN + 2; 462 | left_i = 2; 463 | right = topN + 3; 464 | right_i = 3; 465 | end = topN + 1 + self->num_top; 466 | while(right < end) { 467 | if(left->value == right->value) { 468 | min = left->nanotime > right->nanotime ? left : right; 469 | } else { 470 | min = left->value < right->value ? left : right; 471 | } 472 | if(cur->value < min->value) { 473 | break; 474 | } 475 | // swap cur with min of left, right 476 | x = min->value; 477 | t = min->nanotime; 478 | min->value = cur->value; 479 | min->nanotime = cur->nanotime; 480 | cur->value = x; 481 | cur->nanotime = t; 482 | // set up for the next layer of the heap 483 | cur = min; 484 | cur_i = min == left ? left_i : right_i; 485 | left_i = cur_i * 2; 486 | right_i = left_i + 1; 487 | left = topN + left_i; 488 | right = topN + right_i; 489 | } 490 | } 491 | 492 | 493 | static void _update_expo_avgs(faststat_Stats *self, double x) { 494 | unsigned int i; 495 | double val, alpha; 496 | for(i = 0; i < self->num_expo_avgs; i++) { 497 | val = self->expo_avgs[i].val; 498 | alpha = self->expo_avgs[i].alpha; 499 | // this equation ensures no "gain" 500 | val = x * alpha + val * (1 - alpha); 501 | self->expo_avgs[i].val = val; 502 | } 503 | } 504 | 505 | // re-zero all of the windows which have been "missed" between self->lasttime and t 506 | static void _rezero_window_counts(faststat_Stats *self, unsigned long long t) { 507 | faststat_WindowCount *cur; 508 | unsigned int i; 509 | unsigned long long j, last_window, cur_window; 510 | for(i = 0; i < self->num_window_counts; i++) { 511 | cur = &(self->window_counts[i]); 512 | last_window = self->lasttime / cur->window_size_nanosecs; 513 | cur_window = t / cur->window_size_nanosecs; 514 | if(last_window == cur_window) { 515 | break; // because window_counts are sorted by window_size_nanosecs, 516 | } // the next window cannot miss unless the current one does 517 | if(cur_window - last_window >= cur->num_windows) { 518 | memset(cur->counts, 0, sizeof(*cur->counts) * cur->num_windows); 519 | continue; // if the entire array is getting zero'd, just use memset 520 | } 521 | // TODO: convert this to a memset instead of a loop (perhaps) 522 | for(j = last_window + 1; j <= cur_window; j++) { 523 | cur->counts[j & (cur->num_windows - 1)] = 0; 524 | } // zero out all the "missed" windows 525 | } 526 | } 527 | 528 | 529 | static void _update_window_counts(faststat_Stats *self, unsigned long long t) { 530 | faststat_WindowCount *cur; 531 | unsigned int i, cur_count; 532 | _rezero_window_counts(self, t); 533 | // step 2 -- increment current counts 534 | for(i = 0; i < self->num_window_counts; i++) { 535 | cur = &(self->window_counts[i]); 536 | // use the current time as the index into the circular array to save some memory 537 | cur_count = (t / cur->window_size_nanosecs) & (cur->num_windows - 1); 538 | ++(cur->counts[cur_count]); 539 | } 540 | } 541 | 542 | 543 | static void _add(faststat_Stats *self, double x, unsigned long long t) { 544 | //update extremely basic values: number, min, and max 545 | self->lasttime = t; 546 | self->n++; 547 | if(self->n == 1) { 548 | self->min = self->max = x; 549 | self->mintime = self->maxtime = self->lasttime; 550 | } 551 | if(x <= self->min) { 552 | self->mintime = self->lasttime; 553 | self->min = x; 554 | } 555 | if(x >= self->max) { 556 | self->maxtime = self->lasttime; 557 | self->max = x; 558 | } 559 | // TODO: any platforms not support the NAN macro? 560 | self->sum_of_logs += x > 0 ? log(x) : NAN; 561 | self->sum_of_inv += x > 0 ? 1 / x : NAN; 562 | _update_moments(self, x); 563 | _update_percentiles(self, x); 564 | _update_buckets(self, x); 565 | _update_expo_avgs(self, x); 566 | _update_lastN(self, x); 567 | _update_window_counts(self, t); 568 | _update_topN(self, x, t); 569 | } 570 | 571 | 572 | static PyObject* faststat_Stats_add(faststat_Stats *self, PyObject *args) { 573 | //visual studios hates in-line variable declaration 574 | double x; 575 | unsigned long long t; 576 | x = 0; 577 | t = 0; 578 | if(PyArg_ParseTuple(args, "d", &x)) { 579 | t = nanotime(); 580 | if(self->interval && self->lasttime) { 581 | unsigned long long interval; 582 | interval = t - self->lasttime; 583 | if(interval == 0) { 584 | interval = 1; 585 | } 586 | // ensure interval is at least 1 nanosecond to not mess up 587 | // harmonic and geometric mean (1 ns is noise on this scale) 588 | _add(self->interval, (double)(interval), t); 589 | } 590 | _add(self, x, t); 591 | } 592 | if(PyErr_Occurred()) { return NULL; } 593 | Py_INCREF(Py_None); 594 | return Py_None; 595 | } 596 | 597 | 598 | static PyObject* faststat_Stats_end(faststat_Stats *self, PyObject *args) { 599 | unsigned long long end; 600 | unsigned long long start; 601 | end = start = 0; 602 | if(PyArg_ParseTuple(args, "K", &start)) { 603 | end = nanotime(); 604 | if(self->interval && self->lasttime) { 605 | _add(self->interval, (double)(end - self->lasttime), end); 606 | } 607 | _add(self, (double)(end - start), end); 608 | } 609 | if(PyErr_Occurred()) { return NULL; } 610 | Py_INCREF(Py_None); 611 | return Py_None; 612 | } 613 | 614 | 615 | static PyObject* faststat_Stats_tick(faststat_Stats *self, PyObject *args) { 616 | //tricky part is how to handle the first tick 617 | // weird part will be that calling tick N times results in N-1 data points 618 | unsigned long long t; 619 | t = nanotime(); 620 | if(self->lasttime) { 621 | _add(self, (double)(t - self->lasttime), t); 622 | } else { 623 | self->lasttime = t; 624 | } 625 | if(PyErr_Occurred()) { return NULL; } 626 | Py_INCREF(Py_None); 627 | return Py_None; 628 | } 629 | 630 | 631 | static PyObject* faststat_Stats_get_percentiles(faststat_Stats* self, PyObject *args) { 632 | PyObject *p_dict; 633 | faststat_P2Percentile *cur; 634 | double cur_val; 635 | unsigned int i; 636 | p_dict = PyDict_New(); 637 | for(i = 0; i < self->num_percentiles; i++) { 638 | cur = &(self->percentiles[i]); 639 | cur_val = ((double)cur->percentile) / 0x10000; 640 | cur_val = floor(10000 * cur_val + 0.5) / 10000; 641 | //re-round to handle slop from being 16 bit number 642 | // (note: windows math.h does not include round; use floor) 643 | PyDict_SetItem( 644 | p_dict, 645 | PyFloat_FromDouble(cur_val), 646 | PyFloat_FromDouble(cur->val)); 647 | } 648 | if(PyErr_Occurred()) { 649 | Py_DECREF(p_dict); 650 | return NULL; 651 | } 652 | return p_dict; 653 | } 654 | 655 | 656 | static PyObject* faststat_Stats_get_buckets(faststat_Stats* self, PyObject *args) { 657 | PyObject *b_dict; 658 | faststat_Bucket *cur; 659 | unsigned int i; 660 | unsigned long long leftover; 661 | leftover = self->n; 662 | b_dict = PyDict_New(); 663 | for(i = 0; i < self->num_buckets; i++) { 664 | cur = &(self->buckets[i]); 665 | leftover -= cur->count; 666 | PyDict_SetItem( 667 | b_dict, 668 | PyFloat_FromDouble(cur->max), 669 | PyLong_FromUnsignedLongLong(cur->count)); 670 | } 671 | PyDict_SetItem(b_dict, Py_None, PyLong_FromUnsignedLongLong(leftover)); 672 | if(PyErr_Occurred()) { 673 | Py_DECREF(b_dict); 674 | return NULL; 675 | } 676 | return b_dict; 677 | } 678 | 679 | 680 | static PyObject* faststat_Stats_get_expoavgs(faststat_Stats *self, PyObject *args) { 681 | PyObject *b_dict; 682 | faststat_ExpoAvg *cur; 683 | unsigned int i; 684 | b_dict = PyDict_New(); 685 | for(i = 0; i < self->num_expo_avgs; i++) { 686 | cur = &(self->expo_avgs[i]); 687 | PyDict_SetItem(b_dict, 688 | PyFloat_FromDouble(cur->alpha), 689 | PyFloat_FromDouble(cur->val)); 690 | } 691 | return b_dict; 692 | } 693 | 694 | 695 | static PyObject* faststat_Stats_get_prev(faststat_Stats *self, PyObject *args) { 696 | int offset; 697 | double val; 698 | PyObject *tuple, *pyval, *pytime; 699 | unsigned long long nanotime; 700 | if(self->num_prev == 0) { 701 | Py_INCREF(Py_None); 702 | return Py_None; 703 | } 704 | 705 | offset = 0; 706 | if(PyArg_ParseTuple(args, "i", &offset)) { 707 | offset = ((self->n - 1) + (self->num_prev - offset)) & (self->num_prev - 1); 708 | val = self->lastN[offset].value; 709 | nanotime = self->lastN[offset].nanotime; 710 | pyval = PyFloat_FromDouble(val); 711 | pytime = PyLong_FromUnsignedLongLong(nanotime); 712 | if(pyval != NULL && pytime != NULL) { 713 | tuple = PyTuple_Pack(2, pytime, pyval); 714 | if(tuple != NULL) { 715 | return tuple; 716 | } 717 | } 718 | } 719 | if(PyErr_Occurred()) { return NULL; } 720 | Py_INCREF(Py_None); 721 | return Py_None; 722 | } 723 | 724 | 725 | static PyObject* faststat_Stats_get_topN(faststat_Stats *self, PyObject *args) { 726 | PyObject *ret; 727 | unsigned int i; 728 | ret = PyList_New(self->num_top); 729 | for(i=1; inum_top + 1; i++) { 730 | PyList_SetItem(ret, i - 1, Py_BuildValue( 731 | "(dK)", self->topN[i].value, self->topN[i].nanotime)); 732 | } 733 | if(PyErr_Occurred()) { 734 | Py_DECREF(ret); 735 | return NULL; 736 | } 737 | return ret; 738 | } 739 | 740 | 741 | static PyObject* faststat_Stats_get_window_counts(faststat_Stats *self, PyObject *args) { 742 | unsigned long long t; 743 | PyObject *window_count_dict, *cur_items; 744 | faststat_WindowCount *cur; 745 | unsigned long long i, j, cur_window; 746 | t = nanotime(); 747 | _rezero_window_counts(self, t); 748 | window_count_dict = PyDict_New(); 749 | for(i = 0; i < self->num_window_counts; i++) { 750 | cur = self->window_counts + i; 751 | cur_items = PyTuple_New(cur->num_windows); 752 | cur_window = t / cur->window_size_nanosecs; 753 | for(j = 0; j < cur->num_windows; j++) { 754 | PyTuple_SetItem( 755 | cur_items, (Py_ssize_t)j, 756 | PyLong_FromUnsignedLong( 757 | cur->counts[(j + cur_window) & (cur->num_windows - 1)])); 758 | } 759 | PyDict_SetItem( 760 | window_count_dict, 761 | PyLong_FromUnsignedLongLong(cur->window_size_nanosecs), 762 | cur_items); 763 | } 764 | if(PyErr_Occurred()) { 765 | Py_DECREF(window_count_dict); 766 | return NULL; 767 | } 768 | return window_count_dict; 769 | } 770 | 771 | 772 | static PyMethodDef faststat_Stats_methods[] = { 773 | {"add", (PyCFunction)faststat_Stats_add, METH_VARARGS, "add a data point"}, 774 | {"end", (PyCFunction)faststat_Stats_end, METH_VARARGS, 775 | "add a duration data point, whose start time is passed"}, 776 | {"tick", (PyCFunction)faststat_Stats_tick, METH_NOARGS, 777 | "add an interval data point between now and the last tick"}, 778 | {"get_percentiles", (PyCFunction)faststat_Stats_get_percentiles, METH_NOARGS, 779 | "construct percentiles dictionary"}, 780 | {"get_buckets", (PyCFunction)faststat_Stats_get_buckets, METH_NOARGS, 781 | "construct buckets dictionary"}, 782 | {"get_expo_avgs", (PyCFunction)faststat_Stats_get_expoavgs, METH_NOARGS, 783 | "get a dictionary of decay rates to previous averages"}, 784 | {"get_prev", (PyCFunction)faststat_Stats_get_prev, METH_VARARGS, 785 | "get the nth previous sample"}, 786 | {"get_topN", (PyCFunction)faststat_Stats_get_topN, METH_NOARGS, 787 | "get the highest values"}, 788 | {"get_window_counts", (PyCFunction)faststat_Stats_get_window_counts, METH_NOARGS, 789 | "get a dictionary of window intervals to window counts"}, 790 | {NULL} 791 | }; 792 | 793 | 794 | static PyTypeObject faststat_StatsType = { 795 | PyObject_HEAD_INIT(NULL) 796 | 0, /*ob_size*/ 797 | "_faststat.Stats", /*tp_name*/ 798 | sizeof(faststat_Stats), /*tp_basicsize*/ 799 | 0, /*tp_itemsize*/ 800 | (destructor)faststat_Stats_dealloc, /*tp_dealloc*/ 801 | 0, /*tp_print*/ 802 | 0, /*tp_getattr*/ 803 | 0, /*tp_setattr*/ 804 | 0, /*tp_compare*/ 805 | 0, /*tp_repr*/ 806 | 0, /*tp_as_number*/ 807 | 0, /*tp_as_sequence*/ 808 | 0, /*tp_as_mapping*/ 809 | 0, /*tp_hash */ 810 | 0, /*tp_call*/ 811 | 0, /*tp_str*/ 812 | 0, /*tp_getattro*/ 813 | 0, /*tp_setattro*/ 814 | 0, /*tp_as_buffer*/ 815 | Py_TPFLAGS_DEFAULT, /*tp_flags*/ 816 | "online stats collector", /* tp_doc */ 817 | 0, /* tp_traverse */ 818 | 0, /* tp_clear */ 819 | 0, /* tp_richcompare */ 820 | 0, /* tp_weaklistoffset */ 821 | 0, /* tp_iter */ 822 | 0, /* tp_iternext */ 823 | faststat_Stats_methods, /* tp_methods */ 824 | faststat_Stats_members, /* tp_members */ 825 | 0, /* tp_getset */ 826 | 0, /* tp_base */ 827 | 0, /* tp_dict */ 828 | 0, /* tp_descr_get */ 829 | 0, /* tp_descr_set */ 830 | 0, /* tp_dictoffset */ 831 | 0, /* tp_init */ 832 | 0, /* tp_alloc */ 833 | faststat_Stats_new, /* tp_new */ 834 | }; 835 | 836 | 837 | static PyObject* pynanotime(PyObject *_) { 838 | PyObject *result; 839 | result = PyLong_FromUnsignedLongLong(nanotime()); 840 | if(PyErr_Occurred()) { return NULL; } 841 | return result; 842 | } 843 | 844 | static PyObject* pynanotime_override(PyObject *_, PyObject *args) { 845 | unsigned long long t; 846 | if(PyArg_ParseTuple(args, "K", &t)) { 847 | nanotime_override = t; 848 | } 849 | if(PyErr_Occurred()) { return NULL; } 850 | Py_INCREF(Py_None); 851 | return Py_None; 852 | } 853 | 854 | 855 | static PyMethodDef module_methods[] = { 856 | {"nanotime", (PyCFunction)pynanotime, METH_NOARGS, 857 | "get integral nanoseconds since unix epoch"}, 858 | {"_nanotime_override", (PyCFunction)pynanotime_override, METH_VARARGS, 859 | "override time seen by all faststat operations, useful for testing time based algoritmhs"}, 860 | {NULL} }; 861 | 862 | 863 | #ifndef PyMODINIT_FUNC /* declarations for DLL import/export */ 864 | #define PyMODINIT_FUNC void 865 | #endif 866 | PyMODINIT_FUNC init_faststat(void) { 867 | PyObject *module; 868 | 869 | if(PyType_Ready(&faststat_StatsType) < 0) 870 | return; 871 | 872 | module = Py_InitModule3("_faststat", module_methods, "fast statistics"); 873 | 874 | if(module == NULL) 875 | return; 876 | 877 | Py_INCREF(&faststat_StatsType); 878 | PyModule_AddObject(module, "Stats", (PyObject*)&faststat_StatsType); 879 | nanotime_override = 0; 880 | } 881 | -------------------------------------------------------------------------------- /faststat/cache.py: -------------------------------------------------------------------------------- 1 | class LRUCache(object): 2 | ''' 3 | Implements an LRU cache based on a linked list. 4 | Performance is about 1.1 microseconds per set/get on a core i7 5 | ''' 6 | def __init__(self, maxlen=10000): 7 | self.map = {} 8 | self.root = root = [] 9 | root[:] = [root, root] 10 | self.maxlen = maxlen 11 | 12 | def __getitem__(self, key): 13 | val = self.map[key][3] 14 | self[key] = val 15 | return val 16 | 17 | def add(self, key, val): 18 | # node[0] = prev; node[1] = next 19 | root = self.root 20 | if key in self.map: 21 | # remove from map 22 | link = self.map.pop(key) 23 | # remove from list 24 | link[0][1], link[1][0] = link[1], link[0] 25 | else: 26 | link = [None, None, key, val] 27 | discard = None 28 | if len(self.map) >= self.maxlen: 29 | # pop and discard the oldest item 30 | discard = root[0] 31 | discard[0][1], root[0] = root, discard[0] 32 | self.map.pop(discard[2]) 33 | # insert into map 34 | self.map[key] = link 35 | # insert into list 36 | link[0], link[1] = root, root[1] 37 | root[1][0] = link 38 | root[1] = link 39 | if root[0] is root: 40 | root[0] = root[1] 41 | if discard: 42 | return discard[2], discard[3] 43 | 44 | __setitem__ = add 45 | 46 | _unset = object() 47 | 48 | def pop(self, key=_unset): 49 | # remove from map and list 50 | if key is LRUCache._unset: 51 | link = self.root[0] 52 | self.map.pop(link[2]) 53 | else: 54 | link = self.map.pop(key) 55 | link[0][1], link[1][0] = link[1], link[0] 56 | return link[3] 57 | 58 | def __contains__(self, key): 59 | return key in self.map 60 | 61 | def __len__(self): 62 | return len(self.map) 63 | 64 | def items(self): 65 | return [(k, self.map[k][3]) for k in self.map] 66 | 67 | def keys(self): 68 | return self.map.keys() 69 | 70 | def values(self): 71 | return [self.map[k][3] for k in self.map] 72 | 73 | 74 | class SegmentedCache(object): 75 | ''' 76 | Implements a Segmented LRU cache based on an LRU cache. 77 | ''' 78 | def __init__(self, maxlen=10000): 79 | self.probationary = LRUCache(maxlen) 80 | self.protected = LRUCache(maxlen / 2) 81 | self.maxlen = maxlen 82 | 83 | def __getitem__(self, key): 84 | if key in self.protected.map: 85 | # already protected, nothing to do 86 | return self.protected[key] 87 | if key in self.probationary.map: 88 | # promote to protected 89 | val = self.probationary.pop(key) 90 | discard = self.protected.add(key, val) 91 | if discard: 92 | self.probationary.add(discard[0], discard[1]) 93 | return val 94 | raise KeyError(key) 95 | 96 | def add(self, key, val): 97 | if key in self.protected: 98 | self.protected[key] = val 99 | elif key in self.probationary: 100 | self.probationary.pop(key) 101 | discard = self.protected.add(key, val) 102 | if discard: 103 | self.probationary.add(discard[0], discard[1]) 104 | else: # totally brand new key being added 105 | self.probationary.add(key, val) 106 | if len(self.probationary.map) + len(self.protected.map) > self.maxlen: 107 | self.probationary.pop() 108 | 109 | __setitem__ = add 110 | 111 | def __contains__(self, key): 112 | return key in self.probationary or key in self.protected 113 | 114 | def __len__(self): 115 | return len(self.protected) + len(self.probationary) 116 | 117 | def items(self): 118 | return self.protected.items() + self.probationary.items() 119 | 120 | def keys(self): 121 | return self.protected.keys() + self.probationary.keys() 122 | 123 | def values(self): 124 | return self.protected.values() + self.probationary.values() 125 | 126 | 127 | if __name__ == "__main__": 128 | cache_size = 7 129 | sg = SegmentedCache(cache_size) 130 | r = range(10000) 131 | for i in r: 132 | sg[i] = i 133 | for i in r[-cache_size:]: 134 | assert i in sg 135 | 136 | import time 137 | 138 | r = range(int(5e5)) 139 | s = time.time() 140 | for i in r: 141 | sg[i] = i 142 | sg[i] = i 143 | print "{0:.2f}us".format(time.time() - s) 144 | -------------------------------------------------------------------------------- /faststat/faststat.py: -------------------------------------------------------------------------------- 1 | ''' 2 | faststat is a *streaming*, *light-weight* statistics library designed for embedding 3 | in other Python applications. *Steaming* means 4 | that faststat operates on data points as they arrive, without needing to store 5 | previous data points. *Light-weight* means that statistics do not take up a great 6 | deal of CPU or RAM. Adding a data point to a stat object is a 0.5 - 3 microsecond operation. 7 | Each stat object takes about 4kiB of memory. 8 | ''' 9 | import array 10 | import random 11 | import math 12 | import collections 13 | import time 14 | import functools 15 | import json 16 | import weakref 17 | 18 | import _faststat 19 | import cache 20 | import format 21 | 22 | 23 | class Sample(object): 24 | ''' 25 | This class implements Reservoir Sampling to keep a random sample of an infinite stream. 26 | See http://gregable.com/2007/10/reservoir-sampling.html for one description. 27 | 28 | This class is kept separate from the other stats, because its memory usage is far greater. 29 | ''' 30 | def __init__(self, sample_size=2**14, type='f'): 31 | self.sample = array.array(type) 32 | self.sample_size = sample_size 33 | self.num_vals = 0 34 | 35 | def add(self, val): 36 | if self.num_vals < self.sample_size: 37 | self.sample.append(val) 38 | else: 39 | pos = random.randint(0, self.num_vals) 40 | if pos < self.sample_size: 41 | self.sample[pos] = val 42 | self.num_vals += 1 43 | 44 | DEFAULT_PERCENTILES = (0.25, 0.50, 0.75, 0.90, 0.95, 0.99) 45 | EXPO_AVGS = (1.0/2, 1.0/4, 1.0/8, 1.0/16, 1.0/32, 1.0/64) 46 | 47 | 48 | # keep buckets for intervals in size from 100ns to ~14 hours 49 | TIME_BUCKETS = sum( 50 | [(1*10**x, 2*10**x, 5*10**x) for x in range(2, 13)], ()) 51 | # useful buckets for unsigned integers up to 64 bits 52 | UINT_BUCKETS = (1, 2, 3, 4, 5, 6, 7, 8, 9) + sum( 53 | [(1*10**x, 2*10**x, 5*10**x) for x in range(1, 20)], ()) 54 | # useful buckets for signed integers up to 64 bits 55 | INT_BUCKETS = tuple(reversed([-e for e in UINT_BUCKETS[:-3]])) + (0,) + UINT_BUCKETS[:-3] 56 | 57 | DEFAULT_BUCKETS = (0, 1e-5, 1e-4, 1e-3, 1e-2, 2e-2, 5e-2, 0.1, 0.2, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 58 | 10, 20, 50, 100, 200, 500, 1e3, 2e3, 5e3, 1e4, 1e5, 1e6) 59 | DEFAULT_BUCKETS = tuple(reversed([-e for e in DEFAULT_BUCKETS])) + (0,) + DEFAULT_BUCKETS 60 | 61 | ONE_MIN_NS = int(60e9) 62 | ONE_HOUR_NS = 60 * ONE_MIN_NS 63 | 64 | WINDOW_COUNTS = [(64, ONE_MIN_NS), (32, ONE_HOUR_NS)] 65 | 66 | class _BaseStats(object): 67 | 'base class to avoid repeating code' 68 | def __init__(self, buckets, lastN, percentiles, interval, expo_avgs, 69 | window_counts, num_top): 70 | buckets = buckets + (float("inf"),) 71 | lastN = int(2**math.ceil(math.log(lastN)/math.log(2))) 72 | num_top = int(2**math.ceil(math.log(lastN)/math.log(2))) 73 | if interval: 74 | self.interval = Interval(window_counts=()) 75 | interval = self.interval._stats 76 | else: 77 | interval = None 78 | self._stats = _faststat.Stats(buckets, lastN, percentiles, interval, 79 | expo_avgs, window_counts, num_top) 80 | 81 | @property 82 | def variance(self): 83 | if self.n < 2: 84 | return float("nan") 85 | return self.m2 / (self.n - 1) 86 | 87 | @property 88 | def trimean(self): 89 | p = self.get_percentiles() 90 | return (p[0.25] + 2 * p[0.5] + p[0.75]) / 4 91 | 92 | @property 93 | def skewness(self): 94 | if not self.m2: 95 | return float("nan") 96 | return self.n ** 0.5 * self.m3 / self.m2 ** 1.5 97 | 98 | @property 99 | def kurtosis(self): 100 | if not self.m2: 101 | return float("nan") 102 | return self.n * self.m4 / self.m2 ** 2 - 3 103 | 104 | @property 105 | def percentiles(self): 106 | return self.get_percentiles() 107 | 108 | @property 109 | def buckets(self): 110 | return self.get_buckets() 111 | 112 | @property 113 | def geometric_mean(self): 114 | 'nth root of product of data points' 115 | return math.exp(self.sum_of_logs / self.n) 116 | 117 | @property 118 | def harmonic_mean(self): 119 | 'inverse of mean of inverses of data points' 120 | return self.n / self.sum_of_inv 121 | 122 | @property 123 | def window_median(self): 124 | if self.num_prev: 125 | prev = sorted([self.get_prev(i)[1] for i in range(self.num_prev)]) 126 | return prev[len(prev)/2] 127 | 128 | @property 129 | def expo_avgs(self): 130 | return self.get_expo_avgs() 131 | 132 | @property 133 | def lag_avgs(self): 134 | ''' 135 | same data as expo_avgs, but with keys as the average age 136 | of the data -- assuming evenly spaced data points -- rather 137 | than decay rates 138 | ''' 139 | if not self.interval: 140 | return 141 | interval = self.interval.mean 142 | return dict([(interval/alpha, val) 143 | for alpha, val in self.get_expo_avgs().items()]) 144 | 145 | def __repr__(self): 146 | p = self.percentiles 147 | if self.n < len(p): 148 | quartiles = "(n too small)" 149 | else: 150 | quartiles = (_sigfigs(p.get(0.25, -1)), 151 | _sigfigs(p.get(0.5, -1)), _sigfigs(p.get(0.75, -1))) 152 | return ''.format( 153 | type(self).__name__, self.n, _sigfigs(self.mean), quartiles) 154 | 155 | 156 | class Stats(_BaseStats): 157 | ''' 158 | Call add(value) to add a data point. 159 | ''' 160 | def __init__(self, buckets=DEFAULT_BUCKETS, lastN=64, percentiles=DEFAULT_PERCENTILES, 161 | interval=True): 162 | super(Stats, self).__init__(buckets, lastN, percentiles, interval, EXPO_AVGS, 163 | WINDOW_COUNTS, num_top=64) 164 | self.add = self._stats.add 165 | 166 | def __getattr__(self, name): 167 | if name not in ("tick", "end"): 168 | return getattr(self._stats, name) 169 | 170 | 171 | class _TimeStats(_BaseStats): 172 | def get_percentiles(self): 173 | data = self._stats.get_percentiles() 174 | for k in data: 175 | data[k] = ntd_float(data[k]) 176 | return data 177 | 178 | @property 179 | def mean(self): 180 | return ntd_float(self._stats.mean) 181 | 182 | @property 183 | def max(self): 184 | return ntd_float(self._stats.max) 185 | 186 | @property 187 | def min(self): 188 | return ntd_float(self._stats.min) 189 | 190 | 191 | class Interval(_TimeStats): 192 | ''' 193 | Call tick() to register occurrences. 194 | Note that calling tick() N times results in N-1 data points. 195 | ''' 196 | def __init__(self, buckets=TIME_BUCKETS, lastN=64, percentiles=DEFAULT_PERCENTILES, 197 | window_counts=WINDOW_COUNTS): 198 | super(Interval, self).__init__(buckets, lastN, percentiles, False, (), (), num_top=64) 199 | self.tick = self._stats.tick 200 | 201 | def __getattr__(self, name): 202 | if name not in ("add", "end"): 203 | return getattr(self._stats, name) 204 | 205 | 206 | class Duration(_TimeStats): 207 | ''' 208 | Represents statistics for a duration. 209 | Call end(start_time_nanos) to add a data point. 210 | ''' 211 | def __init__(self, buckets=TIME_BUCKETS, lastN=64, percentiles=DEFAULT_PERCENTILES, 212 | interval=True): 213 | super(Duration, self).__init__(buckets, lastN, percentiles, interval, EXPO_AVGS, 214 | WINDOW_COUNTS, num_top=64) 215 | self.end = self._stats.end 216 | 217 | def __getattr__(self, name): 218 | if name not in ("add", "tick"): 219 | return getattr(self._stats, name) 220 | 221 | 222 | class Markov(object): 223 | ''' 224 | Represents the states of a Markov process. The transition between states are 225 | modeled as Intervals, and the time spent in a given state is modeled as 226 | Durations. 227 | ''' 228 | def __init__(self): 229 | self.state_durations = collections.defaultdict(Duration) 230 | self.transition_intervals = collections.defaultdict(Interval) 231 | self.transitor_states = collections.defaultdict(int) 232 | self._weakref_holder = {} 233 | self.state_counts = collections.defaultdict(functools.partial(Stats, interval=False)) 234 | 235 | def _transition(self, nxt, cur=None, since=None): 236 | ''' 237 | Register that a transition has taken place. 238 | nxt is an identifier for the state being entered. 239 | cur is an identifier for the state being left. 240 | since is the time at which the previous state was entered. 241 | ''' 242 | self.transition_intervals[(cur, nxt)].tick() 243 | if since: 244 | self.state_durations[cur].end(since) 245 | 246 | def make_transitor(self, state): 247 | ''' 248 | Creates and returns a new Markov.Transitor, in the passed state. 249 | ''' 250 | return Markov.Transitor(self, state) 251 | 252 | def _cleanup(self, ref): 253 | 'cleanup after a transitor weakref fires' 254 | self.transitor_states[self._weakref_holder[ref]] -= 1 255 | del self._weakref_holder[ref] 256 | 257 | class Transitor(object): 258 | ''' 259 | An extremely light-weight object that simply tracks a current 260 | state and the time of the last transition. 261 | ''' 262 | def __init__(self, markov, state): 263 | self.markov = markov 264 | self.state = state 265 | self.markov._transition(state) 266 | self.markov.transitor_states[state] += 1 267 | state_count = self.markov.transitor_states[state] 268 | self.markov.state_counts[state].add(state_count) 269 | self.weakref = weakref.ref(self, markov._cleanup) 270 | self.markov._weakref_holder[self.weakref] = state 271 | self.last_transition = nanotime() 272 | 273 | def transition(self, state): 274 | ''' 275 | Notify the parent Markov stats object of a transition 276 | from the current state to the passed state. 277 | ''' 278 | self.markov.transitor_states[self.state] -= 1 279 | self.markov.transitor_states[state] += 1 280 | old_state_count = self.markov.transitor_states[self.state] 281 | new_state_count = self.markov.transitor_states[state] 282 | self.markov.state_counts[self.state].add(old_state_count) 283 | self.markov.state_counts[state].add(new_state_count) 284 | self.markov._weakref_holder[self.weakref] = state 285 | self.markov._transition(state, self.state, self.last_transition) 286 | self.last_transition, self.state = nanotime(), state 287 | 288 | def __repr__(self): 289 | return ''.format(self.state) 290 | 291 | 292 | class PathStats(object): 293 | ''' 294 | Represents a set of paths taken. Unlike Markov, these states remember 295 | their "history". 296 | 297 | Because there are likely to be many more paths than states, the Path 298 | is more aggressively memory optimized, and also employs a SegmentedCache 299 | internally to limit the number of unique path durations that will be 300 | stored. 301 | ''' 302 | def __init__(self, maxsize=2048): 303 | self.path_stats = cache.SegmentedCache(maxsize) 304 | self._weakref_path_map = {} 305 | 306 | def make_walker(self, start): 307 | 'returns a walker object that tracks a path' 308 | return PathStats.Walker(self, start) 309 | 310 | def _commit(self, ref): 311 | 'commit a walkers data after it is collected' 312 | path_times = self._weakref_path_map[ref] 313 | path_times.append(nanotime()) 314 | del self._weakref_path_map[ref] 315 | path = tuple(path_times[1::2]) 316 | times = path_times[::2] 317 | if path not in self.path_stats: 318 | # tuple to save a tiny bit of memory 319 | self.path_stats[path] = tuple([ 320 | Duration(interval=False) for i in range(len(path))]) 321 | path_stats = self.path_stats[path] 322 | for i in range(1, len(times)): 323 | path_stats[i - 1]._stats.add(times[i] - times[i - 1]) 324 | 325 | def pformat(self, prefix=()): 326 | ''' 327 | Makes a pretty ASCII format of the data, suitable for 328 | displaying in a console or saving to a text file. 329 | Returns a list of lines. 330 | ''' 331 | nan = float("nan") 332 | 333 | def sformat(segment, stat): 334 | FMT = "n={0}, mean={1}, p50/95={2}/{3}, max={4}" 335 | line_segs = [segment] 336 | for s in [stat]: 337 | p = s.get_percentiles() 338 | p50, p95 = p.get(0.50, nan), p.get(0.95, nan) 339 | line_segs.append(FMT.format(s.n, s.mean, p50, p95, s.max)) 340 | return '{0}: {1}'.format(*line_segs) 341 | 342 | lines = [] 343 | for path in sorted(self.path_stats.keys()): 344 | lines.append('=====================') 345 | for seg, stat in zip(path, self.path_stats[path]): 346 | lines.append(sformat(seg, stat)) 347 | return lines 348 | 349 | def _finished_segment(self, path, since, start): 350 | if path not in self.path_stats: 351 | self.path_stats[path] = ( 352 | Duration(interval=False), Duration(interval=False)) 353 | dur, offset = self.state_stats[path] 354 | dur.end(since) 355 | offset.end(start) 356 | 357 | class Walker(object): 358 | ''' 359 | A light-weight object that tracks a current path and the time of 360 | the last transition. Similar to Tranistor for Markov. 361 | ''' 362 | def __init__(self, pathstats, segment="NEW"): 363 | self.pathstats = pathstats 364 | self._commiter = weakref.ref(self, self.pathstats._commit) 365 | self.path = self.pathstats._weakref_path_map[self._commiter] = [] 366 | self.push(segment) 367 | 368 | def push(self, segment): 369 | ''' 370 | pushes a new segment onto the path, closing out the previous segment 371 | ''' 372 | self.path.append(nanotime()) 373 | self.path.append(segment) 374 | self.curseg = segment 375 | 376 | def pop(self): 377 | self.push(PathStats.POP) 378 | 379 | def branch(self): 380 | child = PathStats.Walker( 381 | self.pathstats, PathStats.BRANCH_C) 382 | self.push(PathStats.BRANCH_P) 383 | return child 384 | 385 | def join(self, walker): 386 | self.push((PathStats.JOIN, tuple(walker.path))) 387 | walker.push(PathStats.JOINED) 388 | 389 | BRANCH_P, BRANCH_C, JOIN, JOINED, POP = "BRANCH_P", "BRANCH_C", "JOIN", "JOINED", "POP" 390 | 391 | def __repr__(self): 392 | return "".format(len(self.path_stats)) 393 | 394 | 395 | class ntd_float(float): 396 | 'a float which represents the difference of two timestamps in nanoseconds' 397 | def __repr__(self): 398 | return format.si_format(self / 1e9, "s") 399 | 400 | def __format__(self, format_spec): 401 | return repr(self) 402 | 403 | 404 | TimeSeries = functools.partial(Stats, interval=False) 405 | nanotime = _faststat.nanotime 406 | 407 | 408 | def _sigfigs(n, sigfigs=3): 409 | 'helper function to round a number to significant figures' 410 | n = float(n) 411 | if n == 0 or math.isnan(n): # avoid math domain errors 412 | return n 413 | return round(n, -int(math.floor(math.log10(abs(n))) - sigfigs + 1)) 414 | 415 | 416 | def merge_moments(m_a, m_a2, m_a3, m_a4, n_a, m_b, m_b2, m_b3, m_b4, n_b): 417 | ''' 418 | Merge moments of two samples A and B. 419 | parameters are 420 | m_a, ..., m_a4 = first through fourth moment of sample A 421 | n_a = size of sample A 422 | m_b, ..., m_b4 = first through fourth moment of sample B 423 | n_b = size of sample B 424 | ''' 425 | delta = m_b - m_a 426 | delta_2 = delta * delta 427 | delta_3 = delta * delta_2 428 | delta_4 = delta * delta_3 429 | n_x = n_a + n_b 430 | m_x = m_a + delta * n_b / n_x 431 | m_x2 = m_a2 + m_b2 + delta_2 * n_a * n_b / n_x 432 | m_x3 = m_a3 + m_b3 + delta_3 * n_a * n_b * (n_a - n_b) + 3 * delta * (n_a * m_2b - n_b * m_2a) / n_x 433 | m_x4 = (m_a4 + m_b4 + delta_4 * (n_a * n_b * (n_a * n_a - n_a * n_b + n_b * n_b)) / (n_x ** 3) + 434 | 6 * delta_2 * (n_a * n_a * m_b2 + n_b * n_b * m_a2) / (n_x ** 2) + 435 | 4 * delta * (n_a * m_b3 - n_b * m_a3) / n_x ) 436 | return m_x, m_x2, m_x3, m_x4, n_x 437 | -------------------------------------------------------------------------------- /faststat/format.py: -------------------------------------------------------------------------------- 1 | import math 2 | import json 3 | 4 | 5 | def stat2json(stat): 6 | prev = [stat.get_prev(i) for i in range(stat.num_prev)] 7 | # timestamp = 0 means not yet valid 8 | prev = [(p[0] / 1e6, p[1]) for p in prev if p[0]] 9 | return json.dumps({ 10 | "n": stat.n, 11 | "mean": stat.mean, 12 | "max": stat.max, 13 | "min": stat.min, 14 | "percentiles": stat.get_percentiles(), 15 | "prev": prev 16 | }) 17 | 18 | 19 | def stat2html(stat): 20 | return TEMPLATE.replace('"==THE_STAT=="', stat2json(stat)) 21 | 22 | 23 | def si_round(val): 24 | ''' 25 | round to a "scientific notation" tuple of (factor, exponent) 26 | such that 1 < factor < 1000, and factor * 10 ** exponent == val 27 | ''' 28 | if val < 0: 29 | neg = True 30 | val = -val 31 | elif val == 0: 32 | return 0, 0 33 | else: 34 | neg = False 35 | exp = math.log(val) / math.log(1000) 36 | if exp < 0: 37 | exp = int(exp) - 1 38 | else: 39 | exp = int(exp) 40 | val = val / 1000.0 ** exp 41 | if neg: 42 | val = -val 43 | return val, 3 * exp 44 | 45 | 46 | def si_format(val, unit=''): 47 | if math.isnan(val): 48 | return "nan" 49 | if math.isinf(val): 50 | if val < 0: 51 | return "-inf" 52 | return "inf" 53 | val, exp = si_round(val) 54 | if exp: 55 | if exp in _SCALES and unit: 56 | exps = _SCALES[exp] + unit 57 | elif exp == -3: 58 | # special case, do 0.123 instead of 123e-3 59 | return "{0:0.3f}".format(val / 1e3) 60 | else: 61 | exps = 'e' + str(exp) + unit 62 | else: 63 | exps = unit 64 | if val >= 100 or val <= -100: 65 | return '{0:0.0f}{1}'.format(val, exps) 66 | if val >= 10 or val <= -10: 67 | return '{0:0.1f}{1}'.format(val, exps) 68 | return '{0:0.2f}{1}'.format(val, exps) 69 | 70 | 71 | def sib_round(val): 72 | ''' 73 | round to a binary SI tuple of (factor, exponent) 74 | such that 1 < factor < 1024, and factor * 1024 ** exponent == val 75 | ''' 76 | if val < 0: 77 | neg = True 78 | val = -val 79 | elif val == 0: 80 | return 0, 0 81 | else: 82 | neg = False 83 | exp = math.log(val) / math.log(1024) 84 | if exp < 0: 85 | exp = int(exp) - 1 86 | else: 87 | exp = int(exp) 88 | val = val / 1024.0 ** exp 89 | if neg: 90 | val = -val 91 | return val, exp 92 | 93 | 94 | def sib_format(val): 95 | if math.isnan(val): 96 | return "nan" 97 | if math.isinf(val): 98 | if val < 0: 99 | return "-inf" 100 | return "inf" 101 | val, exp = sib_round(val) 102 | if exp < 0 or exp > len(_BSCALES): 103 | raise ValueError("{0} out of format range", val) 104 | exps = _BSCALES[exp] 105 | if not exps: 106 | return str(int(val)) + 'B' 107 | if val >= 100 or val <= -100: 108 | return '{0:0.0f}{1}B'.format(val, exps) 109 | if val >= 10 or val <= -10: 110 | return '{0:0.1f}{1}B'.format(val, exps) 111 | return '{0:0.2f}{1}B'.format(val, exps) 112 | 113 | 114 | def sigfigs(n, sigfigs=3): 115 | 'helper function to round a number to significant figures' 116 | n = float(n) 117 | if n == 0 or math.isnan(n): # avoid math domain errors 118 | return n 119 | return round(n, -int(math.floor(math.log10(abs(n))) - sigfigs + 1)) 120 | 121 | 122 | _SCALES = { 123 | # Peta, Tera, Giga, Mega, kilo 124 | 15: 'P', 12: 'T', 9: 'G', 6: 'M', 3: 'k', 125 | -3: 'm', -6: 'u', -9: 'n', -12: 'p', -15: 'f' 126 | # milli, micro, nano, pico, femto 127 | } 128 | 129 | 130 | _BSCALES = ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'] 131 | 132 | 133 | JAVASCRIPT_HTML_HEAD = ''' 134 | 135 | 136 | 192 | ''' 193 | 194 | TEMPLATE = ''' 195 | 196 | 197 |
198 |
199 | 200 | 205 |
206 | 207 | '''.replace('', JAVASCRIPT_HTML_HEAD) 208 | -------------------------------------------------------------------------------- /faststat/test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | test implementation from wikipedia 3 | http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics 4 | ''' 5 | import random 6 | import time 7 | from faststat import Stats 8 | import faststat 9 | import format 10 | # TODO: fix these to absolute imports 11 | 12 | 13 | def p2_parabolic(l_v, l_n, c_v, c_n, r_v, r_n, d): 14 | return (c_v + (d / (r_n - l_n)) * ( 15 | (c_n - l_n + d) * (r_v - c_v) / (r_n - c_n) + 16 | (r_n - c_n - d) * (c_v - l_v) / (c_n - l_n))) 17 | 18 | 19 | def online_variance(data): 20 | n = 0 21 | mean = 0 22 | M2 = 0 23 | 24 | for x in data: 25 | n = n + 1 26 | delta = x - mean 27 | mean = mean + delta/n 28 | M2 = M2 + delta*(x - mean) 29 | 30 | variance = M2/(n - 1) 31 | return variance 32 | 33 | 34 | def online_kurtosis(data): 35 | n = 0 36 | mean = 0 37 | M2 = 0 38 | M3 = 0 39 | M4 = 0 40 | 41 | for x in data: 42 | n1 = n 43 | n = n + 1 44 | delta = x - mean 45 | delta_n = delta / n 46 | delta_n2 = delta_n * delta_n 47 | term1 = delta * delta_n * n1 48 | mean = mean + delta_n 49 | M4 = M4 + term1 * delta_n2 * (n*n - 3*n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3 50 | M3 = M3 + term1 * delta_n * (n - 2) - 3 * delta_n * M2 51 | M2 = M2 + term1 52 | 53 | kurtosis = (n*M4) / (M2*M2) - 3 54 | return kurtosis 55 | 56 | 57 | def test(): 58 | # random.seed(103) # make test repeatable 59 | data = [random.normalvariate(1.0, 1.0) for i in range(int(1e6))] 60 | stats = Stats() 61 | start = time.time() 62 | for d in data: 63 | stats.add(d) 64 | print time.time() - start, "microseconds per point" 65 | print "mean (should be 1)", stats.mean 66 | print "kurtosis / reference kurtosis", stats.kurtosis, online_kurtosis(data) 67 | print "variance / reference variance", stats.variance, online_variance(data) 68 | print "skewness (should be 0)", stats.skewness 69 | print "max, min, mintime, maxtime", stats.max, stats.min, stats.mintime, stats.maxtime 70 | print "m2, m3, m4", stats.m2, stats.m3, stats.m4 71 | print "geometric_mean, harmonic mean", stats.geometric_mean, stats.harmonic_mean 72 | print "interval.min, interval.geometric_mean, interval.harmonic_mean", 73 | print stats.interval.min, stats.interval.geometric_mean, stats.interval.harmonic_mean 74 | print "expo_avgs (should be 1)", stats.expo_avgs 75 | print "window_counts", stats.get_window_counts() 76 | print "top 10", sorted(stats.get_topN())[-10:] 77 | open('test_html.html', 'w').write(faststat.format.stat2html(stats)) 78 | return stats 79 | 80 | 81 | def test_breadth(): 82 | m = faststat.Markov() 83 | t = m.make_transitor('foo') 84 | t.transition('bar') 85 | 86 | p = faststat.PathTree() 87 | w = p.make_walker('foo') 88 | w.push('bar') 89 | w.push('baz') 90 | b = w.branch() 91 | b.push('branch1') 92 | b.push('branch2') 93 | w.join(b) 94 | w.push('end') 95 | del w 96 | 97 | # import pprint 98 | #pprint.pprint(p.state_stats.keys()) 99 | # pprint.pprint(p.unique_paths()) 100 | print '\n'.join(p.pformat()) 101 | 102 | 103 | if __name__ == "__main__": 104 | test_breadth() 105 | test() 106 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from setuptools import Extension, setup 3 | 4 | extra_compile_args = [] 5 | if platform.system() == 'Windows': 6 | extra_compile_args = ['/MT',] 7 | libraries = [] 8 | if platform.system() not in ('Windows', 'Darwin'): 9 | libraries = ['rt',] 10 | 11 | _faststat = Extension('_faststat', sources=['faststat/_faststat.c'], 12 | libraries=libraries, extra_compile_args=extra_compile_args) 13 | 14 | setup( 15 | name='faststat', 16 | version='17.10.0', 17 | author="Kurt Rose", 18 | author_email="kurt@kurtrose.com", 19 | description='fast online statistics collection', 20 | license="MIT", 21 | url="http://github.com/doublereedkurt/faststat", 22 | long_description='...', 23 | classifiers=[ 24 | 'Development Status :: 4 - Beta', 25 | 'License :: OSI Approved :: MIT License', 26 | ], 27 | packages=['faststat'], 28 | ext_modules=[_faststat]) 29 | 30 | 31 | -------------------------------------------------------------------------------- /test/biased_quantile_stream.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | # This is a fairly naive implementation of the algorithm described in 4 | # "Effective Computation of Biased Quantiles over Data Streams" 5 | 6 | class Quantiles(object): 7 | def __init__(self, f=None): 8 | self.points = None 9 | self.n = 0 10 | self.f = f or biased_quantiles_f 11 | 12 | def insert(self, val): 13 | if self.points is None or val < self.points.val: 14 | # less than first 15 | self.points = _point(val, 1, 0, self.points) 16 | return 17 | prev = self.points 18 | cur = self.points.next 19 | r = 0 20 | while cur and cur.next: 21 | if val < cur.val: 22 | break 23 | r += cur.delta 24 | prev = cur 25 | cur = cur.next 26 | else: # ran off the end 27 | (cur or prev).next = _point(val, 1, 0, None) 28 | return 29 | new = _point(val, 1, max(int(self.f(r, self.n)) - 1, 0), cur) 30 | prev.next = new 31 | self.n += 1 32 | if self.n % COMPRESS_INTERVAL == 0: 33 | self.compress() 34 | 35 | def compress(self): 36 | pointlist = [] 37 | cur = self.points 38 | r = [0] 39 | while cur: 40 | pointlist.append(cur) 41 | r.append(r[-1] + cur.delta) 42 | cur = cur.next 43 | for i in range(len(pointlist) - 2, 0, -1): 44 | if pointlist[i].delta + pointlist[i + 1].delta + pointlist[i + 1].width <= self.f(r[i], self.n): 45 | # merge 46 | pointlist[i].next = pointlist[i + 1].next 47 | pointlist[i].val = pointlist[i + 1].val 48 | pointlist[i].delta += pointlist[i + 1].delta 49 | pointlist[i].width = pointlist[i + 1].width 50 | 51 | def query(self, p): 52 | cur_r = 0 53 | cur_point = self.points 54 | while cur_point: 55 | cur_r += cur_point.delta 56 | if (cur_r + cur_point.next.delta + cur_point.next.width > 57 | self.n * p + self.f(self.n * p, self.n) / 2): 58 | return cur_point, (cur_r + cur_point.delta, cur_r + cur_point.delta + cur_point.width) 59 | cur_point = cur_point.next 60 | 61 | def get_pointlist(self): 62 | if not self.points: 63 | return [] 64 | p = [self.points] 65 | while p[-1].next: 66 | p.append(p[-1].next) 67 | return p 68 | 69 | 70 | import math 71 | 72 | 73 | class DistributedQuantiles(object): 74 | def __init__(self, error): 75 | self.size = math.ceil(1 / error) 76 | self.inbuf = [] 77 | self.qs = [] 78 | self.n = 0 79 | 80 | def insert(self, val): 81 | self.inbuf.append(val) 82 | if len(self.inbuf) >= self.size: 83 | self.inbuf.sort() 84 | i, j = 0, 0 85 | new_qs = [] 86 | if self.inbuf[0] < self.qs[0][0]: 87 | while self.inbuf[i] < self.qs[j][0]: 88 | new_qs.append((self.inbuf[i], i, i)) 89 | while i < len(self.inbuf) and j < len(self.qs): 90 | nxtmin = i + self.qs[j][1] 91 | nxtmax = i + self.qs[j][2] - 1 92 | if self.inbuf[i] < self.qs[j][0]: 93 | nxtval = self.inbuf[i] 94 | i += 1 95 | elif self.qs[j] < self.inbuf[i]: 96 | nxtval = self.qs[j] 97 | j += 1 98 | else: 99 | val = self.inbuf[i] 100 | if len(inbuf) > i + 1 and inbuf[i + 1] == val: 101 | if len(self.qs) > j + 1 and self.qs[j + 1] == val: 102 | new_qs.append((val, )) 103 | # the paper does not cover this case so a supplemental 104 | # proof is provided 105 | pass 106 | new_qs.append((nxtval, nxtmin, nxtmax)) 107 | self.qs = new_qs 108 | self.prune() 109 | 110 | def prune(self): 111 | pass 112 | 113 | 114 | ''' 115 | Supplemental proof of correctness of equal value element case. 116 | (notation is kept as close as possible to the original paper 117 | within the limitations of ASCII) 118 | 119 | MOTIVATION: 120 | The error bound as given in the paper is dependent that 121 | when merging any Q' and Q'', the next-lowest y_s of Q'' and 122 | next-highest y_t of Q'' to any x of Q' must be consecutive. 123 | 124 | However, in the general case there may be many elements whose 125 | value are equal. Consider 126 | Q' = [...,-1,0,1,...] 127 | Q'' = [...,-1,0,0,1,...] 128 | 129 | For x of Q' = 0, y_s = -1 and y_t = 1. 130 | y_s and y_t are not consecutive. 131 | The merge operation is no longer correct, because inductively 132 | rmax_Q''(y_s) - rmin_Q''(y_t) <= 3 eps n'' ; which is not <= 2 eps n'' 133 | 134 | We therefore define a new merge operation for elements of Q' and Q'' 135 | which are equal. 136 | 137 | This merge operation will also inductively guarantee that only 138 | two consecutive elements may share the same value. 139 | 140 | 141 | INTUITION: 142 | The intuitive explanation of the meaning of multiple elements 143 | with the same value is that it represents an unbroken 144 | sequence of observations with that same value. The first 145 | element represents the rank of the beginning of this sequence. 146 | The second element represents the rank of the end of this 147 | sequence. 148 | 149 | Because of this, there should never be more than two elements 150 | with the same value. (If there is a third element, it can be 151 | discarded keeping only the maximum and minimum.) 152 | 153 | This also means that if there are two elements with the same 154 | value, rmin may be set to rmax for the lower ranked element, 155 | and rmax may be set to rmin for the higher ranked element. 156 | 157 | Because every rank between the two elements of the same 158 | value is known to also be of that value, the top-most rank 159 | in the range of the first value and bottom-most rank in the 160 | range of the second value are known to be the exact locations 161 | of an element of the given value. 162 | 163 | 164 | NEW MERGE OPERATION: 165 | 166 | The proposed new merge operation considers 4 points at a time: 167 | 168 | x_r, x_r+1, y_t, y_t+1 169 | 170 | where x_r is the minimum un-merged element of Q' 171 | x_r+1 is the next consecutive element of Q' 172 | y_t is the minimum un-merged element of Q'' 173 | y_t+1 is the next consecutive element of Q'' 174 | 175 | in order to generate z_i of Q 176 | 177 | all points whose value are equal are consumed in a single step 178 | 179 | there are four cases: 180 | 181 | for notation convenience, 182 | define the ADD operation of x of Q' and y of Q'' 183 | be z of Q: 184 | rmax_Q(z) = rmax_Q'(x) + rmax_Q''(y) 185 | rmin_Q(z) = rmin_Q'(x) + rmin_Q''(y) 186 | 187 | CASE I: 188 | x_r != y_t: 189 | apply the merge operation as in the original paper 190 | CASE II: 191 | x_r = y_t, and x_r != x_r+1, and y_t != y_t+1: 192 | z_i = ADD(x_r, y_t) 193 | move on to x_r+1, y_t+1 194 | case III: 195 | x_r = y_t = x_r+1 != y_t+1: 196 | generate two points, z_i and z_i' 197 | z_i = ADD(x_r, y_t) 198 | z_i' = ADD(x_r+1, y_t) 199 | apply SHRINK to the pair of points, 200 | append the result (either one or two points) to Q, 201 | move on to x_r+2 and y_t+1 202 | case IV: 203 | x_r = y_t = y_t+1 != x_r+1: 204 | generate two points z_i and z_i' 205 | z_i = ADD(x_r, y_t) 206 | z_i' = ADD(x_r, y_t+1) 207 | apply SHRINK to the pair of points, 208 | append the result (either one or two points) to Q, 209 | move on to x_r+1, y_t+2 210 | case V: 211 | x_r = y_t = x_r+1 = y_t+1: 212 | this means that there are two runs of the same value 213 | in each of the merging sequences. 214 | 215 | generate two points z_i, z_i': 216 | z_i = ADD(x_r, y_t) 217 | z_i' = ADD(x_r+1, y_t+1) 218 | append both to Q 219 | move on to x_r+2, y_t+2 220 | 221 | 222 | PROOF: 223 | Let eps be the error bound. 224 | Let Q' and Q'' be two quantile summaries being merged, each of 225 | which inductively have error <= eps. 226 | Let x be an element of Q' and y be an element of Q'' such that. 227 | Let n' be the number of observations covered by Q' and 228 | n'' be the number of observations covered by Q''. 229 | 230 | The proposed merge operation is: 231 | rmax_Q(z_i) = rmax_Q'(x) + rmax_Q''(y) 232 | rmin_Q(z_i) = rmin_Q'(x) + rmin_Q''(y) 233 | 234 | We wish to show that rmax_Q(z_i+1) - rmin_Q(z_i) <= 2 eps (n' + n'') 235 | 236 | let r be the index of x in Q' 237 | let s by the index of y in Q'' 238 | 239 | Several things must be proven about the new MERGE: 240 | 1- that z_i and z_i+1 are within the error bounds when z_i is the 241 | result of CASE II, II, IV, or V, and z_i+1 is the result of CASE I 242 | 2- that z_i and z_i+1 are within the error bounds when z_i is the 243 | result of CASE I, and z_i+1 is the result of CASE II, III, IV, or V 244 | 3- that z_i and z_i+1 are within the error bound 245 | 246 | There are 3 cases: 247 | 248 | case I: 249 | 250 | case I: z_i+1 was merged using the new merge operation 251 | case II: z_i+1 was merged using the merge operation defined in the paper 252 | case IIA: z_i+1 came from x_r+1 of Q' 253 | case IIAi: z_i+1 = z_i 254 | case IIAii: z_i+1 > z_i 255 | case IIB: z_i+1 came from y_s+1 of Q'' 256 | case IIBi: z_i+1 = z_i 257 | case IIBii: z_i+1 > z_i 258 | 259 | (Note, if x_r+1 = x_r and y_s+1 = y_s, this is back to case I) 260 | 261 | case I: 262 | rmax_Q(z_i+1) - rmin_Q(z_i) = rmax_Q'(x_r+1) + rmax_Q''(y_s+1) 263 | - rmin_Q'(x_r) - rmin_Q''(y_s) 264 | = rmax_Q'(x_r+1) - rmin_Q'(x_r) 265 | + rmax_Q''(y_s+1) - rmin_Q''(y_s) 266 | by inductive property of Q' and Q'' 267 | <= 2 n' eps + 2 n'' eps 268 | <= 2 eps (n' + n'') 269 | 270 | case IIAi: 271 | if x_r = x_r+1: 272 | by consecutive element contraction lemma: 273 | if x_r and x_r+1 overlap, combine them 274 | and there is no z_i, only z_i+1 275 | 276 | otherwise, rmin_S(x_r) = rmax_S(x_r) 277 | and rmin_S(x_r+1) = rmax_S(x_r+1) 278 | 279 | rmax_Q(z_i+1) - rmin_Q(z_i) 280 | = 281 | 282 | 283 | 284 | 285 | 286 | 287 | consecutive elment contraction lemma: 288 | let s and s' be consecutive elements of summary S, 289 | such that s = s' 290 | CASE I: 291 | if rmin_S(s') < rmax_S(s): 292 | maximum rank of s' = rmax_S(s) 293 | minimum rank of s = rmin_S(s') 294 | [s and s' are now identical, discard one] 295 | CASE II: 296 | if rmin_S(s') >= rmax_S(s): 297 | maximum rank of s' = rmin_S(s') 298 | minimum rank of s = rmax_S(s) 299 | 300 | let '[' represent rmin_S, and ']' represent rmax_S: 301 | [ x_r ] 302 | [ x_r+1 ] 303 | let a = the value of x_r and x_r+1; 304 | We know that somewhere between rmax_S(x_r) and rmin_S(x_r+1) 305 | there is a sequence of consecutive 'a' values in the data 306 | set. Therefore, there is at least one a in that interval. 307 | 308 | 309 | 310 | ----------------------- 311 | Notation: 312 | for readability, everything after a "_" should be read as a subscript 313 | For example, z_i+1 should be read z sub(i+1), not z sub(i) + 1 314 | 315 | x of Q, should be read "x element of Q" 316 | 317 | Also, the original paper assumes that every element is unique. 318 | Therefore, rmin(v) is a function that takes value v and returns 319 | the minimum rank. Because we allow two elements to have the same 320 | value, v must be considered to be a tuple. rmin(v) is selecting 321 | one item from that tuple, rmax(v) is selecting another element. 322 | 323 | This does not affect the correctness of any of the proofs from the 324 | original paper. 325 | 326 | ''' 327 | 328 | 329 | 330 | class _point(object): 331 | def __init__(self, val, delta, width, next): 332 | self.val, self.delta, self.width, self.next = val, delta, width, next 333 | 334 | def __repr__(self): 335 | return "_point(val={0}, delta={1}, width={2})".format(self.val, self.delta, self.width) 336 | 337 | 338 | def biased_quantiles_f(r_i, n): 339 | return 2 * ERROR_RATE * r_i 340 | 341 | 342 | def targeted_quantiles_f(percentiles_errors): 343 | def f(r_i, n): 344 | bounds = [] 345 | for p, e in percentiles_errors: 346 | if r_i < p * n: 347 | bounds.append(2 * e * (n - r_i) / (1 - e)) 348 | else: 349 | bounds.append(2 * e * r_i / p) 350 | return min(bounds) 351 | return f 352 | 353 | 354 | ERROR_RATE = 0.001 355 | COMPRESS_INTERVAL = 10 # int(1 / ERROR_RATE) 356 | 357 | # val is the current (approximate) value 358 | 359 | # delta is the difference between the lowest possible rank of the current 360 | # point/value and the previous point 361 | 362 | # width is the differencet between the lowest and highest possible rank 363 | # of the current point/value 364 | 365 | # this data structure ensures that new points can be inserted into 366 | # the middle of the linked list 367 | 368 | 369 | # performance of naive algorithm is very bad -- 300 - 700 microseconds 370 | # (0.3 to 0.7 ms). this is about 20-40x slower than python piece-wise 371 | # parabolic algorithm; ~300x slower than C piece-wise parabolic 372 | def test(q=None): 373 | import random, time 374 | 375 | data = [random.normalvariate(1.0, 1.0) for i in range(int(1e4))] 376 | q = q or Quantiles() 377 | start = time.time() 378 | for d in data: 379 | q.insert(d) 380 | print (time.time() - start) * 1e6 / len(data), "microseconds per point" 381 | return q 382 | 383 | # about 400 microseconds per point 384 | def test_targeted(): 385 | TARGETS = ((0.25, 0.001), (0.5, 0.001), (0.75, 0.001), (0.9, 0.001), (0.95, 0.001), (0.99, 0.001)) 386 | f = targeted_quantiles_f(TARGETS) 387 | return test(q=Quantiles(f)) 388 | 389 | if __name__ == "__main__": 390 | print 'biased quantile condition' 391 | test() 392 | print 'targeted quantile condition' 393 | test_targeted() 394 | --------------------------------------------------------------------------------