├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── doc
    ├── Makefile
    ├── conf.py
    ├── index.rst
    └── make.bat
├── faststat
    ├── __init__.py
    ├── _faststat.c
    ├── cache.py
    ├── faststat.py
    ├── format.py
    └── test.py
├── setup.py
└── test
    └── biased_quantile_stream.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Kurt Rose
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include faststat *.py *.c
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | faststat
 2 | ========
 3 | 
 4 | fast online statistics collection
 5 | 
 6 | very simple API:
 7 | 
 8 | ```python
 9 | >>> import faststat
10 | >>> stats = faststat.Stats()
11 | >>> stats.add(point)
12 | >>> stats.add(point)
13 | ...
14 | ```
15 | 
16 | 
17 | The following properties are accesible on a Stats object: n, min, max, variance, skewness, and kurtosis.
18 | In addition, a Stats object tracks percentiles.
19 | 
20 | Performance is pretty good: 0.63 microseconds per point on my machine.  (Provided the C module is available.)
21 | 
22 | In pure Python mode, performance is about 9 microseconds per point.
23 | 
24 | ```python
25 | 0.615999937057 microseconds per point
26 | mean (should be 1) 0.998333953189
27 | kurtosis / reference kurtosis -0.0021881144433 -0.00220621681959
28 | variance / reference variance 0.999219190297 0.999219190297
29 | skewness (should be 0) -0.0071960817771
30 | max, min 5.83625092886 -3.4749002526
31 | m2, m3, m4 999218.191078 -7187.64448532 2993126.28574
32 | 9.00099992752 microseconds per point
33 | mean (should be 1) 0.998333953189
34 | kurtosis / reference kurtosis -0.0021881144433 -0.00220621681959
35 | variance / reference variance 0.999219190297 0.999219190297
36 | skewness (should be 0) -0.0071960817771
37 | max, min 5.83625092886 -3.4749002526
38 | m2, m3, m4 999218.191078 -7187.64448532 2993126.28574
39 | ```
40 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/faststat.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/faststat.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/faststat"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/faststat"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # faststat documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Apr 14 00:45:48 2014.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | #sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | # -- General configuration ------------------------------------------------
 24 | 
 25 | # If your documentation needs a minimal Sphinx version, state it here.
 26 | #needs_sphinx = '1.0'
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be
 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 30 | # ones.
 31 | extensions = [
 32 |     'sphinx.ext.autodoc',
 33 |     'sphinx.ext.intersphinx',
 34 |     'sphinx.ext.mathjax',
 35 |     'sphinx.ext.viewcode',
 36 | ]
 37 | 
 38 | # Add any paths that contain templates here, relative to this directory.
 39 | templates_path = ['_templates']
 40 | 
 41 | # The suffix of source filenames.
 42 | source_suffix = '.rst'
 43 | 
 44 | # The encoding of source files.
 45 | #source_encoding = 'utf-8-sig'
 46 | 
 47 | # The master toctree document.
 48 | master_doc = 'index'
 49 | 
 50 | # General information about the project.
 51 | project = u'faststat'
 52 | copyright = u'2014, Kurt Rose'
 53 | 
 54 | # The version info for the project you're documenting, acts as replacement for
 55 | # |version| and |release|, also used in various other places throughout the
 56 | # built documents.
 57 | #
 58 | # The short X.Y version.
 59 | version = '0.7'
 60 | # The full version, including alpha/beta/rc tags.
 61 | release = '0.7'
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #language = None
 66 | 
 67 | # There are two options for replacing |today|: either, you set today to some
 68 | # non-false value, then it is used:
 69 | #today = ''
 70 | # Else, today_fmt is used as the format for a strftime call.
 71 | #today_fmt = '%B %d, %Y'
 72 | 
 73 | # List of patterns, relative to source directory, that match files and
 74 | # directories to ignore when looking for source files.
 75 | exclude_patterns = ['_build']
 76 | 
 77 | # The reST default role (used for this markup: `text`) to use for all
 78 | # documents.
 79 | #default_role = None
 80 | 
 81 | # If true, '()' will be appended to :func: etc. cross-reference text.
 82 | #add_function_parentheses = True
 83 | 
 84 | # If true, the current module name will be prepended to all description
 85 | # unit titles (such as .. function::).
 86 | #add_module_names = True
 87 | 
 88 | # If true, sectionauthor and moduleauthor directives will be shown in the
 89 | # output. They are ignored by default.
 90 | #show_authors = False
 91 | 
 92 | # The name of the Pygments (syntax highlighting) style to use.
 93 | pygments_style = 'sphinx'
 94 | 
 95 | # A list of ignored prefixes for module index sorting.
 96 | #modindex_common_prefix = []
 97 | 
 98 | # If true, keep warnings as "system message" paragraphs in the built documents.
 99 | #keep_warnings = False
100 | 
101 | 
102 | # -- Options for HTML output ----------------------------------------------
103 | 
104 | # The theme to use for HTML and HTML Help pages.  See the documentation for
105 | # a list of builtin themes.
106 | html_theme = 'default'
107 | 
108 | # Theme options are theme-specific and customize the look and feel of a theme
109 | # further.  For a list of options available for each theme, see the
110 | # documentation.
111 | #html_theme_options = {}
112 | 
113 | # Add any paths that contain custom themes here, relative to this directory.
114 | #html_theme_path = []
115 | 
116 | # The name for this set of Sphinx documents.  If None, it defaults to
117 | # "<project> v<release> documentation".
118 | #html_title = None
119 | 
120 | # A shorter title for the navigation bar.  Default is the same as html_title.
121 | #html_short_title = None
122 | 
123 | # The name of an image file (relative to this directory) to place at the top
124 | # of the sidebar.
125 | #html_logo = None
126 | 
127 | # The name of an image file (within the static path) to use as favicon of the
128 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
129 | # pixels large.
130 | #html_favicon = None
131 | 
132 | # Add any paths that contain custom static files (such as style sheets) here,
133 | # relative to this directory. They are copied after the builtin static files,
134 | # so a file named "default.css" will overwrite the builtin "default.css".
135 | html_static_path = ['_static']
136 | 
137 | # Add any extra paths that contain custom files (such as robots.txt or
138 | # .htaccess) here, relative to this directory. These files are copied
139 | # directly to the root of the documentation.
140 | #html_extra_path = []
141 | 
142 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
143 | # using the given strftime format.
144 | #html_last_updated_fmt = '%b %d, %Y'
145 | 
146 | # If true, SmartyPants will be used to convert quotes and dashes to
147 | # typographically correct entities.
148 | #html_use_smartypants = True
149 | 
150 | # Custom sidebar templates, maps document names to template names.
151 | #html_sidebars = {}
152 | 
153 | # Additional templates that should be rendered to pages, maps page names to
154 | # template names.
155 | #html_additional_pages = {}
156 | 
157 | # If false, no module index is generated.
158 | #html_domain_indices = True
159 | 
160 | # If false, no index is generated.
161 | #html_use_index = True
162 | 
163 | # If true, the index is split into individual pages for each letter.
164 | #html_split_index = False
165 | 
166 | # If true, links to the reST sources are added to the pages.
167 | #html_show_sourcelink = True
168 | 
169 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
170 | #html_show_sphinx = True
171 | 
172 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
173 | #html_show_copyright = True
174 | 
175 | # If true, an OpenSearch description file will be output, and all pages will
176 | # contain a <link> tag referring to it.  The value of this option must be the
177 | # base URL from which the finished HTML is served.
178 | #html_use_opensearch = ''
179 | 
180 | # This is the file name suffix for HTML files (e.g. ".xhtml").
181 | #html_file_suffix = None
182 | 
183 | # Output file base name for HTML help builder.
184 | htmlhelp_basename = 'faststatdoc'
185 | 
186 | 
187 | # -- Options for LaTeX output ---------------------------------------------
188 | 
189 | latex_elements = {
190 | # The paper size ('letterpaper' or 'a4paper').
191 | #'papersize': 'letterpaper',
192 | 
193 | # The font size ('10pt', '11pt' or '12pt').
194 | #'pointsize': '10pt',
195 | 
196 | # Additional stuff for the LaTeX preamble.
197 | #'preamble': '',
198 | }
199 | 
200 | # Grouping the document tree into LaTeX files. List of tuples
201 | # (source start file, target name, title,
202 | #  author, documentclass [howto, manual, or own class]).
203 | latex_documents = [
204 |   ('index', 'faststat.tex', u'faststat Documentation',
205 |    u'Kurt Rose', 'manual'),
206 | ]
207 | 
208 | # The name of an image file (relative to this directory) to place at the top of
209 | # the title page.
210 | #latex_logo = None
211 | 
212 | # For "manual" documents, if this is true, then toplevel headings are parts,
213 | # not chapters.
214 | #latex_use_parts = False
215 | 
216 | # If true, show page references after internal links.
217 | #latex_show_pagerefs = False
218 | 
219 | # If true, show URL addresses after external links.
220 | #latex_show_urls = False
221 | 
222 | # Documents to append as an appendix to all manuals.
223 | #latex_appendices = []
224 | 
225 | # If false, no module index is generated.
226 | #latex_domain_indices = True
227 | 
228 | 
229 | # -- Options for manual page output ---------------------------------------
230 | 
231 | # One entry per manual page. List of tuples
232 | # (source start file, name, description, authors, manual section).
233 | man_pages = [
234 |     ('index', 'faststat', u'faststat Documentation',
235 |      [u'Kurt Rose'], 1)
236 | ]
237 | 
238 | # If true, show URL addresses after external links.
239 | #man_show_urls = False
240 | 
241 | 
242 | # -- Options for Texinfo output -------------------------------------------
243 | 
244 | # Grouping the document tree into Texinfo files. List of tuples
245 | # (source start file, target name, title, author,
246 | #  dir menu entry, description, category)
247 | texinfo_documents = [
248 |   ('index', 'faststat', u'faststat Documentation',
249 |    u'Kurt Rose', 'faststat', 'One line description of project.',
250 |    'Miscellaneous'),
251 | ]
252 | 
253 | # Documents to append as an appendix to all manuals.
254 | #texinfo_appendices = []
255 | 
256 | # If false, no module index is generated.
257 | #texinfo_domain_indices = True
258 | 
259 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
260 | #texinfo_show_urls = 'footnote'
261 | 
262 | # If true, do not generate a @detailmenu in the "Top" node's menu.
263 | #texinfo_no_detailmenu = False
264 | 
265 | 
266 | # Example configuration for intersphinx: refer to the Python standard library.
267 | intersphinx_mapping = {'http://docs.python.org/': None}
268 | 
269 | import sys
270 | import os.path
271 | sys.path = [os.path.dirname(os.path.dirname(os.path.abspath(__file__)))] + sys.path
272 | 
273 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
  1 | faststat
  2 | ========
  3 | 
  4 | faststat is a *streaming*, *light-weight* statistics library designed for embedding
  5 | in other Python applications.  *Steaming* means
  6 | that faststat operates on data points as they arrive, without needing to store
  7 | previous data points.  *Light-weight* means that statistics do not take up a great
  8 | deal of CPU or RAM.  Adding a data point to a stat object is a 0.5 - 3 microsecond operation.  
  9 | Each stat object takes about 4kiB of memory.
 10 | 
 11 | .. toctree::
 12 |    :maxdepth: 2
 13 | 
 14 | Basic usage
 15 | -----------
 16 | The most basic usage of faststat is to create a Stats object to represent some continuous
 17 | variable, and then add data points to it.  
 18 | 
 19 | ::
 20 | 
 21 |    >>> import faststat
 22 |    >>> s = faststat.Stats()
 23 |    >>> for i in range(100):
 24 |    ...    s.add(i)
 25 |    ...
 26 |    >>> s
 27 |    <faststat.Stats n=100 mean=49.5 quartiles=(23.4, 49.6, 73.8)>
 28 | 
 29 | Collected data
 30 | --------------
 31 | The following data is collected for each point.  A data point is considered to be `(x, t)`
 32 | where `x` is a floating point value, and `t` is the system clock at the time the data was passed
 33 | to faststat.
 34 | 
 35 | =============== ====================================================================================
 36 | attribute       description
 37 | =============== ====================================================================================
 38 | n               The number of data points.
 39 | 
 40 | mean            The arithmetic mean, also known as expected value E(x) or 
 41 |                 :math:`\bar{x}`. Defined as :math:`\bar{x} = \frac{x_1 + x_2 + ... + x_n}{n}`
 42 | 
 43 | max             The largest value seen.
 44 | 
 45 | maxtime         The time of the largest data point.
 46 | 
 47 | min             The smallest value seen.
 48 | 
 49 | mintime         The time of the smallest data point.
 50 | 
 51 | lasttime        The time of the most recent data point.
 52 | 
 53 | percentiles     A dictionary of approximate percentiles.
 54 | 
 55 | buckets         Counts of data points which have occurred in different ranges.
 56 |                 Essentially logarithmic-scale histogram data.      
 57 | 
 58 | variance        The variance.  See http://en.wikipedia.org/wiki/Variance
 59 | 
 60 | skewness        The skewness.  See http://en.wikipedia.org/wiki/Skewness
 61 | 
 62 | kurtosis        The kurtosis.  See http://en.wikipedia.org/wiki/Kurtosis
 63 | 
 64 | geometric_mean  The geometric mean, or NaN if any points are <= 0.  
 65 |                 See http://en.wikipedia.org/wiki/Geometric_mean
 66 | 
 67 | harmonic_mean   The harmonic mean, or NaN if any points are <= 0.
 68 |                 See http://en.wikipedia.org/wiki/Harmonic_mean
 69 | 
 70 | expo_avgs       A dictionary mapping exponential decay factors to current values.
 71 |                 See http://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average
 72 | 
 73 | get_prev()      The most recent data points.
 74 | 
 75 | get_topN()      The largest data points
 76 | 
 77 | window_avg      The mean of the data points in get_prev().
 78 | =============== ====================================================================================
 79 | 
 80 | 
 81 | Error
 82 | -----
 83 | 
 84 | There is no proven error bound on the algorithm used to calculate percentiles.  However, empirically
 85 | the error is observed to be low.  It is worth noting that the percentile algorithm used (P2) performs
 86 | interpolation of values.  Therefore for a sequence consisting of ~50% 1's and ~50% 2's, the algorithm
 87 | would report a median around 1.5.
 88 | 
 89 | 
 90 | Examples
 91 | --------
 92 | 
 93 | Tracking the average number of items present each time a new item was added.
 94 | 
 95 | .. code-block:: python
 96 | 
 97 |    import collections
 98 |    import faststat
 99 | 
100 |    class Queue(object):
101 |       def __init__(self):
102 |          self.deq = collections.deque()
103 |          self.put_stats = faststat.Stats()
104 | 
105 |       def put(self, item):
106 |          self.put_stats.add(len(self.deq))
107 |          self.deq.append(item)
108 | 
109 |       def get(self):
110 |          return self.deq.popleft()
111 | 
112 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\faststat.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\faststat.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/faststat/__init__.py:
--------------------------------------------------------------------------------
1 | from faststat import *
2 | from format import *
3 | 


--------------------------------------------------------------------------------
/faststat/_faststat.c:
--------------------------------------------------------------------------------
  1 | #include <Python.h>
  2 | #include <structmember.h>
  3 | #include <pymem.h>
  4 | #include <stdio.h>
  5 | #include <math.h>
  6 | 
  7 | //define gettime() which returns integral nanoseconds since epoch
  8 | //as a 64 bit integer for a variety of platforms
  9 | #ifdef _WIN32
 10 | //windows has its own brand of time keeping functions
 11 | #include <windows.h>
 12 | #define DELTA_EPOCH_IN_SECS  11644473600ULL
 13 | //difference between Jan 1, 1601 and Jan 1, 1970 (unix epoch)
 14 | 
 15 | static unsigned long long _nanotime(void) {
 16 |     FILETIME ft;
 17 |     ULARGE_INTEGER result;
 18 |     GetSystemTimeAsFileTime(&ft); //returns time in 100ns intervals since Jan 1, 1601
 19 |     result.HighPart = ft.dwHighDateTime;
 20 |     result.LowPart = ft.dwLowDateTime;
 21 |     result.QuadPart -= DELTA_EPOCH_IN_SECS * 10000000ULL; // 1000 (ms) * 1000 (us) * 10 (100ns)
 22 |     return result.QuadPart * 100;
 23 | }
 24 | 
 25 | // for old versions of MSVC which do not include NAN macro
 26 | #ifndef NAN
 27 |     static const unsigned long __nan[2] = {0xffffffff, 0x7fffffff};
 28 |     #define NAN (*(const float *) __nan)
 29 | #endif
 30 | 
 31 | #elif defined linux
 32 | //linux has clock_gettime(CLOCK_REALTIME) which is ns since epoch -- perfect
 33 | #include <time.h>
 34 | 
 35 | static unsigned long long _nanotime(void) {
 36 |     struct timespec ts;
 37 |     if(clock_gettime(CLOCK_REALTIME, &ts) == -1) {
 38 |         return 0;
 39 |     }
 40 |     return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
 41 | }
 42 | 
 43 | #else
 44 | //for those oddballs like OSX and BSD, fall back on gettimeofday() which is at least microseconds
 45 | #include <time.h>
 46 | 
 47 | static unsigned long long _nanotime(void) {
 48 |     struct timeval tv;
 49 |     if(gettimeofday(&tv, NULL) == -1) {
 50 |         return 0;
 51 |     }
 52 |     return tv.tv_sec * 1000000000ULL + tv.tv_usec * 1000ULL;
 53 | }
 54 | 
 55 | #endif
 56 | 
 57 | static unsigned long long nanotime_override = 0;
 58 | 
 59 | static unsigned long long nanotime(void) {
 60 |     if(nanotime_override) {
 61 |         return nanotime_override;
 62 |     }
 63 |     return _nanotime();
 64 | }
 65 | 
 66 | //percentile point for usage in P2 algorithm
 67 | typedef struct {
 68 |     unsigned short percentile;  //divide by 0xFFFF to get a float between 0 and 1
 69 |     double val;  //estimate of current percentile value
 70 |     unsigned int n;  //estimate of how many values were less than this
 71 | } faststat_P2Percentile;
 72 | 
 73 | 
 74 | typedef struct {
 75 |     float max;
 76 |     unsigned int count;
 77 | } faststat_Bucket;
 78 | 
 79 | 
 80 | // keeping this size a power of 2 makes pointer arithmetic in heap more efficient
 81 | typedef struct {
 82 |     double value;
 83 |     unsigned long long nanotime;
 84 | } faststat_DataPoint;
 85 | 
 86 | // represents an exponential moving average
 87 | // aka a low pass filter, infinite impulse response filter
 88 | typedef struct {
 89 |     double val;
 90 |     double alpha;
 91 | } faststat_ExpoAvg;
 92 | 
 93 | 
 94 | // represents a count for a given interval,
 95 | // aligned on unix epoch
 96 | typedef struct {
 97 |     unsigned short num_windows;  // number of counts -- MUST BE A POWER OF 2
 98 |     unsigned long long window_size_nanosecs;  // size of each window in seconds
 99 |     unsigned int *counts;  // counts for the previous num_windows intervals
100 | } faststat_WindowCount;
101 | 
102 | 
103 | // for representing a normally distributed variable
104 | typedef struct faststat_Stats_struct {
105 |     PyObject_HEAD
106 |     unsigned long long n;
107 |     double mean, min, max, m2, m3, m4;
108 |     double sum_of_logs, sum_of_inv;  // for geometric and harmonic mean
109 |     unsigned long long mintime, maxtime, lasttime;
110 |     unsigned int num_percentiles;
111 |     faststat_P2Percentile *percentiles;
112 |     unsigned int num_buckets;
113 |     faststat_Bucket *buckets; // last bucket MUST BE +inf
114 |     unsigned int num_expo_avgs;
115 |     faststat_ExpoAvg *expo_avgs;
116 |     double window_avg;
117 |     unsigned int num_prev; // MUST BE A POWER OF 2!
118 |     faststat_DataPoint *lastN;
119 |     unsigned int num_top; // MUST BE A POWER OF 2!
120 |     faststat_DataPoint *topN;
121 |     unsigned int num_window_counts;
122 |     //window counts must be sorted by window_size, to
123 |     //make handling code cleaner/smaller
124 |     faststat_WindowCount *window_counts;
125 |     struct faststat_Stats_struct *interval;
126 | } faststat_Stats;
127 | 
128 | /*
129 | typedef struct {
130 |     unsigned int n;
131 |     unsigned int num_prev;
132 | 
133 | } faststat_StatsGroup;
134 | */
135 | 
136 | char* NEW_ARGS[] = {"buckets", "lastN", "percentiles", "interval", "expo_avgs", 
137 |     "window_counts", "num_top", NULL};
138 | 
139 | 
140 | static PyObject* faststat_Stats_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
141 |     faststat_Stats *self;
142 |     PyObject *buckets, *percentiles, *interval, *expo_avgs, *window_counts, *cur;
143 |     int num_prev, num_buckets, num_percentiles, num_expo_avgs, num_window_counts, num_top;
144 |     int i, total, offset;
145 |     double temp;
146 |     if(!PyArg_ParseTupleAndKeywords(args, kwds, "OiOOOOi", NEW_ARGS, 
147 |             &buckets, &num_prev, &percentiles, &interval, &expo_avgs, &window_counts, &num_top)) {
148 |         return NULL;
149 |     }
150 | 
151 |     buckets = PySequence_Fast(buckets, "expected a sequence");
152 |     percentiles = PySequence_Fast(percentiles, "expected a sequence");
153 |     expo_avgs = PySequence_Fast(expo_avgs, "expected a sequence");
154 |     window_counts = PySequence_Fast(window_counts, "expected a sequence");
155 |     if(!buckets || !percentiles || !expo_avgs || !window_counts) { 
156 |         // TODO: decref on buckets and percentiles
157 |         return NULL;
158 |     }
159 |     num_buckets = (int)PySequence_Fast_GET_SIZE(buckets);
160 |     num_percentiles = (int)PySequence_Fast_GET_SIZE(percentiles);
161 |     num_expo_avgs = (int)PySequence_Fast_GET_SIZE(expo_avgs);
162 |     num_window_counts = (int)PySequence_Fast_GET_SIZE(window_counts);
163 | 
164 |     self = (faststat_Stats*)type->tp_alloc(type, 0);
165 |     if(self != NULL) {
166 |         self->interval = NULL;
167 |         self->n = 0;
168 |         self->mean = self->m2 = self->m3 = self->m4 = self->min = self->max = 0;
169 |         self->sum_of_logs = self->sum_of_inv = 0;
170 |         self->mintime = self->maxtime = self->lasttime = 0;
171 |         self->num_percentiles = num_percentiles;
172 |         if(interval != Py_None ) {
173 |             self->interval = (faststat_Stats*)interval; // WARNING: incompatible pointer type..
174 |         } else {                 // TODO: figure out a better test of type here
175 |             self->interval = NULL;
176 |         }
177 |         if(num_percentiles) {
178 |             self->percentiles = PyMem_New(faststat_P2Percentile, num_percentiles);
179 |             for(i=0; i<num_percentiles; i++) {
180 |                 temp = PyFloat_AsDouble(PySequence_Fast_GET_ITEM(percentiles, i));
181 |                 self->percentiles[i].percentile = (unsigned short)(temp * 0x10000);
182 |                 self->percentiles[i].val = 0;
183 |                 self->percentiles[i].n = i + 1;
184 |             }
185 |         } else {
186 |             self->percentiles = NULL;
187 |         }
188 |         self->num_buckets = num_buckets;
189 |         if(num_buckets) {
190 |             self->buckets = PyMem_New(faststat_Bucket, num_buckets);
191 |             for(i=0; i<num_buckets; i++) {
192 |                 self->buckets[i].count = 0;
193 |                 self->buckets[i].max = (float)PyFloat_AsDouble(PySequence_Fast_GET_ITEM(buckets, i));
194 |                 // don't bother checking for error; let it raise later
195 |             }
196 |         } else {
197 |             self->buckets = NULL;
198 |         }
199 |         self->num_expo_avgs = num_expo_avgs;
200 |         if(num_expo_avgs) {
201 |             self->expo_avgs = PyMem_New(faststat_ExpoAvg, num_expo_avgs);
202 |             for(i=0; i<num_expo_avgs; i++) {
203 |                 self->expo_avgs[i].val = 0;
204 |                 self->expo_avgs[i].alpha = (double)PyFloat_AsDouble(PySequence_Fast_GET_ITEM(expo_avgs, i));
205 |             }
206 |         } else {
207 |             self->expo_avgs = NULL;
208 |         }
209 |         self->num_prev = num_prev;
210 |         if(num_prev) {
211 |             self->lastN = PyMem_New(faststat_DataPoint, num_prev);
212 |             for(i=0; i<num_prev; i++) {
213 |                 self->lastN[i].value = 0;
214 |                 self->lastN[i].nanotime = 0;
215 |             }
216 |         } else {
217 |             self->lastN = NULL;
218 |         }
219 |         if(num_top == 0) {
220 |             num_top = 1;
221 |         }
222 |         self->num_top = num_top;
223 |         self->topN = PyMem_New(faststat_DataPoint, num_top);
224 |         memset(self->topN, 0, sizeof(faststat_DataPoint) * num_top);
225 |         self->topN -= 1; //use 1 based indexing
226 | 
227 |         self->num_window_counts = num_window_counts;
228 |         if(num_window_counts) {
229 |             self->window_counts = PyMem_New(faststat_WindowCount, num_window_counts);
230 |             PyList_Sort(window_counts);
231 |             total = 0;
232 |             for(i=0; i<num_window_counts; i++) {
233 |                 cur = PySequence_Fast_GET_ITEM(window_counts, i);
234 |                 if(!PyTuple_Check(cur)) {
235 |                     continue;
236 |                 }
237 |                 PyArg_ParseTuple(cur, "HK",
238 |                     &(self->window_counts[i].num_windows),
239 |                     &(self->window_counts[i].window_size_nanosecs));
240 |                 total += self->window_counts[i].num_windows;
241 |             }
242 |             // allocate all of the window counts as one contiguous block
243 |             self->window_counts[0].counts = PyMem_New(unsigned int, total);
244 |             memset(self->window_counts[0].counts, 0, sizeof(unsigned int) * total);
245 |             offset = self->window_counts[0].num_windows;
246 |             for(i=1; i<num_window_counts; i++) {
247 |                 self->window_counts[i].counts = self->window_counts[0].counts + offset;
248 |                 offset += self->window_counts[i].num_windows;
249 |             }
250 |         } else {
251 |             self->window_counts = NULL;
252 |         }
253 |     }
254 | 
255 |     if(PyErr_Occurred()) {
256 |        Py_DECREF(self);
257 |        return NULL;
258 |     }
259 | 
260 |     return (PyObject*) self;
261 | }
262 | 
263 | 
264 | static void faststat_Stats_dealloc(faststat_Stats* self) {
265 |     if(self->percentiles) {
266 |         PyMem_Del(self->percentiles);
267 |     }
268 |     if(self->buckets) {
269 |         PyMem_Del(self->buckets);
270 |     }
271 |     if(self->expo_avgs) {
272 |         PyMem_Del(self->expo_avgs);
273 |     }
274 |     if(self->lastN) {
275 |         PyMem_Del(self->lastN);
276 |     }
277 |     if(self->topN) {
278 |         PyMem_Del(self->topN + 1);  // undo 1-based indexing
279 |     }
280 |     if(self->window_counts) {
281 |         // see constructor; all window_counts are allocated as one chunk
282 |         PyMem_Del(self->window_counts[0].counts);
283 |         PyMem_Del(self->window_counts);
284 |     }
285 |     self->ob_type->tp_free((PyObject*)self);
286 | }
287 | 
288 | #define STR_VAL(arg) #arg
289 | #define DBL_MEMBER(name, description) {STR_VAL(name), T_DOUBLE, offsetof(faststat_Stats, name), 0, description}
290 | #define DBL_MEMBER1(name) {STR_VAL(name), T_DOUBLE, offsetof(faststat_Stats, name), 0, STR_VAL(name)}
291 | #define MEMBER(name, type, description) {STR_VAL(name), type, offsetof(faststat_Stats, name), 0, description}
292 | static PyMemberDef faststat_Stats_members[] = {
293 |     MEMBER(n, T_UINT, "number of points"),
294 |     DBL_MEMBER1(mean), DBL_MEMBER1(min), DBL_MEMBER1(max),
295 |     DBL_MEMBER(sum_of_logs, "sum of logs of values, for geometric mean; NaN if undefined"),
296 |     DBL_MEMBER(sum_of_inv, "sum of inverses or values, for harmonic mean; NaN if undefined"),
297 |     MEMBER(lasttime, T_ULONGLONG, "time (in nanoseconds since epoch) of last point"),
298 |     MEMBER(mintime, T_ULONGLONG, "time (in nanoseconds since epoch) of min value"),
299 |     MEMBER(maxtime, T_ULONGLONG, "time (in nanoseconds since epoch) of max value"),
300 |     DBL_MEMBER1(m2), DBL_MEMBER1(m3), DBL_MEMBER1(m4),
301 |     MEMBER(interval, T_OBJECT, "another Stat object which measures the time interval between data points"),
302 |     DBL_MEMBER(window_avg, "average of stored most recent data points"),
303 |     MEMBER(num_prev, T_ULONG, "number of most recent data points stored (accessible via get_prev() )"),
304 |     {NULL}
305 | };
306 | #undef MEMBER
307 | #undef DBL_MEMBER
308 | #undef DBL_MEMBER1
309 | #undef STR_VAL
310 | 
311 | 
312 | //update mean, and second third and fourth moments
313 | static void _update_moments(faststat_Stats *self, double x) {
314 |     double n, delta, delta_n, delta_m2, delta_m3, delta_m4;
315 |     n = (double)self->n; // note: math with 32 bit ints can cause problems
316 |     //pre-compute a bunch of intermediate values
317 |     delta = x - self->mean;
318 |     delta_n = delta / n;
319 |     delta_m2 = delta * delta_n * (n - 1);
320 |     delta_m3 = delta_m2 * delta_n * (n - 2);
321 |     delta_m4 = delta_m2 * delta_n * delta_n * (n * (n - 3) + 3);
322 |     //compute updated values
323 |     self->mean = self->mean + delta_n;
324 |     //note: order matters here
325 |     self->m4 += delta_m4 + delta_n * (6 * delta_n * self->m2 - 4 * self->m3);
326 |     self->m3 += delta_m3 + delta_n * 3 * self->m2;
327 |     self->m2 += delta_m2;
328 | }
329 | 
330 | 
331 | //helper for _update_percentiles
332 | static void _p2_update_point(double l_v, double l_n, faststat_P2Percentile *cur,
333 |                             double r_v, double r_n, unsigned long long n) {
334 |     int d;
335 |     double percentile, new_val, c_v, c_n, diff;
336 |     percentile = ((double)cur->percentile) / 0x10000;
337 |     c_n = cur->n;
338 |     diff = (n - 1) * percentile + 1 - c_n;
339 |     // clamp d at +/- 1
340 |     if(diff >= 1) {
341 |         d = 1;
342 |     } else if(diff <= -1) {
343 |         d = -1;
344 |     } else {
345 |         return;
346 |     }
347 |     c_v = cur->val;
348 |     if(l_n < c_n + d && c_n + d < r_n) {  // try updating estimate with parabolic
349 |         new_val = c_v + (d / (r_n - l_n)) * ( 
350 |             (c_n - l_n + d) * (r_v - c_v) / (r_n - c_n) +
351 |             (r_n - c_n - d) * (c_v - l_v) / (c_n - l_n));
352 |         if(l_v >= new_val || r_v <= new_val) {  // fall back on linear
353 |             if(d == 1) {
354 |                 new_val = c_v + (r_v - c_v) / (r_n - c_n);
355 |             } else {  // d == -1
356 |                 new_val = c_v - (l_v - c_v) / (l_n - c_n);
357 |             }
358 |         }
359 |         cur->val = new_val;
360 |         cur->n += d;
361 |     }
362 | }
363 | 
364 | 
365 | static void _insert_percentile_sorted(faststat_Stats *self, double x) {
366 |     unsigned long long num, i;  // prevent loss of precision compiler warning
367 |     double tmp;
368 |     num = self->n < self->num_percentiles ? self->n : self->num_percentiles;
369 |     for(i = 0; i < num-1; i++) { //insert in sorted order
370 |         if(x < self->percentiles[i].val) {
371 |             tmp = x;
372 |             x = self->percentiles[i].val;
373 |             self->percentiles[i].val = tmp;
374 |         }
375 |     }
376 |     self->percentiles[num-1].val = x;
377 | }
378 | 
379 | 
380 | //update percentiles using piece-wise parametric algorithm
381 | static void _update_percentiles(faststat_Stats *self, double x) {
382 |     unsigned int i;
383 |     //double percentile; //TODO: remove me
384 |     faststat_P2Percentile *right, *left, *cur, *prev, *nxt;
385 |     right = &(self->percentiles[self->num_percentiles-1]);
386 |     left = &(self->percentiles[0]);
387 |     if(!(right->n < self->n) ) { // just insert until self->n > self->num_percentiles
388 |         _insert_percentile_sorted(self, x);
389 |         return;
390 |     }
391 |     //right-most is stopping case; handle first
392 |     if(x < right->val && right->n + 1 < self->n) {
393 |         right->n++;
394 |     }
395 |     //handle the rest of the points
396 |     prev = right;
397 |     for(i = self->num_percentiles-2; ; i--) {
398 |         cur = &(self->percentiles[i]);
399 |         if(x < cur->val && cur->n + 1 < prev->n) {
400 |             cur->n++;
401 |         }
402 |         prev = cur;
403 |         if(i == 0) { //making i unsigned fixes some warnings
404 |             break;
405 |         }
406 |     }
407 |     //left-most point is a special case
408 |     nxt = &(self->percentiles[1]);
409 |     _p2_update_point(self->min, 0, left, nxt->val, nxt->n, self->n);
410 |     cur = left;
411 |     for(i=1; i < self->num_percentiles - 1; i++) {
412 |         prev = cur;
413 |         cur = nxt;
414 |         nxt = &(self->percentiles[i+1]);
415 |         _p2_update_point(prev->val, prev->n, cur, nxt->val, nxt->n, self->n);
416 |     }
417 |     _p2_update_point(cur->val, cur->n, right, (double)self->max, (double)self->n, self->n);
418 | } 
419 | 
420 | // be careful; if-condition must properly terminate when max == +inf, even for nan and +inf
421 | #define OFFSET(n)  if(!(x >= self->buckets[i+n].max)) { self->buckets[i+n].count++; break; } 
422 | 
423 | static void _update_buckets(faststat_Stats *self, double x) {
424 |     unsigned int i;
425 |     for(i=0; ; i += 16) {
426 |         OFFSET( 0) OFFSET( 1) OFFSET( 2) OFFSET( 3) 
427 |         OFFSET( 4) OFFSET( 5) OFFSET( 6) OFFSET( 7) 
428 |         OFFSET( 8) OFFSET( 9) OFFSET(10) OFFSET(11) 
429 |         OFFSET(12) OFFSET(13) OFFSET(14) OFFSET(15)
430 |     }
431 | }
432 | 
433 | #undef OFFSET
434 | 
435 | static void _update_lastN(faststat_Stats *self, double x) {
436 |     unsigned int offset;
437 |     if(self->num_prev == 0) { return; }
438 |     offset = (self->n - 1) & (self->num_prev - 1);
439 |     self->window_avg -= self->lastN[offset].value / (1.0 * self->num_prev);
440 |     self->window_avg += x / (1.0 * self->num_prev);
441 |     self->lastN[offset].value = x;
442 |     self->lastN[offset].nanotime = self->lasttime;
443 | }
444 | 
445 | 
446 | //this algorithm deterministically favors storing newer values over older values
447 | static void _update_topN(faststat_Stats *self, double x, unsigned long long t) {
448 |     faststat_DataPoint *cur, *left, *right, *end, *min, *topN;
449 |     unsigned int cur_i, left_i, right_i;
450 |     // uses one based indexing to save additions when navigating down heap
451 |     topN = self->topN;
452 |     if(x < topN[1].value) {
453 |         return;
454 |     }
455 |     // replace the smallest element of the topN with the new point
456 |     topN[1].value = x;
457 |     topN[1].nanotime = t;
458 |     // restore the heap condition
459 |     cur = topN + 1; //use pointers instead of array indices
460 |     cur_i = 1;
461 |     left = topN + 2;
462 |     left_i = 2;
463 |     right = topN + 3;
464 |     right_i = 3;
465 |     end = topN + 1 + self->num_top;
466 |     while(right < end) {
467 |         if(left->value == right->value) {
468 |             min = left->nanotime > right->nanotime ? left : right;
469 |         } else {
470 |             min = left->value < right->value ? left : right;
471 |         }
472 |         if(cur->value < min->value) {
473 |             break;
474 |         }
475 |         // swap cur with min of left, right
476 |         x = min->value;
477 |         t = min->nanotime;
478 |         min->value = cur->value;
479 |         min->nanotime = cur->nanotime;
480 |         cur->value = x;
481 |         cur->nanotime = t;
482 |         // set up for the next layer of the heap
483 |         cur = min;
484 |         cur_i = min == left ? left_i : right_i;
485 |         left_i = cur_i * 2;
486 |         right_i = left_i + 1;
487 |         left = topN + left_i;
488 |         right = topN + right_i;
489 |     }
490 | }
491 | 
492 | 
493 | static void _update_expo_avgs(faststat_Stats *self, double x) {
494 |     unsigned int i;
495 |     double val, alpha;
496 |     for(i = 0; i < self->num_expo_avgs; i++) {
497 |         val = self->expo_avgs[i].val;
498 |         alpha = self->expo_avgs[i].alpha;
499 |         // this equation ensures no "gain"
500 |         val = x * alpha + val * (1 - alpha);
501 |         self->expo_avgs[i].val = val;
502 |     }
503 | }
504 | 
505 | // re-zero all of the windows which have been "missed" between self->lasttime and t
506 | static void _rezero_window_counts(faststat_Stats *self, unsigned long long t) {
507 |     faststat_WindowCount *cur;
508 |     unsigned int i;
509 |     unsigned long long j, last_window, cur_window;
510 |     for(i = 0; i < self->num_window_counts; i++) {
511 |         cur = &(self->window_counts[i]);
512 |         last_window = self->lasttime / cur->window_size_nanosecs;
513 |         cur_window = t / cur->window_size_nanosecs;
514 |         if(last_window == cur_window) {
515 |             break;  // because window_counts are sorted by window_size_nanosecs,
516 |         }           // the next window cannot miss unless the current one does
517 |         if(cur_window - last_window >= cur->num_windows) {
518 |             memset(cur->counts, 0, sizeof(*cur->counts) * cur->num_windows);
519 |             continue;  // if the entire array is getting zero'd, just use memset
520 |         }
521 |         // TODO: convert this to a memset instead of a loop (perhaps)
522 |         for(j = last_window + 1; j <= cur_window; j++) {
523 |             cur->counts[j & (cur->num_windows - 1)] = 0;
524 |         }  // zero out all the "missed" windows
525 |     }
526 | }
527 | 
528 | 
529 | static void _update_window_counts(faststat_Stats *self, unsigned long long t) {
530 |     faststat_WindowCount *cur;
531 |     unsigned int i, cur_count;
532 |     _rezero_window_counts(self, t);
533 |     // step 2 -- increment current counts
534 |     for(i = 0; i < self->num_window_counts; i++) {
535 |         cur = &(self->window_counts[i]);
536 |         // use the current time as the index into the circular array to save some memory
537 |         cur_count = (t / cur->window_size_nanosecs) & (cur->num_windows - 1);
538 |         ++(cur->counts[cur_count]);
539 |     }
540 | }
541 | 
542 | 
543 | static void _add(faststat_Stats *self, double x, unsigned long long t) {
544 |     //update extremely basic values: number, min, and max
545 |     self->lasttime = t;
546 |     self->n++;
547 |     if(self->n == 1) {
548 |         self->min = self->max = x;
549 |         self->mintime = self->maxtime = self->lasttime;
550 |     }
551 |     if(x <= self->min) {
552 |         self->mintime = self->lasttime;
553 |         self->min = x;
554 |     }
555 |     if(x >= self->max) {
556 |         self->maxtime = self->lasttime;
557 |         self->max = x;
558 |     }
559 |     // TODO: any platforms not support the NAN macro?
560 |     self->sum_of_logs += x > 0 ? log(x) : NAN;
561 |     self->sum_of_inv += x > 0 ? 1 / x : NAN;
562 |     _update_moments(self, x);
563 |     _update_percentiles(self, x);
564 |     _update_buckets(self, x);
565 |     _update_expo_avgs(self, x);
566 |     _update_lastN(self, x);
567 |     _update_window_counts(self, t);
568 |     _update_topN(self, x, t);
569 | }
570 | 
571 | 
572 | static PyObject* faststat_Stats_add(faststat_Stats *self, PyObject *args) {
573 |     //visual studios hates in-line variable declaration
574 |     double x;
575 |     unsigned long long t;
576 |     x = 0;
577 |     t = 0;
578 |     if(PyArg_ParseTuple(args, "d", &x)) {
579 |         t = nanotime();
580 |         if(self->interval && self->lasttime) {
581 |             unsigned long long interval;
582 |             interval = t - self->lasttime;
583 |             if(interval == 0) {
584 |                 interval = 1;
585 |             }
586 |             // ensure interval is at least 1 nanosecond to not mess up
587 |             // harmonic and geometric mean (1 ns is noise on this scale)
588 |             _add(self->interval, (double)(interval), t);
589 |         }
590 |         _add(self, x, t);
591 |     }
592 |     if(PyErr_Occurred()) { return NULL; }
593 |     Py_INCREF(Py_None);
594 |     return Py_None;
595 | }
596 | 
597 | 
598 | static PyObject* faststat_Stats_end(faststat_Stats *self, PyObject *args) {
599 |     unsigned long long end;
600 |     unsigned long long start;
601 |     end = start = 0;
602 |     if(PyArg_ParseTuple(args, "K", &start)) {
603 |         end = nanotime();
604 |         if(self->interval && self->lasttime) {
605 |             _add(self->interval, (double)(end - self->lasttime), end);
606 |         }
607 |         _add(self, (double)(end - start), end);
608 |     }
609 |     if(PyErr_Occurred()) { return NULL; }
610 |     Py_INCREF(Py_None);
611 |     return Py_None;
612 | }
613 | 
614 | 
615 | static PyObject* faststat_Stats_tick(faststat_Stats *self, PyObject *args) {
616 |     //tricky part is how to handle the first tick
617 |     // weird part will be that calling tick N times results in N-1 data points
618 |     unsigned long long t;
619 |     t = nanotime();
620 |     if(self->lasttime) {
621 |         _add(self, (double)(t - self->lasttime), t);
622 |     } else {
623 |         self->lasttime = t;
624 |     }
625 |     if(PyErr_Occurred()) { return NULL; }
626 |     Py_INCREF(Py_None);
627 |     return Py_None;
628 | }
629 | 
630 | 
631 | static PyObject* faststat_Stats_get_percentiles(faststat_Stats* self, PyObject *args) {
632 |     PyObject *p_dict;
633 |     faststat_P2Percentile *cur;
634 |     double cur_val;
635 |     unsigned int i;
636 |     p_dict = PyDict_New();
637 |     for(i = 0; i < self->num_percentiles; i++) {
638 |         cur = &(self->percentiles[i]);
639 |         cur_val = ((double)cur->percentile) / 0x10000;
640 |         cur_val = floor(10000 * cur_val + 0.5) / 10000;  
641 |         //re-round to handle slop from being 16 bit number
642 |         // (note: windows math.h does not include round; use floor)
643 |         PyDict_SetItem(
644 |             p_dict, 
645 |             PyFloat_FromDouble(cur_val), 
646 |             PyFloat_FromDouble(cur->val));
647 |     }
648 |     if(PyErr_Occurred()) { 
649 |         Py_DECREF(p_dict);
650 |         return NULL; 
651 |     }
652 |     return p_dict;
653 | }
654 | 
655 | 
656 | static PyObject* faststat_Stats_get_buckets(faststat_Stats* self, PyObject *args) {
657 |     PyObject *b_dict;
658 |     faststat_Bucket *cur;
659 |     unsigned int i;
660 |     unsigned long long leftover;
661 |     leftover = self->n;
662 |     b_dict = PyDict_New();
663 |     for(i = 0; i < self->num_buckets; i++) {
664 |         cur = &(self->buckets[i]);
665 |         leftover -= cur->count;
666 |         PyDict_SetItem(
667 |             b_dict,
668 |             PyFloat_FromDouble(cur->max),
669 |             PyLong_FromUnsignedLongLong(cur->count));
670 |     }
671 |     PyDict_SetItem(b_dict, Py_None, PyLong_FromUnsignedLongLong(leftover));
672 |     if(PyErr_Occurred()) { 
673 |         Py_DECREF(b_dict);
674 |         return NULL; 
675 |     }
676 |     return b_dict;
677 | }
678 | 
679 | 
680 | static PyObject* faststat_Stats_get_expoavgs(faststat_Stats *self, PyObject *args) {
681 |     PyObject *b_dict;
682 |     faststat_ExpoAvg *cur;
683 |     unsigned int i;
684 |     b_dict = PyDict_New();
685 |     for(i = 0; i < self->num_expo_avgs; i++) {
686 |         cur = &(self->expo_avgs[i]);
687 |         PyDict_SetItem(b_dict,
688 |             PyFloat_FromDouble(cur->alpha),
689 |             PyFloat_FromDouble(cur->val));
690 |     }
691 |     return b_dict;
692 | }
693 | 
694 | 
695 | static PyObject* faststat_Stats_get_prev(faststat_Stats *self, PyObject *args) {
696 |     int offset;
697 |     double val;
698 |     PyObject *tuple, *pyval, *pytime;
699 |     unsigned long long nanotime;
700 |     if(self->num_prev == 0) {
701 |         Py_INCREF(Py_None);
702 |         return Py_None;
703 |     }
704 | 
705 |     offset = 0;
706 |     if(PyArg_ParseTuple(args, "i", &offset)) {
707 |         offset = ((self->n - 1)  + (self->num_prev - offset)) & (self->num_prev - 1);
708 |         val = self->lastN[offset].value;
709 |         nanotime = self->lastN[offset].nanotime;
710 |         pyval = PyFloat_FromDouble(val);
711 |         pytime = PyLong_FromUnsignedLongLong(nanotime);
712 |         if(pyval != NULL && pytime != NULL) {
713 |             tuple = PyTuple_Pack(2, pytime, pyval);
714 |             if(tuple != NULL) {
715 |                 return tuple;
716 |             }
717 |         }
718 |     }
719 |     if(PyErr_Occurred()) { return NULL; }
720 |     Py_INCREF(Py_None);
721 |     return Py_None;
722 | }
723 | 
724 | 
725 | static PyObject* faststat_Stats_get_topN(faststat_Stats *self, PyObject *args) {
726 |     PyObject *ret;
727 |     unsigned int i;
728 |     ret = PyList_New(self->num_top);
729 |     for(i=1; i<self->num_top + 1; i++) {
730 |         PyList_SetItem(ret, i - 1, Py_BuildValue(
731 |             "(dK)", self->topN[i].value, self->topN[i].nanotime));
732 |     }
733 |     if(PyErr_Occurred()) { 
734 |         Py_DECREF(ret);
735 |         return NULL; 
736 |     }
737 |     return ret;
738 | }
739 | 
740 | 
741 | static PyObject* faststat_Stats_get_window_counts(faststat_Stats *self, PyObject *args) {
742 |     unsigned long long t;
743 |     PyObject *window_count_dict, *cur_items;
744 |     faststat_WindowCount *cur;
745 |     unsigned long long i, j, cur_window;
746 |     t = nanotime();
747 |     _rezero_window_counts(self, t);
748 |     window_count_dict = PyDict_New();
749 |     for(i = 0; i < self->num_window_counts; i++) {
750 |         cur = self->window_counts + i;
751 |         cur_items = PyTuple_New(cur->num_windows);
752 |         cur_window = t / cur->window_size_nanosecs;
753 |         for(j = 0; j < cur->num_windows; j++) {
754 |             PyTuple_SetItem(
755 |                 cur_items, (Py_ssize_t)j, 
756 |                 PyLong_FromUnsignedLong(
757 |                     cur->counts[(j + cur_window) & (cur->num_windows - 1)]));
758 |         }
759 |         PyDict_SetItem(
760 |             window_count_dict,
761 |             PyLong_FromUnsignedLongLong(cur->window_size_nanosecs),
762 |             cur_items);
763 |     }
764 |     if(PyErr_Occurred()) { 
765 |         Py_DECREF(window_count_dict);
766 |         return NULL; 
767 |     }
768 |     return window_count_dict;
769 | }
770 | 
771 | 
772 | static PyMethodDef faststat_Stats_methods[] = {
773 |     {"add", (PyCFunction)faststat_Stats_add, METH_VARARGS, "add a data point"},
774 |     {"end", (PyCFunction)faststat_Stats_end, METH_VARARGS, 
775 |         "add a duration data point, whose start time is passed"},
776 |     {"tick", (PyCFunction)faststat_Stats_tick, METH_NOARGS, 
777 |         "add an interval data point between now and the last tick"},
778 |     {"get_percentiles", (PyCFunction)faststat_Stats_get_percentiles, METH_NOARGS, 
779 |         "construct percentiles dictionary"},
780 |     {"get_buckets", (PyCFunction)faststat_Stats_get_buckets, METH_NOARGS,
781 |         "construct buckets dictionary"},
782 |     {"get_expo_avgs", (PyCFunction)faststat_Stats_get_expoavgs, METH_NOARGS,
783 |         "get a dictionary of decay rates to previous averages"},
784 |     {"get_prev", (PyCFunction)faststat_Stats_get_prev, METH_VARARGS,
785 |         "get the nth previous sample"},
786 |     {"get_topN", (PyCFunction)faststat_Stats_get_topN, METH_NOARGS,
787 |         "get the highest values"},
788 |     {"get_window_counts", (PyCFunction)faststat_Stats_get_window_counts, METH_NOARGS,
789 |         "get a dictionary of window intervals to window counts"},
790 |     {NULL}
791 | };
792 | 
793 | 
794 | static PyTypeObject faststat_StatsType = {
795 |     PyObject_HEAD_INIT(NULL)
796 |     0,                         /*ob_size*/
797 |     "_faststat.Stats",          /*tp_name*/
798 |     sizeof(faststat_Stats),    /*tp_basicsize*/
799 |     0,                         /*tp_itemsize*/
800 |     (destructor)faststat_Stats_dealloc, /*tp_dealloc*/
801 |     0,                         /*tp_print*/
802 |     0,                         /*tp_getattr*/
803 |     0,                         /*tp_setattr*/
804 |     0,                         /*tp_compare*/
805 |     0,                         /*tp_repr*/
806 |     0,                         /*tp_as_number*/
807 |     0,                         /*tp_as_sequence*/
808 |     0,                         /*tp_as_mapping*/
809 |     0,                         /*tp_hash */
810 |     0,                         /*tp_call*/
811 |     0,                         /*tp_str*/
812 |     0,                         /*tp_getattro*/
813 |     0,                         /*tp_setattro*/
814 |     0,                         /*tp_as_buffer*/
815 |     Py_TPFLAGS_DEFAULT,        /*tp_flags*/
816 |     "online stats collector",  /* tp_doc */
817 |     0,                         /* tp_traverse */
818 |     0,                         /* tp_clear */
819 |     0,                         /* tp_richcompare */
820 |     0,                         /* tp_weaklistoffset */
821 |     0,                         /* tp_iter */
822 |     0,                         /* tp_iternext */
823 |     faststat_Stats_methods,    /* tp_methods */
824 |     faststat_Stats_members,    /* tp_members */
825 |     0,                         /* tp_getset */
826 |     0,                         /* tp_base */
827 |     0,                         /* tp_dict */
828 |     0,                         /* tp_descr_get */
829 |     0,                         /* tp_descr_set */
830 |     0,                         /* tp_dictoffset */
831 |     0,                         /* tp_init */
832 |     0,                         /* tp_alloc */
833 |     faststat_Stats_new,        /* tp_new */
834 | };
835 | 
836 | 
837 | static PyObject* pynanotime(PyObject *_) {
838 |     PyObject *result;
839 |     result = PyLong_FromUnsignedLongLong(nanotime());
840 |     if(PyErr_Occurred()) { return NULL; }
841 |     return result;
842 | }
843 | 
844 | static PyObject* pynanotime_override(PyObject *_, PyObject *args) {
845 |     unsigned long long t;
846 |     if(PyArg_ParseTuple(args, "K", &t)) {
847 |         nanotime_override = t;
848 |     }
849 |     if(PyErr_Occurred()) { return NULL; }
850 |     Py_INCREF(Py_None);
851 |     return Py_None;
852 | }
853 | 
854 | 
855 | static PyMethodDef module_methods[] = { 
856 |     {"nanotime", (PyCFunction)pynanotime, METH_NOARGS, 
857 |         "get integral nanoseconds since unix epoch"},
858 |     {"_nanotime_override", (PyCFunction)pynanotime_override, METH_VARARGS,
859 |         "override time seen by all faststat operations, useful for testing time based algoritmhs"},
860 |     {NULL} };
861 | 
862 | 
863 | #ifndef PyMODINIT_FUNC  /* declarations for DLL import/export */
864 | #define PyMODINIT_FUNC void
865 | #endif
866 | PyMODINIT_FUNC init_faststat(void) {
867 |     PyObject *module;
868 | 
869 |     if(PyType_Ready(&faststat_StatsType) < 0)
870 |         return;
871 | 
872 |     module = Py_InitModule3("_faststat", module_methods, "fast statistics");
873 | 
874 |     if(module == NULL)
875 |         return;
876 | 
877 |     Py_INCREF(&faststat_StatsType);
878 |     PyModule_AddObject(module, "Stats", (PyObject*)&faststat_StatsType);
879 |     nanotime_override = 0;
880 | }
881 | 


--------------------------------------------------------------------------------
/faststat/cache.py:
--------------------------------------------------------------------------------
  1 | class LRUCache(object):
  2 |     '''
  3 |     Implements an LRU cache based on a linked list.
  4 |     Performance is about 1.1 microseconds per set/get on a core i7
  5 |     '''
  6 |     def __init__(self, maxlen=10000):
  7 |         self.map = {}
  8 |         self.root = root = []
  9 |         root[:] = [root, root]
 10 |         self.maxlen = maxlen
 11 |  
 12 |     def __getitem__(self, key):
 13 |         val = self.map[key][3]
 14 |         self[key] = val
 15 |         return val
 16 |  
 17 |     def add(self, key, val):
 18 |         # node[0] = prev; node[1] = next
 19 |         root = self.root
 20 |         if key in self.map:
 21 |             # remove from map
 22 |             link = self.map.pop(key)
 23 |             # remove from list
 24 |             link[0][1], link[1][0] = link[1], link[0]
 25 |         else:
 26 |             link = [None, None, key, val]
 27 |         discard = None
 28 |         if len(self.map) >= self.maxlen:
 29 |             # pop and discard the oldest item
 30 |             discard = root[0]
 31 |             discard[0][1], root[0] = root, discard[0]
 32 |             self.map.pop(discard[2])
 33 |         # insert into map
 34 |         self.map[key] = link
 35 |         # insert into list
 36 |         link[0], link[1] = root, root[1]
 37 |         root[1][0] = link
 38 |         root[1] = link
 39 |         if root[0] is root:
 40 |             root[0] = root[1]
 41 |         if discard:
 42 |             return discard[2], discard[3]
 43 |  
 44 |     __setitem__ = add
 45 |  
 46 |     _unset = object()
 47 |     
 48 |     def pop(self, key=_unset):
 49 |         # remove from map and list
 50 |         if key is LRUCache._unset:
 51 |             link = self.root[0]
 52 |             self.map.pop(link[2])
 53 |         else:
 54 |             link = self.map.pop(key)
 55 |         link[0][1], link[1][0] = link[1], link[0]
 56 |         return link[3]
 57 |  
 58 |     def __contains__(self, key):
 59 |     	return key in self.map
 60 |  
 61 |     def __len__(self):
 62 |         return len(self.map)
 63 |  
 64 |     def items(self):
 65 |         return [(k, self.map[k][3]) for k in self.map]
 66 |  
 67 |     def keys(self):
 68 |         return self.map.keys()
 69 |  
 70 |     def values(self):
 71 |         return [self.map[k][3] for k in self.map]
 72 |  
 73 |  
 74 | class SegmentedCache(object):
 75 |     '''
 76 |     Implements a Segmented LRU cache based on an LRU cache.
 77 |     '''
 78 |     def __init__(self, maxlen=10000):
 79 |         self.probationary = LRUCache(maxlen)
 80 |         self.protected = LRUCache(maxlen / 2)
 81 |         self.maxlen = maxlen
 82 |  
 83 |     def __getitem__(self, key):
 84 |         if key in self.protected.map:
 85 |             # already protected, nothing to do
 86 |             return self.protected[key]
 87 |         if key in self.probationary.map:
 88 |             # promote to protected
 89 |             val = self.probationary.pop(key)
 90 |             discard = self.protected.add(key, val)
 91 |             if discard:
 92 |                 self.probationary.add(discard[0], discard[1])
 93 |             return val
 94 |         raise KeyError(key)
 95 |  
 96 |     def add(self, key, val):
 97 |         if key in self.protected:
 98 |             self.protected[key] = val
 99 |         elif key in self.probationary:
100 |             self.probationary.pop(key)
101 |             discard = self.protected.add(key, val)
102 |             if discard:
103 |                 self.probationary.add(discard[0], discard[1])
104 |         else: # totally brand new key being added
105 |             self.probationary.add(key, val)
106 |             if len(self.probationary.map) + len(self.protected.map) > self.maxlen:
107 |                 self.probationary.pop()
108 |  
109 |     __setitem__ = add
110 |  
111 |     def __contains__(self, key):
112 |     	return key in self.probationary or key in self.protected
113 |  
114 |     def __len__(self):
115 |         return len(self.protected) + len(self.probationary)
116 |  
117 |     def items(self):
118 |         return self.protected.items() + self.probationary.items()
119 |  
120 |     def keys(self):
121 |         return self.protected.keys() + self.probationary.keys()
122 |  
123 |     def values(self):
124 |         return self.protected.values() + self.probationary.values()
125 |  
126 |  
127 | if __name__ == "__main__":
128 | 	cache_size = 7
129 | 	sg = SegmentedCache(cache_size)
130 | 	r = range(10000)
131 | 	for i in r:
132 | 		sg[i] = i
133 | 	for i in r[-cache_size:]:
134 | 		assert i in sg
135 |  
136 | 	import time
137 |  
138 | 	r = range(int(5e5))
139 | 	s = time.time()
140 | 	for i in r:
141 | 		sg[i] = i
142 | 		sg[i] = i
143 | 	print "{0:.2f}us".format(time.time() - s)
144 | 


--------------------------------------------------------------------------------
/faststat/faststat.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | faststat is a *streaming*, *light-weight* statistics library designed for embedding
  3 | in other Python applications.  *Steaming* means
  4 | that faststat operates on data points as they arrive, without needing to store
  5 | previous data points.  *Light-weight* means that statistics do not take up a great
  6 | deal of CPU or RAM.  Adding a data point to a stat object is a 0.5 - 3 microsecond operation.  
  7 | Each stat object takes about 4kiB of memory.
  8 | '''
  9 | import array
 10 | import random
 11 | import math
 12 | import collections
 13 | import time
 14 | import functools
 15 | import json
 16 | import weakref
 17 | 
 18 | import _faststat
 19 | import cache
 20 | import format
 21 | 
 22 | 
 23 | class Sample(object):
 24 |     '''
 25 |     This class implements Reservoir Sampling to keep a random sample of an infinite stream.
 26 |     See http://gregable.com/2007/10/reservoir-sampling.html for one description.
 27 | 
 28 |     This class is kept separate from the other stats, because its memory usage is far greater.
 29 |     '''
 30 |     def __init__(self, sample_size=2**14, type='f'):
 31 |         self.sample = array.array(type)
 32 |         self.sample_size = sample_size
 33 |         self.num_vals = 0
 34 | 
 35 |     def add(self, val):
 36 |         if self.num_vals < self.sample_size:
 37 |             self.sample.append(val)
 38 |         else:
 39 |             pos = random.randint(0, self.num_vals)
 40 |             if pos < self.sample_size:
 41 |                 self.sample[pos] = val
 42 |         self.num_vals += 1
 43 | 
 44 | DEFAULT_PERCENTILES = (0.25, 0.50, 0.75, 0.90, 0.95, 0.99)
 45 | EXPO_AVGS = (1.0/2, 1.0/4, 1.0/8, 1.0/16, 1.0/32, 1.0/64)
 46 | 
 47 | 
 48 | # keep buckets for intervals in size from 100ns to ~14 hours
 49 | TIME_BUCKETS = sum( 
 50 |     [(1*10**x, 2*10**x, 5*10**x) for x in range(2, 13)], ())
 51 | # useful buckets for unsigned integers up to 64 bits
 52 | UINT_BUCKETS = (1, 2, 3, 4, 5, 6, 7, 8, 9) + sum(
 53 |     [(1*10**x, 2*10**x, 5*10**x) for x in range(1, 20)], ())
 54 | # useful buckets for signed integers up to 64 bits
 55 | INT_BUCKETS = tuple(reversed([-e for e in UINT_BUCKETS[:-3]])) + (0,) + UINT_BUCKETS[:-3]
 56 | 
 57 | DEFAULT_BUCKETS = (0, 1e-5, 1e-4, 1e-3, 1e-2, 2e-2, 5e-2, 0.1, 0.2, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
 58 |     10, 20, 50, 100, 200, 500, 1e3, 2e3, 5e3, 1e4, 1e5, 1e6)
 59 | DEFAULT_BUCKETS = tuple(reversed([-e for e in DEFAULT_BUCKETS])) + (0,) + DEFAULT_BUCKETS
 60 | 
 61 | ONE_MIN_NS = int(60e9)
 62 | ONE_HOUR_NS = 60 * ONE_MIN_NS
 63 | 
 64 | WINDOW_COUNTS = [(64, ONE_MIN_NS), (32, ONE_HOUR_NS)]
 65 | 
 66 | class _BaseStats(object):
 67 |     'base class to avoid repeating code'
 68 |     def __init__(self, buckets, lastN, percentiles, interval, expo_avgs,
 69 |             window_counts, num_top):
 70 |         buckets = buckets + (float("inf"),)
 71 |         lastN = int(2**math.ceil(math.log(lastN)/math.log(2)))
 72 |         num_top = int(2**math.ceil(math.log(lastN)/math.log(2)))
 73 |         if interval:
 74 |             self.interval = Interval(window_counts=())
 75 |             interval = self.interval._stats
 76 |         else:
 77 |             interval = None
 78 |         self._stats = _faststat.Stats(buckets, lastN, percentiles, interval,
 79 |             expo_avgs, window_counts, num_top)
 80 | 
 81 |     @property
 82 |     def variance(self):
 83 |         if self.n < 2:
 84 |             return float("nan")
 85 |         return self.m2 / (self.n - 1)
 86 | 
 87 |     @property
 88 |     def trimean(self):
 89 |         p = self.get_percentiles()
 90 |         return (p[0.25] + 2 * p[0.5] + p[0.75]) / 4
 91 | 
 92 |     @property
 93 |     def skewness(self):
 94 |         if not self.m2:
 95 |             return float("nan")
 96 |         return self.n ** 0.5 * self.m3 / self.m2 ** 1.5
 97 | 
 98 |     @property
 99 |     def kurtosis(self):
100 |         if not self.m2:
101 |             return float("nan")
102 |         return self.n * self.m4 / self.m2 ** 2 - 3
103 | 
104 |     @property
105 |     def percentiles(self):
106 |         return self.get_percentiles()
107 | 
108 |     @property
109 |     def buckets(self):
110 |         return self.get_buckets()
111 | 
112 |     @property
113 |     def geometric_mean(self):
114 |         'nth root of product of data points'
115 |         return math.exp(self.sum_of_logs / self.n)
116 | 
117 |     @property
118 |     def harmonic_mean(self):
119 |         'inverse of mean of inverses of data points'
120 |         return self.n / self.sum_of_inv
121 | 
122 |     @property
123 |     def window_median(self):
124 |         if self.num_prev:
125 |             prev = sorted([self.get_prev(i)[1] for i in range(self.num_prev)])
126 |             return prev[len(prev)/2]
127 | 
128 |     @property
129 |     def expo_avgs(self):
130 |         return self.get_expo_avgs()
131 | 
132 |     @property
133 |     def lag_avgs(self):
134 |         '''
135 |         same data as expo_avgs, but with keys as the average age
136 |         of the data -- assuming evenly spaced data points -- rather
137 |         than decay rates
138 |         '''
139 |         if not self.interval:
140 |             return
141 |         interval = self.interval.mean
142 |         return dict([(interval/alpha, val) 
143 |             for alpha, val in self.get_expo_avgs().items()])
144 | 
145 |     def __repr__(self):
146 |         p = self.percentiles
147 |         if self.n < len(p):
148 |             quartiles = "(n too small)"
149 |         else:
150 |             quartiles = (_sigfigs(p.get(0.25, -1)), 
151 |                 _sigfigs(p.get(0.5, -1)), _sigfigs(p.get(0.75, -1)))
152 |         return '<faststat.{0} n={1} mean={2} quartiles={3}>'.format(
153 |             type(self).__name__, self.n, _sigfigs(self.mean), quartiles)
154 | 
155 | 
156 | class Stats(_BaseStats):
157 |     '''
158 |     Call add(value) to add a data point.
159 |     '''
160 |     def __init__(self, buckets=DEFAULT_BUCKETS, lastN=64, percentiles=DEFAULT_PERCENTILES,
161 |             interval=True):
162 |         super(Stats, self).__init__(buckets, lastN, percentiles, interval, EXPO_AVGS,
163 |             WINDOW_COUNTS, num_top=64)
164 |         self.add = self._stats.add
165 | 
166 |     def __getattr__(self, name):
167 |         if name not in ("tick", "end"):
168 |             return getattr(self._stats, name)
169 | 
170 | 
171 | class _TimeStats(_BaseStats):
172 |     def get_percentiles(self):
173 |         data = self._stats.get_percentiles()
174 |         for k in data:
175 |             data[k] = ntd_float(data[k])
176 |         return data
177 | 
178 |     @property
179 |     def mean(self):
180 |         return ntd_float(self._stats.mean)
181 | 
182 |     @property
183 |     def max(self):
184 |         return ntd_float(self._stats.max)
185 | 
186 |     @property
187 |     def min(self):
188 |         return ntd_float(self._stats.min)
189 | 
190 | 
191 | class Interval(_TimeStats):
192 |     '''
193 |     Call tick() to register occurrences.
194 |     Note that calling tick() N times results in N-1 data points.
195 |     '''
196 |     def __init__(self, buckets=TIME_BUCKETS, lastN=64, percentiles=DEFAULT_PERCENTILES,
197 |             window_counts=WINDOW_COUNTS):
198 |         super(Interval, self).__init__(buckets, lastN, percentiles, False, (), (), num_top=64)
199 |         self.tick = self._stats.tick
200 | 
201 |     def __getattr__(self, name):
202 |         if name not in ("add", "end"):
203 |             return getattr(self._stats, name)
204 | 
205 | 
206 | class Duration(_TimeStats):
207 |     '''
208 |     Represents statistics for a duration.
209 |     Call end(start_time_nanos) to add a data point.
210 |     '''
211 |     def __init__(self, buckets=TIME_BUCKETS, lastN=64, percentiles=DEFAULT_PERCENTILES,
212 |             interval=True):
213 |         super(Duration, self).__init__(buckets, lastN, percentiles, interval, EXPO_AVGS,
214 |             WINDOW_COUNTS, num_top=64)
215 |         self.end = self._stats.end
216 | 
217 |     def __getattr__(self, name):
218 |         if name not in ("add", "tick"):
219 |             return getattr(self._stats, name)
220 | 
221 | 
222 | class Markov(object):
223 |     '''
224 |     Represents the states of a Markov process.  The transition between states are
225 |     modeled as Intervals, and the time spent in a given state is modeled as
226 |     Durations.
227 |     '''
228 |     def __init__(self):
229 |         self.state_durations = collections.defaultdict(Duration)
230 |         self.transition_intervals = collections.defaultdict(Interval)
231 |         self.transitor_states = collections.defaultdict(int)
232 |         self._weakref_holder = {}
233 |         self.state_counts = collections.defaultdict(functools.partial(Stats, interval=False))
234 | 
235 |     def _transition(self, nxt, cur=None, since=None):
236 |         '''
237 |         Register that a transition has taken place.
238 |         nxt is an identifier for the state being entered.
239 |         cur is an identifier for the state being left.
240 |         since is the time at which the previous state was entered.
241 |         '''
242 |         self.transition_intervals[(cur, nxt)].tick()
243 |         if since:
244 |             self.state_durations[cur].end(since)
245 | 
246 |     def make_transitor(self, state):
247 |         '''
248 |         Creates and returns a new Markov.Transitor, in the passed state.
249 |         '''
250 |         return Markov.Transitor(self, state)
251 | 
252 |     def _cleanup(self, ref):
253 |         'cleanup after a transitor weakref fires'
254 |         self.transitor_states[self._weakref_holder[ref]] -= 1
255 |         del self._weakref_holder[ref]
256 | 
257 |     class Transitor(object):
258 |         '''
259 |         An extremely light-weight object that simply tracks a current
260 |         state and the time of the last transition.
261 |         '''
262 |         def __init__(self, markov, state):
263 |             self.markov = markov
264 |             self.state = state
265 |             self.markov._transition(state)
266 |             self.markov.transitor_states[state] += 1
267 |             state_count = self.markov.transitor_states[state]
268 |             self.markov.state_counts[state].add(state_count)
269 |             self.weakref = weakref.ref(self, markov._cleanup)
270 |             self.markov._weakref_holder[self.weakref] = state
271 |             self.last_transition = nanotime()
272 | 
273 |         def transition(self, state):
274 |             '''
275 |             Notify the parent Markov stats object of a transition
276 |             from the current state to the passed state.
277 |             '''
278 |             self.markov.transitor_states[self.state] -= 1
279 |             self.markov.transitor_states[state] += 1
280 |             old_state_count = self.markov.transitor_states[self.state]
281 |             new_state_count = self.markov.transitor_states[state]
282 |             self.markov.state_counts[self.state].add(old_state_count)
283 |             self.markov.state_counts[state].add(new_state_count)
284 |             self.markov._weakref_holder[self.weakref] = state
285 |             self.markov._transition(state, self.state, self.last_transition)
286 |             self.last_transition, self.state = nanotime(), state
287 | 
288 |         def __repr__(self):
289 |             return '<faststat.Markov.Transitor({0})>'.format(self.state)
290 | 
291 | 
292 | class PathStats(object):
293 |     '''
294 |     Represents a set of paths taken.  Unlike Markov, these states remember
295 |     their "history".
296 | 
297 |     Because there are likely to be many more paths than states, the Path
298 |     is more aggressively memory optimized, and also employs a SegmentedCache
299 |     internally to limit the number of unique path durations that will be
300 |     stored.
301 |     '''
302 |     def __init__(self, maxsize=2048):
303 |         self.path_stats = cache.SegmentedCache(maxsize)
304 |         self._weakref_path_map = {}
305 | 
306 |     def make_walker(self, start):
307 |         'returns a walker object that tracks a path'
308 |         return PathStats.Walker(self, start)
309 | 
310 |     def _commit(self, ref):
311 |         'commit a walkers data after it is collected'
312 |         path_times = self._weakref_path_map[ref]
313 |         path_times.append(nanotime())
314 |         del self._weakref_path_map[ref]
315 |         path = tuple(path_times[1::2])
316 |         times = path_times[::2]
317 |         if path not in self.path_stats:
318 |             # tuple to save a tiny bit of memory
319 |             self.path_stats[path] = tuple([
320 |                 Duration(interval=False) for i in range(len(path))])
321 |         path_stats = self.path_stats[path]
322 |         for i in range(1, len(times)):
323 |             path_stats[i - 1]._stats.add(times[i] - times[i - 1])
324 | 
325 |     def pformat(self, prefix=()):
326 |         '''
327 |         Makes a pretty ASCII format of the data, suitable for
328 |         displaying in a console or saving to a text file.
329 |         Returns a list of lines.
330 |         '''
331 |         nan = float("nan")
332 | 
333 |         def sformat(segment, stat):
334 |             FMT = "n={0}, mean={1}, p50/95={2}/{3}, max={4}"
335 |             line_segs = [segment]
336 |             for s in [stat]:
337 |                 p = s.get_percentiles()
338 |                 p50, p95 = p.get(0.50, nan), p.get(0.95, nan)
339 |                 line_segs.append(FMT.format(s.n, s.mean, p50, p95, s.max))
340 |             return '{0}: {1}'.format(*line_segs)
341 | 
342 |         lines = []
343 |         for path in sorted(self.path_stats.keys()):
344 |             lines.append('=====================')
345 |             for seg, stat in zip(path, self.path_stats[path]):
346 |                 lines.append(sformat(seg, stat))
347 |         return lines
348 | 
349 |     def _finished_segment(self, path, since, start):
350 |         if path not in self.path_stats:
351 |             self.path_stats[path] = (
352 |                 Duration(interval=False), Duration(interval=False))
353 |         dur, offset = self.state_stats[path]
354 |         dur.end(since)
355 |         offset.end(start)
356 | 
357 |     class Walker(object):
358 |         '''
359 |         A light-weight object that tracks a current path and the time of
360 |         the last transition.  Similar to Tranistor for Markov.
361 |         '''
362 |         def __init__(self, pathstats, segment="NEW"):
363 |             self.pathstats = pathstats
364 |             self._commiter = weakref.ref(self, self.pathstats._commit)
365 |             self.path = self.pathstats._weakref_path_map[self._commiter] = []
366 |             self.push(segment)
367 | 
368 |         def push(self, segment):
369 |             '''
370 |             pushes a new segment onto the path, closing out the previous segment
371 |             '''
372 |             self.path.append(nanotime())
373 |             self.path.append(segment)
374 |             self.curseg = segment
375 | 
376 |         def pop(self):
377 |             self.push(PathStats.POP)
378 | 
379 |         def branch(self):
380 |             child = PathStats.Walker(
381 |                 self.pathstats, PathStats.BRANCH_C)
382 |             self.push(PathStats.BRANCH_P)
383 |             return child
384 | 
385 |         def join(self, walker):
386 |             self.push((PathStats.JOIN, tuple(walker.path)))
387 |             walker.push(PathStats.JOINED)
388 | 
389 |     BRANCH_P, BRANCH_C, JOIN, JOINED, POP = "BRANCH_P", "BRANCH_C", "JOIN", "JOINED", "POP"
390 | 
391 |     def __repr__(self):
392 |         return "<PathStats npaths={0}>".format(len(self.path_stats))
393 | 
394 | 
395 | class ntd_float(float):
396 |     'a float which represents the difference of two timestamps in nanoseconds'
397 |     def __repr__(self):
398 |         return format.si_format(self / 1e9, "s")
399 | 
400 |     def __format__(self, format_spec):
401 |         return repr(self)
402 | 
403 | 
404 | TimeSeries = functools.partial(Stats, interval=False)
405 | nanotime = _faststat.nanotime
406 | 
407 | 
408 | def _sigfigs(n, sigfigs=3):
409 |     'helper function to round a number to significant figures'
410 |     n = float(n)
411 |     if n == 0 or math.isnan(n):  # avoid math domain errors
412 |         return n
413 |     return round(n, -int(math.floor(math.log10(abs(n))) - sigfigs + 1))
414 | 
415 | 
416 | def merge_moments(m_a, m_a2, m_a3, m_a4, n_a, m_b, m_b2, m_b3, m_b4, n_b):
417 |     '''
418 |     Merge moments of two samples A and B.
419 |     parameters are 
420 |     m_a, ..., m_a4 = first through fourth moment of sample A
421 |     n_a = size of sample A
422 |     m_b, ..., m_b4 = first through fourth moment of sample B
423 |     n_b = size of sample B
424 |     '''
425 |     delta = m_b - m_a
426 |     delta_2 = delta * delta
427 |     delta_3 = delta * delta_2
428 |     delta_4 = delta * delta_3
429 |     n_x = n_a + n_b
430 |     m_x = m_a + delta * n_b / n_x
431 |     m_x2 = m_a2 + m_b2 + delta_2 * n_a * n_b / n_x
432 |     m_x3 = m_a3 + m_b3 + delta_3 * n_a * n_b * (n_a - n_b) + 3 * delta * (n_a * m_2b - n_b * m_2a) / n_x
433 |     m_x4 = (m_a4 + m_b4 + delta_4 * (n_a * n_b * (n_a * n_a - n_a * n_b + n_b * n_b)) / (n_x ** 3) +
434 |             6 * delta_2 * (n_a * n_a * m_b2 + n_b * n_b * m_a2) / (n_x ** 2) +
435 |             4 * delta * (n_a * m_b3 - n_b * m_a3) / n_x )
436 |     return m_x, m_x2, m_x3, m_x4, n_x
437 | 


--------------------------------------------------------------------------------
/faststat/format.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import json
  3 |  
  4 | 
  5 | def stat2json(stat):
  6 |     prev = [stat.get_prev(i) for i in range(stat.num_prev)]
  7 |     # timestamp = 0 means not yet valid
  8 |     prev = [(p[0] / 1e6, p[1]) for p in prev if p[0]]
  9 |     return json.dumps({
 10 |         "n": stat.n,
 11 |         "mean": stat.mean,
 12 |         "max": stat.max,
 13 |         "min": stat.min,
 14 |         "percentiles": stat.get_percentiles(),
 15 |         "prev": prev
 16 |     })
 17 | 
 18 | 
 19 | def stat2html(stat):
 20 |     return TEMPLATE.replace('"==THE_STAT=="', stat2json(stat))
 21 | 
 22 | 
 23 | def si_round(val):
 24 |     '''
 25 |     round to a "scientific notation" tuple of (factor, exponent)
 26 |     such that 1 < factor < 1000, and factor * 10 ** exponent == val
 27 |     '''
 28 |     if val < 0:
 29 |         neg = True
 30 |         val = -val
 31 |     elif val == 0:
 32 |         return 0, 0
 33 |     else:
 34 |         neg = False
 35 |     exp = math.log(val) / math.log(1000)
 36 |     if exp < 0:
 37 |         exp = int(exp) - 1
 38 |     else:
 39 |         exp = int(exp)
 40 |     val = val / 1000.0 ** exp
 41 |     if neg:
 42 |         val = -val
 43 |     return val, 3 * exp
 44 | 
 45 |  
 46 | def si_format(val, unit=''):
 47 |     if math.isnan(val):
 48 |         return "nan"
 49 |     if math.isinf(val):
 50 |         if val < 0:
 51 |             return "-inf"
 52 |         return "inf"
 53 |     val, exp = si_round(val)
 54 |     if exp:
 55 |         if exp in _SCALES and unit:
 56 |             exps = _SCALES[exp] + unit
 57 |         elif exp == -3:
 58 |             # special case, do 0.123 instead of 123e-3
 59 |             return "{0:0.3f}".format(val / 1e3)
 60 |         else:
 61 |             exps = 'e' + str(exp) + unit
 62 |     else:
 63 |         exps = unit
 64 |     if val >= 100 or val <= -100:
 65 |         return '{0:0.0f}{1}'.format(val, exps)
 66 |     if val >= 10 or val <= -10:
 67 |         return '{0:0.1f}{1}'.format(val, exps)
 68 |     return '{0:0.2f}{1}'.format(val, exps)
 69 | 
 70 | 
 71 | def sib_round(val):
 72 |     '''
 73 |     round to a binary SI tuple of (factor, exponent)
 74 |     such that 1 < factor < 1024, and factor * 1024 ** exponent == val
 75 |     '''
 76 |     if val < 0:
 77 |         neg = True
 78 |         val = -val
 79 |     elif val == 0:
 80 |         return 0, 0
 81 |     else:
 82 |         neg = False
 83 |     exp = math.log(val) / math.log(1024)
 84 |     if exp < 0:
 85 |         exp = int(exp) - 1
 86 |     else:
 87 |         exp = int(exp)
 88 |     val = val / 1024.0 ** exp
 89 |     if neg:
 90 |         val = -val
 91 |     return val, exp
 92 | 
 93 | 
 94 | def sib_format(val):
 95 |     if math.isnan(val):
 96 |         return "nan"
 97 |     if math.isinf(val):
 98 |         if val < 0:
 99 |             return "-inf"
100 |         return "inf"
101 |     val, exp = sib_round(val)
102 |     if exp < 0 or exp > len(_BSCALES):
103 |         raise ValueError("{0} out of format range", val)
104 |     exps = _BSCALES[exp]
105 |     if not exps:
106 |         return str(int(val)) + 'B'
107 |     if val >= 100 or val <= -100:
108 |         return '{0:0.0f}{1}B'.format(val, exps)
109 |     if val >= 10 or val <= -10:
110 |         return '{0:0.1f}{1}B'.format(val, exps)
111 |     return '{0:0.2f}{1}B'.format(val, exps)
112 | 
113 | 
114 | def sigfigs(n, sigfigs=3):
115 |     'helper function to round a number to significant figures'
116 |     n = float(n)
117 |     if n == 0 or math.isnan(n):  # avoid math domain errors
118 |         return n
119 |     return round(n, -int(math.floor(math.log10(abs(n))) - sigfigs + 1))
120 | 
121 | 
122 | _SCALES = {
123 |     # Peta, Tera, Giga, Mega, kilo
124 |     15: 'P', 12: 'T', 9: 'G', 6: 'M', 3: 'k',
125 |     -3: 'm', -6: 'u', -9: 'n', -12: 'p', -15: 'f'
126 |     # milli, micro, nano, pico, femto
127 | }
128 | 
129 | 
130 | _BSCALES = ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi']
131 | 
132 | 
133 | JAVASCRIPT_HTML_HEAD = '''
134 |   <script src="http://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>
135 |   <script src="http://dimplejs.org/dist/dimple.v2.1.2.min.js"></script>
136 |   <script type="text/javascript">
137 |   // histogram is computed from percentiles by taking the region over which the percentile
138 |   // applies, dividing by the fraction of points that occur in that region
139 |   /// e.g. if 25% = 0.18, then X = min, Y = 0.25 / (0.18 - min)
140 |     function faststat_histogram_chart(faststat, container) {
141 |       var svg = dimple.newSvg(container, 590, 400);
142 |       // var avg_density = faststat.n / (faststat.max - faststat.min);
143 |       var width = faststat.max - faststat.min
144 |       var chart = new dimple.chart(svg);
145 |       var x = chart.addMeasureAxis("x", "Value");
146 |       chart.addMeasureAxis("y", "Density");
147 |       var histogram = chart.addSeries("percentile", dimple.plot.line);
148 |       histogram.interpolation = 'step-before';
149 |       histogram.data = [
150 |         //the value is the minimum, the density is percentage divided by width
151 |         {"percentile": "min", "Value": faststat.min, "Density": 0},
152 |         {"percentile": 0, "Value": faststat.min, "Density": (0.25 - 0) / (faststat.percentiles[0.25] - faststat.min)},
153 |         {"percentile": 25, "Value": faststat.percentiles[0.25], "Density": (0.5 - 0.25) * width / (faststat.percentiles[0.5] - faststat.percentiles[0.25])},
154 |         {"percentile": 50, "Value": faststat.percentiles[0.5], "Density": (0.75 - 0.5) * width / (faststat.percentiles[0.75] - faststat.percentiles[0.5])},
155 |         {"percentile": 75, "Value": faststat.percentiles[0.75], "Density": (0.9 - 0.75) * width / (faststat.percentiles[0.9] - faststat.percentiles[0.75])},
156 |         {"percentile": 90, "Value": faststat.percentiles[0.9], "Density": (0.95 - 0.9) * width / (faststat.percentiles[0.95] - faststat.percentiles[0.9])},
157 |         {"percentile": 95, "Value": faststat.percentiles[0.95], "Density": (0.99 - 0.95) * width / (faststat.percentiles[0.99] - faststat.percentiles[0.95])},
158 |         {"percentile": 99, "Value": faststat.percentiles[0.99], "Density": (1.0 - 0.99) * width / (faststat.max - faststat.percentiles[0.99])},
159 |         {"percentile": 100, "Value": faststat.max, "Density": 0}
160 |       ];
161 |       for(var i=0; i<histogram.data.length; i++) {
162 |         if(histogram.data[i].Density === NaN) {
163 |           histogram.data[i].Density = 1000;
164 |         }
165 |       }
166 |       var vertical = chart.addPctAxis("y", "Vertical");
167 |       vertical.hidden = true;
168 |       var mean = chart.addSeries("mean", dimple.plot.area, [x, vertical]);
169 |       mean.data = [
170 |         {"Value": faststat.mean, "Vertical": 0, "mean": 0},
171 |         {"Value": faststat.mean, "Vertical": 1, "mean": 1},
172 |       ];
173 |       chart.addLegend(60, 10, 500, 20, "right");
174 |       chart.draw();
175 |     }
176 | 
177 |     function faststat_time_chart(faststat, container) {
178 |       var svg = dimple.newSvg(container, 590, 400);
179 |       var chart = new dimple.chart(svg);
180 |       var x = chart.addMeasureAxis("x", "timestamp");
181 |       var y = chart.addMeasureAxis("y", "Value");
182 |       var recent = chart.addSeries("index", dimple.plot.line);
183 |       var recent_data = [];
184 |       for(var i=0; i<faststat.prev.length; i++) {
185 |         recent_data.push({"index": i, "timestamp": faststat.prev[i][0] - faststat.prev[0][0], "Value": faststat.prev[i][1]});
186 |       }
187 |       console.log(recent_data);
188 |       recent.data = recent_data;
189 |       chart.draw();
190 |     }
191 |   </script>
192 | '''
193 | 
194 | TEMPLATE = '''
195 | <html>
196 | <body>
197 | <div id="histogram_container">
198 | <div id="time_container">
199 |   <!==JAVASCRIPT==>
200 |   <script type="text/javascript">
201 |     var THE_STAT = "==THE_STAT==";
202 |     faststat_histogram_chart(THE_STAT, "#histogram_container");
203 |     faststat_time_chart(THE_STAT, "#time_container");
204 |   </script>
205 | </div>
206 | </body>
207 | </html>'''.replace('<!==JAVASCRIPT==>', JAVASCRIPT_HTML_HEAD)
208 | 


--------------------------------------------------------------------------------
/faststat/test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | test implementation from wikipedia
  3 | http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics
  4 | '''
  5 | import random
  6 | import time
  7 | from faststat import Stats
  8 | import faststat
  9 | import format
 10 | # TODO: fix these to absolute imports
 11 | 
 12 | 
 13 | def p2_parabolic(l_v, l_n, c_v, c_n, r_v, r_n, d):
 14 |     return (c_v + (d / (r_n - l_n)) * (
 15 |         (c_n - l_n + d) * (r_v - c_v) / (r_n - c_n) +
 16 |         (r_n - c_n - d) * (c_v - l_v) / (c_n - l_n)))
 17 | 
 18 | 
 19 | def online_variance(data):
 20 |     n = 0
 21 |     mean = 0
 22 |     M2 = 0
 23 | 
 24 |     for x in data:
 25 |         n = n + 1
 26 |         delta = x - mean
 27 |         mean = mean + delta/n
 28 |         M2 = M2 + delta*(x - mean)
 29 | 
 30 |     variance = M2/(n - 1)
 31 |     return variance
 32 | 
 33 | 
 34 | def online_kurtosis(data):
 35 |     n = 0
 36 |     mean = 0
 37 |     M2 = 0
 38 |     M3 = 0
 39 |     M4 = 0
 40 | 
 41 |     for x in data:
 42 |         n1 = n
 43 |         n = n + 1
 44 |         delta = x - mean
 45 |         delta_n = delta / n
 46 |         delta_n2 = delta_n * delta_n
 47 |         term1 = delta * delta_n * n1
 48 |         mean = mean + delta_n
 49 |         M4 = M4 + term1 * delta_n2 * (n*n - 3*n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3
 50 |         M3 = M3 + term1 * delta_n * (n - 2) - 3 * delta_n * M2
 51 |         M2 = M2 + term1
 52 | 
 53 |     kurtosis = (n*M4) / (M2*M2) - 3
 54 |     return kurtosis
 55 | 
 56 | 
 57 | def test():
 58 |     # random.seed(103)  # make test repeatable
 59 |     data = [random.normalvariate(1.0, 1.0) for i in range(int(1e6))]
 60 |     stats = Stats()
 61 |     start = time.time()
 62 |     for d in data:
 63 |         stats.add(d)
 64 |     print time.time() - start, "microseconds per point"
 65 |     print "mean (should be 1)", stats.mean
 66 |     print "kurtosis / reference kurtosis", stats.kurtosis, online_kurtosis(data)
 67 |     print "variance / reference variance", stats.variance, online_variance(data)
 68 |     print "skewness (should be 0)", stats.skewness
 69 |     print "max, min, mintime, maxtime", stats.max, stats.min, stats.mintime, stats.maxtime
 70 |     print "m2, m3, m4", stats.m2, stats.m3, stats.m4
 71 |     print "geometric_mean, harmonic mean", stats.geometric_mean, stats.harmonic_mean
 72 |     print "interval.min, interval.geometric_mean, interval.harmonic_mean",
 73 |     print stats.interval.min, stats.interval.geometric_mean, stats.interval.harmonic_mean
 74 |     print "expo_avgs (should be 1)", stats.expo_avgs
 75 |     print "window_counts", stats.get_window_counts()
 76 |     print "top 10", sorted(stats.get_topN())[-10:]
 77 |     open('test_html.html', 'w').write(faststat.format.stat2html(stats))
 78 |     return stats
 79 | 
 80 | 
 81 | def test_breadth():
 82 |     m = faststat.Markov()
 83 |     t = m.make_transitor('foo')
 84 |     t.transition('bar')
 85 | 
 86 |     p = faststat.PathTree()
 87 |     w = p.make_walker('foo')
 88 |     w.push('bar')
 89 |     w.push('baz')
 90 |     b = w.branch()
 91 |     b.push('branch1')
 92 |     b.push('branch2')
 93 |     w.join(b)
 94 |     w.push('end')
 95 |     del w
 96 | 
 97 |     # import pprint
 98 |     #pprint.pprint(p.state_stats.keys())
 99 |     # pprint.pprint(p.unique_paths())
100 |     print '\n'.join(p.pformat())
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     test_breadth()
105 |     test()
106 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | from setuptools import Extension, setup
 3 | 
 4 | extra_compile_args = []
 5 | if platform.system() == 'Windows':
 6 |     extra_compile_args = ['/MT',]
 7 | libraries = []
 8 | if platform.system() not in ('Windows', 'Darwin'):
 9 |     libraries = ['rt',]
10 | 
11 | _faststat = Extension('_faststat', sources=['faststat/_faststat.c'], 
12 |                       libraries=libraries, extra_compile_args=extra_compile_args)
13 | 
14 | setup(
15 |     name='faststat',
16 |     version='17.10.0',
17 |     author="Kurt Rose",
18 |     author_email="kurt@kurtrose.com",
19 |     description='fast online statistics collection',
20 |     license="MIT",
21 |     url="http://github.com/doublereedkurt/faststat",
22 |     long_description='...',
23 |     classifiers=[
24 |         'Development Status :: 4 - Beta',
25 |         'License :: OSI Approved :: MIT License',
26 |     ],
27 |     packages=['faststat'],
28 |     ext_modules=[_faststat])
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/test/biased_quantile_stream.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | # This is a fairly naive implementation of the algorithm described in
  4 | # "Effective Computation of Biased Quantiles over Data Streams"
  5 | 
  6 | class Quantiles(object):
  7 |     def __init__(self, f=None):
  8 |         self.points = None
  9 |         self.n = 0
 10 |         self.f = f or biased_quantiles_f
 11 | 
 12 |     def insert(self, val):
 13 |         if self.points is None or val < self.points.val:
 14 |             # less than first
 15 |             self.points = _point(val, 1, 0, self.points)
 16 |             return
 17 |         prev = self.points
 18 |         cur = self.points.next
 19 |         r = 0
 20 |         while cur and cur.next:
 21 |             if val < cur.val:
 22 |                 break
 23 |             r += cur.delta
 24 |             prev = cur
 25 |             cur = cur.next
 26 |         else:  # ran off the end
 27 |             (cur or prev).next = _point(val, 1, 0, None)
 28 |             return
 29 |         new = _point(val, 1, max(int(self.f(r, self.n)) - 1, 0), cur)
 30 |         prev.next = new
 31 |         self.n += 1
 32 |         if self.n % COMPRESS_INTERVAL == 0:
 33 |             self.compress()
 34 | 
 35 |     def compress(self):
 36 |         pointlist = []
 37 |         cur = self.points
 38 |         r = [0]
 39 |         while cur:
 40 |             pointlist.append(cur)
 41 |             r.append(r[-1] + cur.delta)
 42 |             cur = cur.next
 43 |         for i in range(len(pointlist) - 2, 0, -1):
 44 |             if pointlist[i].delta + pointlist[i + 1].delta + pointlist[i + 1].width <= self.f(r[i], self.n):
 45 |                 # merge
 46 |                 pointlist[i].next = pointlist[i + 1].next
 47 |                 pointlist[i].val = pointlist[i + 1].val
 48 |                 pointlist[i].delta += pointlist[i + 1].delta
 49 |                 pointlist[i].width = pointlist[i + 1].width
 50 | 
 51 |     def query(self, p):
 52 |         cur_r = 0
 53 |         cur_point = self.points
 54 |         while cur_point:
 55 |             cur_r += cur_point.delta
 56 |             if (cur_r + cur_point.next.delta + cur_point.next.width > 
 57 |                         self.n * p + self.f(self.n * p, self.n) / 2):
 58 |                 return cur_point, (cur_r + cur_point.delta, cur_r + cur_point.delta + cur_point.width)
 59 |             cur_point = cur_point.next
 60 | 
 61 |     def get_pointlist(self):
 62 |         if not self.points:
 63 |             return []
 64 |         p = [self.points]
 65 |         while p[-1].next:
 66 |             p.append(p[-1].next)
 67 |         return p
 68 | 
 69 | 
 70 | import math
 71 | 
 72 | 
 73 | class DistributedQuantiles(object):
 74 |     def __init__(self, error):
 75 |         self.size = math.ceil(1 / error)
 76 |         self.inbuf = []
 77 |         self.qs = []
 78 |         self.n = 0
 79 | 
 80 |     def insert(self, val):
 81 |         self.inbuf.append(val)
 82 |         if len(self.inbuf) >= self.size:
 83 |             self.inbuf.sort()
 84 |             i, j = 0, 0
 85 |             new_qs = []
 86 |             if self.inbuf[0] < self.qs[0][0]:
 87 |                 while self.inbuf[i] < self.qs[j][0]:
 88 |                     new_qs.append((self.inbuf[i], i, i))
 89 |             while i < len(self.inbuf) and j < len(self.qs):
 90 |                 nxtmin = i + self.qs[j][1]
 91 |                 nxtmax = i + self.qs[j][2] - 1
 92 |                 if self.inbuf[i] < self.qs[j][0]:
 93 |                     nxtval = self.inbuf[i]
 94 |                     i += 1
 95 |                 elif self.qs[j] < self.inbuf[i]:
 96 |                     nxtval = self.qs[j]
 97 |                     j += 1
 98 |                 else:
 99 |                     val = self.inbuf[i]
100 |                     if len(inbuf) > i + 1 and inbuf[i + 1] == val:
101 |                         if len(self.qs) > j + 1 and self.qs[j + 1] == val:
102 |                             new_qs.append((val, ))
103 |                     # the paper does not cover this case so a supplemental
104 |                     # proof is provided
105 |                     pass
106 |                 new_qs.append((nxtval, nxtmin, nxtmax))
107 |             self.qs = new_qs
108 |             self.prune()
109 | 
110 |     def prune(self):
111 |         pass
112 | 
113 | 
114 | '''
115 | Supplemental proof of correctness of equal value element case.
116 | (notation is kept as close as possible to the original paper
117 | within the limitations of ASCII)
118 | 
119 | MOTIVATION:
120 | The error bound as given in the paper is dependent that
121 | when merging any Q' and Q'', the next-lowest y_s of Q'' and
122 | next-highest y_t of Q'' to any x of Q' must be consecutive.
123 | 
124 | However, in the general case there may be many elements whose
125 | value are equal.  Consider
126 | Q' = [...,-1,0,1,...]
127 | Q'' = [...,-1,0,0,1,...]
128 | 
129 | For x of Q' = 0, y_s = -1 and y_t = 1.
130 | y_s and y_t are not consecutive.
131 | The merge operation is no longer correct, because inductively
132 | rmax_Q''(y_s) - rmin_Q''(y_t) <= 3 eps n'' ; which is not <= 2 eps n''
133 | 
134 | We therefore define a new merge operation for elements of Q' and Q''
135 | which are equal.
136 | 
137 | This merge operation will also inductively guarantee that only
138 | two consecutive elements may share the same value.
139 | 
140 | 
141 | INTUITION:
142 | The intuitive explanation of the meaning of multiple elements
143 | with the same value is that it represents an unbroken
144 | sequence of observations with that same value.  The first
145 | element represents the rank of the beginning of this sequence.
146 | The second element represents the rank of the end of this
147 | sequence.
148 | 
149 | Because of this, there should never be more than two elements
150 | with the same value.  (If there is a third element, it can be
151 | discarded keeping only the maximum and minimum.)
152 | 
153 | This also means that if there are two elements with the same
154 | value, rmin may be set to rmax for the lower ranked element,
155 | and rmax may be set to rmin for the higher ranked element.
156 | 
157 | Because every rank between the two elements of the same
158 | value is known to also be of that value, the top-most rank
159 | in the range of the first value and bottom-most rank in the
160 | range of the second value are known to be the exact locations
161 | of an element of the given value.
162 | 
163 | 
164 | NEW MERGE OPERATION:
165 | 
166 | The proposed new merge operation considers 4 points at a time:
167 | 
168 | x_r, x_r+1, y_t, y_t+1
169 | 
170 | where x_r is the minimum un-merged element of Q'
171 |       x_r+1 is the next consecutive element of Q'
172 |       y_t is the minimum un-merged element of Q''
173 |       y_t+1 is the next consecutive element of Q''
174 | 
175 | in order to generate z_i of Q
176 | 
177 | all points whose value are equal are consumed in a single step
178 | 
179 | there are four cases:
180 | 
181 | for notation convenience,
182 | define the ADD operation of x of Q' and y of Q''
183 | be z of Q:
184 |     rmax_Q(z) = rmax_Q'(x) + rmax_Q''(y)
185 |     rmin_Q(z) = rmin_Q'(x) + rmin_Q''(y)
186 | 
187 | CASE I:
188 |     x_r != y_t:
189 |         apply the merge operation as in the original paper
190 | CASE II:
191 |     x_r = y_t, and x_r != x_r+1, and y_t != y_t+1:
192 |         z_i = ADD(x_r, y_t)
193 |         move on to x_r+1, y_t+1
194 | case III:
195 |     x_r = y_t = x_r+1 != y_t+1:
196 |         generate two points, z_i and z_i'
197 |         z_i = ADD(x_r, y_t)
198 |         z_i' = ADD(x_r+1, y_t)
199 |         apply SHRINK to the pair of points,
200 |         append the result (either one or two points) to Q,
201 |         move on to x_r+2 and y_t+1
202 | case IV:
203 |     x_r = y_t = y_t+1 != x_r+1:
204 |         generate two points z_i and z_i'
205 |         z_i = ADD(x_r, y_t)
206 |         z_i' = ADD(x_r, y_t+1)
207 |         apply SHRINK to the pair of points,
208 |         append the result (either one or two points) to Q,
209 |         move on to x_r+1, y_t+2
210 | case V:
211 |     x_r = y_t = x_r+1 = y_t+1:
212 |         this means that there are two runs of the same value
213 |         in each of the merging sequences.
214 | 
215 |         generate two points z_i, z_i':
216 |         z_i = ADD(x_r, y_t)
217 |         z_i' = ADD(x_r+1, y_t+1)
218 |         append both to Q
219 |         move on to x_r+2, y_t+2
220 | 
221 | 
222 | PROOF:
223 | Let eps be the error bound.
224 | Let Q' and Q'' be two quantile summaries being merged, each of
225 | which inductively have error <= eps.
226 | Let x be an element of Q' and y be an element of Q'' such that.
227 | Let n' be the number of observations covered by Q' and
228 | n'' be the number of observations covered by Q''.
229 | 
230 | The proposed merge operation is:
231 | rmax_Q(z_i) = rmax_Q'(x) + rmax_Q''(y)
232 | rmin_Q(z_i) = rmin_Q'(x) + rmin_Q''(y)
233 | 
234 | We wish to show that rmax_Q(z_i+1) - rmin_Q(z_i) <= 2 eps (n' + n'')
235 | 
236 | let r be the index of x in Q'
237 | let s by the index of y in Q''
238 | 
239 | Several things must be proven about the new MERGE:
240 | 1- that z_i and z_i+1 are within the error bounds when z_i is the 
241 |         result of CASE II, II, IV, or V, and z_i+1 is the result of CASE I
242 | 2- that z_i and z_i+1 are within the error bounds when z_i is the
243 |         result of CASE I, and z_i+1 is the result of CASE II, III, IV, or V
244 | 3- that z_i and z_i+1 are within the error bound
245 | 
246 | There are 3 cases:
247 | 
248 | case I: 
249 | 
250 | case I: z_i+1 was merged using the new merge operation
251 | case II: z_i+1 was merged using the merge operation defined in the paper
252 | case IIA: z_i+1 came from x_r+1 of Q'
253 | case IIAi: z_i+1 = z_i
254 | case IIAii: z_i+1 > z_i
255 | case IIB: z_i+1 came from y_s+1 of Q''
256 | case IIBi: z_i+1 = z_i
257 | case IIBii: z_i+1 > z_i
258 | 
259 | (Note, if x_r+1 = x_r and y_s+1 = y_s, this is back to case I)
260 | 
261 | case I:
262 |     rmax_Q(z_i+1) - rmin_Q(z_i) = rmax_Q'(x_r+1) + rmax_Q''(y_s+1)
263 |                                    - rmin_Q'(x_r) - rmin_Q''(y_s)
264 |                                 = rmax_Q'(x_r+1) - rmin_Q'(x_r)
265 |                                    + rmax_Q''(y_s+1) - rmin_Q''(y_s)
266 |     by inductive property of Q' and Q''
267 |                                 <= 2 n' eps + 2 n'' eps
268 |                                 <= 2 eps (n' + n'')
269 | 
270 | case IIAi:
271 |     if x_r = x_r+1:
272 |         by consecutive element contraction lemma:
273 |             if x_r and x_r+1 overlap, combine them
274 |             and there is no z_i, only z_i+1
275 | 
276 |             otherwise, rmin_S(x_r) = rmax_S(x_r)
277 |             and rmin_S(x_r+1) = rmax_S(x_r+1)
278 | 
279 |             rmax_Q(z_i+1) - rmin_Q(z_i)
280 |             = 
281 | 
282 |         
283 | 
284 |    
285 | 
286 | 
287 | consecutive elment contraction lemma:
288 |     let s and s' be consecutive elements of summary S,
289 |     such that s = s'
290 |     CASE I:
291 |     if rmin_S(s') < rmax_S(s):
292 |         maximum rank of s' = rmax_S(s)
293 |         minimum rank of s = rmin_S(s')
294 |         [s and s' are now identical, discard one]
295 |     CASE II:
296 |     if rmin_S(s') >= rmax_S(s):
297 |         maximum rank of s' = rmin_S(s')
298 |         minimum rank of s = rmax_S(s)
299 | 
300 |     let '[' represent rmin_S, and ']' represent rmax_S:
301 |         [  x_r  ]
302 |              [  x_r+1 ]
303 |     let a = the value of x_r and x_r+1;
304 |     We know that somewhere between rmax_S(x_r) and rmin_S(x_r+1)
305 |     there is a sequence of consecutive 'a' values in the data
306 |     set.  Therefore, there is at least one a in that interval.
307 | 
308 | 
309 | 
310 | -----------------------
311 | Notation:
312 | for readability, everything after a "_" should be read as a subscript
313 | For example, z_i+1 should be read z sub(i+1), not z sub(i) + 1
314 | 
315 | x of Q, should be read "x element of Q"
316 | 
317 | Also, the original paper assumes that every element is unique.
318 | Therefore, rmin(v) is a function that takes value v and returns
319 | the minimum rank.  Because we allow two elements to have the same
320 | value, v must be considered to be a tuple.  rmin(v) is selecting
321 | one item from that tuple, rmax(v) is selecting another element.
322 | 
323 | This does not affect the correctness of any of the proofs from the
324 | original paper.
325 | 
326 | '''
327 | 
328 | 
329 | 
330 | class _point(object):
331 |     def __init__(self, val, delta, width, next):
332 |         self.val, self.delta, self.width, self.next = val, delta, width, next
333 | 
334 |     def __repr__(self):
335 |         return "_point(val={0}, delta={1}, width={2})".format(self.val, self.delta, self.width)
336 | 
337 | 
338 | def biased_quantiles_f(r_i, n):
339 |     return 2 * ERROR_RATE * r_i
340 | 
341 | 
342 | def targeted_quantiles_f(percentiles_errors):
343 |     def f(r_i, n):
344 |         bounds = []
345 |         for p, e in percentiles_errors:
346 |             if r_i < p * n:
347 |                 bounds.append(2 * e * (n - r_i) / (1 - e))
348 |             else:
349 |                 bounds.append(2 * e * r_i / p)
350 |         return min(bounds)
351 |     return f
352 | 
353 | 
354 | ERROR_RATE = 0.001
355 | COMPRESS_INTERVAL = 10 # int(1 / ERROR_RATE)
356 | 
357 | # val is the current (approximate) value
358 | 
359 | # delta is the difference between the lowest possible rank of the current
360 | # point/value and the previous point
361 | 
362 | # width is the differencet between the lowest and highest possible rank
363 | # of the current point/value
364 | 
365 | # this data structure ensures that new points can be inserted into
366 | # the middle of the linked list
367 | 
368 | 
369 | # performance of naive algorithm is very bad -- 300 - 700 microseconds
370 | # (0.3 to 0.7 ms).  this is about 20-40x slower than python piece-wise
371 | # parabolic algorithm;  ~300x slower than C piece-wise parabolic
372 | def test(q=None):
373 |     import random, time
374 | 
375 |     data = [random.normalvariate(1.0, 1.0) for i in range(int(1e4))]
376 |     q = q or Quantiles()
377 |     start = time.time()
378 |     for d in data:
379 |         q.insert(d)
380 |     print (time.time() - start) * 1e6 / len(data), "microseconds per point"
381 |     return q
382 | 
383 | # about 400 microseconds per point 
384 | def test_targeted():
385 |     TARGETS = ((0.25, 0.001), (0.5, 0.001), (0.75, 0.001), (0.9, 0.001), (0.95, 0.001), (0.99, 0.001))
386 |     f = targeted_quantiles_f(TARGETS)
387 |     return test(q=Quantiles(f))
388 | 
389 | if __name__ == "__main__":
390 |     print 'biased quantile condition'
391 |     test()
392 |     print 'targeted quantile condition'
393 |     test_targeted()
394 | 


--------------------------------------------------------------------------------