├── .gitignore
├── Design.md
├── README.rst
├── docs
├── Makefile
├── make.bat
└── source
│ ├── _static
│ └── default.css
│ ├── api.rst
│ ├── conf.py
│ ├── getting-started.rst
│ ├── index.rst
│ ├── intro.rst
│ └── usage.rst
├── java
├── bin
│ ├── DataStructures
│ │ ├── AVLNode.class
│ │ ├── AVLTree.class
│ │ ├── Distribution.class
│ │ ├── Stream$StreamIterator.class
│ │ ├── Stream.class
│ │ └── StreamItem.class
│ ├── Hash
│ │ └── TwoUniversal.class
│ └── StreamingLib
│ │ ├── CountSketch.class
│ │ ├── MG.class
│ │ └── Sketch.class
└── src
│ ├── DataStructures
│ ├── AVLNode.java
│ ├── AVLTree.java
│ ├── Distribution.java
│ ├── Stream.java
│ └── StreamItem.java
│ ├── Hash
│ └── TwoUniversal.java
│ └── StreamingLib
│ ├── CountSketch.java
│ ├── MG.java
│ └── Sketch.java
├── runtests.py
├── setup.py
├── streamlib
├── __init__.py
├── hashes.py
├── hashes_bak
│ ├── __init__.py
│ ├── makefile
│ └── universalHashing.py
├── info.py
├── makefile
├── sketch_bak
│ ├── BJKST.py
│ ├── F2.py
│ ├── Misra_Gries.py
│ ├── __init__.py
│ ├── countSketch.py
│ ├── makefile
│ ├── quantile.py
│ └── sketch.py
├── summary.py
└── utils.py
└── tests
└── test_summary.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled Object files
2 | *.slo
3 | *.lo
4 | *.o
5 | *.obj
6 |
7 | # Compiled Dynamic libraries
8 | *.so
9 | *.dylib
10 | *.dll
11 |
12 | # Compiled Static libraries
13 | *.lai
14 | *.la
15 | *.a
16 | *.lib
17 |
18 | # Executables
19 | *.exe
20 | *.out
21 | *.app
22 |
23 | # Python
24 | *.pyc
25 |
26 | # Emacs Backup file
27 | *.*~
28 |
--------------------------------------------------------------------------------
/Design.md:
--------------------------------------------------------------------------------
1 | Principles of Design
2 | ==================
3 |
4 | ## Levels of Implementation
5 | + Use Cython or C/C++ to implement the low-level streaming algorithm, which only works for data stream with each item being integer.
6 | + Any **iterable**, **hashable** object can be considered as a data stream. Each item can be converted to integer using proper hash function on the fly, then feed into the low-level api.
7 |
8 |
9 |
10 | ## Unify the Implementation
11 | There are many common aspects of those popular streaming algorithms, e.g., most of them use **median trick** to boost the success probability; many streaming algorithms can be thought as a type of sketch.
12 |
13 | Try to address those common features and unify the implementation of various of streaming algorithms.
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | ==========
2 | StreamLib
3 | ==========
4 | -------------------------------------------
5 | python library for streaming algorithms
6 | -------------------------------------------
7 |
8 | Document can be found in http://xmerge.me/StreamLib/.
9 |
10 | Overview
11 | -------------
12 | Algorithms included:
13 |
14 | * Sketch
15 |
16 | + Count Min Sketch [cm05]_ -- DONE
17 | + Count Median Sketch [cm05]_ -- DONE
18 | + Count Sketch [ccfc04]_ -- DONE
19 | + BJKST Sketch [bjkst]_
20 | + Misra-Gries Sketch [mg82]_
21 | + F2 Sketch [ams]_ -- DONE
22 | + Quantile Sketch [myblog]_
23 | + ...
24 |
25 | Above algorithms share several common features, we could therefore specify a bunch of
26 | common methods, here are some.
27 |
28 | .. code-block:: python
29 |
30 | class Sketch(object):
31 | """
32 | Interface for Sketch.
33 | """
34 | @abstractmethod
35 | def processBatch(self, *args, **kwargs):
36 | """
37 | Summarize data stream in batch mode.
38 | """
39 | raise NotImplemented()
40 |
41 | @abstractmethod
42 | def processItem(self, *args, **kwargs):
43 | """
44 | Summarize one item in a data stream.
45 | """
46 | raise NotImplemented()
47 |
48 | @abstractmethod
49 | def estimate(self, *args, **kwargs):
50 | """
51 | Estimate properties of given item/key.
52 | """
53 | raise NotImplemented()
54 |
55 | @abstractmethod
56 | def merge(self, *args, **kwargs):
57 | """
58 | Merge compatible sketches.
59 | """
60 | raise NotImplemented()
61 |
62 | @abstractmethod
63 | def __add__(self, other):
64 | return self.merge(other)
65 |
66 |
67 | Data Stream
68 | ------------
69 |
70 | Any **iterable** object with **hashable** elements can be considered as a data stream. Here are some examples.
71 |
72 | * a list of integers: :code:`[1, 10, 20, 1, 5]`
73 | * a generator that yields tuples, see the instance :code:`dataStream` as follows,
74 |
75 | .. code-block:: python
76 |
77 | import random
78 |
79 | def demoGen(N = 1000):
80 | i = 0
81 | while i < N:
82 | yield random.randint(0, 10);
83 | i += 1
84 |
85 | dataStream = demoGen()
86 |
87 | * a tuple of strings: :code:`('fix', 'the', 'bug', please', '...')`
88 | * a string: :code:`'abcdefgdahfahdfajkhfkahfsahfjksfhjk'`
89 | * many more
90 |
91 |
92 | Summarize the data stream
93 | -------------------------
94 | Many algorithms that are popular to summarize data streams are included
95 | in the module **streamlib**. We give some examples to show their basic usage.
96 |
97 | Count-Min Sketch
98 | #################
99 | Count-Min sketch [cm05]_ is used to summarize the data stream and estimate the frequency of each element in the data stream. This sketch give high accurate estimation to heavy hitters (elements that have high frequencies) while relatively large error may induced for light elements. See following example for the basic usage.
100 |
101 | .. code-block:: python
102 |
103 | from streamlib import CountMin
104 | cm = CountMin() # create a instance of CountMin, see document for more detail
105 | cm.processBatch([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 3, 3, 4])
106 | for i in xrange(5):
107 | print 'Estimated frequency of', i, 'is', cm.estimate(i)
108 |
109 | result of above code,::
110 |
111 | Estimated frequency of 0 is 4
112 | Estimated frequency of 1 is 6
113 | Estimated frequency of 2 is 1
114 | Estimated frequency of 3 is 2
115 | Estimated frequency of 4 is 1
116 |
117 |
118 | An instance of `CountMin` can be initialized by two parameters, see docs for detail.
119 |
120 |
121 | Documents
122 | ---------
123 | `Official Document `_.
124 |
125 | Dependencies
126 | ------------------
127 | * Python = 2.x (x >= 6).
128 | * mmh3 >= 2.0
129 |
130 |
131 | TODO
132 | ---------------
133 | * Try to use CPython to speed up the implementation.
134 | * Add more streaming algorithms.
135 | * Minimize dependencies.
136 |
137 | Bibliography
138 | -------------
139 | .. [ccfc04] Charikar, Moses, Kevin Chen, and Martin Farach-Colton. "Finding frequent items in data streams." Automata, Languages and Programming. Springer Berlin Heidelberg, 2002. 693-703.
140 |
141 | .. [ams] Alon, Noga, Yossi Matias, and Mario Szegedy. "The space complexity of approximating the frequency moments." Proceedings of the twenty-eighth annual ACM symposium on Theory of computing. ACM, 1996.
142 |
143 | .. [bjkst] Bar-Yossef, Ziv, et al. "Counting distinct elements in a data stream." Randomization and Approximation Techniques in Computer Science. Springer Berlin Heidelberg, 2002. 1-10.
144 |
145 | .. [cm05] Cormode, Graham, and S. Muthukrishnan. "An improved data stream summary: the count-min sketch and its applications." Journal of Algorithms 55.1 (2005): 58-75.
146 |
147 | .. [mg82] Misra, Jayadev, and David Gries. "Finding repeated elements." Science of computer programming 2.2 (1982): 143-152.
148 |
149 | .. [myblog] http://jiecchen.github.io/blog/2014/08/13/quantile-sketch/
150 |
151 |
152 | Maintainer
153 | -----------
154 | * `Jiecao Chen `_ (currently supported by NSF Grant CCF-1525024)
155 |
156 | Other contributors
157 | ---------------
158 | * `Qin Zhang `_
159 | * `Rachel Lowden `_
160 |
161 |
162 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = build
9 |
10 | # User-friendly check for sphinx-build
11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
13 | endif
14 |
15 | # Internal variables.
16 | PAPEROPT_a4 = -D latex_paper_size=a4
17 | PAPEROPT_letter = -D latex_paper_size=letter
18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
19 | # the i18n builder cannot share the environment and doctrees with the others
20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
21 |
22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
23 |
24 | help:
25 | @echo "Please use \`make ' where is one of"
26 | @echo " html to make standalone HTML files"
27 | @echo " dirhtml to make HTML files named index.html in directories"
28 | @echo " singlehtml to make a single large HTML file"
29 | @echo " pickle to make pickle files"
30 | @echo " json to make JSON files"
31 | @echo " htmlhelp to make HTML files and a HTML help project"
32 | @echo " qthelp to make HTML files and a qthelp project"
33 | @echo " devhelp to make HTML files and a Devhelp project"
34 | @echo " epub to make an epub"
35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
36 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
38 | @echo " text to make text files"
39 | @echo " man to make manual pages"
40 | @echo " texinfo to make Texinfo files"
41 | @echo " info to make Texinfo files and run them through makeinfo"
42 | @echo " gettext to make PO message catalogs"
43 | @echo " changes to make an overview of all changed/added/deprecated items"
44 | @echo " xml to make Docutils-native XML files"
45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes"
46 | @echo " linkcheck to check all external links for integrity"
47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
48 |
49 | clean:
50 | rm -rf $(BUILDDIR)/*
51 |
52 | html:
53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
54 | @echo
55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
56 |
57 | dirhtml:
58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
59 | @echo
60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
61 |
62 | singlehtml:
63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
64 | @echo
65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
66 |
67 | pickle:
68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
69 | @echo
70 | @echo "Build finished; now you can process the pickle files."
71 |
72 | json:
73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
74 | @echo
75 | @echo "Build finished; now you can process the JSON files."
76 |
77 | htmlhelp:
78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
79 | @echo
80 | @echo "Build finished; now you can run HTML Help Workshop with the" \
81 | ".hhp project file in $(BUILDDIR)/htmlhelp."
82 |
83 | qthelp:
84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
85 | @echo
86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/StreamLib.qhcp"
89 | @echo "To view the help file:"
90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/StreamLib.qhc"
91 |
92 | devhelp:
93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
94 | @echo
95 | @echo "Build finished."
96 | @echo "To view the help file:"
97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/StreamLib"
98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/StreamLib"
99 | @echo "# devhelp"
100 |
101 | epub:
102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | @echo
104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 |
106 | latex:
107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | @echo
109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | "(use \`make latexpdf' here to do that automatically)."
112 |
113 | latexpdf:
114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | @echo "Running LaTeX files through pdflatex..."
116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 |
119 | latexpdfja:
120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | @echo "Running LaTeX files through platex and dvipdfmx..."
122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 |
125 | text:
126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | @echo
128 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
129 |
130 | man:
131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | @echo
133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 |
135 | texinfo:
136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | @echo
138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | @echo "Run \`make' in that directory to run these through makeinfo" \
140 | "(use \`make info' here to do that automatically)."
141 |
142 | info:
143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | @echo "Running Texinfo files through makeinfo..."
145 | make -C $(BUILDDIR)/texinfo info
146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 |
148 | gettext:
149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | @echo
151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 |
153 | changes:
154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | @echo
156 | @echo "The overview file is in $(BUILDDIR)/changes."
157 |
158 | linkcheck:
159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | @echo
161 | @echo "Link check complete; look for any errors in the above output " \
162 | "or in $(BUILDDIR)/linkcheck/output.txt."
163 |
164 | doctest:
165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | @echo "Testing of doctests in the sources finished, look at the " \
167 | "results in $(BUILDDIR)/doctest/output.txt."
168 |
169 | xml:
170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | @echo
172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 |
174 | pseudoxml:
175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | @echo
177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | REM Command file for Sphinx documentation
4 |
5 | if "%SPHINXBUILD%" == "" (
6 | set SPHINXBUILD=sphinx-build
7 | )
8 | set BUILDDIR=build
9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
10 | set I18NSPHINXOPTS=%SPHINXOPTS% source
11 | if NOT "%PAPER%" == "" (
12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
14 | )
15 |
16 | if "%1" == "" goto help
17 |
18 | if "%1" == "help" (
19 | :help
20 | echo.Please use `make ^` where ^ is one of
21 | echo. html to make standalone HTML files
22 | echo. dirhtml to make HTML files named index.html in directories
23 | echo. singlehtml to make a single large HTML file
24 | echo. pickle to make pickle files
25 | echo. json to make JSON files
26 | echo. htmlhelp to make HTML files and a HTML help project
27 | echo. qthelp to make HTML files and a qthelp project
28 | echo. devhelp to make HTML files and a Devhelp project
29 | echo. epub to make an epub
30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
31 | echo. text to make text files
32 | echo. man to make manual pages
33 | echo. texinfo to make Texinfo files
34 | echo. gettext to make PO message catalogs
35 | echo. changes to make an overview over all changed/added/deprecated items
36 | echo. xml to make Docutils-native XML files
37 | echo. pseudoxml to make pseudoxml-XML files for display purposes
38 | echo. linkcheck to check all external links for integrity
39 | echo. doctest to run all doctests embedded in the documentation if enabled
40 | goto end
41 | )
42 |
43 | if "%1" == "clean" (
44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
45 | del /q /s %BUILDDIR%\*
46 | goto end
47 | )
48 |
49 |
50 | %SPHINXBUILD% 2> nul
51 | if errorlevel 9009 (
52 | echo.
53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
54 | echo.installed, then set the SPHINXBUILD environment variable to point
55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
56 | echo.may add the Sphinx directory to PATH.
57 | echo.
58 | echo.If you don't have Sphinx installed, grab it from
59 | echo.http://sphinx-doc.org/
60 | exit /b 1
61 | )
62 |
63 | if "%1" == "html" (
64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
65 | if errorlevel 1 exit /b 1
66 | echo.
67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html.
68 | goto end
69 | )
70 |
71 | if "%1" == "dirhtml" (
72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
73 | if errorlevel 1 exit /b 1
74 | echo.
75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
76 | goto end
77 | )
78 |
79 | if "%1" == "singlehtml" (
80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
81 | if errorlevel 1 exit /b 1
82 | echo.
83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
84 | goto end
85 | )
86 |
87 | if "%1" == "pickle" (
88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
89 | if errorlevel 1 exit /b 1
90 | echo.
91 | echo.Build finished; now you can process the pickle files.
92 | goto end
93 | )
94 |
95 | if "%1" == "json" (
96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
97 | if errorlevel 1 exit /b 1
98 | echo.
99 | echo.Build finished; now you can process the JSON files.
100 | goto end
101 | )
102 |
103 | if "%1" == "htmlhelp" (
104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | if errorlevel 1 exit /b 1
106 | echo.
107 | echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | goto end
110 | )
111 |
112 | if "%1" == "qthelp" (
113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | if errorlevel 1 exit /b 1
115 | echo.
116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\StreamLib.qhcp
119 | echo.To view the help file:
120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\StreamLib.ghc
121 | goto end
122 | )
123 |
124 | if "%1" == "devhelp" (
125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | if errorlevel 1 exit /b 1
127 | echo.
128 | echo.Build finished.
129 | goto end
130 | )
131 |
132 | if "%1" == "epub" (
133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | if errorlevel 1 exit /b 1
135 | echo.
136 | echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | goto end
138 | )
139 |
140 | if "%1" == "latex" (
141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | if errorlevel 1 exit /b 1
143 | echo.
144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | goto end
146 | )
147 |
148 | if "%1" == "latexpdf" (
149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | cd %BUILDDIR%/latex
151 | make all-pdf
152 | cd %BUILDDIR%/..
153 | echo.
154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | goto end
156 | )
157 |
158 | if "%1" == "latexpdfja" (
159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | cd %BUILDDIR%/latex
161 | make all-pdf-ja
162 | cd %BUILDDIR%/..
163 | echo.
164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | goto end
166 | )
167 |
168 | if "%1" == "text" (
169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | if errorlevel 1 exit /b 1
171 | echo.
172 | echo.Build finished. The text files are in %BUILDDIR%/text.
173 | goto end
174 | )
175 |
176 | if "%1" == "man" (
177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | if errorlevel 1 exit /b 1
179 | echo.
180 | echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | goto end
182 | )
183 |
184 | if "%1" == "texinfo" (
185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | if errorlevel 1 exit /b 1
187 | echo.
188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | goto end
190 | )
191 |
192 | if "%1" == "gettext" (
193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | if errorlevel 1 exit /b 1
195 | echo.
196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | goto end
198 | )
199 |
200 | if "%1" == "changes" (
201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | if errorlevel 1 exit /b 1
203 | echo.
204 | echo.The overview file is in %BUILDDIR%/changes.
205 | goto end
206 | )
207 |
208 | if "%1" == "linkcheck" (
209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | if errorlevel 1 exit /b 1
211 | echo.
212 | echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | goto end
215 | )
216 |
217 | if "%1" == "doctest" (
218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | if errorlevel 1 exit /b 1
220 | echo.
221 | echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | goto end
224 | )
225 |
226 | if "%1" == "xml" (
227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | if errorlevel 1 exit /b 1
229 | echo.
230 | echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | goto end
232 | )
233 |
234 | if "%1" == "pseudoxml" (
235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | if errorlevel 1 exit /b 1
237 | echo.
238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | goto end
240 | )
241 |
242 | :end
243 |
--------------------------------------------------------------------------------
/docs/source/_static/default.css:
--------------------------------------------------------------------------------
1 | /*
2 | * flasky.css_t
3 | * ~~~~~~~~~~~~
4 | *
5 | * :copyright: Copyright 2010 by Armin Ronacher.
6 | * :license: Flask Design License, see LICENSE for details.
7 | */
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | @import url(http://fonts.googleapis.com/css?family=Open+Sans:400,700,400italic|Source+Code+Pro);
17 | @import url("basic.css");
18 |
19 | /* -- page layout ----------------------------------------------------------- */
20 |
21 | html {
22 | overflow-y: scroll;
23 | }
24 |
25 | body {
26 | font-family: 'Open Sans', sans-serif;
27 | font-size: 17px;
28 | background-color: white;
29 | color: #000;
30 | margin: 0;
31 | padding: 0;
32 | }
33 |
34 | div.document {
35 | width: 940px;
36 | margin: 30px auto 0 auto;
37 | }
38 |
39 | div.documentwrapper {
40 | float: left;
41 | width: 100%;
42 | }
43 |
44 | div.bodywrapper {
45 | margin: 0 0 0 220px;
46 | }
47 |
48 | div.sphinxsidebar {
49 | width: 220px;
50 | }
51 |
52 | hr {
53 | border: 1px solid #B1B4B6;
54 | }
55 |
56 | div.body {
57 | background-color: #ffffff;
58 | color: #3E4349;
59 | padding: 0 30px 0 30px;
60 | }
61 |
62 | img.floatingflask {
63 | padding: 0 0 10px 10px;
64 | float: right;
65 | }
66 |
67 | div.footer {
68 | width: 940px;
69 | margin: 20px auto 30px auto;
70 | font-size: 14px;
71 | color: #888;
72 | text-align: right;
73 | }
74 |
75 | div.footer a {
76 | color: #888;
77 | }
78 |
79 | div.related {
80 | display: none;
81 | }
82 |
83 | div.sphinxsidebar a {
84 | color: #444;
85 | text-decoration: none;
86 | border-bottom: 1px dotted #999;
87 | }
88 |
89 | div.sphinxsidebar a:hover {
90 | border-bottom: 1px solid #999;
91 | }
92 |
93 | div.sphinxsidebar {
94 | font-size: 14px;
95 | line-height: 1.5;
96 | }
97 |
98 | div.sphinxsidebarwrapper {
99 | padding: 18px 10px;
100 | }
101 |
102 | div.sphinxsidebarwrapper p.logo {
103 | padding: 0 0 20px 0;
104 | margin: 0;
105 | text-align: center;
106 | }
107 |
108 | div.sphinxsidebar h3,
109 | div.sphinxsidebar h4 {
110 | font-family: 'Open Sans', sans-serif;
111 | color: #444;
112 | font-size: 24px;
113 | font-weight: normal;
114 | margin: 0 0 5px 0;
115 | padding: 0;
116 | }
117 |
118 | div.sphinxsidebar h4 {
119 | font-size: 20px;
120 | }
121 |
122 | div.sphinxsidebar h3 a {
123 | color: #444;
124 | }
125 |
126 | div.sphinxsidebar p.logo a,
127 | div.sphinxsidebar h3 a,
128 | div.sphinxsidebar p.logo a:hover,
129 | div.sphinxsidebar h3 a:hover {
130 | border: none;
131 | }
132 |
133 | div.sphinxsidebar p {
134 | color: #555;
135 | margin: 10px 0;
136 | }
137 |
138 | div.sphinxsidebar ul {
139 | margin: 10px 0;
140 | padding: 0;
141 | color: #000;
142 | }
143 |
144 | div.sphinxsidebar input {
145 | border: 1px solid #ccc;
146 | font-family: 'Open Sans', sans-serif;
147 | font-size: 1em;
148 | }
149 |
150 | /* -- body styles ----------------------------------------------------------- */
151 |
152 | a {
153 | color: #2d4e84;
154 | text-decoration: underline;
155 | }
156 |
157 | a:hover {
158 | color: #2069e1;
159 | text-decoration: underline;
160 | }
161 |
162 | div.body h1,
163 | div.body h2,
164 | div.body h3,
165 | div.body h4,
166 | div.body h5,
167 | div.body h6 {
168 | font-family: 'Open Sans', sans-serif;
169 | font-weight: normal;
170 | margin: 30px 0px 10px 0px;
171 | padding: 0;
172 | }
173 |
174 |
175 | div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; }
176 | div.body h2 { font-size: 180%; }
177 | div.body h3 { font-size: 150%; }
178 | div.body h4 { font-size: 130%; }
179 | div.body h5 { font-size: 100%; }
180 | div.body h6 { font-size: 100%; }
181 |
182 | a.headerlink {
183 | color: #ddd;
184 | padding: 0 4px;
185 | text-decoration: none;
186 | }
187 |
188 | a.headerlink:hover {
189 | color: #444;
190 | background: #eaeaea;
191 | }
192 |
193 | div.body p, div.body dd, div.body li {
194 | line-height: 1.4em;
195 | }
196 |
197 | div.admonition {
198 | background: #fafafa;
199 | margin: 20px -30px;
200 | padding: 10px 30px;
201 | border-top: 1px solid #ccc;
202 | border-bottom: 1px solid #ccc;
203 | }
204 |
205 | div.admonition tt.xref, div.admonition a tt {
206 | border-bottom: 1px solid #fafafa;
207 | }
208 |
209 | dd div.admonition {
210 | margin-left: -60px;
211 | padding-left: 60px;
212 | }
213 |
214 | div.admonition p.admonition-title {
215 | font-family: 'Open Sans', sans-serif;
216 | font-weight: normal;
217 | font-size: 24px;
218 | margin: 0 0 10px 0;
219 | padding: 0;
220 | line-height: 1;
221 | }
222 |
223 | div.admonition p.last {
224 | margin-bottom: 0;
225 | }
226 |
227 | div.highlight {
228 | background-color: white;
229 | }
230 |
231 | dt:target, .highlight {
232 | background: #FAF3E8;
233 | }
234 |
235 | div.note {
236 | background-color: #eee;
237 | border: 1px solid #ccc;
238 | }
239 |
240 | div.seealso {
241 | background-color: #ffc;
242 | border: 1px solid #ff6;
243 | }
244 |
245 | div.topic {
246 | background-color: #eee;
247 | }
248 |
249 | p.admonition-title {
250 | display: inline;
251 | }
252 |
253 | p.admonition-title:after {
254 | content: ":";
255 | }
256 |
257 | pre, tt {
258 | font-family: 'Source Code Pro', 'Consolas', 'Menlo', 'Deja Vu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
259 | font-size: 0.9em;
260 | }
261 |
262 | img.screenshot {
263 | }
264 |
265 | tt.descname, tt.descclassname {
266 | font-size: 0.95em;
267 | }
268 |
269 | tt.descname {
270 | padding-right: 0.08em;
271 | }
272 |
273 | img.screenshot {
274 | -moz-box-shadow: 2px 2px 4px #eee;
275 | -webkit-box-shadow: 2px 2px 4px #eee;
276 | box-shadow: 2px 2px 4px #eee;
277 | }
278 |
279 | table.docutils {
280 | border: 1px solid #888;
281 | -moz-box-shadow: 2px 2px 4px #eee;
282 | -webkit-box-shadow: 2px 2px 4px #eee;
283 | box-shadow: 2px 2px 4px #eee;
284 | }
285 |
286 | table.docutils td, table.docutils th {
287 | border: 1px solid #888;
288 | padding: 0.25em 0.7em;
289 | }
290 |
291 | table.field-list, table.footnote {
292 | border: none;
293 | -moz-box-shadow: none;
294 | -webkit-box-shadow: none;
295 | box-shadow: none;
296 | }
297 |
298 | table.footnote {
299 | margin: 15px 0;
300 | width: 100%;
301 | border: 1px solid #eee;
302 | background: #fdfdfd;
303 | font-size: 0.9em;
304 | }
305 |
306 | table.footnote + table.footnote {
307 | margin-top: -15px;
308 | border-top: none;
309 | }
310 |
311 | table.field-list th {
312 | padding: 0 0.8em 0 0;
313 | }
314 |
315 | table.field-list td {
316 | padding: 0;
317 | }
318 |
319 | table.footnote td.label {
320 | width: 0px;
321 | padding: 0.3em 0 0.3em 0.5em;
322 | }
323 |
324 | table.footnote td {
325 | padding: 0.3em 0.5em;
326 | }
327 |
328 | dl {
329 | margin: 0;
330 | padding: 0;
331 | }
332 |
333 | dl dd {
334 | margin-left: 30px;
335 | }
336 |
337 | blockquote {
338 | margin: 0 0 0 30px;
339 | padding: 0;
340 | }
341 |
342 | ul, ol {
343 | margin: 10px 0 10px 30px;
344 | padding: 0;
345 | }
346 |
347 | pre {
348 | background: #eee;
349 | padding: 7px 30px;
350 | margin: 15px -30px;
351 | line-height: 1.3em;
352 | }
353 |
354 | dl pre, blockquote pre, li pre {
355 | margin-left: -60px;
356 | padding-left: 60px;
357 | }
358 |
359 | dl dl pre {
360 | margin-left: -90px;
361 | padding-left: 90px;
362 | }
363 |
364 | tt {
365 | background-color: #ecf0f3;
366 | color: #222;
367 | /* padding: 1px 2px; */
368 | }
369 |
370 | tt.xref, a tt {
371 | background-color: #FBFBFB;
372 | border-bottom: 1px solid white;
373 | }
374 |
375 | a.reference {
376 | text-decoration: none;
377 | border-bottom: 1px dotted #2d4e84;
378 | }
379 |
380 | a.reference:hover {
381 | border-bottom: 1px solid #2069e1;
382 | }
383 |
384 | a.footnote-reference {
385 | text-decoration: none;
386 | font-size: 0.7em;
387 | vertical-align: top;
388 | border-bottom: 1px dotted #2d4e84;
389 | }
390 |
391 | a.footnote-reference:hover {
392 | border-bottom: 1px solid #2069e1;
393 | }
394 |
395 | a:hover tt {
396 | background: #EEE;
397 | }
398 |
399 |
400 | @media screen and (max-width: 870px) {
401 |
402 | div.sphinxsidebar {
403 | display: none;
404 | }
405 |
406 | div.document {
407 | width: 100%;
408 |
409 | }
410 |
411 | div.documentwrapper {
412 | margin-left: 0;
413 | margin-top: 0;
414 | margin-right: 0;
415 | margin-bottom: 0;
416 | }
417 |
418 | div.bodywrapper {
419 | margin-top: 0;
420 | margin-right: 0;
421 | margin-bottom: 0;
422 | margin-left: 0;
423 | }
424 |
425 | ul {
426 | margin-left: 0;
427 | }
428 |
429 | .document {
430 | width: auto;
431 | }
432 |
433 | .footer {
434 | width: auto;
435 | }
436 |
437 | .bodywrapper {
438 | margin: 0;
439 | }
440 |
441 | .footer {
442 | width: auto;
443 | }
444 |
445 | .github {
446 | display: none;
447 | }
448 |
449 |
450 |
451 | }
452 |
453 |
454 |
455 | @media screen and (max-width: 875px) {
456 |
457 | body {
458 | margin: 0;
459 | padding: 20px 30px;
460 | }
461 |
462 | div.documentwrapper {
463 | float: none;
464 | background: white;
465 | }
466 |
467 | div.sphinxsidebar {
468 | display: block;
469 | float: none;
470 | width: 102.5%;
471 | margin: 50px -30px -20px -30px;
472 | padding: 10px 20px;
473 | background: #333;
474 | color: white;
475 | }
476 |
477 | div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p,
478 | div.sphinxsidebar h3 a {
479 | color: white;
480 | }
481 |
482 | div.sphinxsidebar a {
483 | color: #aaa;
484 | }
485 |
486 | div.sphinxsidebar p.logo {
487 | display: none;
488 | }
489 |
490 | div.document {
491 | width: 100%;
492 | margin: 0;
493 | }
494 |
495 | div.related {
496 | display: block;
497 | margin: 0;
498 | padding: 10px 0 20px 0;
499 | }
500 |
501 | div.related ul,
502 | div.related ul li {
503 | margin: 0;
504 | padding: 0;
505 | }
506 |
507 | div.footer {
508 | display: none;
509 | }
510 |
511 | div.bodywrapper {
512 | margin: 0;
513 | }
514 |
515 | div.body {
516 | min-height: 0;
517 | padding: 0;
518 | }
519 |
520 | .rtd_doc_footer {
521 | display: none;
522 | }
523 |
524 | .document {
525 | width: auto;
526 | }
527 |
528 | .footer {
529 | width: auto;
530 | }
531 |
532 | .footer {
533 | width: auto;
534 | }
535 |
536 | .github {
537 | display: none;
538 | }
539 | }
540 |
541 |
542 | /* scrollbars */
543 |
544 | ::-webkit-scrollbar {
545 | width: 6px;
546 | height: 6px;
547 | }
548 |
549 | ::-webkit-scrollbar-button:start:decrement,
550 | ::-webkit-scrollbar-button:end:increment {
551 | display: block;
552 | height: 10px;
553 | }
554 |
555 | ::-webkit-scrollbar-button:vertical:increment {
556 | background-color: #fff;
557 | }
558 |
559 | ::-webkit-scrollbar-track-piece {
560 | background-color: #eee;
561 | -webkit-border-radius: 3px;
562 | }
563 |
564 | ::-webkit-scrollbar-thumb:vertical {
565 | height: 50px;
566 | background-color: #ccc;
567 | -webkit-border-radius: 3px;
568 | }
569 |
570 | ::-webkit-scrollbar-thumb:horizontal {
571 | width: 50px;
572 | background-color: #ccc;
573 | -webkit-border-radius: 3px;
574 | }
575 |
576 | /* misc. */
577 |
578 | .revsys-inline {
579 | display: none!important;
580 | }
581 |
582 |
583 | .admonition.warning {
584 | background-color: #F5CDCD;
585 | border-color: #7B1B1B;
586 | }
--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
1 | API Documentation
2 | =====================
3 |
4 | ``streamlib.summary``
5 | ---------------------
6 |
7 | .. autoclass:: streamlib.summary.CountMin
8 | :members:
9 | :special-members:
10 | :exclude-members: __dict__, __weakref__
11 | :member-order: bysource
12 |
13 | .. autoclass:: streamlib.summary.CountMedian
14 | :members:
15 | :special-members:
16 | :exclude-members: __dict__, __weakref__
17 | :member-order: bysource
18 |
19 | .. autoclass:: streamlib.summary.CountSketch
20 | :members:
21 | :special-members:
22 | :exclude-members: __dict__, __weakref__
23 | :member-order: bysource
24 |
25 |
26 | .. autoclass:: streamlib.summary.F2
27 | :members:
28 | :special-members:
29 | :exclude-members: __dict__, __weakref__
30 | :member-order: bysource
31 |
32 |
33 |
34 |
35 | ``streamlib.hashes``
36 | ----------------------
37 |
38 | .. autoclass:: streamlib.hashes.MurmurHash
39 | :members:
40 | :special-members:
41 | :exclude-members: __dict__, __weakref__
42 | :member-order: bysource
43 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # StreamLib documentation build configuration file, created by
4 | # sphinx-quickstart on Fri Jun 5 03:03:42 2015.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import sys
16 | import os
17 |
18 | # If extensions (or modules to document with autodoc) are in another directory,
19 | # add these directories to sys.path here. If the directory is relative to the
20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
21 | #sys.path.insert(0, os.path.abspath('.'))
22 |
23 | # -- General configuration ------------------------------------------------
24 |
25 | # If your documentation needs a minimal Sphinx version, state it here.
26 | #needs_sphinx = '1.0'
27 |
28 | # Add any Sphinx extension module names here, as strings. They can be
29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
30 | # ones.
31 | extensions = [
32 | 'sphinx.ext.autodoc',
33 | 'sphinx.ext.doctest',
34 | 'sphinx.ext.todo',
35 | 'sphinx.ext.coverage',
36 | 'sphinx.ext.mathjax',
37 | 'sphinx.ext.viewcode',
38 | ]
39 |
40 | # Add any paths that contain templates here, relative to this directory.
41 | templates_path = ['_templates']
42 |
43 | # The suffix of source filenames.
44 | source_suffix = '.rst'
45 |
46 | # The encoding of source files.
47 | #source_encoding = 'utf-8-sig'
48 |
49 | # The master toctree document.
50 | master_doc = 'index'
51 |
52 | # General information about the project.
53 | project = u'StreamLib'
54 | copyright = u'2015, Jiecao Chen'
55 |
56 | # The version info for the project you're documenting, acts as replacement for
57 | # |version| and |release|, also used in various other places throughout the
58 | # built documents.
59 | #
60 | # The short X.Y version.
61 | version = '1.0'
62 | # The full version, including alpha/beta/rc tags.
63 | release = '1.0'
64 |
65 | # The language for content autogenerated by Sphinx. Refer to documentation
66 | # for a list of supported languages.
67 | #language = None
68 |
69 | # There are two options for replacing |today|: either, you set today to some
70 | # non-false value, then it is used:
71 | #today = ''
72 | # Else, today_fmt is used as the format for a strftime call.
73 | #today_fmt = '%B %d, %Y'
74 |
75 | # List of patterns, relative to source directory, that match files and
76 | # directories to ignore when looking for source files.
77 | exclude_patterns = []
78 |
79 | # The reST default role (used for this markup: `text`) to use for all
80 | # documents.
81 | #default_role = None
82 |
83 | # If true, '()' will be appended to :func: etc. cross-reference text.
84 | #add_function_parentheses = True
85 |
86 | # If true, the current module name will be prepended to all description
87 | # unit titles (such as .. function::).
88 | #add_module_names = True
89 |
90 | # If true, sectionauthor and moduleauthor directives will be shown in the
91 | # output. They are ignored by default.
92 | #show_authors = False
93 |
94 | # The name of the Pygments (syntax highlighting) style to use.
95 | pygments_style = 'sphinx'
96 |
97 | # A list of ignored prefixes for module index sorting.
98 | #modindex_common_prefix = []
99 |
100 | # If true, keep warnings as "system message" paragraphs in the built documents.
101 | #keep_warnings = False
102 |
103 |
104 | # -- Options for HTML output ----------------------------------------------
105 |
106 | # The theme to use for HTML and HTML Help pages. See the documentation for
107 | # a list of builtin themes.
108 | html_theme = 'default'
109 |
110 | # Theme options are theme-specific and customize the look and feel of a theme
111 | # further. For a list of options available for each theme, see the
112 | # documentation.
113 | #html_theme_options = {}
114 |
115 | # Add any paths that contain custom themes here, relative to this directory.
116 | #html_theme_path = []
117 |
118 | # The name for this set of Sphinx documents. If None, it defaults to
119 | # " v documentation".
120 | #html_title = None
121 |
122 | # A shorter title for the navigation bar. Default is the same as html_title.
123 | #html_short_title = None
124 |
125 | # The name of an image file (relative to this directory) to place at the top
126 | # of the sidebar.
127 | #html_logo = None
128 |
129 | # The name of an image file (within the static path) to use as favicon of the
130 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
131 | # pixels large.
132 | #html_favicon = None
133 |
134 | # Add any paths that contain custom static files (such as style sheets) here,
135 | # relative to this directory. They are copied after the builtin static files,
136 | # so a file named "default.css" will overwrite the builtin "default.css".
137 | html_static_path = ['_static']
138 |
139 | # Add any extra paths that contain custom files (such as robots.txt or
140 | # .htaccess) here, relative to this directory. These files are copied
141 | # directly to the root of the documentation.
142 | #html_extra_path = []
143 |
144 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
145 | # using the given strftime format.
146 | #html_last_updated_fmt = '%b %d, %Y'
147 |
148 | # If true, SmartyPants will be used to convert quotes and dashes to
149 | # typographically correct entities.
150 | #html_use_smartypants = True
151 |
152 | # Custom sidebar templates, maps document names to template names.
153 | #html_sidebars = {}
154 |
155 | # Additional templates that should be rendered to pages, maps page names to
156 | # template names.
157 | #html_additional_pages = {}
158 |
159 | # If false, no module index is generated.
160 | #html_domain_indices = True
161 |
162 | # If false, no index is generated.
163 | #html_use_index = True
164 |
165 | # If true, the index is split into individual pages for each letter.
166 | #html_split_index = False
167 |
168 | # If true, links to the reST sources are added to the pages.
169 | #html_show_sourcelink = True
170 |
171 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
172 | #html_show_sphinx = True
173 |
174 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
175 | #html_show_copyright = True
176 |
177 | # If true, an OpenSearch description file will be output, and all pages will
178 | # contain a tag referring to it. The value of this option must be the
179 | # base URL from which the finished HTML is served.
180 | #html_use_opensearch = ''
181 |
182 | # This is the file name suffix for HTML files (e.g. ".xhtml").
183 | #html_file_suffix = None
184 |
185 | # Output file base name for HTML help builder.
186 | htmlhelp_basename = 'StreamLibdoc'
187 |
188 |
189 | # -- Options for LaTeX output ---------------------------------------------
190 |
191 | latex_elements = {
192 | # The paper size ('letterpaper' or 'a4paper').
193 | #'papersize': 'letterpaper',
194 |
195 | # The font size ('10pt', '11pt' or '12pt').
196 | #'pointsize': '10pt',
197 |
198 | # Additional stuff for the LaTeX preamble.
199 | #'preamble': '',
200 | }
201 |
202 | # Grouping the document tree into LaTeX files. List of tuples
203 | # (source start file, target name, title,
204 | # author, documentclass [howto, manual, or own class]).
205 | latex_documents = [
206 | ('index', 'StreamLib.tex', u'StreamLib Documentation',
207 | u'Jiecao Chen', 'manual'),
208 | ]
209 |
210 | # The name of an image file (relative to this directory) to place at the top of
211 | # the title page.
212 | #latex_logo = None
213 |
214 | # For "manual" documents, if this is true, then toplevel headings are parts,
215 | # not chapters.
216 | #latex_use_parts = False
217 |
218 | # If true, show page references after internal links.
219 | #latex_show_pagerefs = False
220 |
221 | # If true, show URL addresses after external links.
222 | #latex_show_urls = False
223 |
224 | # Documents to append as an appendix to all manuals.
225 | #latex_appendices = []
226 |
227 | # If false, no module index is generated.
228 | #latex_domain_indices = True
229 |
230 |
231 | # -- Options for manual page output ---------------------------------------
232 |
233 | # One entry per manual page. List of tuples
234 | # (source start file, name, description, authors, manual section).
235 | man_pages = [
236 | ('index', 'streamlib', u'StreamLib Documentation',
237 | [u'Jiecao Chen'], 1)
238 | ]
239 |
240 | # If true, show URL addresses after external links.
241 | #man_show_urls = False
242 |
243 |
244 | # -- Options for Texinfo output -------------------------------------------
245 |
246 | # Grouping the document tree into Texinfo files. List of tuples
247 | # (source start file, target name, title, author,
248 | # dir menu entry, description, category)
249 | texinfo_documents = [
250 | ('index', 'StreamLib', u'StreamLib Documentation',
251 | u'Jiecao Chen', 'StreamLib', 'One line description of project.',
252 | 'Miscellaneous'),
253 | ]
254 |
255 | # Documents to append as an appendix to all manuals.
256 | #texinfo_appendices = []
257 |
258 | # If false, no module index is generated.
259 | #texinfo_domain_indices = True
260 |
261 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
262 | #texinfo_show_urls = 'footnote'
263 |
264 | # If true, do not generate a @detailmenu in the "Top" node's menu.
265 | #texinfo_no_detailmenu = False
266 | todo_include_todos = True
267 | sys.path.insert(0, os.path.abspath("../.."))
268 |
--------------------------------------------------------------------------------
/docs/source/getting-started.rst:
--------------------------------------------------------------------------------
1 | Getting Started
2 | =================
3 |
4 | Data Stream
5 | ------------
6 | Any **iterable** object with **hashable** elements can be considered as a data stream. Here are some examples.
7 |
8 | * a list of integers: :code:`[1, 10, 20, 1, 5]`
9 | * a generator that yields tuples, see the instance :code:`dataStream` as follows,
10 |
11 | .. code-block:: python
12 |
13 | import random
14 |
15 | def demoGen(N = 1000):
16 | i = 0
17 | while i < N:
18 | yield random.randint(0, 10);
19 | i += 1
20 |
21 | dataStream = demoGen()
22 |
23 | * a tuple of strings: :code:`('fix', 'the', 'bug', please', '...')`
24 | * a string: :code:`'abcdefgdahfahdfajkhfkahfsahfjksfhjk'`
25 | * many more
26 |
27 |
28 | Summarize the data stream
29 | -------------------------
30 | Many algorithms that are popular to summarize data streams are included
31 | in the module **streamlib**. We give some examples to show their basic usage. Most sketches have similar methods, e.g. `processBatch`, `estimate`, `reproduce`, `merge` etc.
32 |
33 | Count-Min Sketch
34 | #################
35 | Count-Min sketch [cm05]_ is used to summarize the data stream and estimate the frequency of each element in the data stream. This sketch give high accurate estimation to heavy hitters (elements that have high frequencies) while relatively large error may induced for light elements. See following example for the basic usage.
36 |
37 | .. code-block:: python
38 |
39 | from streamlib import CountMin
40 | cm = CountMin() # create a instance of CountMin, see document for more detail
41 | cm.processBatch([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 3, 3, 4])
42 | for i in xrange(5):
43 | print 'Estimated frequency of', i, 'is', cm.estimate(i)
44 |
45 | result of above code, ::
46 |
47 | Estimated frequency of 0 is 4
48 | Estimated frequency of 1 is 6
49 | Estimated frequency of 2 is 1
50 | Estimated frequency of 3 is 2
51 | Estimated frequency of 4 is 1
52 |
53 | One can also create multiple instances of CountMin, each handle a substream (thus can be run in multi-threads).
54 | By merging those instances, we obtain a summary of the joint stream of all substreams. See the following example.
55 |
56 | .. code-block:: python
57 |
58 |
59 | from streamlib import CountMin
60 | cm0 = CountMin() # create a instance of CountMin, see document for more detail
61 | cm1 = cm0.reproduce() # reproduce a compatible sketch of cm0
62 |
63 |
64 | cm0.processBatch([0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 3, 3, 4])
65 | cm1.processBatch([1, 2, 3, 4])
66 | cm = cm0.merge(cm1)
67 | for i in xrange(5):
68 | print 'Estimated frequency of', i, 'is', cm.estimate(i)
69 |
70 | which gives, ::
71 |
72 | Estimated frequency of 0 is 4
73 | Estimated frequency of 1 is 6
74 | Estimated frequency of 2 is 4
75 | Estimated frequency of 3 is 1
76 | Estimated frequency of 4 is 1
77 |
78 |
79 | Most sketches included in `summary` module can also handle **weighted** data stream, let's consider
80 | the following case,
81 |
82 | .. code-block:: python
83 |
84 | from streamlib import CountMin
85 | cm = CountMin()
86 | dataStream = [(0, 20), (1, 4), (2, 1), (3, 1), (4, 5), (1, 100), (0, 500)]
87 | cm.processBatch(dataStream, weighted=True) # set weighted=True
88 |
89 | for i in xrange(5):
90 | print 'Estimated frequency of', i, 'is', cm.estimate(i)
91 |
92 | it gives ::
93 |
94 | Estimated frequency of 0 is 520
95 | Estimated frequency of 1 is 104
96 | Estimated frequency of 2 is 1
97 | Estimated frequency of 3 is 1
98 | Estimated frequency of 4 is 5
99 |
100 |
101 |
102 |
103 | AMS F2 Sketch
104 | #############
105 | AMS F2 Sketch can be used to estimate the second moment of the frequency vector of a data stream.
106 | For example, when a data stream is `[1, 1, 1, 2, 2, 3]`, there are three 1, two 2 and one 3, its second moment of the frequency vector then be `3^2 + 2^2 + 1^2 = 14`. Here we show its most basic usage.
107 |
108 | .. code-block:: python
109 |
110 | from streamlib import F2
111 | # set the bucket size as w=100
112 | # the |EstimatedValue - TrueValue| <= TrueValue / sqrt(w)
113 | f2 = F2(w=100)
114 | dataStream = [1, 1, 1, 2, 2, 3]
115 | f2.processBatch(dataStream)
116 | print 'Estimated F2 =', f2.estimate()
117 |
118 |
119 | which gives, ::
120 |
121 | Estimated F2 = 13
122 |
123 |
124 | Bibliography
125 | -------------
126 | .. [ccfc04] Charikar, Moses, Kevin Chen, and Martin Farach-Colton. "Finding frequent items in data streams." Automata, Languages and Programming. Springer Berlin Heidelberg, 2002. 693-703.
127 |
128 | .. [ams] Alon, Noga, Yossi Matias, and Mario Szegedy. "The space complexity of approximating the frequency moments." Proceedings of the twenty-eighth annual ACM symposium on Theory of computing. ACM, 1996.
129 |
130 | .. [bjkst] Bar-Yossef, Ziv, et al. "Counting distinct elements in a data stream." Randomization and Approximation Techniques in Computer Science. Springer Berlin Heidelberg, 2002. 1-10.
131 |
132 | .. [cm05] Cormode, Graham, and S. Muthukrishnan. "An improved data stream summary: the count-min sketch and its applications." Journal of Algorithms 55.1 (2005): 58-75.
133 |
134 | .. [mg82] Misra, Jayadev, and David Gries. "Finding repeated elements." Science of computer programming 2.2 (1982): 143-152.
135 |
136 | .. [myblog] http://jiecchen.github.io/blog/2014/08/13/quantile-sketch/
137 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. StreamLib documentation master file, created by
2 | sphinx-quickstart on Fri Jun 5 03:03:42 2015.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to StreamLib's documentation!
7 | =====================================
8 |
9 | User's Guide
10 | -------------------
11 |
12 | .. toctree::
13 | :maxdepth: 2
14 |
15 | intro
16 | getting-started
17 | usage
18 |
19 |
20 | API Reference
21 | --------------
22 |
23 | .. toctree::
24 | :maxdepth: 3
25 |
26 | api
27 |
28 |
--------------------------------------------------------------------------------
/docs/source/intro.rst:
--------------------------------------------------------------------------------
1 | Introduction
2 | ============
3 |
4 |
--------------------------------------------------------------------------------
/docs/source/usage.rst:
--------------------------------------------------------------------------------
1 | Usage
2 | =============
3 |
--------------------------------------------------------------------------------
/java/bin/DataStructures/AVLNode.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/DataStructures/AVLNode.class
--------------------------------------------------------------------------------
/java/bin/DataStructures/AVLTree.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/DataStructures/AVLTree.class
--------------------------------------------------------------------------------
/java/bin/DataStructures/Distribution.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/DataStructures/Distribution.class
--------------------------------------------------------------------------------
/java/bin/DataStructures/Stream$StreamIterator.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/DataStructures/Stream$StreamIterator.class
--------------------------------------------------------------------------------
/java/bin/DataStructures/Stream.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/DataStructures/Stream.class
--------------------------------------------------------------------------------
/java/bin/DataStructures/StreamItem.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/DataStructures/StreamItem.class
--------------------------------------------------------------------------------
/java/bin/Hash/TwoUniversal.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/Hash/TwoUniversal.class
--------------------------------------------------------------------------------
/java/bin/StreamingLib/CountSketch.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/StreamingLib/CountSketch.class
--------------------------------------------------------------------------------
/java/bin/StreamingLib/MG.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/StreamingLib/MG.class
--------------------------------------------------------------------------------
/java/bin/StreamingLib/Sketch.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiecchen/StreamLib/1e538f14e9f8e88b7df3c3c8b81c48c251fc646d/java/bin/StreamingLib/Sketch.class
--------------------------------------------------------------------------------
/java/src/DataStructures/AVLNode.java:
--------------------------------------------------------------------------------
1 | package DataStructures;
2 |
3 | import java.lang.Comparable;
4 |
5 | public class AVLNode {
6 |
7 | private AVLNode parent;
8 | private AVLNode left;
9 | private AVLNode right;
10 | private Comparable key;
11 | private int frequency;
12 | private int height;
13 |
14 | public AVLNode() {
15 | this.parent = null;
16 | this.left = null;
17 | this.right = null;
18 | this.height = -1;
19 | }
20 |
21 | public AVLNode(Comparable key) {
22 | this();
23 | this.key = key;
24 | this.frequency = 1;
25 | this.height = 0;
26 | }
27 |
28 | public static int height(AVLNode node) {
29 | if(node == null) {
30 | return -1;
31 | } else return node.height;
32 | }
33 |
34 | public Comparable key() {
35 | return this.key;
36 | }
37 |
38 | protected void setKey(Comparable key) {
39 | this.key = key;
40 | }
41 |
42 | public int frequency() {
43 | return this.frequency;
44 | }
45 |
46 | protected void incFreq() {
47 | this.frequency++;
48 | }
49 |
50 | protected void decFreq() {
51 | this.frequency--;
52 | }
53 |
54 | protected void setFreq(int freq) {
55 | this.frequency = freq;
56 | }
57 |
58 | public AVLNode left() {
59 | return this.left;
60 | }
61 |
62 | public AVLNode right() {
63 | return this.right;
64 | }
65 |
66 | public AVLNode parent() {
67 | return this.parent();
68 | }
69 |
70 | protected void setHeight() {
71 | this.height = Math.max(height(left), height(right)) + 1;
72 | }
73 |
74 | protected void setLeft(AVLNode node) {
75 | this.left = node;
76 | }
77 |
78 | protected void setRight(AVLNode node) {
79 | this.right = node;
80 | }
81 |
82 | protected void setParent(AVLNode node) {
83 | this.parent = node;
84 | }
85 |
86 | public int balanceFactor() {
87 | return height(left) - height(right);
88 | }
89 |
90 | }
91 |
--------------------------------------------------------------------------------
/java/src/DataStructures/AVLTree.java:
--------------------------------------------------------------------------------
1 | package DataStructures;
2 |
3 | import java.lang.Comparable;
4 |
5 |
6 | public class AVLTree {
7 |
8 | private AVLNode root;
9 | private int size;
10 |
11 | public AVLTree() {
12 | this.root = null;
13 | this.size = 0;
14 | }
15 |
16 | public AVLNode find(Comparable key) {
17 | return find(this.root, key);
18 | }
19 |
20 | private AVLNode find(AVLNode node, Comparable key) {
21 | if(node == null) {
22 | return null;
23 | } else if(key.compareTo(node.key()) < 0) {
24 | return find(node.left(), key);
25 | } else if(key.compareTo(node.key()) > 0) {
26 | return find(node.right(), key);
27 | } else return node;
28 | }
29 |
30 | public void insert(Comparable key) {
31 | root = insert(this.root, key);
32 | }
33 |
34 | public void delete(Comparable key) {
35 | root = delete(this.root, key);
36 | }
37 |
38 | private AVLNode insert(AVLNode node, Comparable key) {
39 | AVLNode newRoot;
40 | /*A*/
41 | if(node == null) {
42 | newRoot = new AVLNode(key);
43 | this.size++;
44 | }
45 | /*B*/
46 | //search left subtree
47 | else if(key.compareTo(node.key()) < 0) {
48 | newRoot = insert(node.left(), key);
49 | node.setLeft(newRoot);
50 | newRoot.setParent(node);
51 |
52 | //rebalance
53 | if(node.balanceFactor() == 2) {
54 | if(node.left().balanceFactor() > 0) {
55 | newRoot = singleRightRotate(node);
56 | } else {
57 | newRoot = doubleRightRotate(node);
58 | }
59 | } else {
60 | newRoot = node;
61 | }
62 | }
63 | /*C*/
64 | //search right subtree
65 | else if(key.compareTo(node.key()) > 0) {
66 | newRoot = insert(node.right(), key);
67 | node.setRight(newRoot);
68 | newRoot.setParent(node);
69 |
70 | //rebalance
71 | if(node.balanceFactor() == -2) {
72 | if(node.right().balanceFactor() < 0) {
73 | newRoot = singleLeftRotate(node);
74 | } else {
75 | newRoot = doubleLeftRotate(node);
76 | }
77 | } else {
78 | newRoot = node;
79 | }
80 | }
81 | //key already in tree; increase frequency
82 | else {
83 | node.incFreq();
84 | newRoot = node;
85 | }
86 |
87 | /* Adjust Height */
88 | newRoot.setHeight();
89 |
90 | return newRoot;
91 | }
92 |
93 | private AVLNode delete(AVLNode node, Comparable key) {
94 | AVLNode newRoot;
95 | /*A*/
96 | if(node == null) {
97 | System.out.println("Key not in tree");
98 | return node;
99 | }
100 | /*B*/
101 | //search left subtree
102 | else if(key.compareTo(node.key()) < 0) {
103 | newRoot = delete(node.left(), key);
104 | node.setLeft(newRoot);
105 | if(newRoot != null) newRoot.setParent(node);
106 |
107 | //rebalance
108 | if(node.balanceFactor() == -2) {
109 | if(node.right().balanceFactor() < 0) {
110 | newRoot = singleLeftRotate(node);
111 | } else {
112 | newRoot = doubleLeftRotate(node);
113 | }
114 | } else {
115 | newRoot = node;
116 | }
117 | }
118 | /*C*/
119 | //search right subtree
120 | else if(key.compareTo(node.key()) > 0) {
121 | newRoot = delete(node.right(), key);
122 | node.setRight(newRoot);
123 | if(newRoot != null) newRoot.setParent(node);
124 |
125 | //rebalance
126 | if(node.balanceFactor() == 2) {
127 | if(node.left().balanceFactor() > 0) {
128 | newRoot = singleRightRotate(node);
129 | } else {
130 | newRoot = doubleRightRotate(node);
131 | }
132 | } else {
133 | newRoot = node;
134 | }
135 | }
136 | //key already in tree; decrease frequency or delete
137 | else {
138 | if(node.frequency() > 1) {
139 | node.decFreq();
140 | newRoot = node;
141 | } else {
142 | this.size--;
143 | //leaf
144 | if((node.left() == null) && (node.right() == null)) {
145 | return null; //WAS newRoot = null, but want to bypass setHeight();
146 | }
147 | //1 child
148 | else if((node.left() == null) ^ (node.left() == null)) {
149 | newRoot = (node.left() != null) ? node.left() : node.right();
150 | }
151 | //2 children
152 | else {
153 | AVLNode max = max(node.left()); //finding maximum of left subtree
154 | max.setFreq(1);
155 | Comparable newKey = max.key();
156 | int newFreq = max.frequency();
157 | newRoot = delete(node, max.key());
158 | node.setKey(newKey);
159 | node.setFreq(newFreq);
160 | }
161 | }
162 | }
163 |
164 | /* Adjust Height */
165 | newRoot.setHeight();
166 |
167 | return newRoot;
168 | }
169 |
170 | private AVLNode singleLeftRotate(AVLNode A) {
171 | AVLNode B = A.right();
172 | A.setRight(B.left());
173 | if(A.right() != null) A.right().setParent(A);
174 | B.setLeft(A);
175 | A.setParent(B);
176 |
177 | A.setHeight();
178 | B.setHeight(); //technically don't need
179 | return B;
180 | }
181 |
182 | private AVLNode doubleLeftRotate(AVLNode A) {
183 | AVLNode B = A.right();
184 | AVLNode C = B.left();
185 |
186 | B.setLeft(C.right());
187 | if(B.left() != null) B.left().setParent(B);
188 |
189 | A.setRight(C.left());
190 | if(A.right() != null) A.right().setParent(A);
191 |
192 | C.setLeft(A);
193 | A.setParent(C);
194 |
195 | C.setRight(B);
196 | B.setParent(C);
197 |
198 | A.setHeight();
199 | B.setHeight();
200 | C.setHeight(); //technically don't need
201 | return C;
202 | }
203 |
204 | private AVLNode singleRightRotate(AVLNode A) {
205 | AVLNode B = A.left();
206 | A.setLeft(B.right());
207 | if(A.left() != null) A.left().setParent(A);
208 | B.setRight(A);
209 | A.setParent(B);
210 |
211 | A.setHeight();
212 | B.setHeight(); //technically don't need
213 | return B;
214 | }
215 |
216 | private AVLNode doubleRightRotate(AVLNode A) {
217 | AVLNode B = A.left();
218 | AVLNode C = B.right();
219 |
220 | B.setRight(C.left());
221 | if(B.right() != null) B.right().setParent(B);
222 |
223 | A.setLeft(C.right());
224 | if(A.left() != null) A.left().setParent(A);
225 |
226 | C.setRight(A);
227 | A.setParent(C);
228 |
229 | C.setLeft(B);
230 | B.setParent(C);
231 |
232 | A.setHeight();
233 | B.setHeight();
234 | C.setHeight(); //technically don't need
235 | return C;
236 | }
237 |
238 | private AVLNode max(AVLNode node) {
239 | if(node.right() == null) return node;
240 | else return max(node.right());
241 | }
242 |
243 | public int size() {
244 | return this.size;
245 | }
246 |
247 | public void printTree() {
248 | printTree(this.root);
249 | System.out.println();
250 | }
251 |
252 | private void printTree(AVLNode node) {
253 | System.out.print("(");
254 | if(node != null) {
255 | System.out.print(node.key() + ", " + node.height(node) + " ");
256 | printTree(node.left());
257 | printTree(node.right());
258 | }
259 | System.out.print(")");
260 | }
261 |
262 |
263 | public static void main(String args[]) {
264 | AVLTree tree = new AVLTree();
265 | tree.insert(10);
266 | tree.insert(5);
267 | tree.insert(30);
268 | tree.insert(6);
269 | tree.insert(20);
270 | tree.insert(35);
271 | tree.insert(25);
272 | tree.insert(25);
273 | tree.printTree();
274 | tree.delete(25);
275 | tree.delete(25);
276 | tree.printTree();
277 | }
278 | }
279 |
--------------------------------------------------------------------------------
/java/src/DataStructures/Distribution.java:
--------------------------------------------------------------------------------
1 | package DataStructures;
2 |
3 | import java.util.Iterator;
4 | import java.util.List;
5 | import java.util.HashMap;
6 | import java.util.Map;
7 | import java.util.Random;
8 |
9 | import java.util.ArrayList;
10 | import java.util.LinkedList;
11 | import java.util.Arrays;
12 |
13 | public class Distribution {
14 |
15 | private double cumm;
16 | private HashMap dict;
17 | private Random rand;
18 | private ArrayList index;
19 |
20 | public Distribution(List stream) {
21 | HashMap tempDict = new HashMap();
22 | for(K key : stream) {
23 | tempDict.put(key, 1.0);
24 | }
25 |
26 | this.cumm = 0;
27 | this.index = new ArrayList<>();
28 | index.add(cumm);
29 | this.dict = buildDict(tempDict);
30 |
31 | this.rand = new Random();
32 |
33 | }
34 |
35 | private HashMap buildDict(HashMap map) {
36 | HashMap ans = new HashMap<>();
37 | Iterator> iter = map.entrySet().iterator();
38 | while(iter.hasNext()) {
39 | Map.Entry entry = iter.next();
40 | ans.put(cumm, entry.getKey());
41 | cumm += entry.getValue();
42 | index.add(cumm);
43 | }
44 | return ans;
45 | }
46 |
47 | //NOT DONE!!!
48 | public K getSampe() {
49 | double randNum = rand.nextDouble() * cumm;
50 | int pos = bisectLeft(randNum, index, 0, index.size());
51 | return null;
52 | }
53 |
54 | private int bisectLeft(double x, ArrayList list, int left, int right) {
55 | if((left - right) == 0) return left;
56 | int mid = (left + right) / 2;
57 | if((left - right) == 0) return left;
58 | else if(x > list.get(mid)) return bisectLeft(x, list, mid + 1, right);
59 | else if(x < list.get(mid)) return bisectLeft(x, list, left, mid - 1);
60 | else return mid;
61 | }
62 |
63 | public static void main(String[] args) {
64 | /*ArrayList list = new ArrayList<>();
65 | list.add(5);
66 | list.add(4);
67 | list.add(3);
68 | list.add(3);
69 | list.add(5);
70 | Distribution dist = new Distribution(list); */
71 |
72 | }
73 |
74 | }
75 |
--------------------------------------------------------------------------------
/java/src/DataStructures/Stream.java:
--------------------------------------------------------------------------------
1 | package DataStructures;
2 |
3 | import java.util.Arrays;
4 | import java.util.Iterator;
5 | import java.util.ArrayList;
6 | import java.util.List;
7 | import java.util.Map;
8 | import java.util.HashMap;
9 | import java.util.ConcurrentModificationException;
10 | import java.lang.NoSuchMethodException;
11 |
12 | public class Stream implements Iterable>{
13 |
14 | private ArrayList> stream;
15 |
16 | public Stream() {
17 | this.stream = new ArrayList<>();
18 | }
19 |
20 |
21 | public Stream(List list) {
22 | this();
23 | for(T item : list) {
24 | stream.add(new StreamItem(item, 1.0));
25 | }
26 | }
27 |
28 | public Stream(T[] array) {
29 | this(Arrays.asList(array));
30 | }
31 |
32 | public Stream(Map map) {
33 | this();
34 | Iterator> iter = map.entrySet().iterator();
35 | while(iter.hasNext()) {
36 | Map.Entry entry = iter.next();
37 | stream.add(new StreamItem(entry.getKey(), entry.getValue().doubleValue()));
38 | }
39 | }
40 |
41 | //delete later
42 | public void put(StreamItem streamItem) {
43 | this.stream.add(streamItem);
44 | }
45 |
46 | //delete later
47 | public void put(T item, int weight) {
48 | this.put(new StreamItem(item, weight));
49 | }
50 |
51 | public StreamIterator iterator() {
52 | return new StreamIterator();
53 | }
54 |
55 | public static void main(String[] args) {
56 | HashMap map = new HashMap<>();
57 | map.put("Hi", 2);
58 | map.put("Bye", 5);
59 | map.put("Why",-3);
60 | Integer[] array = {5, 6, 3, 4, 2, 2, 2, 2, 1, 2, 1, 3, 2, 3};
61 | //ArrayList arrayList = new ArrayList((ArrayList)Arrays.asList(array));
62 | Stream stream = new Stream(map);
63 | for(StreamItem item : stream) {
64 | System.out.println(item);
65 | }
66 | }
67 |
68 | private class StreamIterator implements Iterator> {
69 |
70 | private int expectedSize = stream.size();
71 | private int index;
72 |
73 | public boolean hasNext() {
74 | if(index >= expectedSize) return false;
75 | return true;
76 | }
77 |
78 | public StreamItem next() {
79 | if(stream.size() != expectedSize) {
80 | throw new ConcurrentModificationException();
81 | } else {
82 | return (StreamItem)stream.get(index++);
83 | }
84 | }
85 |
86 | public void remove() {
87 | }
88 |
89 | }
90 |
91 | }
92 |
--------------------------------------------------------------------------------
/java/src/DataStructures/StreamItem.java:
--------------------------------------------------------------------------------
1 | package DataStructures;
2 |
3 | import java.util.Iterator;
4 |
5 | public class StreamItem {
6 |
7 | private T item;
8 | private double weight;
9 |
10 | public StreamItem(T item, double weight) {
11 | this.item = item;
12 | this.weight = weight;
13 | }
14 |
15 | public T item() {
16 | return this.item;
17 | }
18 |
19 | public double weight() {
20 | return this.weight;
21 | }
22 |
23 | public String toString() {
24 | return "(" + item + ", " + weight + ")";
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/java/src/Hash/TwoUniversal.java:
--------------------------------------------------------------------------------
1 | package Hash;
2 |
3 | import java.util.Random;
4 |
5 | public class TwoUniversal {
6 |
7 | private final int PRIME;
8 | private int a, b, m;
9 | private Random random;
10 |
11 | public TwoUniversal() {
12 | this(40127);
13 |
14 | }
15 |
16 | public TwoUniversal(int m) {
17 | this.random = new Random();
18 | this.m = m;
19 | this.PRIME = generatePrime();
20 | this.generateHash();
21 | }
22 |
23 | private static boolean isPrime(long n) {
24 | if(n < 2) return false;
25 | if(n == 2 || n == 3) return true;
26 | if(n%2 == 0 || n%3 == 0) return false;
27 | long sqrtN = (long) Math.sqrt(n)+1;
28 | for(long i = 6L; i <= sqrtN; i += 6) {
29 | if(n%(i-1) == 0 || n%(i+1) == 0) return false;
30 | }
31 | return true;
32 | }
33 |
34 | /* TODO:
35 | * Rewrite this terribly awful way of finding primes
36 | */
37 | private int findPrime() {
38 | return random.nextInt(m) + m;
39 | }
40 |
41 | /* TODO:
42 | * Rewrite this terribly awful way of finding primes
43 | */
44 | private int generatePrime() {
45 | int rand = 0;
46 | while(! isPrime(rand)) {
47 | rand = findPrime();
48 | }
49 | return rand;
50 | }
51 |
52 | public void generateHash() {
53 | this.a = random.nextInt(PRIME-1)+1;
54 | this.b = random.nextInt(PRIME);
55 | }
56 |
57 | public int hash(E e) {
58 | if(e instanceof Number) {
59 | return hashInt((Number) e);
60 | }
61 | else {
62 | return hashInt(e.hashCode());
63 | }
64 | }
65 |
66 | public int hashInt(Number x) {
67 | //System.out.println("\t\tUsing a = " + a + " and b = " + b + " and p = " + PRIME);
68 | int hash = (int)((((a * x.doubleValue()) + b) % PRIME) % m);
69 | return (hash < 0) ? (hash + m) : hash;
70 | }
71 |
72 | public boolean equals(TwoUniversal other) {
73 | return (this.a == other.a) && (this.b == other.b) &&
74 | (this.PRIME == other.PRIME) && (this.m == other.m);
75 | }
76 |
77 | public String toString() {
78 | return "((((" + this.a + " * x) + " + this.b + ") % " + this.PRIME + ") % " + this.m + ")";
79 | }
80 |
81 | public static void main(String[] args) {
82 | TwoUniversal u = new TwoUniversal(2);
83 | TwoUniversal v = new TwoUniversal(2);
84 | /*u.generateHash();
85 | for(int i = 0; i < 100; i++) {
86 | System.out.println(u.hash(i));
87 | }*/
88 | System.out.println(u.equals(v));
89 | }
90 |
91 | }
92 |
--------------------------------------------------------------------------------
/java/src/StreamingLib/CountSketch.java:
--------------------------------------------------------------------------------
1 | package StreamingLib;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Arrays;
5 |
6 | import DataStructures.Stream;
7 | import DataStructures.StreamItem;
8 | import Hash.TwoUniversal;
9 |
10 | public class CountSketch extends Sketch {
11 |
12 | private double epsilon;
13 | private int[] C;
14 | private TwoUniversal h;
15 | private TwoUniversal g;
16 |
17 | public CountSketch() {
18 | this.epsilon = 0.01;
19 | }
20 |
21 | public CountSketch(double epsilon) {
22 | if(epsilon <= 0) {
23 | throw new IllegalArgumentException("Epsilon must be positive");
24 | }
25 | this.epsilon = epsilon;
26 | int capacity = (int)Math.ceil(3/Math.pow(epsilon, 2));
27 | this.C = new int[capacity];
28 | Arrays.fill(C, 0, capacity, 0);
29 | this.h = new TwoUniversal(capacity);
30 | this.g = new TwoUniversal(2); //will produce either 0 or 1
31 | }
32 |
33 | public void process(StreamItem item) {
34 | int gInt = g.hash(item.item());
35 | gInt = (gInt == 0) ? -1 : gInt;
36 | int hInt = h.hash(item.item());
37 |
38 | C[hInt] = C[hInt] + ((int)item.weight())*gInt ;
39 | }
40 |
41 | public int getEstimation(T t) {
42 | return ((g.hash(t) == 0) ? -1 : g.hash(t)) * this.C[h.hash(t)];
43 | }
44 |
45 | public Sketch merge(Sketch other) {
46 | return null;
47 | }
48 |
49 | public static void main(String[] args) {
50 | CountSketch cs = new CountSketch<>(.5);
51 | String[] array = {"hi", "hi", "what", "Rachel", "hi", "hi", "Jeremy", "Jeremy", "Jeremy", "there", "there", "Rachel"};
52 | Stream stream = new Stream(array);
53 | cs.batchProcess(stream);
54 | System.out.println(cs.getEstimation("hi"));
55 |
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/java/src/StreamingLib/MG.java:
--------------------------------------------------------------------------------
1 | package StreamingLib;
2 |
3 | import java.util.HashMap;
4 | import java.util.Iterator;
5 | import java.util.Map;
6 | import DataStructures.Stream;
7 | import DataStructures.StreamItem;
8 |
9 | public class MG extends Sketch{
10 |
11 | private int k;
12 | private HashMap A;
13 |
14 | public MG() {
15 | this.k = 2;
16 | this.A = new HashMap<>();
17 | }
18 |
19 | public MG(int k) {
20 | this();
21 | this.k = k;
22 | }
23 |
24 | public void process(StreamItem item) {
25 | if(A.containsKey(item.item())) {
26 | A.put(item.item(), A.get(item.item()) + 1);
27 | } else if(A.size() < k-1) {
28 | A.put(item.item(), 1);
29 | } else {
30 | Iterator> iter = A.entrySet().iterator();
31 | while(iter.hasNext()) {
32 | Map.Entry entry = iter.next();
33 | Integer freq = entry.getValue() - 1;
34 | if(freq == 0) iter.remove();
35 | else entry.setValue(freq);
36 | }
37 | }
38 | }
39 |
40 | public int getEstimation(T item) {
41 | if(A.containsKey(item)) return A.get(item);
42 | else return 0;
43 | }
44 |
45 | //Not defined for MG: return some error?
46 | public Sketch merge(Sketch other) {
47 | return null;
48 | }
49 |
50 | /**
51 | * @param args
52 | */
53 | public static void main(String[] args) {
54 | MG mg = new MG<>(5);
55 | Integer[] list = {1, 1, 1, 1, 5, 6, 2, 5, 5, 5, 1, 5, 6, 6, 9, 3};
56 | Stream stream = new Stream(list);
57 | //int[] list = {1, 2, 5, 6, 9};
58 | mg.batchProcess(stream);
59 | System.out.println(mg.getEstimation(6));
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/java/src/StreamingLib/Sketch.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package StreamingLib;
5 |
6 | import DataStructures.Stream;
7 | import DataStructures.StreamItem;
8 |
9 | /**
10 | * @author rachellowden
11 | *
12 | */
13 |
14 | public abstract class Sketch {
15 |
16 | public abstract void process(StreamItem t);
17 |
18 | public void batchProcess(Stream s) {
19 | for(StreamItem t : s) {
20 | this.process(t);
21 | }
22 | }
23 |
24 | public abstract int getEstimation(T t);
25 |
26 | public abstract Sketch merge(Sketch other);
27 |
28 | }
29 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from setuptools import setup, find_packages
3 | from codecs import open
4 | import os
5 | from distutils.core import setup, Command
6 |
7 | def read(fname):
8 | path = os.path.join(os.path.dirname(__file__), fname)
9 | return open(path, encoding='utf-8').read()
10 |
11 | # class PyTest(Command):
12 | # user_options = []
13 | # def initialize_options(self):
14 | # pass
15 | # def finalize_options(self):
16 | # pass
17 | # def run(self):
18 | # import sys,subprocess
19 | # errno = subprocess.call([sys.executable, 'runtests.py'])
20 | # raise SystemExit(errno)
21 |
22 |
23 |
24 | setup(
25 | name="StreamLib",
26 | version="1.0.1",
27 | packages=find_packages(),
28 |
29 | # development metadata
30 | zip_safe=True,
31 |
32 | # metadata for upload to PyPI
33 | author="Jiecao Chen",
34 | author_email="chenjiecao@gmail.com",
35 | description="StreamLib: Library of streaming algorithms in python",
36 | license="MIT",
37 | keywords="Streaming Algorithms",
38 | url="https://github.com/jiecchen/StreamLib",
39 | classifiers=[
40 | # "Development Status :: 5 - Production/Stable",
41 | "Intended Audience :: Developers",
42 | "Intended Audience :: Researchers",
43 | "Intended Audience :: Data Scientists",
44 | "License :: OSI Approved :: MIT License",
45 | "Topic :: Algorithms",
46 | "Topic :: Algorithms :: Streaming Algorithms",
47 | "Topic :: Utilities",
48 | "Programming Language :: Python :: 2.6",
49 | "Programming Language :: Python :: 2.7",
50 | # "Programming Language :: Python :: 3.3",
51 | # "Programming Language :: Python :: 3.4",
52 | # "Programming Language :: Python :: Implementation :: PyPy",
53 | "Operating System :: OS Independent"
54 | ],
55 |
56 | long_description=read('README.rst'),
57 | install_requires=['mmh3'],
58 | # cmdclass = {'test': PyTest},
59 | )
60 |
--------------------------------------------------------------------------------
/streamlib/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.0.1"
2 |
3 |
4 |
5 | from streamlib.hashes import MurmurHash
6 | from streamlib.summary import CountMin, CountMedian, CountSketch, F2
7 |
8 |
9 |
10 | __all__ = ('MurmurHash', 'CountMin', 'CountMedian', 'CountSketch', 'F2')
11 |
--------------------------------------------------------------------------------
/streamlib/hashes.py:
--------------------------------------------------------------------------------
1 | """
2 | Interface and implementation for hash functions.
3 | """
4 |
5 | from abc import ABCMeta, abstractmethod
6 | import mmh3
7 | import random
8 |
9 | class _Hash(object):
10 | """
11 | Interface for Hash Object.
12 | """
13 | @abstractmethod
14 | def hash(self, key):
15 | """
16 | Map the given key to an integer.
17 |
18 | :param key: a hashable object
19 |
20 | :return:
21 | :rtype: int
22 | """
23 | raise NotImplementedError('To be overwritten!')
24 |
25 | class MurmurHash(_Hash):
26 | """
27 | Murmur Hash Function.
28 | """
29 | def __init__(self):
30 | self._seed = random.randint(0, 1 << 31)
31 |
32 | def hash(self, key):
33 | """
34 | Return the hash value of key.
35 |
36 | :param key: can be any hashable object
37 |
38 | :return:
39 | :rtype: int
40 | """
41 | v = mmh3.hash(str(key.__hash__()), self._seed)
42 | return -(v + 1) if v < 0 else v
43 |
44 |
--------------------------------------------------------------------------------
/streamlib/hashes_bak/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/streamlib/hashes_bak/makefile:
--------------------------------------------------------------------------------
1 | clean:
2 | rm *.py~ *.pyc
3 |
--------------------------------------------------------------------------------
/streamlib/hashes_bak/universalHashing.py:
--------------------------------------------------------------------------------
1 | # Implementation for Universal Hash Gamily.
2 | # Analysis of the algorithm can be found in (sec 10.6.1)
3 | # http://www.cs.cmu.edu/~avrim/451f11/lectures/lect1004.pdf
4 | # or
5 | # https://github.com/jiecchen/references/blob/master/lect1004.pdf
6 |
7 |
8 |
9 |
10 | from bisect import bisect_left
11 | import math
12 | import random
13 |
14 | # Primes = (3, 7, 13, 19, 29, 37, 43, 53, 61, 71, 79, 89, 101, 107, 113, 131,
15 | # 139, 151, 163, 173, 181, 193, 199, 223, 229, 239, 251, 263, 271, 281,
16 | # 293, 311, 317, 337, 349, 359, 373, 383, 397, 409, 421, 433, 443, 457,
17 | # 463, 479, 491, 503, 521, 541, 557, 569, 577, 593, 601, 613, 619, 641,
18 | # 647, 659, 673, 683, 701, 719, 733, 743, 757, 769, 787, 809, 821, 827, 839,
19 | # 857, 863, 881, 887, 911, 929, 941, 953, 971, 983, 997, 1013, 1021, 1033,
20 | # 1049, 1061, 1069, 1091, 1097, 1109, 1123, 1151, 1163, 1181, 1193, 1213, 1223,
21 | # 1289, 1999, 2551, 3023, 3469, 3851, 4217, 4561, 4909, 5197, 5501,
22 | # 5779, 6053, 6301, 6569, 6823, 7027, 7297, 7541, 7727, 7951, 8209,
23 | # 8419, 8629, 8807, 9007, 9203, 9397, 9547, 9743, 9907, 10111, 10273,
24 | # 10459, 10651, 10847, 11003, 11173, 11353, 11519, 11717, 11887, 12011,
25 | # 12163, 12343, 12487, 12611, 12757, 12917, 13043, 13183, 13337, 13499,
26 | # 13679, 13781, 13907, 14071, 14249, 14407, 14537, 14653, 14767, 14887,
27 | # 15053, 15173, 15287, 15391, 15527, 15649, 15767, 15889, 16033, 16139,
28 | # 16273, 16421, 16553, 16673, 16823, 16937, 17041, 17183, 17317, 17417,
29 | # 17509, 17627, 17761, 17891, 17981, 18097, 18199, 18301, 18413, 18517,
30 | # 18661, 18787, 18919, 19069, 19183, 19289, 19417)
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 | # # TODO: fix the bug that the returned hash value might be larger than M
39 | # # This bug is due to the gap between M and its nearest prime number
40 | # class _LinearHash:
41 | # """
42 | # hash function use linear combination % M,
43 | # for internal use only
44 | # """
45 | # def __init__(self, _M, _rd):
46 | # # make sure self._M is a prime number right rather than _M
47 | # _M = 19289 if _M > 19289 else _M
48 | # pos = bisect_left(Primes, _M)
49 | # self._M = _M #Primes[pos]
50 | # # make sure self._base < self._M
51 | # b = int(math.log(self._M, 2)) - 1
52 | # self._base = (1 << b) - 1
53 | # self._b = b
54 | # self._para = [_rd.choice(range(self._M)) for i in range(100 / self._b + 1)]
55 | # # print (self._M, self._base, self._b, self._para)
56 |
57 |
58 | # def hash(self, key):
59 | # """
60 | # Given a key (should be hashable type),
61 | # return the hashed value
62 | # """
63 | # try:
64 | # x_int = key.__hash__()
65 | # except AttributeError:
66 | # x_int = hash(key)
67 |
68 | # if x_int < 0:
69 | # x_int = -x_int - 1
70 |
71 | # i = 0
72 | # hash_value = 0
73 | # while (x_int):
74 | # current = x_int & self._base
75 | # x_int = (x_int >> self._b)
76 | # hash_value = (hash_value + self._para[i] * current) % self._M
77 | # i += 1
78 | # return hash_value
79 |
80 |
81 | class _hash:
82 | """ Interface for Hash Object """
83 | def __init__(self, _M, _random):
84 | pass
85 |
86 | def hash(self, key):
87 | """
88 | @args
89 | key : a hashable object
90 | @return
91 | an integer as hashed value, \in [0, M)
92 | """
93 | pass
94 |
95 |
96 | from streamlib.info import MachineBits
97 | from streamlib.utils import CountBits
98 | class _matrixHash(_hash):
99 | """ Hash class constructed by matrix method """
100 | def __init__(self, _M, _random):
101 | self.b = int(math.log(_M, 2)) # + 1 -------> pay attention here
102 | self.matrix = [int(_random.getrandbits(MachineBits)) for i in range(self.b)]
103 |
104 | def hash(self, key):
105 | try:
106 | x_int = key.__hash__()
107 | except AttributeError:
108 | x_int = hash(key)
109 |
110 | if x_int < 0:
111 | x_int = -(x_int + 1)
112 |
113 | value = 0
114 | for v in self.matrix:
115 | tmp = CountBits(v & x_int) & 1
116 | value = (value << 1) | tmp
117 | return value
118 |
119 | import mmh3
120 | class _murmurHash(_hash):
121 | """ Hash class constructed by MurmurHash """
122 | def __init__(self, _M, _random):
123 | self.seed = _random.randint(0, 1 << 31)
124 | self.M = _M
125 | if self.M < 1:
126 | raise ValueError('M should be larger than 0!')
127 |
128 | def hash(self, key):
129 | key = -(key + 1) if key < 0 else key
130 | return mmh3.hash(key.__str__(), self.seed) % self.M
131 |
132 |
133 | # there is an issue, given M, the return hash value
134 | # might never reach M - 1 due to the gap between M and its
135 | # its nearest 2^m.
136 | # Any idea to fix?
137 | class UniversalHash:
138 | """
139 | Example:
140 | ----------------
141 | # construct a universal hash family map: [?] -> [M]
142 | # here M should be a prime number
143 | uhash = UniversalHash(M)
144 | hs = uhash.pickHash()
145 | ----------------
146 | hs.hash(hashable_obj) will give the hash value of hashable_obj which is \in [M]
147 | """
148 | def __init__(self, _M):
149 | self._random = random.Random()
150 | self._random.seed()
151 | self._M = _M
152 |
153 | def pickHash(self, mode = 'murmur'): # mode from {'matrix', 'murmur'}
154 | """
155 | Randomly return a hash function belongs to
156 | this Universal Hash Family. Such hash function is
157 | an instance of _LinearHash hence has the method .hash(hashable_object)
158 | """
159 | if mode == 'matrix':
160 | return _matrixHash(self._M, self._random)
161 | elif mode == 'murmur':
162 | return _murmurHash(self._M, self._random)
163 | else:
164 | return None
165 |
166 |
167 |
168 |
169 |
170 |
171 |
--------------------------------------------------------------------------------
/streamlib/info.py:
--------------------------------------------------------------------------------
1 | """
2 | Machine related information
3 | or
4 | Universal constants
5 | """
6 |
7 | #
8 | MachineBits = 64
9 |
--------------------------------------------------------------------------------
/streamlib/makefile:
--------------------------------------------------------------------------------
1 | clean:
2 | rm *.py~ *.pyc
3 |
--------------------------------------------------------------------------------
/streamlib/sketch_bak/BJKST.py:
--------------------------------------------------------------------------------
1 |
2 | from streamlib.hashes.universalHashing import UniversalHash
3 | from streamlib.utils import zeros, median, unionDict
4 | from sketch import Sketch, BasicEstimator
5 | from streamlib.wrappers import inherit_docs
6 | import math
7 |
8 |
9 | @inherit_docs
10 | class _BJKST_Estimator(BasicEstimator):
11 | """
12 | Basic BJKST-Estimator to esimate # of distinct elements in a data stream.
13 | It gives a (eps, O(1))-approximation, and the "Constant Probability" can
14 | be amplified to high probability using Median Trick
15 | """
16 | def __init__(self, eps, thresh, uhash_h, uhash_g):
17 | """
18 | @args
19 | eps : constrol the quality of estimation
20 | n : the size of the universe
21 | uhash_x: a universal hash family to pick hash function x
22 | """
23 | self.h = uhash_h.pickHash()
24 | self.g = uhash_g.pickHash()
25 | self.z = 0
26 | self.B = {}
27 | self.thresh = thresh
28 | # print self.thresh
29 |
30 | def _shrinkB(self):
31 | while len(self.B) >= self.thresh:
32 | self.z += 1
33 | self.B = {k:v for k, v in self.B.items() if v >= self.z}
34 |
35 |
36 | def process(self, key):
37 | """ process the given item """
38 | hs = zeros(self.h.hash(key))
39 | if hs >= self.z:
40 | self.B[self.g.hash(key)] = hs
41 |
42 |
43 | def getEstimation(self):
44 | """
45 | return a integer as an (eps, O(1))-approximation of #
46 | of distinct elements in the data stream
47 | """
48 | return len(self.B) * (2**self.z)
49 |
50 |
51 | def merge(self, skc):
52 | self.B = unionDict(self.B, skc.B)
53 | self._shrinkB()
54 |
55 |
56 |
57 | @inherit_docs
58 | class BJKST(Sketch):
59 | """
60 | BJKST sketch for estimation the distinct frequency.
61 | Algorithm and Analysis can be found in:
62 | http://www.cs.dartmouth.edu/~ac/Teach/CS49-Fall11/Notes/lecnotes.pdf
63 | or
64 | https://github.com/jiecchen/references/blob/master/lecnotes.pdf
65 |
66 | Usage:
67 | @args
68 | n : the size of universe
69 | eps, delta : control the quality of estimation
70 | @return
71 | BJKST(n, eps, delta) returns an (eps, delta) - BJKST sketch with para eps and delta.
72 |
73 | Example:
74 | ------------------
75 | d = DataStream(list("qwertyuiopasdfghjklzxcvbnm"), 1000)
76 | sketch = BJKST(26, 0.1, 0.001)
77 | for x in d:
78 | sketch.process(x)
79 | print sketch.getEstimation()
80 | ------------------
81 | """
82 | def __init__(self, n, eps, delta = 0.01):
83 | """
84 | @args
85 | n : the size of universe
86 | eps, delta: control the quality of estimation
87 | @return
88 | BJKST(n, eps, delta) returns an (eps, delta) - BJKST sketch with para eps and delta.
89 | """
90 | uhash_h = UniversalHash(n)
91 | uhash_g = UniversalHash(int(math.log(n, 2)**2 * eps**(-4)))
92 | thresh = eps**(-2)
93 | n_hash = int(math.log(1. / delta)) + 1
94 | # print "n_hash = ", n_hash
95 | self.estimators = [_BJKST_Estimator(eps, thresh, uhash_h, uhash_g) for i in range(n_hash)]
96 |
97 | def process(self, key):
98 | """ process the key """
99 | for est in self.estimators:
100 | est.process(key)
101 |
102 |
103 | def getEstimation(self):
104 | """ return the (eps, delta)-approximation """
105 | return median( [est.getEstimation() for est in self.estimators] )
106 |
107 | def merge(self, skc):
108 | """ To be check """
109 | un = zip(self.estimators, skc.estimators)
110 | self.estimators = [u.merge(v) for u, v in un]
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/streamlib/sketch_bak/F2.py:
--------------------------------------------------------------------------------
1 | from sketch import Sketch, BasicEstimator
2 | from streamlib.wrappers import inherit_docs
3 | from streamlib.hashes.universalHashing import UniversalHash
4 | from streamlib.utils import median
5 |
6 | @inherit_docs
7 | class _F2_estimator(BasicEstimator):
8 | """ Basic estimator for 2-frequencey moment """
9 |
10 | def __init__(self, uhash):
11 | self.h = uhash.pickHash()
12 | self.x = 0
13 |
14 | def process(self, itm):
15 | try:
16 | i, c = itm
17 | except (ValueError, TypeError):
18 | i = itm
19 | c = 1
20 | self.x += c * (1 - 2 * self.h.hash(i))
21 |
22 | def getEstimation(self):
23 | return self.x ** 2
24 |
25 | def merge(self, skc):
26 | self.x += skc.x
27 |
28 | import math
29 | @inherit_docs
30 | class F2(Sketch):
31 | def __init__(self, eps, delta = 0.001):
32 | n = int(1 + 1 / eps**2)
33 | m = int(math.log(1/delta, 2) + 1)
34 | uhash = UniversalHash(2)
35 | self.n = n
36 | self.m = m
37 | self.estimators = [ [ _F2_estimator(uhash) for j in range(n)] for i in range(m) ]
38 |
39 | def process(self, itm):
40 | for arr in self.estimators:
41 | for est in arr:
42 | est.process(itm)
43 |
44 | def _mean(self, arr):
45 | """ given an array of BsicEstimators,
46 | return the mean of their estimations """
47 | return sum([est.getEstimation() for est in arr]) / float(len(arr))
48 |
49 | def getEstimation(self):
50 | return median([self._mean(arr) for arr in self.estimators])
51 |
52 | def merge(self, skc):
53 | for i in range(self.m):
54 | for j in range(self.n):
55 | self.estimators[i][j].merge(skc.estimators[i][j])
56 |
--------------------------------------------------------------------------------
/streamlib/sketch_bak/Misra_Gries.py:
--------------------------------------------------------------------------------
1 | from sketch import Sketch
2 | from streamlib.wrappers import inherit_docs
3 |
4 | class MG(Sketch):
5 | """
6 | Misra-Gries Algorithm for Frequency-Estimation.
7 |
8 | Usage:
9 | -------------------------
10 | mg = MG(k)
11 | for x in data_stream:
12 | mg.process(x)
13 | ------------------------
14 | mg.frequency(i) will give an estimation of the frequency of item i,
15 | with error at most m/k, here the m is the size of the data_stream.
16 | More detail can be found in
17 | http://www.cs.dartmouth.edu/~ac/Teach/CS49-Fall11/Notes/lecnotes.pdf
18 | or
19 | https://github.com/jiecchen/references/blob/master/lecnotes.pdf
20 | """
21 | def __init__(self, _k):
22 | self.A = {}
23 | self._k = _k
24 |
25 | def process(self, _item):
26 | """
27 | Process for each item,
28 | Worst-case processing time: O(k) per item
29 | Amortized processing time: O(1) per item
30 | """
31 | if _item in self.A.keys():
32 | self.A[_item] += 1
33 | elif len(self.A.keys()) < self._k - 1:
34 | self.A[_item] = 1
35 | else:
36 | self.A = {_k : (_v - 1) for _k, _v in self.A.items() if _v > 1}
37 |
38 | def getEstimation(self, _item):
39 | """ Return the estimation of the frequncy of _item """
40 | return self.A[_item] if _item in self.A.keys() else 0
41 |
42 | def frequency(self, _item):
43 | """ Return the estimation of the frequncy of _item """
44 | return self.getEstimation(_item)
45 |
46 | def merge(self, skc):
47 | """ MG sketch is not mergable """
48 | raise AttributeError(" MG sketch is not mergable ")
49 |
--------------------------------------------------------------------------------
/streamlib/sketch_bak/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/streamlib/sketch_bak/countSketch.py:
--------------------------------------------------------------------------------
1 | from streamlib.sketch.sketch import Sketch, BasicEstimator
2 | from streamlib.hashes.universalHashing import UniversalHash
3 | from streamlib.utils import zeros, median
4 | from streamlib.wrappers import inherit_docs
5 | import math
6 | import random
7 |
8 |
9 | @inherit_docs
10 | class _CountSketch_estimator(BasicEstimator):
11 | """ Basic estimator for Count Sketch """
12 | def __init__(self, k, uhash_h, uhash_g):
13 | """
14 | @args
15 | eps : control accuracy
16 | uhash_h : an instance of random.Random
17 | """
18 | self.k = k
19 | self.C = [0 for i in range(self.k)]
20 | self.h = uhash_h.pickHash()
21 | self.g = uhash_g.pickHash()
22 |
23 | def process(self, key):
24 | # key has the form (i, c) or i
25 | try:
26 | i, c = key
27 | except (ValueError, TypeError):
28 | i = key
29 | c = 1
30 |
31 | self.C[self.h.hash(i)] += c * (1 - 2 * self.g.hash(i))
32 |
33 |
34 | def getEstimation(self, i):
35 | return (1 - 2 * self.g.hash(i)) * self.C[self.h.hash(i)]
36 |
37 |
38 | def merge(self, skc):
39 | pass
40 |
41 |
42 |
43 | @inherit_docs
44 | class CountSketch(Sketch):
45 | def __init__(self, eps, delta = 0.01):
46 | """
47 | @args
48 | @return
49 | """
50 | k = 3. * eps**(-2)
51 | # make sure self.k in the form 2^m
52 | self.k = 1 << (int(math.log(k, 2)) + 1)
53 | uhash_h = UniversalHash(self.k)
54 | uhash_g = UniversalHash(2)
55 | n_hash = int(math.log(1. / delta, 2)) + 1
56 | self.estimators = [_CountSketch_estimator(self.k, uhash_h, uhash_g) for i in range(n_hash)]
57 |
58 |
59 | def process(self, key):
60 | """ process the key """
61 | for est in self.estimators:
62 | est.process(key)
63 |
64 |
65 | def getEstimation(self, i):
66 | """ return the (eps, delta)-approximation """
67 | return median( [est.getEstimation(i) for est in self.estimators] )
68 |
69 |
70 | def merge(self, skc):
71 | pass
72 |
--------------------------------------------------------------------------------
/streamlib/sketch_bak/makefile:
--------------------------------------------------------------------------------
1 | clean:
2 | rm *.py~ *.pyc
3 |
--------------------------------------------------------------------------------
/streamlib/sketch_bak/quantile.py:
--------------------------------------------------------------------------------
1 | # Algorithm Designed by myself
2 |
3 | from sketch import Sketch
4 | from bisect import bisect_left, bisect_right
5 |
6 |
7 | class Quantile(Sketch):
8 | """
9 | Deterministic single pass approximate algorithm for quantile problem
10 | in streaming model. Mergable sketch.
11 |
12 | Usage:
13 | q = Quantile(eps, a, b) requires all items appeares in the data stream is in [a, b]
14 | q.getEstimation(k) then return an (1 + eps)-approximation to the k_th smallest item
15 | in the data stream.
16 | Example:
17 | --------------
18 | >>> q = Quantile(0.001, 0, 100.0)
19 | >>> d = [1, 2, 3, 4, 1, 1, 100, 2, 5, 8]
20 | >>> q.batchProcess(d)
21 | >>> print q.getEstimation(4)
22 | 2.00251528703
23 | -------------
24 | """
25 | # when all items \in [a, b + n]
26 | # Space usage: O(log n / eps) to give (1 + eps) - approximation
27 | # Processing time per item: O(log log n - log eps)
28 | # Time per query: O(log n / eps) -- reasonable because # of queries << # of items
29 |
30 |
31 | def __init__(self, eps, a, b):
32 | t = a
33 | # a_i = a -1 + (1 + eps)^i
34 | # C_i : # of < a_i
35 | self.a = []
36 | self.C = []
37 | self.n = 0
38 | b = a + 2 * (b - a)
39 | while t <= b:
40 | self.a.append(t)
41 | self.C.append(0)
42 | t = (t - a + 1) * (1 + eps) + a - 1
43 |
44 | def process(self, itm):
45 | """
46 | process an item in the data stream
47 | """
48 | try:
49 | i, c = itm
50 | except (ValueError, TypeError):
51 | i = itm
52 | c = 1
53 | self.n += c
54 | pos = bisect_right(self.a, i)
55 | self.C[pos] += c
56 |
57 |
58 | def getEstimation(self, k):
59 | """ Given k, return k_th smallest number ever appeared """
60 | if k < 1 or k > self.n:
61 | raise ValueError("k should be an int and between [1, n]!")
62 | cum = 0
63 | i = 0
64 | while cum < k:
65 | cum += self.C[i]
66 | i += 1
67 | return self.a[i - 1]
68 |
69 |
70 | def merge(self, skc):
71 | for i in range(len(self.a)):
72 | self.C[i] += skc.C[i]
73 | self.n += skc.n
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/streamlib/sketch_bak/sketch.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta, abstractmethod
2 |
3 | class BasicEstimator():
4 | """ Interface for basic sketch estimator """
5 | __metaclass__ = ABCMeta
6 |
7 | @abstractmethod
8 | def process(self, *args, **kwargs):
9 | """ process each item """
10 | pass
11 |
12 | # @abstractmethod
13 | def batchProcess(self, dataStream):
14 | """ process the dataStream in batch """
15 | for itm in dataStream:
16 | self.process(itm)
17 |
18 | @abstractmethod
19 | def getEstimation(self, *args, **kwargs):
20 | """ return the estimation """
21 | pass
22 |
23 | @abstractmethod
24 | def merge(self, estimator):
25 | """
26 | @args
27 | estimator : basic estimator that copied from self but has processed different dataString
28 | @return
29 | Suppose est1.batchProcess(dataStream1), est2.batchProcess(dataStream2),
30 | est1.merge(est2) returns a estimator that has processed dataStream1 concatenates dataStream2
31 | """
32 | pass
33 |
34 |
35 | class Sketch:
36 | """ Interface for sketch classes """
37 | __metaclass__ = ABCMeta
38 | # @abstractmethod
39 | # def __init__(self, *args, **kwargs):
40 | # pass
41 |
42 | @abstractmethod
43 | def process(self, *args, **kwargs):
44 | """ process each item """
45 | pass
46 |
47 | # @abstractmethod
48 | def batchProcess(self, dataStream):
49 | """ process the dataStream in batch """
50 | for itm in dataStream:
51 | self.process(itm)
52 |
53 |
54 | @abstractmethod
55 | def getEstimation(self, *args, **kwargs):
56 | """ return the estimation """
57 | pass
58 |
59 | @abstractmethod
60 | def merge(self, sketch):
61 | """
62 | @args
63 | sketch : sketch that copied from self but has processed different dataString
64 | @return
65 | Suppose sketch1.batchProcess(dataStream1), sketch2.batchProcess(dataStream2),
66 | sketch1.merge(sketch2) returns a sketch that has processed dataStream1 concatenates dataStream2
67 | """
68 | pass
69 |
--------------------------------------------------------------------------------
/streamlib/summary.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | """
4 | from streamlib import MurmurHash
5 | import copy
6 | from array import array
7 | from abc import ABCMeta, abstractmethod
8 | from random import randint
9 | from streamlib.utils import doc_inherit
10 | import streamlib.utils as utils
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | class Sketch(object):
19 | """
20 | Interface for Sketch.
21 | """
22 | @abstractmethod
23 | def processBatch(self, *args, **kwargs):
24 | """
25 | Summarize data stream in batch mode.
26 | """
27 | raise NotImplemented()
28 |
29 | @abstractmethod
30 | def processItem(self, *args, **kwargs):
31 | """
32 | Summarize one item in a data stream.
33 | """
34 | raise NotImplemented()
35 |
36 | @abstractmethod
37 | def estimate(self, *args, **kwargs):
38 | """
39 | Estimate properties of given item/key.
40 | """
41 | raise NotImplemented()
42 |
43 | @abstractmethod
44 | def merge(self, *args, **kwargs):
45 | """
46 | Merge compatible sketches.
47 | """
48 | raise NotImplemented()
49 |
50 | @abstractmethod
51 | def __add__(self, other):
52 | return self.merge(other)
53 |
54 |
55 |
56 |
57 |
58 | class F2(Sketch):
59 | """
60 | AMS F2 sketch
61 | estimate the second moment of the
62 | data stream
63 | """
64 | def __init__(self, w=20, mu=5, typecode='i'):
65 | """
66 | Create a new instance.
67 |
68 | :param w: The number of buckets.
69 | :type w: int
70 |
71 | :param mu: The number of repeated copies. Used to control the
72 | failure probability ~= 2^{-mu}
73 | :type mu: int
74 |
75 | :param typecode: type to represent the frequencies, check
76 | docs.python.org for module `array`
77 |
78 | """
79 |
80 | self._w = w
81 | self._mu = mu
82 | self._sketch = [array(typecode, [0] * w) for i in xrange(mu)]
83 | self._hashes = [[MurmurHash() for j in xrange(w)] for i in xrange(mu)]
84 | self._hash = hash(self)
85 |
86 |
87 | def processBatch(self, dataStream, weighted=False):
88 | """
89 | Summarize the given data stream.
90 |
91 | :param dataStream: any iterable object with hashable elements.
92 | e.g. a list of integers.
93 | """
94 |
95 | for item in dataStream:
96 | self.processItem(item, weighted)
97 |
98 |
99 | def processItem(self, item, weighted=False):
100 | """
101 | Summarize the given data stream, but only process one
102 | item.
103 |
104 | :param item: hashable object to be processed
105 | e.g. an integer
106 | """
107 | if not weighted:
108 | for i in xrange(self._mu):
109 | for j in xrange(self._w):
110 | self._sketch[i][j] += self._hashes[i][j].hash(item) % 2 * 2 - 1
111 | else:
112 | itm, wt = item
113 | for i in xrange(self._mu):
114 | for j in xrange(self._w):
115 | self._sketch[i][j] += (self._hashes[i][j].hash(itm) % 2 * 2 - 1) * wt
116 |
117 | def estimate(self):
118 | """
119 | Estimate the F2 moment of the given stream
120 |
121 | :return: estimated F2 moment
122 | :rtype: int/real
123 | """
124 |
125 | return utils.median([utils.mean( map(lambda x: x**2, self._sketch[i]) )
126 | for i in xrange(self._mu)])
127 |
128 |
129 |
130 | def reproduce(self, num=1):
131 | """
132 | Reproduce F2 Sketch instance(s) to have the same
133 | internal status.
134 |
135 | :param num: number of instances to be reproduced
136 | :type num: int
137 |
138 | :return: reproduced instance. if num > 1, a list
139 | of instances will be returned
140 | """
141 | if type(num) is not int:
142 | raise TypeError('num should be int')
143 | if num < 1:
144 | raise ValueError('num should >= 1')
145 |
146 | if num == 1:
147 | return copy.deepcopy(self)
148 | else:
149 | return [copy.deepcopy(self)]
150 |
151 |
152 | def merge(self, other):
153 | """
154 | Merge two F2 Sketch instances if they are compatible.
155 |
156 | :param other: an instance of F2 Sketch,
157 | """
158 | if other._hash != self._hash:
159 | raise ValueError('two instances are not compatible')
160 |
161 | res = F2(w=1, mu=1)
162 | res._sketch = copy.deepcopy(self._sketch)
163 | res._hashes = copy.deepcopy(self._hashes)
164 | res._w = self._w
165 | res._mu = self._mu
166 | res._hash = self._hash
167 |
168 | for i in xrange(self._mu):
169 | for j in xrange(self._w):
170 | res._sketch[i][j] += other._sketch[i][j]
171 |
172 | return res
173 |
174 |
175 |
176 | class CountSketch(object):
177 | """
178 | Count Sketch.
179 | """
180 | def __init__(self, w=20, mu=5, typecode='i'):
181 | """
182 | Create a new instance.
183 |
184 | :param w: The number of buckets.
185 | :type w: int
186 |
187 | :param mu: The number of repeated copies. Used to control the
188 | failure probability ~= 2^{-mu}
189 | :type mu: int
190 |
191 | :param typecode: type to represent the frequencies, check
192 | docs.python.org for module `array`
193 |
194 | """
195 | self._w = w
196 | self._mu = mu
197 | self._sketch = [array(typecode, [0] * w) for i in xrange(mu)]
198 | self._sign = [MurmurHash() for i in xrange(mu)]
199 | self._hashes = [MurmurHash() for i in xrange(mu)]
200 | self._hash = hash(self)
201 |
202 |
203 | def processBatch(self, dataStream, weighted=False):
204 | """
205 | Summarize the given data stream.
206 |
207 | :param dataStream: any iterable object with hashable elements.
208 | e.g. a list of integers.
209 | :param weighted: if weighted, each item in dataStream should
210 | be (key, weight) pair
211 | """
212 |
213 | for item in dataStream:
214 | self.processItem(item, weighted)
215 |
216 | def processItem(self, item, weighted=False):
217 | """
218 | Summarize the given data stream, but only process one
219 | item.
220 |
221 | :param item: hashable object to be processed
222 | e.g. an integer
223 | :param weighted: if weighted, item should
224 | be a (key, weight) pair
225 | """
226 | if weighted:
227 | key, weight = item
228 | for i in xrange(self._mu):
229 | # where the item is mapped by the i_th hash
230 | pos = self._hashes[i].hash(key) % self._w
231 | # increment the bucket
232 | sg = (self._sign[i].hash(key) % 2) * 2 - 1
233 | self._sketch[i][pos] += weight * sg
234 | else:
235 | for i in xrange(self._mu):
236 | # where the item is mapped by the i_th hash
237 | pos = self._hashes[i].hash(item) % self._w
238 | # increment the bucket
239 | sg = (self._sign[i].hash(item) % 2) * 2 - 1
240 | self._sketch[i][pos] += sg
241 |
242 |
243 | def estimate(self, key):
244 | """
245 | Estimate the frequency of given item.
246 |
247 | :param key: key/item in the data stream
248 |
249 | :return: estimated frequency of the given key.
250 | :rtype: int/real
251 | """
252 | all_estimators = [(self._sign[i].hash(key) % 2 * 2 - 1) *
253 | self._sketch[i][self._hashes[i].hash(key) % self._w]
254 | for i in xrange(self._mu)]
255 | return utils.median(all_estimators)
256 |
257 |
258 |
259 | def reproduce(self, num=1):
260 | """
261 | Reproduce Count Sketch instance(s) to have the same
262 | internal status.
263 |
264 | :param num: number of instances to be reproduced
265 | :type num: int
266 |
267 | :return: reproduced instance. if num > 1, a list
268 | of instances will be returned
269 | """
270 | if type(num) is not int:
271 | raise TypeError('num should be int')
272 | if num < 1:
273 | raise ValueError('num should >= 1')
274 |
275 | if num == 1:
276 | return copy.deepcopy(self)
277 | else:
278 | return [copy.deepcopy(self)]
279 |
280 |
281 | def merge(self, other):
282 | """
283 | Merge two CountSketch instances if they are compatible.
284 |
285 | :param other: an instance of CountSketch,
286 | """
287 | if other._hash != self._hash:
288 | raise ValueError('two instances are not compatible')
289 |
290 | res = CountSketch(w=1, mu=1)
291 | res._sketch = copy.deepcopy(self._sketch)
292 | res._hashes = copy.deepcopy(self._hashes)
293 | res._sign = copy.deepcopy(self._sign)
294 | res._w = self._w
295 | res._mu = self._mu
296 | res._hash = self._hash
297 |
298 | for i in xrange(self._mu):
299 | for j in xrange(self._w):
300 | res._sketch[i][j] += other._sketch[i][j]
301 |
302 | return res
303 |
304 |
305 |
306 | def __add__(self, other):
307 | return self.merge(other)
308 |
309 |
310 |
311 | class CountMin(Sketch):
312 | """
313 | Count-Min sketch.
314 | support non-negative weighted data stream.
315 | """
316 | def __init__(self, w=20, mu=5, typecode='i'):
317 | """
318 | Create a new instance.
319 |
320 | :param w: The number of buckets.
321 | :type w: int
322 |
323 | :param mu: The number of repeated copies. Used to control the
324 | failure probability ~= 2^{-mu}
325 | :type mu: int
326 |
327 | :param typecode: type to represent the frequencies, check
328 | docs.python.org for module `array`
329 | """
330 | self._w = w
331 | self._mu = mu
332 | self._sketch = [array(typecode, [0] * w) for i in xrange(mu)]
333 | self._hashes = [MurmurHash() for i in xrange(mu)]
334 | self._hash = hash(self)
335 |
336 | def processBatch(self, dataStream, weighted=False):
337 | """
338 | Summarize the given data stream.
339 |
340 | :param dataStream: any iterable object with hashable elements.
341 | e.g. a list of integers.
342 | :param weighted: if weighted, each item in dataStream should
343 | be (key, weight) pair, where weight > 0
344 | """
345 |
346 | for item in dataStream:
347 | self.processItem(item, weighted)
348 |
349 | def processItem(self, item, weighted=False):
350 | """
351 | Summarize the given data stream, but only process one
352 | item.
353 |
354 | :param item: hashable object to be processed
355 | e.g. an integer
356 | :param weighted: if weighted, item should
357 | be a (key, weight) pair, where weight > 0
358 | """
359 | if weighted:
360 | key, weight = item
361 | for i in xrange(self._mu):
362 | # where the item is mapped by the i_th hash
363 | pos = self._hashes[i].hash(key) % self._w
364 | # increment the bucket
365 | self._sketch[i][pos] += weight
366 | else:
367 | for i in xrange(self._mu):
368 | # where the item is mapped by the i_th hash
369 | pos = self._hashes[i].hash(item) % self._w
370 | # increment the bucket
371 | self._sketch[i][pos] += 1
372 |
373 |
374 | def estimate(self, key):
375 | """
376 | Estimate the frequency of given item.
377 |
378 | :param key: key/item in the data stream
379 |
380 | :return: estimated frequency of the given key.
381 | :rtype: int/real
382 | """
383 | all_estimators = [self._sketch[i][self._hashes[i].hash(key) % self._w]
384 | for i in xrange(self._mu)]
385 | return min(all_estimators)
386 |
387 |
388 |
389 | def reproduce(self, num=1):
390 | """
391 | Reproduce CountMin instance(s) to have the same
392 | internal status.
393 |
394 | :param num: number of instances to be reproduced
395 | :type num: int
396 |
397 | :return: reproduced instance. if num > 1, a list
398 | of instances will be returned
399 | """
400 | if type(num) is not int:
401 | raise TypeError('num should be int')
402 | if num < 1:
403 | raise ValueError('num should >= 1')
404 |
405 | if num == 1:
406 | return copy.deepcopy(self)
407 | else:
408 | return [copy.deepcopy(self)]
409 |
410 |
411 |
412 |
413 | def merge(self, other):
414 | """
415 | Merge two CountMin instances if they are compatible.
416 |
417 | :param other: an instance of CountMin,
418 | """
419 | if other._hash != self._hash:
420 | raise ValueError('two instances are not compatible')
421 |
422 | res = CountMin(w=1, mu=1)
423 | res._sketch = copy.deepcopy(self._sketch)
424 | res._hashes = copy.deepcopy(self._hashes)
425 | res._w = self._w
426 | res._mu = self._mu
427 | for i in xrange(self._mu):
428 | for j in xrange(self._w):
429 | res._sketch[i][j] += other._sketch[i][j]
430 |
431 | return res
432 |
433 |
434 | def __add__(self, other):
435 | """
436 | Overload + for self.merge
437 | """
438 | return self.merge(other)
439 |
440 |
441 |
442 | class CountMedian(CountMin):
443 | """
444 | Count-Median sketch.
445 | support negative weighted data stream.
446 | """
447 |
448 | @doc_inherit
449 | def __init__(self, w=20, mu=5, typecode='i'):
450 | super(CountMedian, self).__init__(w, mu, typecode)
451 |
452 |
453 |
454 | def processBatch(self, dataStream, weighted=False):
455 | """
456 | Summarize the given data stream.
457 |
458 | :param dataStream: any iterable object with hashable elements.
459 | e.g. a list of integers.
460 | :param weighted: if weighted, each item in dataStream should
461 | be (key, weight) pair, weight can be positive
462 | or negtive
463 | """
464 |
465 | for item in dataStream:
466 | self.processItem(item, weighted)
467 |
468 |
469 | def processItem(self, item, weighted=False):
470 | """
471 | Summarize the given data stream, but only process one
472 | item.
473 |
474 | :param item: hashable object to be processed
475 | e.g. an integer
476 | :param weighted: if weighted, item should
477 | be a (key, weight) pair, weight can be postive
478 | or negative
479 | """
480 | super(CountMedian, self).processItem(item, weighted)
481 |
482 |
483 | def estimate(self, key):
484 | """
485 | Estimate the frequency of given item.
486 |
487 | :param key: key/item in the data stream
488 |
489 | :return: estimated frequency of the given key.
490 | :rtype: int/real
491 | """
492 | all_estimators = [self._sketch[i][self._hashes[i].hash(key) % self._w]
493 | for i in xrange(self._mu)]
494 | return utils.median(all_estimators)
495 |
496 |
497 |
498 |
499 |
500 |
501 |
--------------------------------------------------------------------------------
/streamlib/utils.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | def median(numbers):
7 | st = sorted(numbers)
8 | l = len(st)
9 | return (st[l // 2] + st[(l - 1) // 2]) / 2
10 |
11 | def mean(numbers):
12 | if len(numbers) == 0:
13 | return 0
14 | else:
15 | return sum(numbers) / len(numbers)
16 |
17 |
18 |
19 | from functools import wraps
20 |
21 | class DocInherit(object):
22 | """
23 | Docstring inheriting method descriptor
24 |
25 | The class itself is also used as a decorator
26 |
27 | credit: code received from http://code.activestate.com/recipes/576862/
28 | """
29 |
30 | def __init__(self, mthd):
31 | self.mthd = mthd
32 | self.name = mthd.__name__
33 |
34 | def __get__(self, obj, cls):
35 | if obj:
36 | return self.get_with_inst(obj, cls)
37 | else:
38 | return self.get_no_inst(cls)
39 |
40 | def get_with_inst(self, obj, cls):
41 |
42 | overridden = getattr(super(cls, obj), self.name, None)
43 |
44 | @wraps(self.mthd, assigned=('__name__','__module__'))
45 | def f(*args, **kwargs):
46 | return self.mthd(obj, *args, **kwargs)
47 |
48 | return self.use_parent_doc(f, overridden)
49 |
50 | def get_no_inst(self, cls):
51 |
52 | for parent in cls.__mro__[1:]:
53 | overridden = getattr(parent, self.name, None)
54 | if overridden: break
55 |
56 | @wraps(self.mthd, assigned=('__name__','__module__'))
57 | def f(*args, **kwargs):
58 | return self.mthd(*args, **kwargs)
59 |
60 | return self.use_parent_doc(f, overridden)
61 |
62 | def use_parent_doc(self, func, source):
63 | if source is None:
64 | raise NameError, ("Can't find '%s' in parents"%self.name)
65 | func.__doc__ = source.__doc__
66 | return func
67 |
68 | doc_inherit = DocInherit
69 |
--------------------------------------------------------------------------------
/tests/test_summary.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import math
3 |
4 | from streamlib import CountMin
5 | class Test_CountMin(object):
6 |
7 | def test_process(self):
8 | a = CountMin(w=10, mu=10)
9 | ls = [1, 1, 1, 2, 1, 1, 1]
10 | a.processBatch(ls)
11 | assert a.estimate(1) == 6
12 | assert a.estimate(2) == 1
13 |
14 |
15 | def test_merge(self):
16 | a = CountMin(w=10, mu=10)
17 | b = a.reproduce()
18 | c = CountMin()
19 |
20 | with pytest.raises(ValueError):
21 | a + c
22 |
23 | l1 = [1, 1, 1, 2, 1, 1, 1]
24 | l2 = [1, 1, 1, 3, 3]
25 | a.processBatch(l1)
26 | b.processBatch(l2)
27 |
28 | c = a + b
29 |
30 | assert c.estimate(1) == 9
31 | assert c.estimate(2) == 1
32 | assert c.estimate(3) == 2
33 |
34 |
35 |
36 | from streamlib import CountMedian
37 | class Test_CountMedian(object):
38 |
39 | def test_estimate(self):
40 | a = CountMedian(w=10, mu=10)
41 | ls = [1, 1, 1, 2, 1, 1, 1]
42 | a.processBatch(ls)
43 | assert a.estimate(1) == 6
44 | assert a.estimate(2) == 1
45 |
46 | def test_merge(self):
47 | a = CountMedian(w=10, mu=10)
48 | b = a.reproduce()
49 | c = CountMin()
50 |
51 | with pytest.raises(ValueError):
52 | a + c
53 |
54 | l1 = [1, 1, 1, 2, 1, 1, 1]
55 | l2 = [1, 1, 1, 3, 3]
56 | a.processBatch(l1)
57 | b.processBatch(l2)
58 |
59 | c = a + b
60 |
61 | assert c.estimate(1) == 9
62 | assert c.estimate(2) == 1
63 | assert c.estimate(3) == 2
64 |
65 |
66 | from streamlib import F2
67 | class Test_F2(object):
68 |
69 | def test(self):
70 | w = 300
71 | f2 = F2(w)
72 | items = [1, 2, 3, 4]
73 | weights = [5, 1, 1, 6]
74 | f2.processBatch(zip(items, weights), True)
75 | new_f2 = f2.merge(f2.reproduce())
76 |
77 | # items.extend(items)
78 | weights = map(lambda x: x * 2, weights)
79 | exact_f2 = sum(map(lambda x: x**2, weights))
80 |
81 | assert abs(new_f2.estimate() - exact_f2) <= exact_f2 / math.sqrt(w)
82 |
--------------------------------------------------------------------------------