├── csvec
    ├── __init__.py
    ├── test_csvec.py
    └── csvec.py
├── setup.py
├── README.md
├── .gitignore
└── LICENSE


/csvec/__init__.py:
--------------------------------------------------------------------------------
1 | from .csvec import CSVec
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='csvec',
 4 |       version='0.0.1',
 5 |       description='Count Sketch Vector',
 6 |       url='https://github.com/nikitaivkin/csh',
 7 |       author='Nikita Ivkin',
 8 |       author_email='nikitasemail@nikita.com',
 9 |       license='MIT',
10 |       packages=['csvec'],
11 |       zip_safe=False)
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CSVec: Count Sketch Vector
 2 | 
 3 | ## Installation
 4 | Dependencies: `pytorch` and `numpy`. Tested with `torch==1.0.1` and `numpy==1.15.3`, but this should work with a wide range of versions.
 5 | 
 6 | `git clone` the repository to your local machine, move to the directory containing `setup.py`, then run
 7 | ```
 8 | pip install -e .
 9 | ```
10 | to install this package.
11 | 
12 | ## Description
13 | 
14 | This package contains one main class, `CSVec`, which computes the Count Sketch of input vectors, and can extract heavy hitters from a Count Sketch.
15 | 
16 | Link to the Count Sketch paper -> http://www.mathcs.emory.edu/~cheung/Courses/584-StreamDB/Syllabus/papers/Frequency-count/FrequentStream.pdf
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/csvec/test_csvec.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import csvec
  3 | from csvec import CSVec
  4 | import torch
  5 | 
  6 | class Base:
  7 |     # use Base class to hide CSVecTestCase from the unittest runner
  8 |     # we only want the subclasses to actually be run
  9 | 
 10 |     class CSVecTestCase(unittest.TestCase):
 11 |         def testRandomness(self):
 12 |             # make sure two sketches get the same hashes and signs
 13 |             d = 100
 14 |             c = 20
 15 |             r = 5
 16 |             a = CSVec(d, c, r, **self.csvecArgs)
 17 |             b = CSVec(d, c, r, **self.csvecArgs)
 18 |             self.assertTrue(torch.allclose(a.signs, b.signs))
 19 |             self.assertTrue(torch.allclose(a.buckets, b.buckets))
 20 |             self.assertTrue(torch.allclose(a.signs, b.signs))
 21 | 
 22 |             if self.numBlocks > 1:
 23 |                 self.assertTrue(torch.allclose(a.blockOffsets,
 24 |                                                b.blockOffsets))
 25 |                 self.assertTrue(torch.allclose(a.blockSigns,
 26 |                                                b.blockSigns))
 27 | 
 28 |         def testInit(self):
 29 |             # make sure the table starts out zeroed
 30 |             d = 100
 31 |             c = 20
 32 |             r = 5
 33 |             a = CSVec(d, c, r, **self.csvecArgs)
 34 |             zeros = torch.zeros(r, c).to(self.device)
 35 |             self.assertTrue(torch.allclose(a.table, zeros))
 36 | 
 37 |         def testSketchVec(self):
 38 |             # sketch a vector with all zeros except a single 1
 39 |             # then the table should be zeros everywhere except a single
 40 |             # 1 in each row
 41 |             d = 100
 42 |             c = 1
 43 |             r = 5
 44 |             a = CSVec(d=d, c=c, r=r, **self.csvecArgs)
 45 |             vec = torch.zeros(d).to(self.device)
 46 |             vec[0] = 1
 47 |             a.accumulateVec(vec)
 48 |             # make sure the sketch only has one nonzero entry per row
 49 |             for i in range(r):
 50 |                 with self.subTest(row=i):
 51 |                     self.assertEqual(a.table[i,:].nonzero().numel(), 1)
 52 | 
 53 |             # make sure each row sums to +-1
 54 |             summed = a.table.abs().sum(dim=1).view(-1)
 55 |             ones = torch.ones(r).to(self.device)
 56 |             self.assertTrue(torch.allclose(summed, ones))
 57 | 
 58 |         def testZeroSketch(self):
 59 |             d = 100
 60 |             c = 20
 61 |             r = 5
 62 |             a = CSVec(d, c, r, **self.csvecArgs)
 63 |             vec = torch.rand(d).to(self.device)
 64 |             a.accumulateVec(vec)
 65 | 
 66 |             zeros = torch.zeros((r, c)).to(self.device)
 67 |             self.assertFalse(torch.allclose(a.table, zeros))
 68 | 
 69 |             a.zero()
 70 |             self.assertTrue(torch.allclose(a.table, zeros))
 71 | 
 72 |         def testUnsketch(self):
 73 |             # make sure heavy hitter recovery works correctly
 74 | 
 75 |             # use a gigantic sketch so there's no chance of collision
 76 |             d = 5
 77 |             c = 10000
 78 |             r = 20
 79 |             a = CSVec(d, c, r, **self.csvecArgs)
 80 |             vec = torch.rand(d).to(self.device)
 81 | 
 82 |             a.accumulateVec(vec)
 83 | 
 84 |             with self.subTest(method="topk"):
 85 |                 recovered = a.unSketch(k=d)
 86 |                 self.assertTrue(torch.allclose(recovered, vec))
 87 | 
 88 |             with self.subTest(method="epsilon"):
 89 |                 thr = vec.abs().min() * 0.9
 90 |                 recovered = a.unSketch(epsilon=thr / vec.norm())
 91 |                 self.assertTrue(torch.allclose(recovered, vec))
 92 | 
 93 |         def testSketchSum(self):
 94 |             d = 5
 95 |             c = 10000
 96 |             r = 20
 97 | 
 98 |             summed = CSVec(d, c, r, **self.csvecArgs)
 99 |             for i in range(d):
100 |                 vec = torch.zeros(d).to(self.device)
101 |                 vec[i] = 1
102 |                 sketch = CSVec(d, c, r, **self.csvecArgs)
103 |                 sketch.accumulateVec(vec)
104 |                 summed += sketch
105 | 
106 |             recovered = summed.unSketch(k=d)
107 |             trueSum = torch.ones(d).to(self.device)
108 |             self.assertTrue(torch.allclose(recovered, trueSum))
109 | 
110 |         def testL2(self):
111 |             d = 5
112 |             c = 10000
113 |             r = 20
114 | 
115 |             vec = torch.randn(d).to(self.device)
116 |             a = CSVec(d, c, r, **self.csvecArgs)
117 |             a.accumulateVec(vec)
118 | 
119 |             tol = 0.0001
120 |             self.assertTrue((a.l2estimate() - vec.norm()).abs() < tol)
121 | 
122 |         def testMedian(self):
123 |             d = 5
124 |             c = 10000
125 |             r = 20
126 | 
127 |             csvecs = [CSVec(d, c, r, **self.csvecArgs) for _ in range(3)]
128 |             for i, csvec in enumerate(csvecs):
129 |                 vec = torch.arange(d).float().to(self.device) + i
130 |                 csvec.accumulateVec(vec)
131 |             median = CSVec.median(csvecs)
132 |             recovered = median.unSketch(k=d)
133 |             trueMedian = torch.arange(d).float().to(self.device) + 1
134 |             self.assertTrue(torch.allclose(recovered, trueMedian))
135 | 
136 | class TestCaseCPU1(Base.CSVecTestCase):
137 |     def setUp(self):
138 |         # hack to reset csvec's global cache between tests
139 |         csvec.cache = {}
140 | 
141 |         self.device = "cpu"
142 |         self.numBlocks = 1
143 | 
144 |         self.csvecArgs = {"numBlocks": self.numBlocks,
145 |                           "device": self.device}
146 | 
147 | class TestCaseCPU2(Base.CSVecTestCase):
148 |     def setUp(self):
149 |         csvec.cache = {}
150 | 
151 |         self.device = "cpu"
152 |         self.numBlocks = 2
153 | 
154 |         self.csvecArgs = {"numBlocks": self.numBlocks,
155 |                           "device": self.device}
156 | 
157 | @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
158 | class TestCaseCUDA2(Base.CSVecTestCase):
159 |     def setUp(self):
160 |         csvec.cache = {}
161 | 
162 |         self.device = "cuda"
163 |         self.numBlocks = 2
164 | 
165 |         self.csvecArgs = {"numBlocks": self.numBlocks,
166 |                           "device": self.device}
167 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 Daniel Rothchild, Nikita Ivkin, Ashwinee Panda
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/csvec/csvec.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import copy
  4 | import torch
  5 | 
  6 | LARGEPRIME = 2**61-1
  7 | 
  8 | cache = {}
  9 | 
 10 | #import line_profiler
 11 | #import atexit
 12 | #profile = line_profiler.LineProfiler()
 13 | #atexit.register(profile.print_stats)
 14 | 
 15 | class CSVec(object):
 16 |     """ Count Sketch of a vector
 17 | 
 18 |     Treating a vector as a stream of tokens with associated weights,
 19 |     this class computes the count sketch of an input vector, and
 20 |     supports operations on the resulting sketch.
 21 | 
 22 |     public methods: zero, unSketch, l2estimate, __add__, __iadd__
 23 |     """
 24 | 
 25 |     def __init__(self, d, c, r, doInitialize=True, device=None,
 26 |                  numBlocks=1):
 27 |         """ Constductor for CSVec
 28 | 
 29 |         Args:
 30 |             d: the cardinality of the skteched vector
 31 |             c: the number of columns (buckets) in the sketch
 32 |             r: the number of rows in the sketch
 33 |             doInitialize: if False, you are responsible for setting
 34 |                 self.table, self.signs, self.buckets, self.blockSigns,
 35 |                 and self.blockOffsets
 36 |             device: which device to use (cuda or cpu). If None, chooses
 37 |                 cuda if available, else cpu
 38 |             numBlocks: mechanism to reduce memory consumption. A value
 39 |                 of 1 leads to a normal sketch. Higher values reduce
 40 |                 peak memory consumption proportionally but decrease
 41 |                 randomness of the hashes
 42 |         Note:
 43 |             Since sketching a vector always requires the hash functions
 44 |             to be evaluated for all of 0..d-1, we precompute the
 45 |             hash values in the constructor. However, this takes d*r
 46 |             memory, which is sometimes too big. We therefore only
 47 |             compute hashes of 0..(d/numBlocks - 1), and we let the
 48 |             hash of all other tokens be the hash of that token modulo
 49 |             d/numBlocks. In order to recover some of the lost randomness,
 50 |             we add a random number to each "block" (self.blockOffsets)
 51 |             and multiply each block by a random sign (self.blockSigns)
 52 |         """
 53 | 
 54 |         # save random quantities in a module-level variable so we can
 55 |         # reuse them if someone else makes a sketch with the same d, c, r
 56 |         global cache
 57 | 
 58 |         self.r = r # num of rows
 59 |         self.c = c # num of columns
 60 |         # need int() here b/c annoying np returning np.int64...
 61 |         self.d = int(d) # vector dimensionality
 62 | 
 63 |         # reduce memory consumption of signs & buckets by constraining
 64 |         # them to be repetitions of a single block
 65 |         self.numBlocks = numBlocks
 66 | 
 67 |         # choose the device automatically if none was given
 68 |         if device is None:
 69 |             device = 'cuda' if torch.cuda.is_available() else 'cpu'
 70 |         else:
 71 |             if (not isinstance(device, torch.device) and
 72 |                     not ("cuda" in device or device == "cpu")):
 73 |                 msg = "Expected a valid device, got {}"
 74 |                 raise ValueError(msg.format(device))
 75 | 
 76 |         self.device = device
 77 | 
 78 |         # this flag indicates that the caller plans to set up
 79 |         # self.signs, self.buckets, self.blockSigns, and self.blockOffsets
 80 |         # itself (e.g. self.deepcopy does this)
 81 |         if not doInitialize:
 82 |             return
 83 | 
 84 |         # initialize the sketch to all zeros
 85 |         self.table = torch.zeros((r, c), device=self.device)
 86 | 
 87 |         # if we already have these, don't do the same computation
 88 |         # again (wasting memory storing the same data several times)
 89 |         cacheKey = (d, c, r, numBlocks, device)
 90 |         if cacheKey in cache:
 91 |             self.signs = cache[cacheKey]["signs"]
 92 |             self.buckets = cache[cacheKey]["buckets"]
 93 |             if self.numBlocks > 1:
 94 |                 self.blockSigns = cache[cacheKey]["blockSigns"]
 95 |                 self.blockOffsets = cache[cacheKey]["blockOffsets"]
 96 |             return
 97 | 
 98 |         # initialize hashing functions for each row:
 99 |         # 2 random numbers for bucket hashes + 4 random numbers for
100 |         # sign hashes
101 |         # maintain existing random state so we don't mess with
102 |         # the main module trying to set the random seed but still
103 |         # get reproducible hashes for the same value of r
104 | 
105 |         # do all these computations on the CPU, since pytorch
106 |         # is incapable of in-place mod, and without that, this
107 |         # computation uses up too much GPU RAM
108 |         rand_state = torch.random.get_rng_state()
109 |         torch.random.manual_seed(42)
110 |         hashes = torch.randint(0, LARGEPRIME, (r, 6),
111 |                                dtype=torch.int64, device="cpu")
112 | 
113 |         # compute random blockOffsets and blockSigns
114 |         if self.numBlocks > 1:
115 |             nTokens = self.d // numBlocks
116 |             if self.d % numBlocks != 0:
117 |                 # so that we only need numBlocks repetitions
118 |                 nTokens += 1
119 |             self.blockSigns = torch.randint(0, 2, size=(self.numBlocks,),
120 |                                             device=self.device) * 2 - 1
121 |             self.blockOffsets = torch.randint(0, self.c,
122 |                                               size=(self.numBlocks,),
123 |                                               device=self.device)
124 |         else:
125 |             assert(numBlocks == 1)
126 |             nTokens = self.d
127 | 
128 |         torch.random.set_rng_state(rand_state)
129 | 
130 |         # tokens are the indices of the vector entries
131 |         tokens = torch.arange(nTokens, dtype=torch.int64, device="cpu")
132 |         tokens = tokens.reshape((1, nTokens))
133 | 
134 |         # computing sign hashes (4 wise independence)
135 |         h1 = hashes[:,2:3]
136 |         h2 = hashes[:,3:4]
137 |         h3 = hashes[:,4:5]
138 |         h4 = hashes[:,5:6]
139 |         self.signs = (((h1 * tokens + h2) * tokens + h3) * tokens + h4)
140 |         self.signs = ((self.signs % LARGEPRIME % 2) * 2 - 1).float()
141 | 
142 |         # only move to device now, since this computation takes too
143 |         # much memory if done on the GPU, and it can't be done
144 |         # in-place because pytorch (1.0.1) has no in-place modulo
145 |         # function that works on large numbers
146 |         self.signs = self.signs.to(self.device)
147 | 
148 |         # computing bucket hashes (2-wise independence)
149 |         h1 = hashes[:,0:1]
150 |         h2 = hashes[:,1:2]
151 |         self.buckets = ((h1 * tokens) + h2) % LARGEPRIME % self.c
152 | 
153 |         # only move to device now. See comment above.
154 |         # can't cast this to int, unfortunately, since we index with
155 |         # this below, and pytorch only lets us index with long
156 |         # tensors
157 |         self.buckets = self.buckets.to(self.device)
158 | 
159 |         cache[cacheKey] = {"signs": self.signs,
160 |                            "buckets": self.buckets}
161 |         if numBlocks > 1:
162 |             cache[cacheKey].update({"blockSigns": self.blockSigns,
163 |                                     "blockOffsets": self.blockOffsets})
164 | 
165 |     def zero(self):
166 |         """ Set all the entries of the sketch to zero """
167 |         self.table.zero_()
168 | 
169 |     def cpu_(self):
170 |         self.device = "cpu"
171 |         self.table = self.table.cpu()
172 | 
173 |     def cuda_(self, device="cuda"):
174 |         self.device = device
175 |         self.table = self.table.cuda()
176 | 
177 |     def half_(self):
178 |         self.table = self.table.half()
179 | 
180 |     def float_(self):
181 |         self.table = self.table.float()
182 | 
183 |     def __deepcopy__(self, memodict={}):
184 |         # don't initialize new CSVec, since that will calculate bc,
185 |         # which is slow, even though we can just copy it over
186 |         # directly without recomputing it
187 |         newCSVec = CSVec(d=self.d, c=self.c, r=self.r,
188 |                          doInitialize=False, device=self.device,
189 |                          numBlocks=self.numBlocks)
190 |         newCSVec.table = copy.deepcopy(self.table)
191 |         global cache
192 |         cachedVals = cache[(self.d, self.c, self.r, self.numBlocks, self.device)]
193 |         newCSVec.signs = cachedVals["signs"]
194 |         newCSVec.buckets = cachedVals["buckets"]
195 |         if self.numBlocks > 1:
196 |             newCSVec.blockSigns = cachedVals["blockSigns"]
197 |             newCSVec.blockOffsets = cachedVals["blockOffsets"]
198 |         return newCSVec
199 | 
200 |     def __imul__(self, other):
201 |         if isinstance(other, int) or isinstance(other, float):
202 |             self.table = self.table.mul_(other)
203 |         else:
204 |             raise ValueError(f"Can't multiply a CSVec by {other}")
205 |         return self
206 | 
207 |     def __truediv__(self, other):
208 |         if isinstance(other, int) or isinstance(other, float):
209 |             self.table = self.table.div_(other)
210 |         else:
211 |             raise ValueError(f"Can't divide a CSVec by {other}")
212 |         return self
213 | 
214 |     def __add__(self, other):
215 |         """ Returns the sum of self with other
216 | 
217 |         Args:
218 |             other: a CSVec with identical values of d, c, and r
219 |         """
220 |         # a bit roundabout in order to avoid initializing a new CSVec
221 |         returnCSVec = copy.deepcopy(self)
222 |         returnCSVec += other
223 |         return returnCSVec
224 | 
225 |     def __iadd__(self, other):
226 |         """ Accumulates another sketch
227 | 
228 |         Args:
229 |             other: a CSVec with identical values of d, c, r, device, numBlocks
230 |         """
231 |         if isinstance(other, CSVec):
232 |             # merges csh sketch into self
233 |             assert(self.d == other.d)
234 |             assert(self.c == other.c)
235 |             assert(self.r == other.r)
236 |             assert(self.device == other.device)
237 |             assert(self.numBlocks == other.numBlocks)
238 |             self.table += other.table
239 |         else:
240 |             raise ValueError("Can't add this to a CSVec: {}".format(other))
241 |         return self
242 | 
243 |     def accumulateTable(self, table):
244 |         """ Adds a CSVec.table to self
245 | 
246 |         Args:
247 |             table: the table to be added
248 | 
249 |         """
250 |         if table.size() != self.table.size():
251 |             msg = "Passed in table has size {}, expecting {}"
252 |             raise ValueError(msg.format(table.size(), self.table.size()))
253 | 
254 |         self.table += table
255 | 
256 |     def accumulateVec(self, vec):
257 |         """ Sketches a vector and adds the result to self
258 | 
259 |         Args:
260 |             vec: the vector to be sketched
261 |         """
262 |         assert(len(vec.size()) == 1 and vec.size()[0] == self.d)
263 | 
264 |         # the vector is sketched to each row independently
265 |         for r in range(self.r):
266 |             buckets = self.buckets[r,:].to(self.device)
267 |             signs = self.signs[r,:].to(self.device)
268 |             # the main computation here is the bincount below, but
269 |             # there's lots of index accounitng leading up to it due
270 |             # to numBlocks being potentially > 1
271 |             for blockId in range(self.numBlocks):
272 |                 start = blockId * buckets.size()[0]
273 |                 end = (blockId + 1) * buckets.size()[0]
274 |                 end = min(end, self.d)
275 |                 offsetBuckets = buckets[:end-start].clone()
276 |                 offsetSigns = signs[:end-start].clone()
277 |                 if self.numBlocks > 1:
278 |                     offsetBuckets += self.blockOffsets[blockId]
279 |                     offsetBuckets %= self.c
280 |                     offsetSigns *= self.blockSigns[blockId]
281 |                 # bincount computes the sum of all values in the vector
282 |                 # that correspond to each bucket
283 |                 self.table[r,:] += torch.bincount(
284 |                                     input=offsetBuckets,
285 |                                     weights=offsetSigns * vec[start:end],
286 |                                     minlength=self.c
287 |                                    )
288 | 
289 |     def _findHHK(self, k):
290 |         assert(k is not None)
291 |         #tokens = torch.arange(self.d, device=self.device)
292 |         #vals = self._findValues(tokens)
293 |         vals = self._findAllValues()
294 | 
295 |         # sort is faster than torch.topk...
296 |         #HHs = torch.sort(vals**2)[1][-k:]
297 | 
298 |         # topk on cuda returns what looks like uninitialized memory if
299 |         # vals has nan values in it
300 |         # saving to a zero-initialized output array instead of using the
301 |         # output of topk appears to solve this problem
302 |         outVals = torch.zeros(k, device=vals.device)
303 |         HHs = torch.zeros(k, device=vals.device).long()
304 |         torch.topk(vals**2, k, sorted=False, out=(outVals, HHs))
305 |         return HHs, vals[HHs]
306 | 
307 |     def _findHHThr(self, thr):
308 |         assert(thr is not None)
309 |         vals = self._findAllValues()
310 |         HHs = vals.abs() >= thr
311 |         return HHs, vals[HHs]
312 | 
313 |         """ this is a potentially faster way to compute the same thing,
314 |         but it doesn't play nicely with numBlocks > 1, so for now I'm
315 |         just using the slower code above
316 | 
317 |         # to figure out which items are heavy hitters, check whether
318 |         # self.table exceeds thr (in magnitude) in at least r/2 of
319 |         # the rows. These elements are exactly those for which the median
320 |         # exceeds thr, but computing the median is expensive, so only
321 |         # calculate it after we identify which ones are heavy
322 |         tablefiltered = (  (self.table >  thr).float()
323 |                          - (self.table < -thr).float())
324 |         est = torch.zeros(self.d, device=self.device)
325 |         for r in range(self.r):
326 |             est += tablefiltered[r, self.buckets[r,:]] * self.signs[r, :]
327 |         est = (  (est >=  math.ceil(self.r/2.)).float()
328 |                - (est <= -math.ceil(self.r/2.)).float())
329 | 
330 |         # HHs - heavy coordinates
331 |         HHs = torch.nonzero(est)
332 |         return HHs, self._findValues(HHs)
333 |         """
334 | 
335 |     def _findValues(self, coords):
336 |         # estimating frequency of input coordinates
337 |         assert(self.numBlocks == 1)
338 |         d = coords.size()[0]
339 |         vals = torch.zeros(self.r, self.d, device=self.device)
340 |         for r in range(self.r):
341 |             vals[r] = (self.table[r, self.buckets[r, coords]]
342 |                        * self.signs[r, coords])
343 |         return vals.median(dim=0)[0]
344 | 
345 |     def _findAllValues(self):
346 |         if self.numBlocks == 1:
347 |             vals = torch.zeros(self.r, self.d, device=self.device)
348 |             for r in range(self.r):
349 |                 vals[r] = (self.table[r, self.buckets[r,:]]
350 |                            * self.signs[r,:])
351 |             return vals.median(dim=0)[0]
352 |         else:
353 |             medians = torch.zeros(self.d, device=self.device)
354 |             for blockId in range(self.numBlocks):
355 |                 start = blockId * self.buckets.size()[1]
356 |                 end = (blockId + 1) * self.buckets.size()[1]
357 |                 end = min(end, self.d)
358 |                 vals = torch.zeros(self.r, end-start, device=self.device)
359 |                 for r in range(self.r):
360 |                     buckets = self.buckets[r, :end-start]
361 |                     signs = self.signs[r, :end-start]
362 |                     offsetBuckets = buckets + self.blockOffsets[blockId]
363 |                     offsetBuckets %= self.c
364 |                     offsetSigns = signs * self.blockSigns[blockId]
365 |                     vals[r] = (self.table[r, offsetBuckets]
366 |                                 * offsetSigns)
367 |                 medians[start:end] = vals.median(dim=0)[0]
368 |             return medians
369 | 
370 |     def _findHHs(self, k=None, thr=None):
371 |         assert((k is None) != (thr is None))
372 |         if k is not None:
373 |             return self._findHHK(k)
374 |         else:
375 |             return self._findHHThr(thr)
376 | 
377 |     def unSketch(self, k=None, epsilon=None):
378 |         """ Performs heavy-hitter recovery on the sketch
379 | 
380 |         Args:
381 |             k: if not None, the number of heavy hitters to recover
382 |             epsilon: if not None, the approximation error in the recovery.
383 |                 The returned heavy hitters are estimated to be greater
384 |                 than epsilon * self.l2estimate()
385 | 
386 |         Returns:
387 |             A vector containing the heavy hitters, with zero everywhere
388 |             else
389 | 
390 |         Note:
391 |             exactly one of k and epsilon must be non-None
392 |         """
393 | 
394 |         # either epsilon or k might be specified
395 |         # (but not both). Act accordingly
396 |         if epsilon is None:
397 |             thr = None
398 |         else:
399 |             thr = epsilon * self.l2estimate()
400 | 
401 |         hhs = self._findHHs(k=k, thr=thr)
402 | 
403 |         if k is not None:
404 |             assert(len(hhs[1]) == k)
405 |         if epsilon is not None:
406 |             assert((hhs[1] < thr).sum() == 0)
407 | 
408 |         # the unsketched vector is 0 everywhere except for HH
409 |         # coordinates, which are set to the HH values
410 |         unSketched = torch.zeros(self.d, device=self.device)
411 |         unSketched[hhs[0]] = hhs[1]
412 |         return unSketched
413 | 
414 |     def l2estimate(self):
415 |         """ Return an estimate of the L2 norm of the sketch """
416 |         # l2 norm esimation from the sketch
417 |         return np.sqrt(torch.median(torch.sum(self.table**2,1)).item())
418 | 
419 |     @classmethod
420 |     def median(cls, csvecs):
421 |         # make sure all CSVecs match
422 |         d = csvecs[0].d
423 |         c = csvecs[0].c
424 |         r = csvecs[0].r
425 |         device = csvecs[0].device
426 |         numBlocks = csvecs[0].numBlocks
427 |         for csvec in csvecs:
428 |             assert(csvec.d == d)
429 |             assert(csvec.c == c)
430 |             assert(csvec.r == r)
431 |             assert(csvec.device == device)
432 |             assert(csvec.numBlocks == numBlocks)
433 | 
434 |         tables = [csvec.table for csvec in csvecs]
435 |         med = torch.median(torch.stack(tables), dim=0)[0]
436 |         returnCSVec = copy.deepcopy(csvecs[0])
437 |         returnCSVec.table = med
438 |         return returnCSVec
439 | 


--------------------------------------------------------------------------------