├── csvec ├── __init__.py ├── test_csvec.py └── csvec.py ├── setup.py ├── README.md ├── .gitignore └── LICENSE /csvec/__init__.py: -------------------------------------------------------------------------------- 1 | from .csvec import CSVec 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='csvec', 4 | version='0.0.1', 5 | description='Count Sketch Vector', 6 | url='https://github.com/nikitaivkin/csh', 7 | author='Nikita Ivkin', 8 | author_email='nikitasemail@nikita.com', 9 | license='MIT', 10 | packages=['csvec'], 11 | zip_safe=False) 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CSVec: Count Sketch Vector 2 | 3 | ## Installation 4 | Dependencies: `pytorch` and `numpy`. Tested with `torch==1.0.1` and `numpy==1.15.3`, but this should work with a wide range of versions. 5 | 6 | `git clone` the repository to your local machine, move to the directory containing `setup.py`, then run 7 | ``` 8 | pip install -e . 9 | ``` 10 | to install this package. 11 | 12 | ## Description 13 | 14 | This package contains one main class, `CSVec`, which computes the Count Sketch of input vectors, and can extract heavy hitters from a Count Sketch. 15 | 16 | Link to the Count Sketch paper -> http://www.mathcs.emory.edu/~cheung/Courses/584-StreamDB/Syllabus/papers/Frequency-count/FrequentStream.pdf 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /csvec/test_csvec.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import csvec 3 | from csvec import CSVec 4 | import torch 5 | 6 | class Base: 7 | # use Base class to hide CSVecTestCase from the unittest runner 8 | # we only want the subclasses to actually be run 9 | 10 | class CSVecTestCase(unittest.TestCase): 11 | def testRandomness(self): 12 | # make sure two sketches get the same hashes and signs 13 | d = 100 14 | c = 20 15 | r = 5 16 | a = CSVec(d, c, r, **self.csvecArgs) 17 | b = CSVec(d, c, r, **self.csvecArgs) 18 | self.assertTrue(torch.allclose(a.signs, b.signs)) 19 | self.assertTrue(torch.allclose(a.buckets, b.buckets)) 20 | self.assertTrue(torch.allclose(a.signs, b.signs)) 21 | 22 | if self.numBlocks > 1: 23 | self.assertTrue(torch.allclose(a.blockOffsets, 24 | b.blockOffsets)) 25 | self.assertTrue(torch.allclose(a.blockSigns, 26 | b.blockSigns)) 27 | 28 | def testInit(self): 29 | # make sure the table starts out zeroed 30 | d = 100 31 | c = 20 32 | r = 5 33 | a = CSVec(d, c, r, **self.csvecArgs) 34 | zeros = torch.zeros(r, c).to(self.device) 35 | self.assertTrue(torch.allclose(a.table, zeros)) 36 | 37 | def testSketchVec(self): 38 | # sketch a vector with all zeros except a single 1 39 | # then the table should be zeros everywhere except a single 40 | # 1 in each row 41 | d = 100 42 | c = 1 43 | r = 5 44 | a = CSVec(d=d, c=c, r=r, **self.csvecArgs) 45 | vec = torch.zeros(d).to(self.device) 46 | vec[0] = 1 47 | a.accumulateVec(vec) 48 | # make sure the sketch only has one nonzero entry per row 49 | for i in range(r): 50 | with self.subTest(row=i): 51 | self.assertEqual(a.table[i,:].nonzero().numel(), 1) 52 | 53 | # make sure each row sums to +-1 54 | summed = a.table.abs().sum(dim=1).view(-1) 55 | ones = torch.ones(r).to(self.device) 56 | self.assertTrue(torch.allclose(summed, ones)) 57 | 58 | def testZeroSketch(self): 59 | d = 100 60 | c = 20 61 | r = 5 62 | a = CSVec(d, c, r, **self.csvecArgs) 63 | vec = torch.rand(d).to(self.device) 64 | a.accumulateVec(vec) 65 | 66 | zeros = torch.zeros((r, c)).to(self.device) 67 | self.assertFalse(torch.allclose(a.table, zeros)) 68 | 69 | a.zero() 70 | self.assertTrue(torch.allclose(a.table, zeros)) 71 | 72 | def testUnsketch(self): 73 | # make sure heavy hitter recovery works correctly 74 | 75 | # use a gigantic sketch so there's no chance of collision 76 | d = 5 77 | c = 10000 78 | r = 20 79 | a = CSVec(d, c, r, **self.csvecArgs) 80 | vec = torch.rand(d).to(self.device) 81 | 82 | a.accumulateVec(vec) 83 | 84 | with self.subTest(method="topk"): 85 | recovered = a.unSketch(k=d) 86 | self.assertTrue(torch.allclose(recovered, vec)) 87 | 88 | with self.subTest(method="epsilon"): 89 | thr = vec.abs().min() * 0.9 90 | recovered = a.unSketch(epsilon=thr / vec.norm()) 91 | self.assertTrue(torch.allclose(recovered, vec)) 92 | 93 | def testSketchSum(self): 94 | d = 5 95 | c = 10000 96 | r = 20 97 | 98 | summed = CSVec(d, c, r, **self.csvecArgs) 99 | for i in range(d): 100 | vec = torch.zeros(d).to(self.device) 101 | vec[i] = 1 102 | sketch = CSVec(d, c, r, **self.csvecArgs) 103 | sketch.accumulateVec(vec) 104 | summed += sketch 105 | 106 | recovered = summed.unSketch(k=d) 107 | trueSum = torch.ones(d).to(self.device) 108 | self.assertTrue(torch.allclose(recovered, trueSum)) 109 | 110 | def testL2(self): 111 | d = 5 112 | c = 10000 113 | r = 20 114 | 115 | vec = torch.randn(d).to(self.device) 116 | a = CSVec(d, c, r, **self.csvecArgs) 117 | a.accumulateVec(vec) 118 | 119 | tol = 0.0001 120 | self.assertTrue((a.l2estimate() - vec.norm()).abs() < tol) 121 | 122 | def testMedian(self): 123 | d = 5 124 | c = 10000 125 | r = 20 126 | 127 | csvecs = [CSVec(d, c, r, **self.csvecArgs) for _ in range(3)] 128 | for i, csvec in enumerate(csvecs): 129 | vec = torch.arange(d).float().to(self.device) + i 130 | csvec.accumulateVec(vec) 131 | median = CSVec.median(csvecs) 132 | recovered = median.unSketch(k=d) 133 | trueMedian = torch.arange(d).float().to(self.device) + 1 134 | self.assertTrue(torch.allclose(recovered, trueMedian)) 135 | 136 | class TestCaseCPU1(Base.CSVecTestCase): 137 | def setUp(self): 138 | # hack to reset csvec's global cache between tests 139 | csvec.cache = {} 140 | 141 | self.device = "cpu" 142 | self.numBlocks = 1 143 | 144 | self.csvecArgs = {"numBlocks": self.numBlocks, 145 | "device": self.device} 146 | 147 | class TestCaseCPU2(Base.CSVecTestCase): 148 | def setUp(self): 149 | csvec.cache = {} 150 | 151 | self.device = "cpu" 152 | self.numBlocks = 2 153 | 154 | self.csvecArgs = {"numBlocks": self.numBlocks, 155 | "device": self.device} 156 | 157 | @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") 158 | class TestCaseCUDA2(Base.CSVecTestCase): 159 | def setUp(self): 160 | csvec.cache = {} 161 | 162 | self.device = "cuda" 163 | self.numBlocks = 2 164 | 165 | self.csvecArgs = {"numBlocks": self.numBlocks, 166 | "device": self.device} 167 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Daniel Rothchild, Nikita Ivkin, Ashwinee Panda 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /csvec/csvec.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import copy 4 | import torch 5 | 6 | LARGEPRIME = 2**61-1 7 | 8 | cache = {} 9 | 10 | #import line_profiler 11 | #import atexit 12 | #profile = line_profiler.LineProfiler() 13 | #atexit.register(profile.print_stats) 14 | 15 | class CSVec(object): 16 | """ Count Sketch of a vector 17 | 18 | Treating a vector as a stream of tokens with associated weights, 19 | this class computes the count sketch of an input vector, and 20 | supports operations on the resulting sketch. 21 | 22 | public methods: zero, unSketch, l2estimate, __add__, __iadd__ 23 | """ 24 | 25 | def __init__(self, d, c, r, doInitialize=True, device=None, 26 | numBlocks=1): 27 | """ Constductor for CSVec 28 | 29 | Args: 30 | d: the cardinality of the skteched vector 31 | c: the number of columns (buckets) in the sketch 32 | r: the number of rows in the sketch 33 | doInitialize: if False, you are responsible for setting 34 | self.table, self.signs, self.buckets, self.blockSigns, 35 | and self.blockOffsets 36 | device: which device to use (cuda or cpu). If None, chooses 37 | cuda if available, else cpu 38 | numBlocks: mechanism to reduce memory consumption. A value 39 | of 1 leads to a normal sketch. Higher values reduce 40 | peak memory consumption proportionally but decrease 41 | randomness of the hashes 42 | Note: 43 | Since sketching a vector always requires the hash functions 44 | to be evaluated for all of 0..d-1, we precompute the 45 | hash values in the constructor. However, this takes d*r 46 | memory, which is sometimes too big. We therefore only 47 | compute hashes of 0..(d/numBlocks - 1), and we let the 48 | hash of all other tokens be the hash of that token modulo 49 | d/numBlocks. In order to recover some of the lost randomness, 50 | we add a random number to each "block" (self.blockOffsets) 51 | and multiply each block by a random sign (self.blockSigns) 52 | """ 53 | 54 | # save random quantities in a module-level variable so we can 55 | # reuse them if someone else makes a sketch with the same d, c, r 56 | global cache 57 | 58 | self.r = r # num of rows 59 | self.c = c # num of columns 60 | # need int() here b/c annoying np returning np.int64... 61 | self.d = int(d) # vector dimensionality 62 | 63 | # reduce memory consumption of signs & buckets by constraining 64 | # them to be repetitions of a single block 65 | self.numBlocks = numBlocks 66 | 67 | # choose the device automatically if none was given 68 | if device is None: 69 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 70 | else: 71 | if (not isinstance(device, torch.device) and 72 | not ("cuda" in device or device == "cpu")): 73 | msg = "Expected a valid device, got {}" 74 | raise ValueError(msg.format(device)) 75 | 76 | self.device = device 77 | 78 | # this flag indicates that the caller plans to set up 79 | # self.signs, self.buckets, self.blockSigns, and self.blockOffsets 80 | # itself (e.g. self.deepcopy does this) 81 | if not doInitialize: 82 | return 83 | 84 | # initialize the sketch to all zeros 85 | self.table = torch.zeros((r, c), device=self.device) 86 | 87 | # if we already have these, don't do the same computation 88 | # again (wasting memory storing the same data several times) 89 | cacheKey = (d, c, r, numBlocks, device) 90 | if cacheKey in cache: 91 | self.signs = cache[cacheKey]["signs"] 92 | self.buckets = cache[cacheKey]["buckets"] 93 | if self.numBlocks > 1: 94 | self.blockSigns = cache[cacheKey]["blockSigns"] 95 | self.blockOffsets = cache[cacheKey]["blockOffsets"] 96 | return 97 | 98 | # initialize hashing functions for each row: 99 | # 2 random numbers for bucket hashes + 4 random numbers for 100 | # sign hashes 101 | # maintain existing random state so we don't mess with 102 | # the main module trying to set the random seed but still 103 | # get reproducible hashes for the same value of r 104 | 105 | # do all these computations on the CPU, since pytorch 106 | # is incapable of in-place mod, and without that, this 107 | # computation uses up too much GPU RAM 108 | rand_state = torch.random.get_rng_state() 109 | torch.random.manual_seed(42) 110 | hashes = torch.randint(0, LARGEPRIME, (r, 6), 111 | dtype=torch.int64, device="cpu") 112 | 113 | # compute random blockOffsets and blockSigns 114 | if self.numBlocks > 1: 115 | nTokens = self.d // numBlocks 116 | if self.d % numBlocks != 0: 117 | # so that we only need numBlocks repetitions 118 | nTokens += 1 119 | self.blockSigns = torch.randint(0, 2, size=(self.numBlocks,), 120 | device=self.device) * 2 - 1 121 | self.blockOffsets = torch.randint(0, self.c, 122 | size=(self.numBlocks,), 123 | device=self.device) 124 | else: 125 | assert(numBlocks == 1) 126 | nTokens = self.d 127 | 128 | torch.random.set_rng_state(rand_state) 129 | 130 | # tokens are the indices of the vector entries 131 | tokens = torch.arange(nTokens, dtype=torch.int64, device="cpu") 132 | tokens = tokens.reshape((1, nTokens)) 133 | 134 | # computing sign hashes (4 wise independence) 135 | h1 = hashes[:,2:3] 136 | h2 = hashes[:,3:4] 137 | h3 = hashes[:,4:5] 138 | h4 = hashes[:,5:6] 139 | self.signs = (((h1 * tokens + h2) * tokens + h3) * tokens + h4) 140 | self.signs = ((self.signs % LARGEPRIME % 2) * 2 - 1).float() 141 | 142 | # only move to device now, since this computation takes too 143 | # much memory if done on the GPU, and it can't be done 144 | # in-place because pytorch (1.0.1) has no in-place modulo 145 | # function that works on large numbers 146 | self.signs = self.signs.to(self.device) 147 | 148 | # computing bucket hashes (2-wise independence) 149 | h1 = hashes[:,0:1] 150 | h2 = hashes[:,1:2] 151 | self.buckets = ((h1 * tokens) + h2) % LARGEPRIME % self.c 152 | 153 | # only move to device now. See comment above. 154 | # can't cast this to int, unfortunately, since we index with 155 | # this below, and pytorch only lets us index with long 156 | # tensors 157 | self.buckets = self.buckets.to(self.device) 158 | 159 | cache[cacheKey] = {"signs": self.signs, 160 | "buckets": self.buckets} 161 | if numBlocks > 1: 162 | cache[cacheKey].update({"blockSigns": self.blockSigns, 163 | "blockOffsets": self.blockOffsets}) 164 | 165 | def zero(self): 166 | """ Set all the entries of the sketch to zero """ 167 | self.table.zero_() 168 | 169 | def cpu_(self): 170 | self.device = "cpu" 171 | self.table = self.table.cpu() 172 | 173 | def cuda_(self, device="cuda"): 174 | self.device = device 175 | self.table = self.table.cuda() 176 | 177 | def half_(self): 178 | self.table = self.table.half() 179 | 180 | def float_(self): 181 | self.table = self.table.float() 182 | 183 | def __deepcopy__(self, memodict={}): 184 | # don't initialize new CSVec, since that will calculate bc, 185 | # which is slow, even though we can just copy it over 186 | # directly without recomputing it 187 | newCSVec = CSVec(d=self.d, c=self.c, r=self.r, 188 | doInitialize=False, device=self.device, 189 | numBlocks=self.numBlocks) 190 | newCSVec.table = copy.deepcopy(self.table) 191 | global cache 192 | cachedVals = cache[(self.d, self.c, self.r, self.numBlocks, self.device)] 193 | newCSVec.signs = cachedVals["signs"] 194 | newCSVec.buckets = cachedVals["buckets"] 195 | if self.numBlocks > 1: 196 | newCSVec.blockSigns = cachedVals["blockSigns"] 197 | newCSVec.blockOffsets = cachedVals["blockOffsets"] 198 | return newCSVec 199 | 200 | def __imul__(self, other): 201 | if isinstance(other, int) or isinstance(other, float): 202 | self.table = self.table.mul_(other) 203 | else: 204 | raise ValueError(f"Can't multiply a CSVec by {other}") 205 | return self 206 | 207 | def __truediv__(self, other): 208 | if isinstance(other, int) or isinstance(other, float): 209 | self.table = self.table.div_(other) 210 | else: 211 | raise ValueError(f"Can't divide a CSVec by {other}") 212 | return self 213 | 214 | def __add__(self, other): 215 | """ Returns the sum of self with other 216 | 217 | Args: 218 | other: a CSVec with identical values of d, c, and r 219 | """ 220 | # a bit roundabout in order to avoid initializing a new CSVec 221 | returnCSVec = copy.deepcopy(self) 222 | returnCSVec += other 223 | return returnCSVec 224 | 225 | def __iadd__(self, other): 226 | """ Accumulates another sketch 227 | 228 | Args: 229 | other: a CSVec with identical values of d, c, r, device, numBlocks 230 | """ 231 | if isinstance(other, CSVec): 232 | # merges csh sketch into self 233 | assert(self.d == other.d) 234 | assert(self.c == other.c) 235 | assert(self.r == other.r) 236 | assert(self.device == other.device) 237 | assert(self.numBlocks == other.numBlocks) 238 | self.table += other.table 239 | else: 240 | raise ValueError("Can't add this to a CSVec: {}".format(other)) 241 | return self 242 | 243 | def accumulateTable(self, table): 244 | """ Adds a CSVec.table to self 245 | 246 | Args: 247 | table: the table to be added 248 | 249 | """ 250 | if table.size() != self.table.size(): 251 | msg = "Passed in table has size {}, expecting {}" 252 | raise ValueError(msg.format(table.size(), self.table.size())) 253 | 254 | self.table += table 255 | 256 | def accumulateVec(self, vec): 257 | """ Sketches a vector and adds the result to self 258 | 259 | Args: 260 | vec: the vector to be sketched 261 | """ 262 | assert(len(vec.size()) == 1 and vec.size()[0] == self.d) 263 | 264 | # the vector is sketched to each row independently 265 | for r in range(self.r): 266 | buckets = self.buckets[r,:].to(self.device) 267 | signs = self.signs[r,:].to(self.device) 268 | # the main computation here is the bincount below, but 269 | # there's lots of index accounitng leading up to it due 270 | # to numBlocks being potentially > 1 271 | for blockId in range(self.numBlocks): 272 | start = blockId * buckets.size()[0] 273 | end = (blockId + 1) * buckets.size()[0] 274 | end = min(end, self.d) 275 | offsetBuckets = buckets[:end-start].clone() 276 | offsetSigns = signs[:end-start].clone() 277 | if self.numBlocks > 1: 278 | offsetBuckets += self.blockOffsets[blockId] 279 | offsetBuckets %= self.c 280 | offsetSigns *= self.blockSigns[blockId] 281 | # bincount computes the sum of all values in the vector 282 | # that correspond to each bucket 283 | self.table[r,:] += torch.bincount( 284 | input=offsetBuckets, 285 | weights=offsetSigns * vec[start:end], 286 | minlength=self.c 287 | ) 288 | 289 | def _findHHK(self, k): 290 | assert(k is not None) 291 | #tokens = torch.arange(self.d, device=self.device) 292 | #vals = self._findValues(tokens) 293 | vals = self._findAllValues() 294 | 295 | # sort is faster than torch.topk... 296 | #HHs = torch.sort(vals**2)[1][-k:] 297 | 298 | # topk on cuda returns what looks like uninitialized memory if 299 | # vals has nan values in it 300 | # saving to a zero-initialized output array instead of using the 301 | # output of topk appears to solve this problem 302 | outVals = torch.zeros(k, device=vals.device) 303 | HHs = torch.zeros(k, device=vals.device).long() 304 | torch.topk(vals**2, k, sorted=False, out=(outVals, HHs)) 305 | return HHs, vals[HHs] 306 | 307 | def _findHHThr(self, thr): 308 | assert(thr is not None) 309 | vals = self._findAllValues() 310 | HHs = vals.abs() >= thr 311 | return HHs, vals[HHs] 312 | 313 | """ this is a potentially faster way to compute the same thing, 314 | but it doesn't play nicely with numBlocks > 1, so for now I'm 315 | just using the slower code above 316 | 317 | # to figure out which items are heavy hitters, check whether 318 | # self.table exceeds thr (in magnitude) in at least r/2 of 319 | # the rows. These elements are exactly those for which the median 320 | # exceeds thr, but computing the median is expensive, so only 321 | # calculate it after we identify which ones are heavy 322 | tablefiltered = ( (self.table > thr).float() 323 | - (self.table < -thr).float()) 324 | est = torch.zeros(self.d, device=self.device) 325 | for r in range(self.r): 326 | est += tablefiltered[r, self.buckets[r,:]] * self.signs[r, :] 327 | est = ( (est >= math.ceil(self.r/2.)).float() 328 | - (est <= -math.ceil(self.r/2.)).float()) 329 | 330 | # HHs - heavy coordinates 331 | HHs = torch.nonzero(est) 332 | return HHs, self._findValues(HHs) 333 | """ 334 | 335 | def _findValues(self, coords): 336 | # estimating frequency of input coordinates 337 | assert(self.numBlocks == 1) 338 | d = coords.size()[0] 339 | vals = torch.zeros(self.r, self.d, device=self.device) 340 | for r in range(self.r): 341 | vals[r] = (self.table[r, self.buckets[r, coords]] 342 | * self.signs[r, coords]) 343 | return vals.median(dim=0)[0] 344 | 345 | def _findAllValues(self): 346 | if self.numBlocks == 1: 347 | vals = torch.zeros(self.r, self.d, device=self.device) 348 | for r in range(self.r): 349 | vals[r] = (self.table[r, self.buckets[r,:]] 350 | * self.signs[r,:]) 351 | return vals.median(dim=0)[0] 352 | else: 353 | medians = torch.zeros(self.d, device=self.device) 354 | for blockId in range(self.numBlocks): 355 | start = blockId * self.buckets.size()[1] 356 | end = (blockId + 1) * self.buckets.size()[1] 357 | end = min(end, self.d) 358 | vals = torch.zeros(self.r, end-start, device=self.device) 359 | for r in range(self.r): 360 | buckets = self.buckets[r, :end-start] 361 | signs = self.signs[r, :end-start] 362 | offsetBuckets = buckets + self.blockOffsets[blockId] 363 | offsetBuckets %= self.c 364 | offsetSigns = signs * self.blockSigns[blockId] 365 | vals[r] = (self.table[r, offsetBuckets] 366 | * offsetSigns) 367 | medians[start:end] = vals.median(dim=0)[0] 368 | return medians 369 | 370 | def _findHHs(self, k=None, thr=None): 371 | assert((k is None) != (thr is None)) 372 | if k is not None: 373 | return self._findHHK(k) 374 | else: 375 | return self._findHHThr(thr) 376 | 377 | def unSketch(self, k=None, epsilon=None): 378 | """ Performs heavy-hitter recovery on the sketch 379 | 380 | Args: 381 | k: if not None, the number of heavy hitters to recover 382 | epsilon: if not None, the approximation error in the recovery. 383 | The returned heavy hitters are estimated to be greater 384 | than epsilon * self.l2estimate() 385 | 386 | Returns: 387 | A vector containing the heavy hitters, with zero everywhere 388 | else 389 | 390 | Note: 391 | exactly one of k and epsilon must be non-None 392 | """ 393 | 394 | # either epsilon or k might be specified 395 | # (but not both). Act accordingly 396 | if epsilon is None: 397 | thr = None 398 | else: 399 | thr = epsilon * self.l2estimate() 400 | 401 | hhs = self._findHHs(k=k, thr=thr) 402 | 403 | if k is not None: 404 | assert(len(hhs[1]) == k) 405 | if epsilon is not None: 406 | assert((hhs[1] < thr).sum() == 0) 407 | 408 | # the unsketched vector is 0 everywhere except for HH 409 | # coordinates, which are set to the HH values 410 | unSketched = torch.zeros(self.d, device=self.device) 411 | unSketched[hhs[0]] = hhs[1] 412 | return unSketched 413 | 414 | def l2estimate(self): 415 | """ Return an estimate of the L2 norm of the sketch """ 416 | # l2 norm esimation from the sketch 417 | return np.sqrt(torch.median(torch.sum(self.table**2,1)).item()) 418 | 419 | @classmethod 420 | def median(cls, csvecs): 421 | # make sure all CSVecs match 422 | d = csvecs[0].d 423 | c = csvecs[0].c 424 | r = csvecs[0].r 425 | device = csvecs[0].device 426 | numBlocks = csvecs[0].numBlocks 427 | for csvec in csvecs: 428 | assert(csvec.d == d) 429 | assert(csvec.c == c) 430 | assert(csvec.r == r) 431 | assert(csvec.device == device) 432 | assert(csvec.numBlocks == numBlocks) 433 | 434 | tables = [csvec.table for csvec in csvecs] 435 | med = torch.median(torch.stack(tables), dim=0)[0] 436 | returnCSVec = copy.deepcopy(csvecs[0]) 437 | returnCSVec.table = med 438 | return returnCSVec 439 | --------------------------------------------------------------------------------