├── .gitignore ├── LICENSE ├── LICENSE-MIT ├── NOTICE ├── README.md ├── cpp ├── Makefile ├── MurmurHash3.cpp ├── MurmurHash3.h ├── README.md ├── chain.cpp ├── chain.h ├── docopt.cpp ├── docopt.h ├── docopt_private.h ├── docopt_util.h ├── docopt_value.h ├── hash.h ├── main.cpp ├── param.h ├── streamhash.cpp ├── streamhash.h ├── synDataNoisy.svm ├── test_xstream_static.py └── util.h └── python ├── Chains.py ├── README.md ├── StreamhashProjection.py ├── synData.png ├── synDataNoisy.tsv └── test_xstream.py /.gitignore: -------------------------------------------------------------------------------- 1 | xstream 2 | *.o 3 | tex/ 4 | *.png 5 | *.swp 6 | .DS_Store 7 | data/ 8 | env* 9 | *.synctex.gz 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | env/ 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | .hypothesis/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # celery beat schedule file 87 | celerybeat-schedule 88 | 89 | # SageMath parsed files 90 | *.sage.py 91 | 92 | # dotenv 93 | .env 94 | 95 | # virtualenv 96 | .venv 97 | venv/ 98 | ENV/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2018 Emaad Ahmed Manzoor 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Vladimir Keleshev, 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the Software 6 | without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, 8 | and/or sell copies of the Software, and to permit persons to 9 | whom the Software is furnished to do so, subject to the 10 | following conditions: 11 | 12 | The above copyright notice and this permission notice shall 13 | be included in all copies or substantial portions of the 14 | Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY 17 | KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 18 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 19 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 20 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 22 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 23 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | xstream 2 | 3 | Unless otherwise noted in this file, all software is copyright 2018 4 | Emaad Ahmed Manzoor, licensed under the Apache License 2.0 (see LICENSE). 5 | 6 | docopt.cpp 7 | 8 | Copyright (c) 2012 Vladimir Keleshev, 9 | Licensed under the MIT license (see LICENSE-MIT). 10 | https://github.com/docopt/docopt.cpp 11 | 12 | Files: docopt.cpp docopt.h docopt_private.h docopt_util.h docopt_value.h 13 | Modifications: Remove "#pragma mark" directives. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xStream Core 2 | 3 | 4 | 5 | [https://cmuxstream.github.io](https://cmuxstream.github.io) 6 | 7 | Provided in this repository are two implementations of xStream: 8 | 9 | 1. [Python](https://github.com/cmuxstream/cmuxstream-core/tree/master/python): For static data. 10 | 2. [C++](https://github.com/cmuxstream/cmuxstream-core/tree/master/cpp): For static, row-stream and evolving-stream data. 11 | 12 | For further information, see README.md's in the respective folders for each implementation. 13 | 14 | # Contact 15 | 16 | * emaad@cmu.edu 17 | * hlamba@andrew.cmu.edu 18 | * lakoglu@andrew.cmu.edu 19 | -------------------------------------------------------------------------------- /cpp/Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | CFLAGS=-Wall --std=c++11 -g 3 | SOURCES := $(wildcard *.cpp) 4 | OBJS := $(SOURCES:.cpp=.o) 5 | 6 | .PHONY: clean 7 | 8 | optimized: CFLAGS += -Ofast -march=native -mtune=native -lpthread -pthread 9 | optimized: xstream 10 | 11 | debug: CFLAGS += -DDEBUG -fsanitize=address -fsanitize=undefined -g 12 | debug: xstream 13 | 14 | profile: CFLAGS += -L/home/localdirs/stufs1/emanzoor/.local/lib -Wl,--no-as-needed,-lprofiler,--as-needed -g 15 | profile: xstream 16 | 17 | xstream: $(OBJS) 18 | $(CC) $(CFLAGS) $^ -o $@ 19 | 20 | .cpp.o: 21 | $(CC) $(CFLAGS) -c $< 22 | 23 | clean: 24 | rm -f xstream $(OBJS) 25 | -------------------------------------------------------------------------------- /cpp/MurmurHash3.cpp: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | // Note - The x86 and x64 versions do _not_ produce the same results, as the 6 | // algorithms are optimized for their respective platforms. You can still 7 | // compile and run any of them on any platform, but your performance with the 8 | // non-native version will be less than optimal. 9 | 10 | #include "MurmurHash3.h" 11 | 12 | //----------------------------------------------------------------------------- 13 | // Platform-specific functions and macros 14 | 15 | // Microsoft Visual Studio 16 | 17 | #if defined(_MSC_VER) 18 | 19 | #define FORCE_INLINE __forceinline 20 | 21 | #include 22 | 23 | #define ROTL32(x,y) _rotl(x,y) 24 | #define ROTL64(x,y) _rotl64(x,y) 25 | 26 | #define BIG_CONSTANT(x) (x) 27 | 28 | // Other compilers 29 | 30 | #else // defined(_MSC_VER) 31 | 32 | #define FORCE_INLINE inline __attribute__((always_inline)) 33 | 34 | inline uint32_t rotl32 ( uint32_t x, int8_t r ) 35 | { 36 | return (x << r) | (x >> (32 - r)); 37 | } 38 | 39 | inline uint64_t rotl64 ( uint64_t x, int8_t r ) 40 | { 41 | return (x << r) | (x >> (64 - r)); 42 | } 43 | 44 | #define ROTL32(x,y) rotl32(x,y) 45 | #define ROTL64(x,y) rotl64(x,y) 46 | 47 | #define BIG_CONSTANT(x) (x##LLU) 48 | 49 | #endif // !defined(_MSC_VER) 50 | 51 | //----------------------------------------------------------------------------- 52 | // Block read - if your platform needs to do endian-swapping or can only 53 | // handle aligned reads, do the conversion here 54 | 55 | FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) 56 | { 57 | return p[i]; 58 | } 59 | 60 | FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) 61 | { 62 | return p[i]; 63 | } 64 | 65 | //----------------------------------------------------------------------------- 66 | // Finalization mix - force all bits of a hash block to avalanche 67 | 68 | FORCE_INLINE uint32_t fmix32 ( uint32_t h ) 69 | { 70 | h ^= h >> 16; 71 | h *= 0x85ebca6b; 72 | h ^= h >> 13; 73 | h *= 0xc2b2ae35; 74 | h ^= h >> 16; 75 | 76 | return h; 77 | } 78 | 79 | //---------- 80 | 81 | FORCE_INLINE uint64_t fmix64 ( uint64_t k ) 82 | { 83 | k ^= k >> 33; 84 | k *= BIG_CONSTANT(0xff51afd7ed558ccd); 85 | k ^= k >> 33; 86 | k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); 87 | k ^= k >> 33; 88 | 89 | return k; 90 | } 91 | 92 | //----------------------------------------------------------------------------- 93 | 94 | void MurmurHash3_x86_32 ( const void * key, int len, 95 | uint32_t seed, void * out ) 96 | { 97 | const uint8_t * data = (const uint8_t*)key; 98 | const int nblocks = len / 4; 99 | 100 | uint32_t h1 = seed; 101 | 102 | const uint32_t c1 = 0xcc9e2d51; 103 | const uint32_t c2 = 0x1b873593; 104 | 105 | //---------- 106 | // body 107 | 108 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); 109 | 110 | for(int i = -nblocks; i; i++) 111 | { 112 | uint32_t k1 = getblock32(blocks,i); 113 | 114 | k1 *= c1; 115 | k1 = ROTL32(k1,15); 116 | k1 *= c2; 117 | 118 | h1 ^= k1; 119 | h1 = ROTL32(h1,13); 120 | h1 = h1*5+0xe6546b64; 121 | } 122 | 123 | //---------- 124 | // tail 125 | 126 | const uint8_t * tail = (const uint8_t*)(data + nblocks*4); 127 | 128 | uint32_t k1 = 0; 129 | 130 | switch(len & 3) 131 | { 132 | case 3: k1 ^= tail[2] << 16; 133 | case 2: k1 ^= tail[1] << 8; 134 | case 1: k1 ^= tail[0]; 135 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 136 | }; 137 | 138 | //---------- 139 | // finalization 140 | 141 | h1 ^= len; 142 | 143 | h1 = fmix32(h1); 144 | 145 | *(uint32_t*)out = h1; 146 | } 147 | 148 | //----------------------------------------------------------------------------- 149 | 150 | void MurmurHash3_x86_128 ( const void * key, const int len, 151 | uint32_t seed, void * out ) 152 | { 153 | const uint8_t * data = (const uint8_t*)key; 154 | const int nblocks = len / 16; 155 | 156 | uint32_t h1 = seed; 157 | uint32_t h2 = seed; 158 | uint32_t h3 = seed; 159 | uint32_t h4 = seed; 160 | 161 | const uint32_t c1 = 0x239b961b; 162 | const uint32_t c2 = 0xab0e9789; 163 | const uint32_t c3 = 0x38b34ae5; 164 | const uint32_t c4 = 0xa1e38b93; 165 | 166 | //---------- 167 | // body 168 | 169 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); 170 | 171 | for(int i = -nblocks; i; i++) 172 | { 173 | uint32_t k1 = getblock32(blocks,i*4+0); 174 | uint32_t k2 = getblock32(blocks,i*4+1); 175 | uint32_t k3 = getblock32(blocks,i*4+2); 176 | uint32_t k4 = getblock32(blocks,i*4+3); 177 | 178 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 179 | 180 | h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; 181 | 182 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 183 | 184 | h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; 185 | 186 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 187 | 188 | h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; 189 | 190 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 191 | 192 | h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; 193 | } 194 | 195 | //---------- 196 | // tail 197 | 198 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 199 | 200 | uint32_t k1 = 0; 201 | uint32_t k2 = 0; 202 | uint32_t k3 = 0; 203 | uint32_t k4 = 0; 204 | 205 | switch(len & 15) 206 | { 207 | case 15: k4 ^= tail[14] << 16; 208 | case 14: k4 ^= tail[13] << 8; 209 | case 13: k4 ^= tail[12] << 0; 210 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 211 | 212 | case 12: k3 ^= tail[11] << 24; 213 | case 11: k3 ^= tail[10] << 16; 214 | case 10: k3 ^= tail[ 9] << 8; 215 | case 9: k3 ^= tail[ 8] << 0; 216 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 217 | 218 | case 8: k2 ^= tail[ 7] << 24; 219 | case 7: k2 ^= tail[ 6] << 16; 220 | case 6: k2 ^= tail[ 5] << 8; 221 | case 5: k2 ^= tail[ 4] << 0; 222 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 223 | 224 | case 4: k1 ^= tail[ 3] << 24; 225 | case 3: k1 ^= tail[ 2] << 16; 226 | case 2: k1 ^= tail[ 1] << 8; 227 | case 1: k1 ^= tail[ 0] << 0; 228 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 229 | }; 230 | 231 | //---------- 232 | // finalization 233 | 234 | h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; 235 | 236 | h1 += h2; h1 += h3; h1 += h4; 237 | h2 += h1; h3 += h1; h4 += h1; 238 | 239 | h1 = fmix32(h1); 240 | h2 = fmix32(h2); 241 | h3 = fmix32(h3); 242 | h4 = fmix32(h4); 243 | 244 | h1 += h2; h1 += h3; h1 += h4; 245 | h2 += h1; h3 += h1; h4 += h1; 246 | 247 | ((uint32_t*)out)[0] = h1; 248 | ((uint32_t*)out)[1] = h2; 249 | ((uint32_t*)out)[2] = h3; 250 | ((uint32_t*)out)[3] = h4; 251 | } 252 | 253 | //----------------------------------------------------------------------------- 254 | 255 | void MurmurHash3_x64_128 ( const void * key, const int len, 256 | const uint32_t seed, void * out ) 257 | { 258 | const uint8_t * data = (const uint8_t*)key; 259 | const int nblocks = len / 16; 260 | 261 | uint64_t h1 = seed; 262 | uint64_t h2 = seed; 263 | 264 | const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); 265 | const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); 266 | 267 | //---------- 268 | // body 269 | 270 | const uint64_t * blocks = (const uint64_t *)(data); 271 | 272 | for(int i = 0; i < nblocks; i++) 273 | { 274 | uint64_t k1 = getblock64(blocks,i*2+0); 275 | uint64_t k2 = getblock64(blocks,i*2+1); 276 | 277 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 278 | 279 | h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 280 | 281 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 282 | 283 | h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 284 | } 285 | 286 | //---------- 287 | // tail 288 | 289 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 290 | 291 | uint64_t k1 = 0; 292 | uint64_t k2 = 0; 293 | 294 | switch(len & 15) 295 | { 296 | case 15: k2 ^= ((uint64_t)tail[14]) << 48; 297 | case 14: k2 ^= ((uint64_t)tail[13]) << 40; 298 | case 13: k2 ^= ((uint64_t)tail[12]) << 32; 299 | case 12: k2 ^= ((uint64_t)tail[11]) << 24; 300 | case 11: k2 ^= ((uint64_t)tail[10]) << 16; 301 | case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; 302 | case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; 303 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 304 | 305 | case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; 306 | case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; 307 | case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; 308 | case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; 309 | case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; 310 | case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; 311 | case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; 312 | case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; 313 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 314 | }; 315 | 316 | //---------- 317 | // finalization 318 | 319 | h1 ^= len; h2 ^= len; 320 | 321 | h1 += h2; 322 | h2 += h1; 323 | 324 | h1 = fmix64(h1); 325 | h2 = fmix64(h2); 326 | 327 | h1 += h2; 328 | h2 += h1; 329 | 330 | ((uint64_t*)out)[0] = h1; 331 | ((uint64_t*)out)[1] = h2; 332 | } 333 | 334 | //----------------------------------------------------------------------------- 335 | 336 | -------------------------------------------------------------------------------- /cpp/MurmurHash3.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | #ifndef _MURMURHASH3_H_ 6 | #define _MURMURHASH3_H_ 7 | 8 | //----------------------------------------------------------------------------- 9 | // Platform-specific functions and macros 10 | 11 | // Microsoft Visual Studio 12 | 13 | #if defined(_MSC_VER) && (_MSC_VER < 1600) 14 | 15 | typedef unsigned char uint8_t; 16 | typedef unsigned int uint32_t; 17 | typedef unsigned __int64 uint64_t; 18 | 19 | // Other compilers 20 | 21 | #else // defined(_MSC_VER) 22 | 23 | #include 24 | 25 | #endif // !defined(_MSC_VER) 26 | 27 | //----------------------------------------------------------------------------- 28 | 29 | void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); 30 | 31 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); 32 | 33 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); 34 | 35 | //----------------------------------------------------------------------------- 36 | 37 | #endif // _MURMURHASH3_H_ 38 | -------------------------------------------------------------------------------- /cpp/README.md: -------------------------------------------------------------------------------- 1 | # xStream - C++ 2 | 3 | [https://cmuxstream.github.io](https://cmuxstream.github.io) 4 | 5 | This implementation is in C++ 11 and works on static, row-stream and evolving-stream data. 6 | 7 | ## Format 8 | 9 | Static and row-stream data needs to be in the SVM-Light format (see `synDataNoisy.svm`, the 10 | SVM-Light formatted version of `synDataNoisy.tsv` used to test the 11 | [Python implementation](https://github.com/cmuxstream/cmuxstream-core/tree/master/python)). 12 | Evolving-stream data needs to be in the modified SVM-Light format described in the 13 | [datasets README](https://github.com/cmuxstream/cmuxstream-data/tree/master/evolving). 14 | 15 | ## Build 16 | 17 | ``` 18 | make clean 19 | make optimized 20 | ``` 21 | 22 | ## Arguments and Examples 23 | 24 | Help with arguments can be displayed by running `./xstream --help` 25 | ``` 26 | Usage: 27 | xstream [--k=] 28 | [--c=] 29 | [--d=] 30 | [--rowstream] 31 | [--nwindows=] 32 | [--initsample=] 33 | [--scoringbatch=] 34 | [--cosine] 35 | 36 | xstream (-h | --help) 37 | 38 | Options: 39 | -h, --help Show this screen. 40 | --k= Projection size [default: 100]. 41 | --c= Number of chains [default: 100]. 42 | --d= Depth [default: 15]. 43 | --rowstream Row stream (each row starts with a label). 44 | --nwindows= > 0 if windowed [default: 1]. 45 | --initsample= Initial sample size [default: 256]. 46 | --scoringbatch= Print scores at regular intervals [default: 1000]. 47 | --cosine Work in cosine space instead of Euclidean. 48 | ``` 49 | 50 | If the data is static or a row-stream in SVM-Light format, specify the `--rowstream` option. 51 | 52 | An example of running on 3082 rows of the [synthetic data](https://github.com/cmuxstream/cmuxstream-core/tree/master/python) 53 | without windows (mimics the static Python implementation) in Euclidean space, scoring just once at the end: 54 | ``` 55 | cat synDataNoisy.svm | ./xstream --k 50 --c 50 --d 10 --rowstream --nwindows 0 --initsample `wc -l < synDataNoisy.svm` --scoringbatch 100000 > scores 56 | python test_xstream_static.py 57 | ``` 58 | 59 | An example of evaluating these scores is in `test_xstream_static.py`. 60 | 61 | # Contact 62 | 63 | * emaad@cmu.edu 64 | * hlamba@andrew.cmu.edu 65 | * lakoglu@andrew.cmu.edu 66 | -------------------------------------------------------------------------------- /cpp/chain.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "chain.h" 4 | #include 5 | #include "hash.h" 6 | #include 7 | #include 8 | #include 9 | #include "streamhash.h" 10 | #include 11 | #include 12 | 13 | namespace std { 14 | 15 | void 16 | chains_init_features(vector>& fs, uint k, mt19937_64& prng) { 17 | uint c = fs.size(); 18 | uint d = fs[0].size(); 19 | uniform_int_distribution<> dis(0, k-1); 20 | 21 | for (uint c_i = 0; c_i < c; c_i++) { 22 | for (uint d_i = 0; d_i < d; d_i++) { 23 | int feature = dis(prng); 24 | fs[c_i][d_i] = feature; 25 | } 26 | } 27 | } 28 | 29 | float 30 | chains_add(vector& xp, vector>& deltamax, vector>& shift, 31 | vector,int>>>& cmsketches, 32 | vector>& fs, bool update) { 33 | 34 | uint k = xp.size(); 35 | uint nchains = cmsketches.size(); 36 | uint depth = cmsketches[0].size(); 37 | 38 | vector> scaled_bincount(nchains, vector(depth)); 39 | 40 | for (uint c = 0; c < nchains; c++) { 41 | vector prebin(k, 0.0); 42 | vector used(k, false); 43 | for (uint d = 0; d < depth; d++) { 44 | uint f = fs[c][d]; 45 | if (used[f] == false) { 46 | prebin[f] = (xp[f] + shift[c][f])/deltamax[c][f]; 47 | used[f] = true; 48 | } else { 49 | prebin[f] = 2.0 * prebin[f] - shift[c][f]/deltamax[c][f]; 50 | } 51 | 52 | vector bin(k); 53 | for (uint i = 0; i < k; i++) { 54 | bin[i] = static_cast(floor(prebin[i])); 55 | } 56 | 57 | if (update) { 58 | cmsketches[c][d][bin]++; 59 | } 60 | scaled_bincount[c][d] = log2(cmsketches[c][d][bin] + 1) + (d + 1); 61 | } 62 | } 63 | 64 | float avg_anomalyscore = 0.0; 65 | for (uint c = 0; c < nchains; c++) { 66 | float score_c = scaled_bincount[c][0]; 67 | for (uint d = 1; d < depth; d++) { 68 | if (scaled_bincount[c][d] < score_c) { 69 | score_c = scaled_bincount[c][d]; 70 | } 71 | } 72 | avg_anomalyscore += score_c; 73 | } 74 | avg_anomalyscore /= nchains; 75 | 76 | return avg_anomalyscore; 77 | } 78 | 79 | float 80 | chains_add_cosine(vector& xp, 81 | vector,int>>>& cmsketches, 82 | vector>& fs, bool update) { 83 | 84 | uint nchains = cmsketches.size(); 85 | uint depth = cmsketches[0].size(); 86 | 87 | vector> scaled_bincount(nchains, vector(depth)); 88 | 89 | for (uint c = 0; c < nchains; c++) { 90 | vector bin; 91 | for (uint d = 0; d < depth; d++) { 92 | bin.push_back(signbit(xp[fs[c][d]])); 93 | if (update) { 94 | cmsketches[c][d][bin]++; 95 | } 96 | scaled_bincount[c][d] = log2(cmsketches[c][d][bin] + 1) + (d + 1); 97 | } 98 | } 99 | 100 | float avg_anomalyscore = 0.0; 101 | for (uint c = 0; c < nchains; c++) { 102 | float score_c = scaled_bincount[c][0]; 103 | for (uint d = 1; d < depth; d++) { 104 | if (scaled_bincount[c][d] < score_c) { 105 | score_c = scaled_bincount[c][d]; 106 | } 107 | } 108 | avg_anomalyscore += score_c; 109 | } 110 | avg_anomalyscore /= nchains; 111 | 112 | return avg_anomalyscore; 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /cpp/chain.h: -------------------------------------------------------------------------------- 1 | #ifndef XSTREAM_CHAIN_H_ 2 | #define XSTREAM_CHAIN_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace std { 11 | 12 | void 13 | chains_init_features(vector>& fs, uint k, mt19937_64& prng); 14 | 15 | float 16 | chains_add(vector& xp, vector>& deltamax, vector>& shift, 17 | vector,int>>>& cmsketches, 18 | vector>& fs, bool update); 19 | 20 | float 21 | chains_add_cosine(vector& xp, 22 | vector,int>>>& cmsketches, 23 | vector>& fs, bool update); 24 | } 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /cpp/docopt.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // docopt.cpp 3 | // docopt 4 | // 5 | // Created by Jared Grubb on 2013-11-03. 6 | // Copyright (c) 2013 Jared Grubb. All rights reserved. 7 | // 8 | 9 | #include "docopt.h" 10 | #include "docopt_util.h" 11 | #include "docopt_private.h" 12 | 13 | #include "docopt_value.h" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | using namespace docopt; 26 | 27 | DocoptExitHelp::DocoptExitHelp() 28 | : std::runtime_error("Docopt --help argument encountered") 29 | {} 30 | 31 | DocoptExitVersion::DocoptExitVersion() 32 | : std::runtime_error("Docopt --version argument encountered") 33 | {} 34 | 35 | const char* value::kindAsString(Kind kind) 36 | { 37 | switch (kind) { 38 | case Kind::Empty: return "empty"; 39 | case Kind::Bool: return "bool"; 40 | case Kind::Long: return "long"; 41 | case Kind::String: return "string"; 42 | case Kind::StringList: return "string-list"; 43 | } 44 | return "unknown"; 45 | } 46 | 47 | void value::throwIfNotKind(Kind expected) const 48 | { 49 | if (kind == expected) 50 | return; 51 | 52 | std::string error = "Illegal cast to "; 53 | error += kindAsString(expected); 54 | error += "; type is actually "; 55 | error += kindAsString(kind); 56 | throw std::runtime_error(std::move(error)); 57 | } 58 | 59 | std::ostream& docopt::operator<<(std::ostream& os, value const& val) 60 | { 61 | if (val.isBool()) { 62 | bool b = val.asBool(); 63 | os << (b ? "true" : "false"); 64 | } else if (val.isLong()) { 65 | long v = val.asLong(); 66 | os << v; 67 | } else if (val.isString()) { 68 | std::string const& str = val.asString(); 69 | os << '"' << str << '"'; 70 | } else if (val.isStringList()) { 71 | auto const& list = val.asStringList(); 72 | os << "["; 73 | bool first = true; 74 | for(auto const& el : list) { 75 | if (first) { 76 | first = false; 77 | } else { 78 | os << ", "; 79 | } 80 | os << '"' << el << '"'; 81 | } 82 | os << "]"; 83 | } else { 84 | os << "null"; 85 | } 86 | return os; 87 | } 88 | 89 | std::vector Pattern::leaves() { 90 | std::vector ret; 91 | collect_leaves(ret); 92 | return ret; 93 | } 94 | 95 | bool Required::match(PatternList& left, std::vector>& collected) const 96 | { 97 | auto l = left; 98 | auto c = collected; 99 | 100 | for(auto const& pattern : fChildren) { 101 | bool ret = pattern->match(l, c); 102 | if (!ret) { 103 | // leave (left, collected) untouched 104 | return false; 105 | } 106 | } 107 | 108 | left = std::move(l); 109 | collected = std::move(c); 110 | return true; 111 | } 112 | 113 | bool LeafPattern::match(PatternList& left, std::vector>& collected) const 114 | { 115 | auto match = single_match(left); 116 | if (!match.second) { 117 | return false; 118 | } 119 | 120 | left.erase(left.begin()+static_cast(match.first)); 121 | 122 | auto same_name = std::find_if(collected.begin(), collected.end(), [&](std::shared_ptr const& p) { 123 | return p->name()==name(); 124 | }); 125 | if (getValue().isLong()) { 126 | long val = 1; 127 | if (same_name == collected.end()) { 128 | collected.push_back(match.second); 129 | match.second->setValue(value{val}); 130 | } else if ((**same_name).getValue().isLong()) { 131 | val += (**same_name).getValue().asLong(); 132 | (**same_name).setValue(value{val}); 133 | } else { 134 | (**same_name).setValue(value{val}); 135 | } 136 | } else if (getValue().isStringList()) { 137 | std::vector val; 138 | if (match.second->getValue().isString()) { 139 | val.push_back(match.second->getValue().asString()); 140 | } else if (match.second->getValue().isStringList()) { 141 | val = match.second->getValue().asStringList(); 142 | } else { 143 | /// cant be!? 144 | } 145 | 146 | if (same_name == collected.end()) { 147 | collected.push_back(match.second); 148 | match.second->setValue(value{val}); 149 | } else if ((**same_name).getValue().isStringList()) { 150 | std::vector const& list = (**same_name).getValue().asStringList(); 151 | val.insert(val.begin(), list.begin(), list.end()); 152 | (**same_name).setValue(value{val}); 153 | } else { 154 | (**same_name).setValue(value{val}); 155 | } 156 | } else { 157 | collected.push_back(match.second); 158 | } 159 | return true; 160 | } 161 | 162 | Option Option::parse(std::string const& option_description) 163 | { 164 | std::string shortOption, longOption; 165 | int argcount = 0; 166 | value val { false }; 167 | 168 | auto double_space = option_description.find(" "); 169 | auto options_end = option_description.end(); 170 | if (double_space != std::string::npos) { 171 | options_end = option_description.begin() + static_cast(double_space); 172 | } 173 | 174 | static const std::regex pattern {"(-{1,2})?(.*?)([,= ]|$)"}; 175 | for(std::sregex_iterator i {option_description.begin(), options_end, pattern, std::regex_constants::match_not_null}, 176 | e{}; 177 | i != e; 178 | ++i) 179 | { 180 | std::smatch const& match = *i; 181 | if (match[1].matched) { // [1] is optional. 182 | if (match[1].length()==1) { 183 | shortOption = "-" + match[2].str(); 184 | } else { 185 | longOption = "--" + match[2].str(); 186 | } 187 | } else if (match[2].length() > 0) { // [2] always matches. 188 | std::string m = match[2]; 189 | argcount = 1; 190 | } else { 191 | // delimeter 192 | } 193 | 194 | if (match[3].length() == 0) { // [3] always matches. 195 | // Hit end of string. For some reason 'match_not_null' will let us match empty 196 | // at the end, and then we'll spin in an infinite loop. So, if we hit an empty 197 | // match, we know we must be at the end. 198 | break; 199 | } 200 | } 201 | 202 | if (argcount) { 203 | std::smatch match; 204 | if (std::regex_search(options_end, option_description.end(), 205 | match, 206 | std::regex{"\\[default: (.*)\\]", std::regex::icase})) 207 | { 208 | val = match[1].str(); 209 | } 210 | } 211 | 212 | return {std::move(shortOption), 213 | std::move(longOption), 214 | argcount, 215 | std::move(val)}; 216 | } 217 | 218 | bool OneOrMore::match(PatternList& left, std::vector>& collected) const 219 | { 220 | assert(fChildren.size() == 1); 221 | 222 | auto l = left; 223 | auto c = collected; 224 | 225 | bool matched = true; 226 | size_t times = 0; 227 | 228 | decltype(l) l_; 229 | bool firstLoop = true; 230 | 231 | while (matched) { 232 | // could it be that something didn't match but changed l or c? 233 | matched = fChildren[0]->match(l, c); 234 | 235 | if (matched) 236 | ++times; 237 | 238 | if (firstLoop) { 239 | firstLoop = false; 240 | } else if (l == l_) { 241 | break; 242 | } 243 | 244 | l_ = l; 245 | } 246 | 247 | if (times == 0) { 248 | return false; 249 | } 250 | 251 | left = std::move(l); 252 | collected = std::move(c); 253 | return true; 254 | } 255 | 256 | bool Either::match(PatternList& left, std::vector>& collected) const 257 | { 258 | using Outcome = std::pair>>; 259 | 260 | std::vector outcomes; 261 | 262 | for(auto const& pattern : fChildren) { 263 | // need a copy so we apply the same one for every iteration 264 | auto l = left; 265 | auto c = collected; 266 | bool matched = pattern->match(l, c); 267 | if (matched) { 268 | outcomes.emplace_back(std::move(l), std::move(c)); 269 | } 270 | } 271 | 272 | auto min = std::min_element(outcomes.begin(), outcomes.end(), [](Outcome const& o1, Outcome const& o2) { 273 | return o1.first.size() < o2.first.size(); 274 | }); 275 | 276 | if (min == outcomes.end()) { 277 | // (left, collected) unchanged 278 | return false; 279 | } 280 | 281 | std::tie(left, collected) = std::move(*min); 282 | return true; 283 | } 284 | 285 | std::pair> Argument::single_match(PatternList const& left) const 286 | { 287 | std::pair> ret {}; 288 | 289 | for(size_t i = 0, size = left.size(); i < size; ++i) 290 | { 291 | auto arg = dynamic_cast(left[i].get()); 292 | if (arg) { 293 | ret.first = i; 294 | ret.second = std::make_shared(name(), arg->getValue()); 295 | break; 296 | } 297 | } 298 | 299 | return ret; 300 | } 301 | 302 | std::pair> Command::single_match(PatternList const& left) const 303 | { 304 | std::pair> ret {}; 305 | 306 | for(size_t i = 0, size = left.size(); i < size; ++i) 307 | { 308 | auto arg = dynamic_cast(left[i].get()); 309 | if (arg) { 310 | if (name() == arg->getValue()) { 311 | ret.first = i; 312 | ret.second = std::make_shared(name(), value{true}); 313 | } 314 | break; 315 | } 316 | } 317 | 318 | return ret; 319 | } 320 | 321 | std::pair> Option::single_match(PatternList const& left) const 322 | { 323 | std::pair> ret {}; 324 | 325 | for(size_t i = 0, size = left.size(); i < size; ++i) 326 | { 327 | auto leaf = std::dynamic_pointer_cast(left[i]); 328 | if (leaf && name() == leaf->name()) { 329 | ret.first = i; 330 | ret.second = leaf; 331 | break; 332 | } 333 | } 334 | 335 | return ret; 336 | } 337 | 338 | static std::vector transform(PatternList pattern); 339 | 340 | void BranchPattern::fix_repeating_arguments() 341 | { 342 | std::vector either = transform(children()); 343 | for(auto const& group : either) { 344 | // use multiset to help identify duplicate entries 345 | std::unordered_multiset, PatternHasher> group_set {group.begin(), group.end()}; 346 | for(auto const& e : group_set) { 347 | if (group_set.count(e) == 1) 348 | continue; 349 | 350 | LeafPattern* leaf = dynamic_cast(e.get()); 351 | if (!leaf) continue; 352 | 353 | bool ensureList = false; 354 | bool ensureInt = false; 355 | 356 | if (dynamic_cast(leaf)) { 357 | ensureInt = true; 358 | } else if (dynamic_cast(leaf)) { 359 | ensureList = true; 360 | } else if (Option* o = dynamic_cast(leaf)) { 361 | if (o->argCount()) { 362 | ensureList = true; 363 | } else { 364 | ensureInt = true; 365 | } 366 | } 367 | 368 | if (ensureList) { 369 | std::vector newValue; 370 | if (leaf->getValue().isString()) { 371 | newValue = split(leaf->getValue().asString()); 372 | } 373 | if (!leaf->getValue().isStringList()) { 374 | leaf->setValue(value{newValue}); 375 | } 376 | } else if (ensureInt) { 377 | leaf->setValue(value{0}); 378 | } 379 | } 380 | } 381 | } 382 | 383 | static std::vector transform(PatternList pattern) 384 | { 385 | std::vector result; 386 | 387 | std::vector groups; 388 | groups.emplace_back(std::move(pattern)); 389 | 390 | while(!groups.empty()) { 391 | // pop off the first element 392 | auto children = std::move(groups[0]); 393 | groups.erase(groups.begin()); 394 | 395 | // find the first branch node in the list 396 | auto child_iter = std::find_if(children.begin(), children.end(), [](std::shared_ptr const& p) { 397 | return dynamic_cast(p.get()); 398 | }); 399 | 400 | // no branch nodes left : expansion is complete for this grouping 401 | if (child_iter == children.end()) { 402 | result.emplace_back(std::move(children)); 403 | continue; 404 | } 405 | 406 | // pop the child from the list 407 | auto child = std::move(*child_iter); 408 | children.erase(child_iter); 409 | 410 | // expand the branch in the appropriate way 411 | if (Either* either = dynamic_cast(child.get())) { 412 | // "[e] + children" for each child 'e' in Either 413 | for(auto const& eitherChild : either->children()) { 414 | PatternList group = { eitherChild }; 415 | group.insert(group.end(), children.begin(), children.end()); 416 | 417 | groups.emplace_back(std::move(group)); 418 | } 419 | } else if (OneOrMore* oneOrMore = dynamic_cast(child.get())) { 420 | // child.children * 2 + children 421 | auto const& subchildren = oneOrMore->children(); 422 | PatternList group = subchildren; 423 | group.insert(group.end(), subchildren.begin(), subchildren.end()); 424 | group.insert(group.end(), children.begin(), children.end()); 425 | 426 | groups.emplace_back(std::move(group)); 427 | } else { // Required, Optional, OptionsShortcut 428 | BranchPattern* branch = dynamic_cast(child.get()); 429 | 430 | // child.children + children 431 | PatternList group = branch->children(); 432 | group.insert(group.end(), children.begin(), children.end()); 433 | 434 | groups.emplace_back(std::move(group)); 435 | } 436 | } 437 | 438 | return result; 439 | } 440 | 441 | class Tokens { 442 | public: 443 | Tokens(std::vector tokens, bool isParsingArgv = true) 444 | : fTokens(std::move(tokens)), 445 | fIsParsingArgv(isParsingArgv) 446 | {} 447 | 448 | explicit operator bool() const { 449 | return fIndex < fTokens.size(); 450 | } 451 | 452 | static Tokens from_pattern(std::string const& source) { 453 | static const std::regex re_separators { 454 | "(?:\\s*)" // any spaces (non-matching subgroup) 455 | "(" 456 | "[\\[\\]\\(\\)\\|]" // one character of brackets or parens or pipe character 457 | "|" 458 | "\\.\\.\\." // elipsis 459 | ")" }; 460 | 461 | static const std::regex re_strings { 462 | "(?:\\s*)" // any spaces (non-matching subgroup) 463 | "(" 464 | "\\S*<.*?>" // strings, but make sure to keep "< >" strings together 465 | "|" 466 | "[^<>\\s]+" // string without <> 467 | ")" }; 468 | 469 | // We do two stages of regex matching. The '[]()' and '...' are strong delimeters 470 | // and need to be split out anywhere they occur (even at the end of a token). We 471 | // first split on those, and then parse the stuff between them to find the string 472 | // tokens. This is a little harder than the python version, since they have regex.split 473 | // and we dont have anything like that. 474 | 475 | std::vector tokens; 476 | std::for_each(std::sregex_iterator{ source.begin(), source.end(), re_separators }, 477 | std::sregex_iterator{}, 478 | [&](std::smatch const& match) 479 | { 480 | // handle anything before the separator (this is the "stuff" between the delimeters) 481 | if (match.prefix().matched) { 482 | std::for_each(std::sregex_iterator{match.prefix().first, match.prefix().second, re_strings}, 483 | std::sregex_iterator{}, 484 | [&](std::smatch const& m) 485 | { 486 | tokens.push_back(m[1].str()); 487 | }); 488 | } 489 | 490 | // handle the delimter token itself 491 | if (match[1].matched) { 492 | tokens.push_back(match[1].str()); 493 | } 494 | }); 495 | 496 | return Tokens(tokens, false); 497 | } 498 | 499 | std::string const& current() const { 500 | if (*this) 501 | return fTokens[fIndex]; 502 | 503 | static std::string const empty; 504 | return empty; 505 | } 506 | 507 | std::string the_rest() const { 508 | if (!*this) 509 | return {}; 510 | return join(fTokens.begin()+static_cast(fIndex), 511 | fTokens.end(), 512 | " "); 513 | } 514 | 515 | std::string pop() { 516 | return std::move(fTokens.at(fIndex++)); 517 | } 518 | 519 | bool isParsingArgv() const { return fIsParsingArgv; } 520 | 521 | struct OptionError : std::runtime_error { using runtime_error::runtime_error; }; 522 | 523 | private: 524 | std::vector fTokens; 525 | size_t fIndex = 0; 526 | bool fIsParsingArgv; 527 | }; 528 | 529 | // Get all instances of 'T' from the pattern 530 | template 531 | std::vector flat_filter(Pattern& pattern) { 532 | std::vector flattened = pattern.flat([](Pattern const* p) -> bool { 533 | return dynamic_cast(p) != nullptr; 534 | }); 535 | 536 | // now, we're guaranteed to have T*'s, so just use static_cast 537 | std::vector ret; 538 | std::transform(flattened.begin(), flattened.end(), std::back_inserter(ret), [](Pattern* p) { 539 | return static_cast(p); 540 | }); 541 | return ret; 542 | } 543 | 544 | static std::vector parse_section(std::string const& name, std::string const& source) { 545 | // ECMAScript regex only has "?=" for a non-matching lookahead. In order to make sure we always have 546 | // a newline to anchor our matching, we have to avoid matching the final newline of each grouping. 547 | // Therefore, our regex is adjusted from the docopt Python one to use ?= to match the newlines before 548 | // the following lines, rather than after. 549 | std::regex const re_section_pattern { 550 | "(?:^|\\n)" // anchored at a linebreak (or start of string) 551 | "(" 552 | "[^\\n]*" + name + "[^\\n]*(?=\\n?)" // a line that contains the name 553 | "(?:\\n[ \\t].*?(?=\\n|$))*" // followed by any number of lines that are indented 554 | ")", 555 | std::regex::icase 556 | }; 557 | 558 | std::vector ret; 559 | std::for_each(std::sregex_iterator(source.begin(), source.end(), re_section_pattern), 560 | std::sregex_iterator(), 561 | [&](std::smatch const& match) 562 | { 563 | ret.push_back(trim(match[1].str())); 564 | }); 565 | 566 | return ret; 567 | } 568 | 569 | static bool is_argument_spec(std::string const& token) { 570 | if (token.empty()) 571 | return false; 572 | 573 | if (token[0]=='<' && token[token.size()-1]=='>') 574 | return true; 575 | 576 | if (std::all_of(token.begin(), token.end(), &::isupper)) 577 | return true; 578 | 579 | return false; 580 | } 581 | 582 | template 583 | std::vector longOptions(I iter, I end) { 584 | std::vector ret; 585 | std::transform(iter, end, 586 | std::back_inserter(ret), 587 | [](typename I::reference opt) { return opt->longOption(); }); 588 | return ret; 589 | } 590 | 591 | static PatternList parse_long(Tokens& tokens, std::vector