├── .flake8 ├── .gitignore ├── .gitlab-ci.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── csrc ├── Array.cc ├── Array.h ├── ClassInfo.cc ├── ClassInfo.h ├── Env.h ├── EqGrNode.cc ├── EqGrNode.h ├── Eqclass.cc ├── Eqclass.h ├── FreqIt.cc ├── FreqIt.h ├── InvertDatabase.cc ├── InvertDatabase.h ├── Itemset.cc ├── Itemset.h ├── Lists.cc ├── Lists.h ├── Partition.cc ├── Partition.h ├── Sequence.cc ├── Sequence.h ├── SpadeArguments.cc ├── SpadeArguments.h ├── TransArray.cc ├── TransArray.h ├── argh.h ├── argv_parser.cc ├── argv_parser.h ├── calcdb.cc ├── calcdb.h ├── common.cc ├── common.h ├── dirent-win.h ├── exttpose.cc ├── exttpose.h ├── exttpose_main.cc ├── getconf.cc ├── getconf.h ├── getconf_main.cc ├── main.cc ├── makebin.cc ├── makebin.h ├── makebin_main.cc ├── spade_main.cc ├── test.cc ├── wrappers.cc └── wrappers.h ├── pycspade ├── __init__.py ├── cspade.cpp ├── cspade.pyx ├── helpers.py └── shortcuts.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── test-global.sh ├── test-local.sh ├── tests ├── __init__.py ├── bb-tmi.txt ├── example.py ├── simplest.txt ├── test.ascii.data ├── test1.ascii.data ├── test_cspade.py ├── zaki.conf ├── zaki.data ├── zaki.idx ├── zaki.tpose └── zaki.txt ├── uppypi.sh └── utilssrc ├── Array.cc ├── Array.h ├── b2a.cc ├── calcdb.cc ├── calcdb.h ├── exttpose.cc ├── getconf.cc └── makebin.cc /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = D100,D101,D102,D103,D105,D106,D107,D200,D205,D400,D401,D413,F403,F405 4 | exclude = .venv,__init__.py,./build/ 5 | 6 | putty-ignore = 7 | test_*.py : E501 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .venv 3 | build 4 | *.pyc 5 | *.so 6 | 7 | # Created by https://www.gitignore.io/api/c++,python,pycharm 8 | 9 | ### C++ ### 10 | # Prerequisites 11 | *.d 12 | 13 | # Compiled Object files 14 | *.slo 15 | *.lo 16 | *.o 17 | *.obj 18 | 19 | # Precompiled Headers 20 | *.gch 21 | *.pch 22 | 23 | # Compiled Dynamic libraries 24 | *.so 25 | *.dylib 26 | *.dll 27 | 28 | # Fortran module files 29 | *.mod 30 | *.smod 31 | 32 | # Compiled Static libraries 33 | *.lai 34 | *.la 35 | *.a 36 | *.lib 37 | 38 | # Executables 39 | *.exe 40 | *.out 41 | *.app 42 | 43 | ### PyCharm ### 44 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 45 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 46 | 47 | # User-specific stuff 48 | .idea/**/workspace.xml 49 | .idea/**/tasks.xml 50 | .idea/**/usage.statistics.xml 51 | .idea/**/dictionaries 52 | .idea/**/shelf 53 | 54 | # Sensitive or high-churn files 55 | .idea/**/dataSources/ 56 | .idea/**/dataSources.ids 57 | .idea/**/dataSources.local.xml 58 | .idea/**/sqlDataSources.xml 59 | .idea/**/dynamic.xml 60 | .idea/**/uiDesigner.xml 61 | .idea/**/dbnavigator.xml 62 | 63 | # Gradle 64 | .idea/**/gradle.xml 65 | .idea/**/libraries 66 | 67 | # Gradle and Maven with auto-import 68 | # When using Gradle or Maven with auto-import, you should exclude module files, 69 | # since they will be recreated, and may cause churn. Uncomment if using 70 | # auto-import. 71 | # .idea/modules.xml 72 | # .idea/*.iml 73 | # .idea/modules 74 | 75 | # CMake 76 | cmake-build-*/ 77 | 78 | # Mongo Explorer plugin 79 | .idea/**/mongoSettings.xml 80 | 81 | # File-based project format 82 | *.iws 83 | 84 | # IntelliJ 85 | out/ 86 | 87 | # mpeltonen/sbt-idea plugin 88 | .idea_modules/ 89 | 90 | # JIRA plugin 91 | atlassian-ide-plugin.xml 92 | 93 | # Cursive Clojure plugin 94 | .idea/replstate.xml 95 | 96 | # Crashlytics plugin (for Android Studio and IntelliJ) 97 | com_crashlytics_export_strings.xml 98 | crashlytics.properties 99 | crashlytics-build.properties 100 | fabric.properties 101 | 102 | # Editor-based Rest Client 103 | .idea/httpRequests 104 | 105 | ### PyCharm Patch ### 106 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 107 | 108 | # *.iml 109 | # modules.xml 110 | # .idea/misc.xml 111 | # *.ipr 112 | 113 | # Sonarlint plugin 114 | .idea/sonarlint 115 | 116 | ### Python ### 117 | # Byte-compiled / optimized / DLL files 118 | __pycache__/ 119 | *.py[cod] 120 | *$py.class 121 | 122 | # C extensions 123 | 124 | # Distribution / packaging 125 | .Python 126 | build/ 127 | develop-eggs/ 128 | dist/ 129 | downloads/ 130 | eggs/ 131 | .eggs/ 132 | lib/ 133 | lib64/ 134 | parts/ 135 | sdist/ 136 | var/ 137 | wheels/ 138 | *.egg-info/ 139 | .installed.cfg 140 | *.egg 141 | MANIFEST 142 | 143 | # PyInstaller 144 | # Usually these files are written by a python script from a template 145 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 146 | *.manifest 147 | *.spec 148 | 149 | # Installer logs 150 | pip-log.txt 151 | pip-delete-this-directory.txt 152 | 153 | # Unit test / coverage reports 154 | htmlcov/ 155 | .tox/ 156 | .coverage 157 | .coverage.* 158 | .cache 159 | nosetests.xml 160 | coverage.xml 161 | *.cover 162 | .hypothesis/ 163 | .pytest_cache/ 164 | 165 | # Translations 166 | *.mo 167 | *.pot 168 | 169 | # Django stuff: 170 | *.log 171 | local_settings.py 172 | db.sqlite3 173 | 174 | # Flask stuff: 175 | instance/ 176 | .webassets-cache 177 | 178 | # Scrapy stuff: 179 | .scrapy 180 | 181 | # Sphinx documentation 182 | docs/_build/ 183 | 184 | # PyBuilder 185 | target/ 186 | 187 | # Jupyter Notebook 188 | .ipynb_checkpoints 189 | 190 | # pyenv 191 | .python-version 192 | 193 | # celery beat schedule file 194 | celerybeat-schedule 195 | 196 | # SageMath parsed files 197 | *.sage.py 198 | 199 | # Environments 200 | .env 201 | .venv 202 | env/ 203 | venv/ 204 | ENV/ 205 | env.bak/ 206 | venv.bak/ 207 | 208 | # Spyder project settings 209 | .spyderproject 210 | .spyproject 211 | 212 | # Rope project settings 213 | .ropeproject 214 | 215 | # mkdocs documentation 216 | /site 217 | 218 | # mypy 219 | .mypy_cache/ 220 | 221 | ### Python Patch ### 222 | .venv/ 223 | 224 | 225 | # End of https://www.gitignore.io/api/c++,python,pycharm 226 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: agileware/python-3.6.1-node-6.11 2 | 3 | before_script: 4 | - pip install pycodestyle==2.0.0 flake8==2.6.2 flake8-docstrings==1.3.0 flake8-polyfill==1.0.2 flake8-putty==0.4.0 5 | 6 | check-coding-standard-compliance: 7 | script: 8 | - flake8 9 | allow_failure: false 10 | 11 | 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018-2022 Mohammed J. Zaki and Yukio Fukuzawa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | 11 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include csrc/ *.c *.cpp *.cc *.h *.py *.pyx 2 | recursive-include pycspade/ *.c *.cpp *.cc *.h *.py *.pyx 3 | include *.c *.cpp *.h *.py *.pyx 4 | include README.md 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Pycspade 2 | === 3 | 4 | #### What is this? 5 | This is a python wrapper for the C++ implementation of C-SPADE algorithm by the author, Mohammed J. Zaki 6 | Original code was downloaded from http://www.cs.rpi.edu/~zaki/www-new/pmwiki.php/Software/Software#toc11 7 | Since this is just a wrapper it is as fast as the C++ code 8 | 9 | #### How to install? 10 | Compatible with Python 2 and 3. 11 | On Windows, Visual Studio 2015 Build Tools is also required. 12 | 13 | ```bash 14 | pip install Cython pycspade 15 | ``` 16 | 17 | #### How to use? 18 | Your data needs to be in a particular format similar to the following: 19 | ```text 20 | 1 1 3 8 37 42 21 | 1 2 4 4 11 37 42 22 | 2 1 2 10 73 23 | 2 2 1 72 24 | 2 3 3 4 24 77 25 | ... 26 | ``` 27 | 28 | The first number is the sequence index, the second is the event index, the third is the number of elements, 29 | followed by the element, space separated 30 | 31 | Let's call this file `data.txt`. You will call cspade as following: 32 | ```python 33 | from pycspade.helpers import spade, print_result 34 | 35 | # To get raw SPADE output 36 | result = spade(filename='tests/zaki.txt', support=0.3, parse=False) 37 | print(result['mined']) 38 | ``` 39 | ```bash 40 | 1 -- 4 4 41 | 2 -- 4 4 42 | 4 -- 2 2 43 | 6 -- 4 4 44 | 4 -> 6 -- 2 2 45 | 4 -> 2 -- 2 2 46 | 2 -> 1 -- 2 2 47 | 4 -> 1 -- 2 2 48 | 6 -> 1 -- 2 2 49 | 4 -> 6 -> 1 -- 2 2 50 | 4 -> 2 -> 1 -- 2 2 51 | ``` 52 | ```python 53 | print(result['logger']) 54 | ``` 55 | ```bash 56 | CONF 4 9 2.7 2.5 57 | args.MINSUPPORT 2 4 58 | MINMAX 1 4 59 | 1 SUPP 4 60 | 2 SUPP 4 61 | 4 SUPP 2 62 | 6 SUPP 4 63 | numfreq 4 : SUMSUP SUMDIFF = 0 0 64 | EXTRARYSZ 2465792 65 | OPENED /tmp/cspade-WWv9bQWBYdDyH85T.idx 66 | OFF 9 38 67 | Wrote Offt 68 | BOUNDS 1 5 69 | WROTE INVERT 70 | Cleaned up successful: /tmp/cspade-WWv9bQWBYdDyH85T.tpose 71 | Cleaned up successful: /tmp/cspade-WWv9bQWBYdDyH85T.idx 72 | Cleaned up successful: /tmp/cspade-WWv9bQWBYdDyH85T.data 73 | Cleaned up successful: /tmp/cspade-WWv9bQWBYdDyH85T.conf 74 | ``` 75 | ```python 76 | print(result['summary']) 77 | ``` 78 | ```bash 79 | CONF 4 9 2.5 2.7 10 1 4 0.781025 4 80 | TPOSE SEQ NOF2 /tmp/cspade-WWv9bQWBYdDyH85T.data 0.3 4 2 1 81 | F1stats = [ 4 0 0 ] 82 | SPADE /tmp/cspade-WWv9bQWBYdDyH85T.tpose 0.3 2 7 0 0 0 0 0 -1 1 100 100 4 5 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 83 | ``` 84 | 85 | ```python 86 | # To also get other sequence mining's measures, incl. lift, support, confidence: 87 | result = spade(filename='tests/zaki.txt', support=0.3, parse=True) 88 | # Pretty print result: 89 | print_result(result) 90 | ``` 91 | ```bash 92 | Occurs Accum Support Confid Lift Sequence 93 | 4 14 1.0000000 N/A N/A (1) 94 | 4 6 1.0000000 N/A N/A (2) 95 | 2 4 0.5000000 0.5000000 0.5000000 (2)->(1) 96 | 2 2 0.5000000 N/A N/A (4) 97 | 2 2 0.5000000 1.0000000 1.0000000 (4)->(1) 98 | 2 2 0.5000000 1.0000000 1.0000000 (4)->(2) 99 | 2 2 0.5000000 1.0000000 1.0000000 (4)->(2)->(1) 100 | 2 2 0.5000000 1.0000000 1.0000000 (4)->(6) 101 | 2 2 0.5000000 1.0000000 1.0000000 (4)->(6)->(1) 102 | 4 6 1.0000000 N/A N/A (6) 103 | 2 4 0.5000000 0.5000000 0.5000000 (6)->(1) 104 | ``` 105 | 106 | ##### You can provide cspade with list of sequences instead of a file: 107 | ```python 108 | data = [ 109 | [1, 10, [3, 4]], 110 | [1, 15, [1, 2, 3]], 111 | [1, 20, [1, 2, 6]], 112 | [1, 25, [1, 3, 4, 6]], 113 | [2, 15, [1, 2, 6]], 114 | [2, 20, [5]], 115 | [3, 10, [1, 2, 6]], 116 | [4, 10, [4, 7, 8]], 117 | [4, 20, [2, 6]], 118 | [4, 25, [1, 7, 8]] 119 | ] 120 | 121 | result = spade(data=data, support=0.01) 122 | print_result(result) 123 | ``` 124 | 125 | The result `seq` is a string, that have multiple rows and looks like this: 126 | 127 | ```text 128 | 22 80 -> 72 -> 42 -> 22 -- 2 2 129 | 22 -> 45 71 -> 42 -- 1 1 130 | 80 -> 45 71 -> 42 -- 1 1 131 | 22 80 -> 45 71 -> 42 -- 1 1 132 | ``` 133 | Let's decipher the first row: 134 | ```bash 135 | 22 80 -> 72 -> 42 -> 22 -- 2 2 136 | ``` 137 | 138 | It gives you the frequent sequence followed by support (the last two numbers, which will be the same in this application). 139 | The row reads: the itemset (22 80) is followed by (72) followed by (42) followed by (22). 140 | 141 | 142 | There're a lot of parameters that can be passed to this function. most important ones are: 143 | 144 | - `support`: this is the minimum support level, default to 0 (not excluding anything) 145 | - `max_gap`: The max number of itemset that can be skipped in a sequence 146 | - `min_gap`: The min number of itemset that must be skipped in a sequence 147 | 148 | Read the original paper and the C++ implementation for more details 149 | 150 | #### How to contribute? 151 | - Fork this repo 152 | - Make change 153 | - Pull request 154 | 155 | #### How to recompile to use in IDE? 156 | - `rm cspade.cpp; python setup.py build_ext --inplace` 157 | 158 | #### Licence 159 | - MIT -------------------------------------------------------------------------------- /csrc/Array.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "Array.h" 4 | 5 | Array::Array(int sz) { 6 | totSize = sz; 7 | theSize = 0; 8 | if (sz > 0) { 9 | theArray = make_shared(sz); 10 | } 11 | } 12 | 13 | Array::~Array() = default; 14 | 15 | ostream &operator<<(ostream &outputStream, Array &arr) { 16 | for (int i = 0; i < arr.theSize; i++) 17 | outputStream << arr[i] << " "; 18 | return outputStream; 19 | } 20 | 21 | int Array::subsequence(Array_S ar) { 22 | int i, j; 23 | int sz1, sz2; 24 | Array_S ar1, ar2; 25 | int retval; 26 | 27 | if (theSize <= ar->theSize) { 28 | sz1 = theSize; 29 | sz2 = ar->theSize; 30 | ar1 = shared_ptr(this); 31 | ar2 = ar; 32 | retval = 1; 33 | } else { 34 | sz1 = ar->theSize; 35 | sz2 = theSize; 36 | ar1 = ar; 37 | ar2 = shared_ptr(this); 38 | retval = -1; 39 | } 40 | int start = 0; 41 | for (i = 0; i < sz1; i++) { 42 | for (j = start; j < sz2; j++) { 43 | if (ar1->theArray->at(i) == ar2->theArray->at(j)) { 44 | start = j + 1; 45 | break; 46 | } 47 | } 48 | if (j >= ar2->theSize) return 0; 49 | } 50 | return retval; 51 | } 52 | 53 | 54 | int Array::compare(Array &ar2) { 55 | int len; 56 | if (size() <= ar2.size()) len = size(); 57 | else len = ar2.size(); 58 | for (int i = 0; i < len; i++) { 59 | if (theArray->at(i) > ar2.theArray->at(i)) return 1; 60 | else if (theArray->at(i) < ar2.theArray->at(i)) return -1; 61 | } 62 | if (size() < ar2.size()) return -1; 63 | else if (size() > ar2.size()) return 1; 64 | else return 0; 65 | } 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /csrc/Array.h: -------------------------------------------------------------------------------- 1 | #ifndef __ARRAY_H 2 | #define __ARRAY_H 3 | 4 | #include "common.h" 5 | 6 | class Array; 7 | typedef shared_ptr Array_S; 8 | typedef shared_ptr> Array_SS; 9 | 10 | class Array { 11 | private: 12 | shared_ptr theArray = nullptr; 13 | int theSize; // DD 14 | int totSize; // DD 15 | //unsigned int theIncr; 16 | public: 17 | 18 | //Array (int sz, int incr); 19 | explicit Array(int sz); 20 | 21 | ~Array(); 22 | 23 | int subsequence(Array_S ar); 24 | 25 | //void add (int, unsigned int); 26 | void add_ext(int val, int off, int *ary) { 27 | ary[off + theSize] = val; 28 | theSize++; 29 | } 30 | 31 | int operator[](unsigned int index) { 32 | return theArray->at(index); 33 | }; 34 | 35 | void setitem(int pos, int val) { 36 | theArray->at(pos) = val; 37 | }; 38 | 39 | int totsize() { 40 | return totSize; 41 | } 42 | 43 | void set_totsize(int sz) { 44 | totSize = sz; 45 | } 46 | 47 | void set_size(int sz) { 48 | theSize = sz; 49 | } 50 | 51 | void reset() { 52 | theSize = 0; 53 | } 54 | 55 | shared_ptr array() { 56 | return theArray; 57 | } 58 | 59 | void set_array(shared_ptr ary) { 60 | theArray = ary; 61 | } 62 | 63 | //int subsequence(Array&); 64 | //int compare(Array&); 65 | friend ostream &operator<<(ostream &outputStream, Array &arr); 66 | 67 | int compare(Array &ar2); 68 | 69 | int item(unsigned int index) { 70 | return theArray->at(index); 71 | } 72 | 73 | int size() // DD 74 | { 75 | return theSize; 76 | } 77 | 78 | void resize(int newsz) { 79 | totSize = newsz; 80 | theArray->resize(totSize); 81 | } 82 | 83 | void compact() { 84 | theArray->resize(theSize); 85 | } 86 | 87 | void optadd(int item) { 88 | add(item); 89 | } 90 | 91 | void add(int item) { 92 | if (theSize + 1 > totSize) { 93 | resize((int) (totSize * 1.5)); 94 | } 95 | theArray->at(theSize) = (item); 96 | theSize++; 97 | } 98 | }; 99 | 100 | #endif //__ARRAY_H 101 | 102 | 103 | -------------------------------------------------------------------------------- /csrc/ClassInfo.cc: -------------------------------------------------------------------------------- 1 | #include "ClassInfo.h" 2 | 3 | void ClassInfo::init() { 4 | int i, maxval = 0; 5 | long numtrans; // DD 6 | 7 | const string& classfn = args->classf; 8 | const double min_support_per_class = args->min_support_per_class; 9 | has_class = args->use_class; 10 | 11 | if (has_class) { 12 | fstream classf(classfn, ios::binary); 13 | if (!classf.is_open()) { 14 | throw runtime_error("Unable to read file " + classfn); 15 | } 16 | 17 | long fdlen = file_size(classf); 18 | int *clsaddr = read_file(classf, fdlen); 19 | if (! classf) { 20 | throw runtime_error("Error reading file " + classfn); 21 | } 22 | 23 | // first entry contains num classes 24 | num_class = clsaddr[0]; 25 | //input is num_class followed by pairs 26 | numtrans = (fdlen / INT_SIZE - 1) / 2; 27 | maxval = clsaddr[numtrans * 2 - 1] + 1; 28 | classes.resize(maxval); 29 | for (i = 0; i < maxval; i++) classes[i] = NOCLASS; 30 | for (i = 1; i < (int) (fdlen / INT_SIZE); i += 2) { // DD 31 | classes[clsaddr[i]] = clsaddr[i + 1]; 32 | } 33 | delete [] clsaddr; 34 | } 35 | else { 36 | classes.resize(num_class); 37 | } 38 | 39 | class_count.resize(num_class, 0); 40 | tmpe.resize(num_class); 41 | tmpm.resize(num_class); 42 | tmpl.resize(num_class); 43 | min_supports.resize(num_class); 44 | 45 | if (has_class) { 46 | // class frequency 47 | for (i = 0; i < maxval; i++) 48 | if (classes[i] != NOCLASS) 49 | class_count[classes[i]]++; 50 | } 51 | else { 52 | class_count[0] = args->total_trans_count; 53 | } 54 | 55 | for (i = 0; i < num_class; i++) { 56 | min_supports[i] = (int) ceil(min_support_per_class * class_count[i]); 57 | if (min_supports[i] < 1) min_supports[i] = 1; 58 | } 59 | } 60 | 61 | int ClassInfo::get_num_class() const { 62 | return num_class; 63 | } 64 | 65 | int ClassInfo::get_min_support(int idx) const { 66 | return min_supports[idx]; 67 | } 68 | 69 | const vint &ClassInfo::get_tmpe() const { 70 | return tmpe; 71 | } 72 | 73 | const vint &ClassInfo::get_tmpm() const { 74 | return tmpm; 75 | } 76 | 77 | const vint &ClassInfo::get_tmpl() const { 78 | return tmpl; 79 | } 80 | 81 | void ClassInfo::setArgs(const shared_ptr &args) { 82 | ClassInfo::args = args; 83 | } 84 | 85 | ClassInfo::ClassInfo() { 86 | 87 | } 88 | -------------------------------------------------------------------------------- /csrc/ClassInfo.h: -------------------------------------------------------------------------------- 1 | #ifndef CSPADE_CLASSINFO_H 2 | #define CSPADE_CLASSINFO_H 3 | 4 | 5 | #include "common.h" 6 | #include "SpadeArguments.h" 7 | 8 | #define NOCLASS -1 9 | 10 | class ClassInfo { 11 | private: 12 | bool has_class; 13 | vint clsaddr; 14 | vint classes; 15 | vint class_count; 16 | vint min_supports; 17 | vint tmpe; // temporary variables to keep support 18 | vint tmpm; // counts during intersections 19 | vint tmpl; 20 | int num_class = 1; 21 | shared_ptr args; 22 | public: 23 | ClassInfo(); 24 | 25 | void setArgs(const shared_ptr &args); 26 | 27 | void reset_temps() { 28 | std::fill(tmpe.begin(), tmpe.end(), 0); 29 | std::fill(tmpm.begin(), tmpm.end(), 0); 30 | std::fill(tmpl.begin(), tmpl.end(), 0); 31 | } 32 | 33 | void increase_tmpl(int i) { 34 | tmpl[getcls(i)]++; 35 | } 36 | 37 | void init(); 38 | 39 | void increase_tmpm(int i) { 40 | tmpm[getcls(i)]++; 41 | } 42 | 43 | void increase_tmpe(int i) { 44 | tmpe[getcls(i)]++; 45 | } 46 | 47 | void set_tmpe_item(int i, int val) { 48 | tmpe[i] = val; 49 | } 50 | 51 | int get_tmpe_item(int i) { 52 | return tmpe[i]; 53 | } 54 | 55 | bool strong_lsupport(int i) { 56 | return tmpl[i] >= min_supports[i]; 57 | } 58 | 59 | bool strong_esupport(int i) { 60 | return tmpe[i] >= min_supports[i]; 61 | } 62 | 63 | bool strong_msupport(int i) { 64 | return tmpm[i] >= min_supports[i]; 65 | } 66 | 67 | const vint &get_tmpe() const; 68 | 69 | const vint &get_tmpm() const; 70 | 71 | const vint &get_tmpl() const; 72 | 73 | int get_num_class() const; 74 | 75 | int get_min_support(int idx) const; 76 | 77 | int getcnt(int cls = -1) { 78 | if (cls == -1) { 79 | int sum = 0; 80 | for (int i = 0; i < num_class; i++) 81 | sum += class_count[i]; 82 | return sum; 83 | } else return class_count[cls]; 84 | } 85 | 86 | int getcls(int idx) { 87 | if (has_class) 88 | return classes[idx]; 89 | else 90 | return 0; 91 | } 92 | }; 93 | 94 | #endif //CSPADE_CLASSINFO_H 95 | -------------------------------------------------------------------------------- /csrc/Env.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 10/12/18. 3 | // 4 | 5 | #ifndef UTILITIES_ENV_H 6 | #define UTILITIES_ENV_H 7 | 8 | #include "common.h" 9 | 10 | class Env { 11 | public: 12 | ostringstream seqstrm; // Print the sequences 13 | ostringstream logger; 14 | ostringstream summary; 15 | ostringstream idlstrm; // To print ID list 16 | }; 17 | 18 | 19 | #endif //UTILITIES_ENV_H 20 | -------------------------------------------------------------------------------- /csrc/EqGrNode.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 8/12/18. 3 | // 4 | 5 | #include "EqGrNode.h" 6 | #include "Array.h" 7 | #include "FreqIt.h" 8 | 9 | 10 | EqGrNode::EqGrNode(int sz, int num_class_) { 11 | num_class = num_class_; 12 | if (sz > 0) { 13 | theElements.reset(new Array(sz)); 14 | stheElements.reset(new Array(sz)); 15 | 16 | _set_sup = make_shared>(); 17 | _seq_sup = make_shared>(); 18 | 19 | _set_sup->reserve(num_class); 20 | _seq_sup->reserve(num_class); 21 | 22 | for (int i = 0; i < num_class; i++) { 23 | _set_sup->push_back(make_shared(sz)); 24 | _seq_sup->push_back(make_shared(sz)); 25 | } 26 | } else { 27 | theElements = nullptr; 28 | stheElements = nullptr; 29 | _set_sup->resize(0); 30 | _seq_sup->resize(0); 31 | } 32 | 33 | freqArray = nullptr; 34 | freqArraySz = 0; 35 | theFlg = 0; 36 | } 37 | 38 | EqGrNode::~EqGrNode() = default; 39 | 40 | 41 | //assume that elements are sorted in descending order 42 | int EqGrNode::bsearch(int min, int max, FreqIt_SS freqArray, FreqIt_S fit, int recursive) { 43 | int mid = (max + min) / 2; 44 | if (max < min) return -1; 45 | 46 | int res = freqArray->at(mid)->compare(fit, recursive); 47 | if (res == 0) return mid; 48 | else if (res < 0) return bsearch(min, mid - 1, freqArray, fit, recursive); 49 | else return bsearch(mid + 1, max, freqArray, fit, recursive); 50 | } 51 | 52 | 53 | int EqGrNode::bsearch(int min, int max, shared_ptr itary, int it) { 54 | int mid = (max + min) / 2; 55 | if (max < min) return -1; 56 | 57 | if (it == itary->at(mid)) return mid; 58 | else if (it < itary->at(mid)) return bsearch(min, mid - 1, itary, it); 59 | else return bsearch(mid + 1, max, itary, it); 60 | } 61 | 62 | 63 | int EqGrNode::find_freqarray(FreqIt_S fit, int recursive) { 64 | if (freqArraySz > 0) 65 | return bsearch(0, freqArraySz - 1, freqArray, fit, recursive); 66 | else return 0; 67 | } 68 | 69 | 70 | ostream &operator<<(ostream &outputStream, EqGrNode &EQ) { 71 | int i; 72 | if (EQ.theElements) { 73 | outputStream << "SET " << *EQ.theElements << endl; 74 | for (i = 0; i < EQ.num_class; i++) 75 | outputStream << "Sup" << i << " : " << *EQ._set_sup->at(i) << endl; 76 | outputStream << "Tot"; 77 | for (i = 0; i < EQ.theElements->size(); i++) 78 | outputStream << " " << EQ.get_sup(i); 79 | outputStream << endl; 80 | } 81 | if (EQ.stheElements) { 82 | outputStream << "SEQ " << *EQ.stheElements << endl; 83 | for (i = 0; i < EQ.num_class; i++) 84 | outputStream << "SSup" << i << " : " << *EQ._seq_sup->at(i) << endl; 85 | outputStream << "Tot"; 86 | for (i = 0; i < EQ.stheElements->size(); i++) 87 | outputStream << " " << EQ.get_seqsup(i); 88 | outputStream << endl; 89 | } 90 | 91 | return outputStream; 92 | } 93 | -------------------------------------------------------------------------------- /csrc/EqGrNode.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 8/12/18. 3 | // 4 | 5 | #ifndef CSPADE_EQGRNODE_H 6 | #define CSPADE_EQGRNODE_H 7 | 8 | #include "common.h" 9 | #include "Array.h" 10 | #include "FreqIt.h" 11 | 12 | class EqGrNode { 13 | private: 14 | Array_S theElements; 15 | Array_S stheElements; 16 | 17 | Array_SS _set_sup; // support in different classes 18 | Array_SS _seq_sup; // support in different classes 19 | 20 | FreqIt_SS freqArray; //frequent seq from this class 21 | int freqArraySz; 22 | 23 | int theFlg; //indicates if class is in memory 24 | 25 | int num_class; 26 | 27 | public: 28 | static int bsearch(int min, int max, FreqIt_SS freqArray, FreqIt_S fit, int recursive); 29 | 30 | static int bsearch(int min, int max, shared_ptr itary, int it); 31 | 32 | EqGrNode(int sz, int num_class); 33 | 34 | ~EqGrNode(); 35 | 36 | FreqIt_SS freqarray() { 37 | return freqArray; 38 | } 39 | 40 | int freqarraysz() { 41 | return freqArraySz; 42 | } 43 | 44 | void set_freqarray(FreqIt_SS fit, int sz) { 45 | freqArray = fit; 46 | freqArraySz = sz; 47 | } 48 | 49 | int find_freqarray(FreqIt_S fit, int recursive); 50 | 51 | int getflg() { 52 | return theFlg; 53 | } 54 | 55 | void setflg(int val) { 56 | theFlg = val; 57 | } 58 | 59 | void add_sup(int sup, int clas) { 60 | _set_sup->at(clas)->add(sup); 61 | } 62 | 63 | void add_seqsup(int sup, int clas) { 64 | _seq_sup->at(clas)->add(sup); 65 | } 66 | 67 | int get_sup(int idx, int clas = -1) { 68 | if (clas == -1) { 69 | int sum = 0; 70 | //return sup in all classes 71 | for (int i = 0; i < num_class; i++) 72 | sum += (*_set_sup->at(i))[idx]; 73 | return sum; 74 | } else return (*_set_sup->at(clas))[idx]; //return sup in class only 75 | } 76 | 77 | int get_seqsup(int idx, int clas = -1) { 78 | if (clas == -1) { 79 | int sum = 0; 80 | //return sup in all classes 81 | for (int i = 0; i < num_class; i++) 82 | sum += (*_seq_sup->at(i))[idx]; 83 | return sum; 84 | } else return (*_seq_sup->at(clas))[idx]; //return sup in class only 85 | } 86 | 87 | Array_S elements() { 88 | return theElements; 89 | } 90 | 91 | int num_elements() { 92 | if (theElements) 93 | return theElements->size(); 94 | else return 0; 95 | } 96 | 97 | void add_element(int el) { 98 | //theElements[numElements] = el; 99 | //numElements++; 100 | theElements->add(el); 101 | } 102 | 103 | void add_element(int el, int pos) { 104 | theElements->setitem(pos, el); 105 | } 106 | 107 | int get_element(int pos) { 108 | return (*theElements)[pos]; 109 | } 110 | 111 | void seqsetelements(Array_S ary) { 112 | stheElements = ary; 113 | } 114 | 115 | Array_S seqelements() { 116 | return stheElements; 117 | } 118 | 119 | int seqnum_elements() { 120 | if (stheElements) 121 | return stheElements->size(); 122 | else return 0; 123 | } 124 | 125 | void seqadd_element(int el) { 126 | stheElements->add(el); 127 | //snumElements++; 128 | } 129 | 130 | void seqadd_element(int el, int pos) { 131 | stheElements->setitem(pos, el); 132 | } 133 | 134 | int seqget_element(int pos) { 135 | return (*stheElements)[pos]; 136 | } 137 | 138 | 139 | int find(int it) { 140 | if (theElements) { 141 | //for (int i=0; i size(); i++) 142 | // if ((*theElements)[i] == it) return 1; 143 | return bsearch(0, theElements->size() - 1, theElements->array(), it); 144 | } 145 | return -1; 146 | } 147 | 148 | int seqfind(int it) { 149 | if (stheElements) { 150 | //for (int i=0; i size(); i++) 151 | // if ((*stheElements)[i] == it) return 1; 152 | return bsearch(0, stheElements->size() - 1, stheElements->array(), it); 153 | } 154 | return -1; 155 | } 156 | 157 | friend ostream &operator<<(ostream &outputStream, EqGrNode &EQ); 158 | }; 159 | 160 | typedef shared_ptr EqGrNode_S; 161 | typedef shared_ptr> EqGrNode_SS; 162 | 163 | #endif //CSPADE_EQGRNODE_H 164 | -------------------------------------------------------------------------------- /csrc/Eqclass.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "Eqclass.h" 3 | 4 | Eqclass::Eqclass(int iset_sz, int eqt) { 5 | Iset_size = iset_sz; 6 | Eqtype = eqt; 7 | theList.reset(new Lists>()); 8 | seqTemplate = seqTemplate2 = 0; 9 | 10 | if (Eqtype == EQCTYP1) { 11 | theList2.reset(new Lists>()); 12 | } 13 | } 14 | 15 | Eqclass::~Eqclass() = default; 16 | 17 | 18 | int Eqclass::templ_sz() { 19 | return Iset_size; 20 | } 21 | 22 | int Eqclass::eqtype() { 23 | return Eqtype; 24 | } 25 | 26 | shared_ptr>> &Eqclass::list() { 27 | return theList; 28 | } 29 | 30 | shared_ptr>> &Eqclass::list2() { 31 | return theList2; 32 | } 33 | 34 | unsigned int Eqclass::templ() { 35 | return seqTemplate; 36 | } 37 | 38 | unsigned int Eqclass::templ2() { 39 | return seqTemplate2; 40 | } 41 | 42 | void Eqclass::set_templ(unsigned int val) { 43 | seqTemplate = val; 44 | } 45 | 46 | void Eqclass::set_templ2(unsigned int val) { 47 | seqTemplate2 = val; 48 | } 49 | 50 | void Eqclass::append(Itemset_S it) { 51 | theList->append(std::move(it)); 52 | } 53 | 54 | void Eqclass::append2(Itemset_S it) { 55 | theList2->append(std::move(it)); 56 | } 57 | 58 | void Eqclass::prepend(Itemset_S it) { 59 | theList->prepend(std::move(it)); 60 | } 61 | 62 | void Eqclass::prepend2(Itemset_S it) { 63 | theList2->prepend(std::move(it)); 64 | } 65 | -------------------------------------------------------------------------------- /csrc/Eqclass.h: -------------------------------------------------------------------------------- 1 | #ifndef _EQCLASS_H 2 | #define _EQCLASS_H 3 | 4 | #include "common.h" 5 | #include "Lists.h" 6 | #include "Itemset.h" 7 | 8 | #define EQCTYP1 1 9 | #define EQCTYP2 2 10 | #define EQCTYP3 3 11 | 12 | class Eqclass; 13 | typedef shared_ptr Eqclass_S; 14 | typedef shared_ptr> Eqclass_SS; 15 | 16 | class Eqclass { 17 | private: 18 | shared_ptr>> theList; 19 | int Iset_size; 20 | unsigned int seqTemplate; 21 | shared_ptr>> theList2; 22 | unsigned int seqTemplate2; 23 | int Eqtype; 24 | public: 25 | Eqclass(int iset_sz, int eqt); 26 | 27 | ~Eqclass(); 28 | 29 | int templ_sz(); 30 | 31 | int eqtype(); 32 | 33 | shared_ptr>>& list(); 34 | 35 | shared_ptr>>& list2(); 36 | 37 | unsigned int templ(); 38 | 39 | unsigned int templ2(); 40 | 41 | void set_templ(unsigned int val); 42 | 43 | void set_templ2(unsigned int val); 44 | 45 | void append(Itemset_S it); 46 | 47 | void append2(Itemset_S it); 48 | 49 | void prepend(Itemset_S it); 50 | 51 | void prepend2(Itemset_S it); 52 | }; 53 | 54 | #endif 55 | 56 | -------------------------------------------------------------------------------- /csrc/FreqIt.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 8/12/18. 3 | // 4 | 5 | #include "FreqIt.h" 6 | 7 | int FreqIt::compare(shared_ptr fit, unsigned int itpl) { 8 | int i; 9 | 10 | //first compare seqsz, one with larger seqsz is smaller 11 | if (seqsz > fit->size()) return -1; 12 | else if (seqsz < fit->size()) return 1; 13 | 14 | int *seq_data = seq->data(); 15 | 16 | //compare items & template bits 17 | if (seq_data[0] < (*fit)[0]) return -1; 18 | else if (seq_data[0] > (*fit)[0]) return 1; 19 | 20 | int bpos = seqsz - 1; 21 | int b1, b2; 22 | for (i = 1; i < seqsz; i++) { 23 | b1 = GETBIT(templ, bpos - i); 24 | b2 = GETBIT(itpl, bpos - i); 25 | if (b1 < b2) return -1; 26 | else if (b1 > b2) return 1; 27 | 28 | if (seq_data[i] < (*fit)[i]) return -1; 29 | else if (seq_data[i] > (*fit)[i]) return 1; 30 | } 31 | return 0; 32 | } 33 | 34 | 35 | int FreqIt::compare(FreqIt_S fit, int recursive) { 36 | int i; 37 | int *seq_data = seq->data(); 38 | int *fit_data = fit->seq->data(); 39 | 40 | //first compare seqsz, one with larger seqsz is smaller 41 | if (seqsz > fit->seqsz) return -1; 42 | else if (seqsz < fit->seqsz) return 1; 43 | 44 | //compare items & template bits 45 | if (seq_data[seqsz - 1] < fit_data[fit->seqsz - 1]) return -1; 46 | else if (seq_data[seqsz - 1] > fit_data[fit->seqsz - 1]) return 1; 47 | 48 | int bpos = 0; 49 | int b1, b2; 50 | for (i = seqsz - 2; i >= 0; i--, bpos++) { 51 | b1 = GETBIT(templ, bpos); 52 | b2 = GETBIT(fit->templ, bpos); 53 | if (b1 < b2) return -1; 54 | else if (b1 > b2) return 1; 55 | 56 | if (seq_data[i] < fit_data[i]) return -1; 57 | else if (seq_data[i] > fit_data[i]) return 1; 58 | } 59 | return 0; 60 | } 61 | 62 | ostream &operator<<(ostream &outputStream, FreqIt_S freq) { 63 | int *freq_data = freq->seq->data(); 64 | 65 | outputStream << "FREQ : "; 66 | for (int i = 0; i < freq->seqsz; i++) 67 | outputStream << " " << freq_data[i]; 68 | outputStream << " --- " << freq->templ << endl; 69 | return outputStream; 70 | } 71 | 72 | int FreqIt::size() { 73 | return seqsz; 74 | } 75 | 76 | FreqIt::FreqIt(int sz, unsigned int tpl) { 77 | templ = tpl; 78 | seqsz = sz; 79 | seq = make_shared(sz); 80 | } 81 | 82 | FreqIt::FreqIt(shared_ptr ary, int sz, unsigned int tpl) { 83 | templ = tpl; 84 | seqsz = sz; 85 | seq = make_shared(); 86 | seq->reserve(sz); 87 | auto ary_data = ary->data(); 88 | for (int i = 0; i < sz; i++) { 89 | seq->push_back(ary_data[i]); 90 | } 91 | } -------------------------------------------------------------------------------- /csrc/FreqIt.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 8/12/18. 3 | // 4 | 5 | #ifndef CSPADE_FREQIT_H 6 | #define CSPADE_FREQIT_H 7 | 8 | #include "common.h" 9 | #include "Itemset.h" 10 | 11 | class FreqIt; 12 | typedef shared_ptr FreqIt_S; 13 | typedef shared_ptr> FreqIt_SS; 14 | 15 | class FreqIt { 16 | public: 17 | shared_ptr seq; 18 | int seqsz; 19 | unsigned int templ; 20 | 21 | FreqIt(int sz, unsigned int tpl); 22 | 23 | FreqIt(shared_ptr ary, int sz, unsigned int tpl); 24 | 25 | ~FreqIt() = default; 26 | 27 | int size(); 28 | 29 | int compare(Itemset_S iset, unsigned int itpl); 30 | 31 | int compare(FreqIt_S fit, int recursive); 32 | 33 | friend ostream &operator<<(ostream &outputStream, FreqIt_S freq); 34 | }; 35 | 36 | 37 | 38 | #endif //CSPADE_FREQIT_H 39 | -------------------------------------------------------------------------------- /csrc/InvertDatabase.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "InvertDatabase.h" 3 | 4 | void InvertDatabase::incr(int sz) { 5 | int oldsz = numcust; 6 | numcust = sz; 7 | 8 | curits.resize(numcust); 9 | curcnts.resize(numcust); 10 | curcids.resize(numcust); 11 | curitszs.resize(numcust); 12 | 13 | int i; 14 | int ttval = (int) (args->avg_cust_size * args->avg_trans_count); 15 | for (i = oldsz; i < numcust; i++) { 16 | curitszs[i] = ttval; 17 | curits[i].resize(curitszs[i]); 18 | curcnts[i] = 0; 19 | curcids[i] = NOCLASS; 20 | } 21 | } 22 | 23 | void InvertDatabase::incr_curit(int midx) { 24 | curitszs[midx] *= 2; 25 | curits[midx].resize(curitszs[midx]); 26 | } 27 | 28 | 29 | void InvertDatabase::print_idlist(ostream& idlstrm, shared_ptr ival, int supsz) { 30 | int i, cid, cnt; 31 | int *ival_data = ival->data(); 32 | 33 | if (supsz > 0) { 34 | cid = ival_data[0]; 35 | cnt = 0; 36 | for (i = 0; i < supsz;) { 37 | if (cid == ival_data[i]) { 38 | cnt++; 39 | i += 2; 40 | } else { 41 | idlstrm << cid << " " << cnt << " "; 42 | cid = ival_data[i]; 43 | cnt = 0; 44 | } 45 | } 46 | idlstrm << cid << " " << cnt; 47 | } 48 | } 49 | 50 | void InvertDatabase::get_l2file(const string& fname, char use_seq, int &l2cnt) { 51 | fstream file(fname.c_str(), ios::binary); 52 | if (!file.is_open()) { 53 | throw runtime_error("can't open l2 file"); 54 | } 55 | unsigned long flen = file_size(file); 56 | if (flen > 0) { 57 | int *cntary = read_file(file, flen); 58 | 59 | if (!file) { 60 | throw runtime_error("Error reading file " + fname); 61 | } 62 | file.close(); 63 | 64 | // build eqgraph -- large 2-itemset relations 65 | int lim = flen / INT_SIZE; 66 | char lflg = 0; 67 | int i, j; 68 | for (i = 0; i < lim; i += 3) { 69 | lflg = 0; 70 | for (j = 0; j < cls->get_num_class(); j++) { 71 | if (cntary[i + 2] >= cls->get_min_support(j)) { 72 | lflg = 1; 73 | break; 74 | } 75 | } 76 | if (lflg) { 77 | if (!extl2_pre_pruning(cntary[i + 2], cntary[i + 1], cntary[i], use_seq, vuint_null)) { 78 | suffix_add_item_eqgraph(use_seq, cntary[i], cntary[i + 1]); 79 | l2cnt++; 80 | //assign sup to a single class, sice we don't know breakup 81 | if (use_seq) eqgraph[cntary[i + 1]]->add_seqsup(cntary[i + 2], 0); 82 | else eqgraph[cntary[i + 1]]->add_sup(cntary[i + 2], 0); 83 | for (j = 1; j < cls->get_num_class(); j++) 84 | if (use_seq) eqgraph[cntary[i + 1]]->add_seqsup(0, j); 85 | else eqgraph[cntary[i + 1]]->add_sup(0, j); 86 | } 87 | } 88 | } 89 | 90 | delete [] cntary; 91 | } 92 | } 93 | 94 | void InvertDatabase::suffix_add_item_eqgraph(char use_seq, int it1, int it2) { 95 | if (eqgraph[it2] == nullptr) { 96 | eqgraph[it2].reset(new EqGrNode(2, cls->get_num_class())); 97 | } 98 | if (use_seq) eqgraph[it2]->seqadd_element(it1); 99 | else eqgraph[it2]->add_element(it1); 100 | } 101 | 102 | int InvertDatabase::make_l1_pass() { 103 | int i, j; 104 | int supsz; 105 | int bsz = 100; 106 | 107 | ostringstream &seqstrm = env.seqstrm; 108 | ostringstream &idlstrm = env.idlstrm; 109 | 110 | backidx.resize(bsz); 111 | fidx.resize(args->dbase_max_item); 112 | 113 | numfreq = 0; 114 | int ivalsz = 100; 115 | shared_ptr ival = make_shared(ivalsz); 116 | // int tt=0; 117 | for (i = 0; i < args->dbase_max_item; i++) { 118 | supsz = partition->partition_get_idxsup(i); 119 | if (ivalsz < supsz) { 120 | ivalsz = supsz; 121 | ival->resize(ivalsz); 122 | } 123 | int *ival_data = ival->data(); 124 | partition->partition_read_item(ival, i); 125 | for (j = 0; j < cls->get_num_class(); j++) { 126 | cls->set_tmpe_item(j, 0); 127 | } 128 | 129 | int cid = -1; 130 | for (j = 0; j < supsz; j += 2) { 131 | if (cid != ival_data[j]) { 132 | cls->increase_tmpe(j); 133 | } 134 | cid = ival_data[j]; 135 | } 136 | 137 | char lflg = 0; 138 | fidx[i] = -1; // default init 139 | for (j = 0; j < cls->get_num_class(); j++) { 140 | if (cls->strong_esupport(j)) { 141 | lflg = 1; 142 | if (numfreq + 1 > bsz) { 143 | bsz = 2 * bsz; 144 | backidx.resize(bsz); 145 | } 146 | backidx[numfreq] = i; 147 | fidx[i] = numfreq; 148 | // cls->TMPE[j] << endl; 149 | numfreq++; 150 | break; 151 | } 152 | } 153 | 154 | if (lflg) { 155 | for (j = 0; j < cls->get_num_class(); j++) 156 | add_sup(cls->get_tmpe_item(j), j); 157 | if (args->outputfreq) { 158 | seqstrm << i << " --"; 159 | seqstrm << " " << get_sup(i); 160 | for (j = 0; j < cls->get_num_class(); j++) 161 | seqstrm << " " << cls->get_tmpe_item(j); 162 | seqstrm << " "; 163 | if (args->print_tidlist) print_idlist(idlstrm, ival, supsz); 164 | seqstrm << endl; 165 | } 166 | } 167 | } 168 | 169 | backidx.resize(numfreq); 170 | ival = nullptr; 171 | 172 | return numfreq; 173 | } 174 | 175 | void InvertDatabase::add_sup(int sup, int clsidx) { 176 | itsup[clsidx].add(sup); 177 | } 178 | 179 | int InvertDatabase::get_sup(int it, int clsidx) { 180 | if (clsidx == -1) { 181 | int sum = 0; 182 | auto num_class = cls->get_num_class(); 183 | for (int i = 0; i < num_class; i++) 184 | sum += itsup[i][fidx[it]]; 185 | return sum; 186 | } else return itsup[clsidx][fidx[it]]; 187 | } 188 | 189 | void InvertDatabase::process_cust_invert(int custidx) { 190 | int cid = curcids[custidx]; 191 | int curcnt = curcnts[custidx]; 192 | vint curit = curits[custidx]; 193 | 194 | int i, j, k, l; 195 | int nv1, nv2, diff; 196 | int it1, it2; 197 | 198 | for (i = 0; i < curcnt; i = nv1) { 199 | nv1 = i; 200 | it1 = curit[i]; 201 | while (nv1 < curcnt && it1 == curit[nv1]) nv1 += 2; 202 | for (j = i; j < curcnt; j = nv2) { 203 | nv2 = j; 204 | it2 = curit[j]; 205 | while (nv2 < curcnt && it2 == curit[nv2]) nv2 += 2; 206 | if (!seq_sup[it1].empty() && curit[i + 1] + args->min_gap <= curit[nv2 - 1]) { 207 | for (k = i, l = j; k < nv1 && l < nv2;) { 208 | diff = curit[l + 1] - curit[k + 1]; 209 | if (diff < args->min_gap) l += 2; 210 | else if (diff > args->max_gap) k += 2; 211 | else { 212 | seq_sup[it1][it2][cls->getcls(cid)]++; 213 | break; 214 | } 215 | } 216 | } 217 | 218 | if (j > i) { 219 | if (!seq_sup[it2].empty() && curit[j + 1] + args->min_gap <= curit[nv1 - 1]) { 220 | for (k = j, l = i; k < nv2 && l < nv1;) { 221 | diff = curit[l + 1] - curit[k + 1]; 222 | if (diff < args->min_gap) l += 2; 223 | else if (diff > args->max_gap) k += 2; 224 | else { 225 | seq_sup[it2][it1][cls->getcls(cid)]++; 226 | break; 227 | } 228 | } 229 | } 230 | 231 | if (!set_sup[it1].empty()) { 232 | for (k = i, l = j; k < nv1 && l < nv2;) { 233 | if (curit[k + 1] > curit[l + 1]) l += 2; 234 | else if (curit[k + 1] < curit[l + 1]) k += 2; 235 | else { 236 | set_sup[it1][it2 - it1 - 1][cls->getcls(cid)]++; 237 | break; 238 | } 239 | } 240 | } 241 | } 242 | } 243 | } 244 | } 245 | 246 | void InvertDatabase::process_invert(int pnum) { 247 | int i, k; 248 | int minv, maxv; 249 | partition->partition_get_minmaxcustid(backidx, numfreq, pnum, minv, maxv); 250 | if (numcust < maxv - minv + 1) 251 | incr(maxv - minv + 1); 252 | 253 | int supsz; 254 | int ivalsz = 0; 255 | shared_ptr ival = make_shared(); 256 | for (i = 0; i < numfreq; i++) { 257 | supsz = partition->partition_get_lidxsup(pnum, backidx[i]); 258 | if (ivalsz < supsz) { 259 | ivalsz = supsz; 260 | ival->resize(ivalsz); 261 | } 262 | partition->partition_lclread_item(ival, pnum, backidx[i]); 263 | 264 | int cid; 265 | int midx; 266 | int *ival_data = ival->data(); 267 | for (int pos = 0; pos < supsz; pos += 2) { 268 | cid = ival_data[pos]; 269 | midx = cid - minv; 270 | if (curcnts[midx] + 2 > curitszs[midx]) { 271 | incr_curit(midx); 272 | } 273 | curcids[midx] = cid; 274 | curits[midx][curcnts[midx]++] = i; 275 | curits[midx][curcnts[midx]++] = ival_data[pos + 1]; 276 | 277 | } 278 | } 279 | for (k = 0; k < maxv - minv + 1; k++) { 280 | if (curcnts[k] > 0) { 281 | process_cust_invert(k); 282 | } 283 | curcnts[k] = 0; 284 | curcids[k] = NOCLASS; 285 | } 286 | } 287 | 288 | bool InvertDatabase::extl2_pre_pruning(int totsup, int it, int pit, char use_seq, vuint& clsup) { 289 | ostringstream &logger = env.logger; 290 | 291 | float conf, conf2; 292 | int itsup; 293 | if (args->pruning_type == NOPRUNING) return false; 294 | if (use_seq) return false; 295 | if (GETBIT(args->pruning_type, FOLLOWPRUNING - 1)) { 296 | itsup = get_sup(it); 297 | conf = (1.0f * totsup) / itsup; 298 | conf2 = (1.0f * totsup) / get_sup(pit); 299 | if (conf >= args->follow_thresh || conf2 >= args->follow_thresh) { 300 | if (args->outputfreq && !clsup.empty()) { 301 | logger << "PRUNE_EXT " << pit << (use_seq ? " -2 " : " ") 302 | << it << " -1 " << totsup; 303 | for (int i = 0; i < cls->get_num_class(); i++) 304 | logger << " " << clsup[i]; 305 | logger << endl; 306 | } 307 | args->prepruning++; 308 | return true; 309 | } 310 | } 311 | return false; 312 | } 313 | 314 | void InvertDatabase::get_F2(int &l2cnt) { 315 | int i, j, k; 316 | int fcnt; 317 | char lflg; 318 | char use_seq; 319 | 320 | for (j = 0; j < numfreq; j++) { 321 | if (set_sup[j].empty()) { 322 | use_seq = 0; 323 | for (k = j + 1; k < numfreq; k++) { 324 | lflg = 0; 325 | for (i = 0; i < cls->get_num_class(); i++) { 326 | fcnt = set_sup[j][k - j - 1][i]; 327 | if (fcnt >= cls->get_min_support(i)) { 328 | lflg = 1; 329 | break; 330 | } 331 | } 332 | if (lflg) { 333 | fcnt = 0; 334 | for (i = 0; i < cls->get_num_class(); i++) { 335 | fcnt += set_sup[j][k - j - 1][i]; 336 | } 337 | if (!extl2_pre_pruning(fcnt, backidx[k], backidx[j], use_seq, set_sup[j][k - j - 1])) { 338 | suffix_add_item_eqgraph(use_seq, backidx[j], backidx[k]); 339 | for (i = 0; i < cls->get_num_class(); i++) { 340 | int ffcnt = set_sup[j][k - j - 1][i]; 341 | eqgraph[backidx[k]]->add_sup(ffcnt, i); 342 | } 343 | l2cnt++; 344 | } 345 | } 346 | } 347 | } 348 | if (!seq_sup[j].empty()) { 349 | use_seq = 1; 350 | for (k = 0; k < numfreq; k++) { 351 | lflg = 0; 352 | for (i = 0; i < cls->get_num_class(); i++) { 353 | fcnt = seq_sup[j][k][i]; 354 | if (fcnt >= cls->get_min_support(i)) { 355 | lflg = 1; 356 | break; 357 | } 358 | } 359 | if (lflg) { 360 | fcnt = 0; 361 | for (i = 0; i < cls->get_num_class(); i++) { 362 | fcnt += seq_sup[j][k][i]; 363 | } 364 | if (!extl2_pre_pruning(fcnt, backidx[k], backidx[j], use_seq, seq_sup[j][k])) { 365 | suffix_add_item_eqgraph(use_seq, backidx[j], backidx[k]); 366 | l2cnt++; 367 | for (i = 0; i < cls->get_num_class(); i++) { 368 | int ffcnt = seq_sup[j][k][i]; 369 | eqgraph[backidx[k]]->add_seqsup(ffcnt, i); 370 | } 371 | } 372 | } 373 | } 374 | } 375 | } 376 | } 377 | 378 | int InvertDatabase::make_l2_pass() { 379 | int i, j; 380 | 381 | int l2cnt = 0; 382 | int num_class = cls->get_num_class(); 383 | 384 | // support for 2-itemsets 385 | set_sup.resize(numfreq); 386 | seq_sup.resize(numfreq); 387 | 388 | int low, high; 389 | 390 | for (low = 0; low < numfreq; low = high) { 391 | for (high = low; high < numfreq; high++) { 392 | if (args->max_iset_len > 1 && numfreq - high - 1 > 0) { 393 | set_sup[high].resize(numfreq - high - 1); 394 | for (i = 0; i < numfreq - high - 1; i++) { 395 | set_sup[high][i].resize(num_class, 0); 396 | } 397 | } 398 | if (args->max_seq_len > 1) { 399 | seq_sup[high].resize(numfreq); 400 | for (i = 0; i < numfreq; i++) { 401 | seq_sup[high][i].resize(num_class, 0); 402 | } 403 | } 404 | } 405 | for (int p = 0; p < args->num_partitions; p++) { 406 | process_invert(p); 407 | } 408 | get_F2(l2cnt); 409 | } 410 | 411 | return l2cnt; 412 | } 413 | 414 | void InvertDatabase::init(int sz) { 415 | int i; 416 | numcust = 0; 417 | incr(sz); 418 | eqgraph.resize(args->dbase_max_item); 419 | 420 | auto num_class = cls->get_num_class(); 421 | itsup.reserve(num_class); 422 | for (i = 0; i &cls) { 430 | InvertDatabase::cls = cls; 431 | } 432 | 433 | void InvertDatabase::setPartition(const shared_ptr &partition) { 434 | InvertDatabase::partition = partition; 435 | } 436 | 437 | void InvertDatabase::setArgs(const shared_ptr &args) { 438 | InvertDatabase::args = args; 439 | } 440 | -------------------------------------------------------------------------------- /csrc/InvertDatabase.h: -------------------------------------------------------------------------------- 1 | #ifndef __EXT_H_ 2 | #define __EXT_H_ 3 | 4 | #include "common.h" 5 | #include "Partition.h" 6 | #include "Eqclass.h" 7 | #include "EqGrNode.h" 8 | #include "ClassInfo.h" 9 | #include "Env.h" 10 | 11 | class InvertDatabase { 12 | int numcust; 13 | vvint curits; 14 | vint curcnts; 15 | vint curcids; 16 | vint curitszs; 17 | Env& env; 18 | shared_ptr cls; 19 | shared_ptr partition; 20 | shared_ptr args; 21 | vector eqgraph; 22 | vint backidx; 23 | vint fidx; 24 | int numfreq; 25 | vector itsup; 26 | vector> _items; 27 | vector>> set_sup, seq_sup; 28 | public: 29 | InvertDatabase(Env& env); 30 | 31 | const EqGrNode_S &get_eqgraph_item(int i) { 32 | return eqgraph[i]; 33 | } 34 | 35 | void set_cls(const shared_ptr &cls); 36 | 37 | void setPartition(const shared_ptr &partition); 38 | 39 | void setArgs(const shared_ptr &args); 40 | 41 | void init(int sz); 42 | 43 | void incr(int sz); 44 | 45 | void incr_curit(int midx); 46 | 47 | void process_invert(int pnum); 48 | 49 | void process_cust_invert(int custidx); 50 | 51 | int make_l1_pass(); 52 | 53 | int make_l2_pass(); 54 | 55 | void get_l2file(const string &fname, char use_seq, int &l2cnt); 56 | 57 | bool extl2_pre_pruning(int totsup, int it, int pit, char use_seq, vuint& clsup); 58 | 59 | void suffix_add_item_eqgraph(char use_seq, int it1, int it2); 60 | 61 | void get_F2(int &l2cnt); 62 | 63 | void print_idlist(ostream& idlstrm, shared_ptr ival, int supsz); 64 | 65 | void add_sup(int sup, int cls); 66 | 67 | int get_sup(int it, int cls = -1); 68 | 69 | int in_mem(int it) { 70 | if (_items[fidx[it]]->ival()->array() == nullptr) return 0; 71 | else return 1; 72 | } 73 | 74 | Itemset_S get_item(int it) { 75 | if (!in_mem(it)) get_ext_item(it); 76 | return _items[fidx[it]]; 77 | } 78 | 79 | void get_ext_item(int it) { 80 | int supsz = partition->partition_get_idxsup(it); 81 | shared_ptr newit = make_shared(supsz); 82 | partition->partition_read_item(newit, it); 83 | _items[fidx[it]]->set_support(supsz); 84 | _items[fidx[it]]->ival()->set_size(supsz); 85 | _items[fidx[it]]->ival()->set_array(newit); 86 | } 87 | 88 | void init_buffer(int num_class, int size) { 89 | _items.reserve(size); 90 | bool print_idlist = args->print_tidlist; 91 | 92 | for (int i = 0; i < size; i++) { 93 | Itemset_S tmp = make_shared(1, 0, num_class, print_idlist); 94 | tmp->setitem(0, backidx[i]); 95 | _items.push_back(tmp); 96 | } 97 | } 98 | 99 | }; 100 | 101 | #endif //__EXT_H_ 102 | -------------------------------------------------------------------------------- /csrc/Itemset.cc: -------------------------------------------------------------------------------- 1 | #include "Itemset.h" 2 | 3 | 4 | Itemset::Itemset(int it_sz, int ival_sz, int nclass, bool print) { 5 | num_class = nclass; 6 | do_print = print; 7 | theItemset.reset(new Array(it_sz)); 8 | theIval.reset(new Array(ival_sz)); 9 | theSupport = 0; 10 | clsSup.resize(num_class, 0); 11 | } 12 | 13 | Itemset::~Itemset() = default; 14 | 15 | int Itemset::compare(Itemset &ar2) { 16 | int len; 17 | if (size() <= ar2.size()) len = size(); 18 | else len = ar2.size(); 19 | for (int i = 0; i < len; i++) { 20 | if ((*theItemset)[i] > (*ar2.theItemset)[i]) return 1; 21 | else if ((*theItemset)[i] < (*ar2.theItemset)[i]) return -1; 22 | } 23 | if (size() < ar2.size()) return -1; 24 | else if (size() > ar2.size()) return 1; 25 | else return 0; 26 | } 27 | 28 | int Itemset::subsequence(Itemset_S ar) { 29 | int i, j; 30 | if (size() > ar->size()) return 0; 31 | int start = 0; 32 | for (i = 0; i < size(); i++) { 33 | for (j = start; j size(); j++) { 34 | if ((*theItemset)[i] == (*ar->theItemset)[j]) { 35 | start = j + 1; 36 | break; 37 | } 38 | } 39 | if (j >= ar->size()) return 0; 40 | } 41 | return 1; 42 | } 43 | 44 | ostream &operator<<(ostream &outputStream, Itemset &itemset) { 45 | outputStream << "ITEM: "; 46 | outputStream << *itemset.theItemset; 47 | outputStream << "(" << itemset.theSupport << ")"; 48 | outputStream << "\n"; 49 | return outputStream; 50 | } 51 | 52 | void Itemset::print_seq(ostream& seqstrm, int itempl) { 53 | int i; 54 | int sz = size(); 55 | seqstrm << (*theItemset)[0] << " "; 56 | for (i = 1; i < sz - 1; i++) { 57 | if (GETBIT(itempl, sz - 1 - i)) 58 | seqstrm << "-> "; 59 | seqstrm << (*theItemset)[i] << " "; 60 | } 61 | if (GETBIT(itempl, sz - 1 - i)) 62 | seqstrm << "-> "; 63 | seqstrm << (*theItemset)[sz - 1] << " "; 64 | seqstrm << "-- " << theSupport; 65 | for (i = 0; i < num_class; i++) 66 | seqstrm << " " << clsSup[i]; 67 | seqstrm << " "; 68 | // if (do_print) print_idlist(); 69 | seqstrm << endl; 70 | } 71 | 72 | void Itemset::print_idlist(ostream& idlstrm) { 73 | int i, cid, cnt; 74 | 75 | if (theIval && theIval->size() > 0) { 76 | cid = (*theIval)[0]; 77 | cnt = 0; 78 | for (i = 0; i size();) { 79 | if (cid == (*theIval)[i]) { 80 | cnt++; 81 | i += 2; 82 | } else { 83 | idlstrm << cid << " " << cnt << " "; 84 | cid = (*theIval)[i]; 85 | cnt = 0; 86 | } 87 | } 88 | idlstrm << cid << " " << cnt; 89 | } 90 | } 91 | 92 | Itemset_S placeholder = make_shared(true); -------------------------------------------------------------------------------- /csrc/Itemset.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef __ITEMSET_H 3 | #define __ITEMSET_H 4 | 5 | #include "common.h" 6 | #include "Array.h" 7 | #include "Lists.h" 8 | #include "SpadeArguments.h" 9 | #include "Partition.h" 10 | 11 | #define SETBIT(a, v, b) (((v) != 0) ? ((a) | (01 << (b))): ((a) & ~(01 << (b)))) 12 | #define GETBIT(a, b) ((a) & (01 << (b))) 13 | 14 | class Itemset; 15 | typedef shared_ptr Itemset_S; 16 | 17 | class Itemset { 18 | protected: 19 | Array_S theItemset; 20 | Array_S theIval; 21 | int theSupport; 22 | vint clsSup; 23 | int num_class; 24 | bool do_print; 25 | // shared_ptr cls; 26 | // shared_ptr args; 27 | public: 28 | explicit Itemset(bool x){} 29 | 30 | Itemset(int it_sz, int ival_sz, int nclass, bool print); 31 | 32 | ~Itemset(); 33 | 34 | friend ostream &operator<<(ostream &outputStream, Itemset &itemset); 35 | 36 | int compare(Itemset &ar2); 37 | 38 | int subsequence(Itemset_S ar); 39 | 40 | void print_seq(ostream& seqstrm, int itempl); 41 | 42 | void print_idlist(ostream& idlstrm); 43 | 44 | Array_S &ival() { 45 | return theIval; 46 | } 47 | 48 | int ival(int pos) { 49 | return (*theIval)[pos]; 50 | } 51 | 52 | int ivalsize() { 53 | return theIval->size(); 54 | } 55 | 56 | void reallocival() { 57 | theIval->resize(ivalsize()); 58 | } 59 | 60 | int operator[](int pos) { 61 | return (*theItemset)[pos]; 62 | }; 63 | 64 | void setitem(int pos, int val) { 65 | theItemset->setitem(pos, val); 66 | }; 67 | 68 | Array_S itemset() { 69 | return theItemset; 70 | }; 71 | 72 | void add_item(int val) { 73 | theItemset->add(val); 74 | }; 75 | 76 | int size() { 77 | return theItemset->size(); 78 | }; 79 | 80 | int support() { 81 | return theSupport; 82 | }; 83 | 84 | void set_support(int sup) { 85 | theSupport = sup; 86 | } 87 | 88 | void increment_support() { 89 | theSupport++; 90 | }; 91 | 92 | int cls_support(int cls) { 93 | return clsSup[cls]; 94 | } 95 | 96 | void increment_cls_support(int cls) { 97 | clsSup[cls]++; 98 | } 99 | 100 | void set_cls_support(int sup, int cls) { 101 | clsSup[cls] = sup; 102 | } 103 | 104 | static int Itemcompare(void *iset1, void *iset2) { 105 | Itemset* it1 = (Itemset* ) iset1; 106 | Itemset* it2 = (Itemset* ) iset2; 107 | return it1->compare(*it2); 108 | } 109 | }; 110 | 111 | extern Itemset_S placeholder; 112 | 113 | #endif //__ITEMSET_H 114 | 115 | -------------------------------------------------------------------------------- /csrc/Lists.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "Lists.h" 3 | #include "Array.h" 4 | #include "Eqclass.h" 5 | #include "Itemset.h" 6 | 7 | template 8 | ListNodes::ListNodes(T item, shared_ptr> next) { 9 | theItem = item; 10 | theNext = next; 11 | } 12 | 13 | template 14 | ListNodes::~ListNodes() { 15 | theNext = nullptr; 16 | theItem = nullptr; 17 | } 18 | 19 | template 20 | Lists::Lists() { 21 | theHead = 0; 22 | theLast = 0; 23 | theSize = 0; 24 | } 25 | 26 | //only listnodes are deleted, if node->item() is a pointer to some object 27 | //that object is *not* deleted 28 | template 29 | Lists::~Lists() = default; 30 | 31 | //listnodes are deleted, if node->item() is a pointer to some object 32 | //that object is *also* deleted 33 | template 34 | void Lists::clear() { 35 | theHead = nullptr; 36 | theLast = nullptr; 37 | theSize = 0; 38 | } 39 | 40 | 41 | template 42 | void Lists::append(T item) { 43 | shared_ptr> node; 44 | 45 | theSize++; 46 | node.reset(new ListNodes(item, 0)); 47 | 48 | if (theHead == nullptr) { 49 | theHead = node; 50 | theLast = node; 51 | } else { 52 | theLast->set_next(node); 53 | theLast = node; 54 | } 55 | } 56 | 57 | 58 | template 59 | void Lists::prepend(T item) { 60 | shared_ptr> node; 61 | 62 | theSize++; 63 | node.reset(new ListNodes(item, 0)); 64 | 65 | if (theHead == 0) { 66 | theHead = node; 67 | theLast = node; 68 | } else { 69 | node->set_next(theHead); 70 | theHead = node; 71 | } 72 | } 73 | 74 | template 75 | class Lists; 76 | 77 | template 78 | class Lists; 79 | 80 | template 81 | class Lists>; 82 | 83 | template 84 | class Lists; 85 | -------------------------------------------------------------------------------- /csrc/Lists.h: -------------------------------------------------------------------------------- 1 | #ifndef __LISTS_H 2 | #define __LISTS_H 3 | 4 | #include "common.h" 5 | 6 | typedef int (*CMP_FUNC)(void *, void *); 7 | 8 | template 9 | class ListNodes { 10 | private: 11 | shared_ptr> theNext; 12 | T theItem; 13 | 14 | public: 15 | ListNodes(T item, shared_ptr> next); 16 | 17 | ~ListNodes(); 18 | 19 | shared_ptr> next() { 20 | return theNext; 21 | } 22 | 23 | void set_next(shared_ptr> next) { 24 | theNext = next; 25 | } 26 | 27 | T& item() { 28 | return theItem; 29 | } 30 | 31 | void set_item(T item) { 32 | theItem = item; 33 | } 34 | }; 35 | 36 | template 37 | class Lists { 38 | private: 39 | shared_ptr> theHead; 40 | shared_ptr> theLast; 41 | int theSize; 42 | 43 | public: 44 | 45 | Lists(); 46 | 47 | ~Lists(); 48 | 49 | void clear(); 50 | 51 | shared_ptr> head() { 52 | return theHead; 53 | }; 54 | 55 | shared_ptr> last() { 56 | return theLast; 57 | }; 58 | 59 | int size() { 60 | return theSize; 61 | }; 62 | 63 | void append(T item); 64 | 65 | void prepend(T item); 66 | 67 | T find(T item, CMP_FUNC cmpare); 68 | }; 69 | 70 | #endif// __LISTS_H 71 | 72 | 73 | -------------------------------------------------------------------------------- /csrc/Partition.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "Partition.h" 3 | #include "Sequence.h" 4 | 5 | 6 | Partition::Partition() {} 7 | 8 | void Partition::init() { 9 | DATAFD.resize(args->num_partitions); 10 | ITEMIDX.reserve(args->num_partitions); 11 | } 12 | 13 | void Partition::partition_alloc() { 14 | ostringstream datafnstream; 15 | ostringstream idxfnstream; 16 | 17 | for (int i = 0; i < args->num_partitions; i++) { 18 | datafnstream.str(string()); 19 | idxfnstream.str(string()); 20 | datafnstream << args->dataf; 21 | idxfnstream << args->idxf; 22 | 23 | if (args->num_partitions > 1) { 24 | datafnstream << ".P" << i; 25 | idxfnstream << ".P" << i; 26 | } 27 | fstream &datafstream = DATAFD[i]; 28 | datafstream.open(datafnstream.str(), ios::binary | ios::in); 29 | if (!datafstream.is_open()) { 30 | throw runtime_error("can't open data file: " + datafnstream.str()); 31 | } 32 | 33 | fstream idxfstream(idxfnstream.str().c_str(), ios::binary | ios::in); 34 | if (!idxfstream.is_open()) { 35 | throw runtime_error("can't open idx file: " + idxfnstream.str()); 36 | } 37 | 38 | auto idxflen = file_size(idxfstream); 39 | vint chunk; 40 | chunk.reserve(idxflen); 41 | 42 | int *chunk_buf = read_file(idxfstream, idxflen); 43 | 44 | for (int j=0; jnum_partitions; i++) { 57 | int* data = ITEMIDX[i].data(); 58 | supsz += data[it + 1] - data[it]; 59 | } 60 | return supsz; 61 | } 62 | 63 | int Partition::partition_get_lidxsup(int idx, int it) { 64 | int* data = ITEMIDX[idx].data(); 65 | return (data[it + 1] - data[it]); 66 | } 67 | 68 | void Partition::partition_read_item(shared_ptr ival, int it) { 69 | int ipos = 0; 70 | int supsz; 71 | for (int i = 0; i < args->num_partitions; i++) { 72 | supsz = ITEMIDX[i][it + 1] - ITEMIDX[i][it]; 73 | if (supsz > 0) { 74 | fstream& f = DATAFD[i]; 75 | f.seekg(ITEMIDX[i][it] * INT_SIZE, ios::beg); 76 | f.read((char *) &ival->at(ipos), supsz * INT_SIZE); 77 | if (!f) { 78 | throw runtime_error("Error reading item"); 79 | } 80 | ipos += supsz; 81 | } 82 | } 83 | } 84 | 85 | void Partition::partition_lclread_item(shared_ptr ival, int pnum, int it) { 86 | int supsz; 87 | supsz = ITEMIDX[pnum][it + 1] - ITEMIDX[pnum][it]; 88 | if (supsz > 0) { 89 | fstream& f = DATAFD[pnum]; 90 | f.seekg(ITEMIDX[pnum][it] * INT_SIZE, ios::beg); 91 | f.read((char *) ival->data(), supsz * INT_SIZE); 92 | 93 | if (!f) { 94 | throw runtime_error("Error reading item"); 95 | } 96 | } 97 | } 98 | 99 | 100 | void Partition::partition_get_minmaxcustid(vint& backidx, int numit, int pnum, int &minv, int &maxv) { 101 | int custid, it, i, supsz; 102 | minv = INT_MAX; 103 | maxv = 0; 104 | for (i = 0; i < numit; i++) { 105 | it = backidx[i]; 106 | supsz = ITEMIDX[pnum][it + 1] - ITEMIDX[pnum][it]; 107 | if (supsz > 0) { 108 | fstream& f = DATAFD[pnum]; 109 | f.seekg(ITEMIDX[pnum][it] * INT_SIZE, ios::beg); 110 | f.read((char *) &custid, INT_SIZE); 111 | if (minv > custid) { 112 | minv = custid; 113 | } 114 | 115 | f.seekg((supsz - 3) * INT_SIZE, ios::cur); 116 | f.read((char *) &custid, INT_SIZE); 117 | 118 | if (maxv < custid) { 119 | maxv = custid; 120 | } 121 | } 122 | } 123 | } 124 | 125 | void Partition::set_args(const shared_ptr &args) { 126 | Partition::args = args; 127 | } -------------------------------------------------------------------------------- /csrc/Partition.h: -------------------------------------------------------------------------------- 1 | #ifndef __PARTITION_H_ 2 | #define __PARTITION_H_ 3 | 4 | //#include "spade.h" 5 | //#include "sequence.h" 6 | 7 | #include "common.h" 8 | #include "SpadeArguments.h" 9 | #include "Array.h" 10 | 11 | class Partition { 12 | private: 13 | shared_ptr args; 14 | vector DATAFD; 15 | vvint ITEMIDX; 16 | public: 17 | Partition(); 18 | 19 | void init(); 20 | 21 | void set_args(const shared_ptr &args); 22 | 23 | void partition_alloc(); 24 | 25 | int partition_get_idxsup(int it); 26 | 27 | int partition_get_lidxsup(int idx, int it); 28 | 29 | void partition_read_item(shared_ptr ival, int it); 30 | 31 | void partition_lclread_item(shared_ptr ival, int pnum, int it); 32 | 33 | void partition_get_minmaxcustid(vint& backidx, int numit, int pnum, int &minv, int &maxv); 34 | }; 35 | 36 | #endif// __PARTITION_H_ 37 | -------------------------------------------------------------------------------- /csrc/Sequence.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 7/12/18. 3 | // 4 | 5 | #ifndef CSPADE_SEQUENCE_H 6 | #define CSPADE_SEQUENCE_H 7 | 8 | #include "common.h" 9 | #include "SpadeArguments.h" 10 | #include "argv_parser.h" 11 | #include "Partition.h" 12 | #include "InvertDatabase.h" 13 | #include "FreqIt.h" 14 | 15 | result_t sequenceFunc(Env& env, const shared_ptr& args); 16 | 17 | /** 18 | * Call spade given the argument list as string 19 | * @param args e.g 'spade -i zaki -s 0.3 -Z 10 -z 10 -u 1 -r -e 1 -o' 20 | */ 21 | result_t sequenceWrapper(const string &s, shared_ptr& envptr); 22 | 23 | class Sequence { 24 | private: 25 | Env& env; 26 | shared_ptr args; 27 | shared_ptr partition; 28 | shared_ptr cls; 29 | shared_ptr invdb; 30 | 31 | Array_S l_array, e_array, m_array; 32 | FreqIt_SS FreqArray; 33 | unsigned long FreqArraySz = 100; 34 | int FreqArrayPos = 0; 35 | shared_ptr numLargeItemset; 36 | public: 37 | 38 | explicit Sequence(Env& env_); 39 | 40 | void set_args(const shared_ptr &args); 41 | 42 | void set_partition(const shared_ptr &partition); 43 | 44 | void set_cls(const shared_ptr &cls); 45 | 46 | void set_num_large_dataset(const shared_ptr &numLargeItemset); 47 | 48 | void set_invdb(const shared_ptr &invdb); 49 | 50 | void read_files(); 51 | 52 | int get_file_l2() { 53 | int l2cnt = 0; 54 | 55 | if (args->max_iset_len > 1) { 56 | invdb->get_l2file(args->it2f, 0, l2cnt); 57 | } 58 | if (args->max_seq_len > 1) { 59 | invdb->get_l2file(args->seqf, 1, l2cnt); 60 | } 61 | 62 | cerr << "L2 : " << l2cnt << endl; 63 | return l2cnt; 64 | } 65 | 66 | void get_tmpnewf_intersect(Itemset_S &ljoin, Itemset_S &ejoin, Itemset_S &mjoin, 67 | int &lcnt, int &ecnt, int &mcnt, 68 | Itemset_S& it1, Itemset_S& it2, int iter); 69 | 70 | void make_itemset(Itemset_S& it, Array_S& ary, int cnt, const vint &clscnt); 71 | 72 | void pre_pruning(Itemset_S &join, unsigned int ptempl, Itemset_S& clas, Itemset_S& prefix, char use_seq); 73 | 74 | void post_pruning(Itemset_S &iset, unsigned int templ); 75 | 76 | void newSeq(); 77 | 78 | void process_class(int it); 79 | 80 | Eqclass_S get_ext_eqclass(int it); 81 | 82 | void get_2newf_intersect(Itemset_S& ljoin, Itemset_S& ejoin, shared_ptr vit1, shared_ptr vit2, int sup1, 83 | int sup2); 84 | 85 | void add_freq(Itemset_S &it, int templ); 86 | 87 | Itemset_S prune_decision(Itemset_S& it1, Itemset_S& it2, unsigned int ptempl, int jflg); 88 | 89 | void find_large(Eqclass_S cluster, int it); 90 | 91 | void insert_freqarray(shared_ptr>& LargeL); 92 | 93 | int get_valid_el(int it, vector &ibvec, vector &sbvec); 94 | 95 | void process_itemset(Itemset_S iset, unsigned int templ, int iter); 96 | 97 | void process_maxgap(Eqclass_S L2); 98 | 99 | void process_cluster1(Eqclass_S cluster, shared_ptr> LargeL, int iter); 100 | 101 | void process_cluster_list1(shared_ptr>>& hdr1, 102 | shared_ptr>>& cluster1, 103 | shared_ptr>>& cluster2, 104 | shared_ptr>& LargeL, 105 | int iter, int eqtype, Eqclass_S& parent); 106 | 107 | void process_cluster_list2(shared_ptr>>& hdr1, int i, Eqclass_SS& EQ, 108 | shared_ptr>>& cluster, 109 | shared_ptr>& LargeL, 110 | int iter, int eqtype, Eqclass_S& parent); 111 | 112 | void fill_join(Itemset_S& join, Itemset_S& hdr1, Itemset_S& hdr2); 113 | }; 114 | 115 | #endif //CSPADE_SEQUENCE_H 116 | -------------------------------------------------------------------------------- /csrc/SpadeArguments.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 8/12/18. 3 | // 4 | 5 | #include "SpadeArguments.h" 6 | 7 | void SpadeArguments::parse_args(int argc, char **argv) { 8 | string name; 9 | 10 | auto cmdl = argh::parser(argc, argv, argh::parser::PREFER_PARAM_FOR_UNREG_OPTION); 11 | cmdl("i") >> name; 12 | 13 | if (name.empty() || !cmdl("s")) { 14 | cerr << "usage: usage: spade [OPTION]... -i -s\n"; 15 | throw runtime_error("spade needs valid value of -i and -s"); 16 | } 17 | cmdl("s") >> min_support_per_class; 18 | cmdl("a") >> use_ascending; 19 | cmdl("c") >> use_class; 20 | if (cmdl("e")) { 21 | ext_l2_pass = 1; 22 | cmdl("e") >> num_partitions; 23 | } 24 | if (cmdl["h"]) use_hash = 1; 25 | if (cmdl["o"]) outputfreq = 1; 26 | if (cmdl["r"]) recursive = 1; 27 | cmdl("l") >> min_gap; 28 | if (cmdl("u")) { 29 | use_maxgap = 1; 30 | use_hash = 0; 31 | cmdl("u") >> max_gap; 32 | } 33 | cmdl("t") >> pruning_type; 34 | cmdl("v") >> min_support; 35 | if (cmdl("w") || cmdl["w"]) { 36 | if (!cmdl("u")) { 37 | cerr << "-u is required when -w is enabled" << endl; 38 | throw runtime_error("-u is required when -w is enabled"); 39 | } 40 | use_window = 1; 41 | } 42 | cmdl("y") >> print_tidlist; 43 | cmdl("z") >> max_seq_len; 44 | cmdl("Z") >> max_iset_len; 45 | 46 | 47 | dataf = name + ".tpose"; 48 | idxf = name + ".idx"; 49 | conf = name + ".conf"; 50 | it2f = name + ".2it"; 51 | seqf = name + ".2seq"; 52 | classf = name + ".class"; 53 | 54 | ifstream conff(conf, ios::binary); 55 | if (!conff.is_open()) { 56 | throw runtime_error("File " + string(conf) + " doesn\'t exist."); 57 | } 58 | 59 | conff.read((char *) &total_trans_count, INT_SIZE); 60 | if (min_support == -1) 61 | min_support = (int) ceil(min_support_per_class * total_trans_count); 62 | //ensure that support is at least 2 63 | if (min_support < 1) { 64 | min_support = 1; 65 | } 66 | 67 | conff.read((char *) &dbase_max_item, INT_SIZE); 68 | conff.read((char *) &avg_cust_size, FLOAT_SIZE); 69 | conff.read((char *) &avg_trans_count, FLOAT_SIZE); 70 | conff.read((char *) &dbase_total_trans, INT_SIZE); 71 | conff.close(); 72 | } -------------------------------------------------------------------------------- /csrc/SpadeArguments.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 8/12/18. 3 | // 4 | 5 | #ifndef CSPADE_SPADE_ARGS_T_H 6 | #define CSPADE_SPADE_ARGS_T_H 7 | 8 | #include "common.h" 9 | #include "argv_parser.h" 10 | 11 | //join type 12 | #define LJOIN 0 13 | #define EJOIN 1 14 | #define MJOIN 2 15 | 16 | //pruning types 17 | #define NOPRUNING 0 // no pruning 18 | #define L2PRUNING 1 19 | #define ZEROPRUNING 2 // when sup goes to zero in other classes, prune 20 | #define FOLLOWPRUNING 4 // 21 | 22 | class SpadeArguments { 23 | public: 24 | int pruning_type = NOPRUNING; 25 | string dataf; 26 | string idxf; 27 | string conf; 28 | string it2f; 29 | string seqf; 30 | string classf; 31 | int ext_l2_pass = 0; 32 | int use_hash = 0; 33 | int num_intersect = 0; 34 | int recursive = 0; 35 | int maxiter = 2; 36 | int min_gap = 1; 37 | int max_gap = INT_MAX; 38 | char use_maxgap = 0; 39 | char use_window = 0; 40 | int use_ascending = -2; 41 | bool use_class = false; 42 | char outputfreq = 0; 43 | char print_tidlist = 0; 44 | 45 | int L2pruning = 0; 46 | int prepruning = 0; 47 | int postpruning = 0; 48 | 49 | int max_seq_len = 100; 50 | int max_iset_len = 100; 51 | 52 | int total_trans_count; 53 | int dbase_max_item; 54 | float avg_trans_count; 55 | float avg_cust_size; 56 | int dbase_total_trans; 57 | 58 | double min_support_per_class; 59 | float follow_thresh = 1.0; 60 | float zero_thresh = 0.0; 61 | int min_support = -1; 62 | 63 | int num_partitions; 64 | 65 | void parse_args(int argc, char **argv); 66 | }; 67 | 68 | 69 | #endif //CSPADE_SPADE_ARGS_T_H 70 | -------------------------------------------------------------------------------- /csrc/TransArray.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "TransArray.h" 3 | 4 | TransArray::TransArray(long sz, int npart) { 5 | totSize = sz; 6 | theSize = 0; 7 | lastPos = 0; 8 | theFlg = 0; 9 | offset.reserve(npart); 10 | for (int i = 0; i < npart; i++) offset[i] = 0; 11 | if (sz > 0) { 12 | theArray.reserve(totSize); 13 | } 14 | } 15 | 16 | TransArray::~TransArray() { 17 | theArray.clear(); 18 | offset.clear(); 19 | } 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /csrc/TransArray.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRANS_ARRAY_H 2 | #define __TRANS_ARRAY_H 3 | 4 | #include "common.h" 5 | 6 | class TransArray { 7 | private: 8 | vector theArray; 9 | char theFlg; 10 | int lastPos; 11 | // DD: remove unsigned to avoid gcc warning signed vs non-signed cmp 12 | int theSize; 13 | // DD: remove unsigned to avoid gcc warning signed vs non-signed cmp 14 | long totSize; 15 | vector offset; 16 | public: 17 | 18 | explicit TransArray(long sz, int npart = 1); 19 | 20 | ~TransArray(); 21 | 22 | int operator[](unsigned int index) { 23 | return theArray[index]; 24 | }; 25 | 26 | char flg() { 27 | return theFlg; 28 | } 29 | 30 | void setflg(char flg) { 31 | theFlg = flg; 32 | } 33 | 34 | int lastpos() { 35 | return lastPos; 36 | } 37 | 38 | //to be used ony for use_seq 39 | void setlastpos() { 40 | theArray[lastPos + 1] = theSize - lastPos - 2; 41 | lastPos = theSize; 42 | } 43 | 44 | long get_offset(int pos = 0) { 45 | return offset[pos]; 46 | } 47 | 48 | void set_offset(long off, int pos = 0) { 49 | offset[pos] = off; 50 | } 51 | 52 | int totsize() { 53 | return totSize; 54 | } 55 | 56 | void reset() { 57 | theSize = 0; 58 | lastPos = 0; 59 | theFlg = 0; 60 | } 61 | 62 | vector& array() { 63 | return theArray; 64 | } 65 | 66 | int size() { 67 | return theSize; 68 | } 69 | 70 | void setsize(int size) { 71 | theSize = size; 72 | } 73 | 74 | void setitem(int pos, int item) { 75 | theArray[pos] = item; 76 | } 77 | 78 | void additem(int item) { 79 | theArray[theSize] = item; 80 | theSize++; 81 | } 82 | 83 | void flushbuf(int fd, int use_seq, int pos = 0) { 84 | lseek(fd, offset[pos] * sizeof(int), SEEK_SET); 85 | int wblk = theSize; 86 | if (wblk > 0) { 87 | auto res = write(fd, (char *) theArray.data(), wblk * sizeof(int)); 88 | if (res < wblk * sizeof(int)) { 89 | throw runtime_error("Error writing"); 90 | } 91 | offset[pos] += wblk; 92 | } 93 | theSize = 0; 94 | } 95 | 96 | void add(int fd, int item, int use_seq, int pos, int custid = -1) { 97 | if (use_seq) { 98 | if (theSize + 2 > totSize) { 99 | flushbuf(fd, use_seq, pos); 100 | } 101 | theArray[theSize++] = custid; 102 | } else { 103 | if (theSize + 1 > totSize) { 104 | flushbuf(fd, use_seq, pos); 105 | } 106 | } 107 | theArray[theSize++] = item; 108 | } 109 | }; 110 | 111 | #endif //__TRANS_ARRAY_H 112 | 113 | 114 | -------------------------------------------------------------------------------- /csrc/argh.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 11/12/18. 3 | // 4 | 5 | #ifndef UTILITIES_ARGH_H 6 | #define UTILITIES_ARGH_H 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace argh { 19 | // Terminology: 20 | // A command line is composed of 2 types of args: 21 | // 1. Positional args, i.e. free standing values 22 | // 2. Options: args beginning with '-'. We identify two kinds: 23 | // 2.1: Flags: boolean options => (exist ? true : false) 24 | // 2.2: Parameters: a name followed by a non-option value 25 | 26 | #if !defined(__GNUC__) || (__GNUC__ >= 5) 27 | using string_stream = std::istringstream; 28 | #else 29 | 30 | // Until GCC 5, istringstream did not have a move constructor. 31 | // stringstream_proxy is used instead, as a workaround. 32 | class stringstream_proxy { 33 | public: 34 | stringstream_proxy() = default; 35 | 36 | // Construct with a value. 37 | stringstream_proxy(std::string const &value) : 38 | stream_(value) {} 39 | 40 | // Copy constructor. 41 | stringstream_proxy(const stringstream_proxy &other) : 42 | stream_(other.stream_.str()) { 43 | stream_.setstate(other.stream_.rdstate()); 44 | } 45 | 46 | void setstate(std::ios_base::iostate state) { stream_.setstate(state); } 47 | 48 | // Stream out the value of the parameter. 49 | // If the conversion was not possible, the stream will enter the fail state, 50 | // and operator bool will return false. 51 | template 52 | stringstream_proxy &operator>>(T &thing) { 53 | stream_ >> thing; 54 | return *this; 55 | } 56 | 57 | 58 | // Get the string value. 59 | std::string str() const { return stream_.str(); } 60 | 61 | std::stringbuf *rdbuf() const { return stream_.rdbuf(); } 62 | 63 | // Check the state of the stream. 64 | // False when the most recent stream operation failed 65 | operator bool() const { return !!stream_; } 66 | 67 | ~stringstream_proxy() = default; 68 | 69 | private: 70 | std::istringstream stream_; 71 | }; 72 | 73 | using string_stream = stringstream_proxy; 74 | #endif 75 | 76 | class parser { 77 | public: 78 | enum Mode { 79 | PREFER_FLAG_FOR_UNREG_OPTION = 1 << 0, 80 | PREFER_PARAM_FOR_UNREG_OPTION = 1 << 1, 81 | NO_SPLIT_ON_EQUALSIGN = 1 << 2, 82 | SINGLE_DASH_IS_MULTIFLAG = 1 << 3, 83 | }; 84 | 85 | parser() = default; 86 | 87 | parser(std::initializer_list pre_reg_names) { add_params(pre_reg_names); } 88 | 89 | parser(char **argv, int mode = PREFER_FLAG_FOR_UNREG_OPTION) { parse(argv, mode); } 90 | 91 | parser(int argc, char **argv, int mode = PREFER_FLAG_FOR_UNREG_OPTION) { parse(argc, argv, mode); } 92 | 93 | void add_param(std::string const &name); 94 | 95 | void add_params(std::initializer_list init_list); 96 | 97 | void parse(char **argv, int mode = PREFER_FLAG_FOR_UNREG_OPTION); 98 | 99 | void parse(int argc, char **argv, int mode = PREFER_FLAG_FOR_UNREG_OPTION); 100 | 101 | std::multiset const &flags() const { return flags_; } 102 | 103 | std::map const ¶ms() const { return params_; } 104 | 105 | std::vector const &pos_args() const { return pos_args_; } 106 | 107 | // begin() and end() for using range-for over positional args. 108 | std::vector::const_iterator begin() const { return pos_args_.cbegin(); } 109 | 110 | std::vector::const_iterator end() const { return pos_args_.cend(); } 111 | 112 | size_t size() const { return pos_args_.size(); } 113 | 114 | ////////////////////////////////////////////////////////////////////////// 115 | // Accessors 116 | 117 | // flag (boolean) accessors: return true if the flag appeared, otherwise false. 118 | bool operator[](std::string const &name) const; 119 | 120 | // multiple flag (boolean) accessors: return true if at least one of the flag appeared, otherwise false. 121 | bool operator[](std::initializer_list init_list) const; 122 | 123 | // returns positional arg string by order. Like argv[] but without the options 124 | std::string const &operator[](size_t ind) const; 125 | 126 | // returns a std::istream that can be used to convert a positional arg to a typed value. 127 | string_stream operator()(size_t ind) const; 128 | 129 | // same as above, but with a default value in case the arg is missing (index out of range). 130 | template 131 | string_stream operator()(size_t ind, T &&def_val) const; 132 | 133 | // parameter accessors, give a name get an std::istream that can be used to convert to a typed value. 134 | // call .str() on result to get as string 135 | string_stream operator()(std::string const &name) const; 136 | 137 | // accessor for a parameter with multiple names, give a list of names, get an std::istream that can be used to convert to a typed value. 138 | // call .str() on result to get as string 139 | // returns the first value in the list to be found. 140 | string_stream operator()(std::initializer_list init_list) const; 141 | 142 | // same as above, but with a default value in case the param was missing. 143 | // Non-string def_val types must have an operator<<() (output stream operator) 144 | // If T only has an input stream operator, pass the string version of the type as in "3" instead of 3. 145 | template 146 | string_stream operator()(std::string const &name, T &&def_val) const; 147 | 148 | // same as above but for a list of names. returns the first value to be found. 149 | template 150 | string_stream operator()(std::initializer_list init_list, T &&def_val) const; 151 | 152 | private: 153 | string_stream bad_stream() const; 154 | 155 | std::string trim_leading_dashes(std::string const &name) const; 156 | 157 | bool is_number(std::string const &arg) const; 158 | 159 | bool is_option(std::string const &arg) const; 160 | 161 | bool got_flag(std::string const &name) const; 162 | 163 | bool is_param(std::string const &name) const; 164 | 165 | private: 166 | std::vector args_; 167 | std::map params_; 168 | std::vector pos_args_; 169 | std::multiset flags_; 170 | std::set registeredParams_; 171 | std::string empty_; 172 | }; 173 | 174 | 175 | ////////////////////////////////////////////////////////////////////////// 176 | 177 | inline void parser::parse(char **argv, int mode) { 178 | int argc = 0; 179 | for (auto argvp = argv; *argvp; ++argc, ++argvp); 180 | parse(argc, argv, mode); 181 | } 182 | 183 | ////////////////////////////////////////////////////////////////////////// 184 | 185 | inline void parser::parse(int argc, char **argv, int mode /*= PREFER_FLAG_FOR_UNREG_OPTION*/) { 186 | // convert to strings 187 | args_.resize(argc); 188 | std::transform(argv, argv + argc, args_.begin(), [](const char *const arg) { return arg; }); 189 | 190 | // parse line 191 | for (auto i = 0u; i < args_.size(); ++i) { 192 | if (!is_option(args_[i])) { 193 | pos_args_.emplace_back(args_[i]); 194 | continue; 195 | } 196 | 197 | auto name = trim_leading_dashes(args_[i]); 198 | 199 | if (!(mode & NO_SPLIT_ON_EQUALSIGN)) { 200 | auto equalPos = name.find('='); 201 | if (equalPos != std::string::npos) { 202 | params_.insert({name.substr(0, equalPos), name.substr(equalPos + 1)}); 203 | continue; 204 | } 205 | } 206 | 207 | // if the option is unregistered and should be a multi-flag 208 | if (1 == (args_[i].size() - name.size()) && // single dash 209 | argh::parser::SINGLE_DASH_IS_MULTIFLAG & mode && // multi-flag mode 210 | !is_param(name)) // unregistered 211 | { 212 | std::string keep_param; 213 | 214 | if (!name.empty() && is_param(std::string(1ul, name.back()))) // last char is param 215 | { 216 | keep_param += name.back(); 217 | name.resize(name.size() - 1); 218 | } 219 | 220 | for (auto const &c : name) { 221 | flags_.emplace(std::string{c}); 222 | } 223 | 224 | if (!keep_param.empty()) { 225 | name = keep_param; 226 | } else { 227 | continue; // do not consider other options for this arg 228 | } 229 | } 230 | 231 | // any potential option will get as its value the next arg, unless that arg is an option too 232 | // in that case it will be determined a flag. 233 | if (i == args_.size() - 1 || is_option(args_[i + 1])) { 234 | flags_.emplace(name); 235 | continue; 236 | } 237 | 238 | // if 'name' is a pre-registered option, then the next arg cannot be a free parameter to it is skipped 239 | // otherwise we have 2 modes: 240 | // PREFER_FLAG_FOR_UNREG_OPTION: a non-registered 'name' is determined a flag. 241 | // The following value (the next arg) will be a free parameter. 242 | // 243 | // PREFER_PARAM_FOR_UNREG_OPTION: a non-registered 'name' is determined a parameter, the next arg 244 | // will be the value of that option. 245 | 246 | assert(!(mode & argh::parser::PREFER_FLAG_FOR_UNREG_OPTION) 247 | || !(mode & argh::parser::PREFER_PARAM_FOR_UNREG_OPTION)); 248 | 249 | bool preferParam = mode & argh::parser::PREFER_PARAM_FOR_UNREG_OPTION; 250 | 251 | if (is_param(name) || preferParam) { 252 | params_.insert({name, args_[i + 1]}); 253 | ++i; // skip next value, it is not a free parameter 254 | continue; 255 | } else { 256 | flags_.emplace(name); 257 | } 258 | }; 259 | } 260 | 261 | ////////////////////////////////////////////////////////////////////////// 262 | 263 | inline string_stream parser::bad_stream() const { 264 | string_stream bad; 265 | bad.setstate(std::ios_base::failbit); 266 | return bad; 267 | } 268 | 269 | ////////////////////////////////////////////////////////////////////////// 270 | 271 | inline bool parser::is_number(std::string const &arg) const { 272 | // inefficient but simple way to determine if a string is a number (which can start with a '-') 273 | std::istringstream istr(arg); 274 | double number; 275 | istr >> number; 276 | return !(istr.fail() || istr.bad()); 277 | } 278 | 279 | ////////////////////////////////////////////////////////////////////////// 280 | 281 | inline bool parser::is_option(std::string const &arg) const { 282 | assert(0 != arg.size()); 283 | if (is_number(arg)) 284 | return false; 285 | return '-' == arg[0]; 286 | } 287 | 288 | ////////////////////////////////////////////////////////////////////////// 289 | 290 | inline std::string parser::trim_leading_dashes(std::string const &name) const { 291 | auto pos = name.find_first_not_of('-'); 292 | return std::string::npos != pos ? name.substr(pos) : name; 293 | } 294 | 295 | ////////////////////////////////////////////////////////////////////////// 296 | 297 | inline bool argh::parser::got_flag(std::string const &name) const { 298 | return flags_.end() != flags_.find(trim_leading_dashes(name)); 299 | } 300 | 301 | ////////////////////////////////////////////////////////////////////////// 302 | 303 | inline bool argh::parser::is_param(std::string const &name) const { 304 | return registeredParams_.count(name); 305 | } 306 | 307 | ////////////////////////////////////////////////////////////////////////// 308 | 309 | inline bool parser::operator[](std::string const &name) const { 310 | return got_flag(name); 311 | } 312 | 313 | ////////////////////////////////////////////////////////////////////////// 314 | 315 | inline bool parser::operator[](std::initializer_list init_list) const { 316 | return std::any_of(init_list.begin(), init_list.end(), [&](char const *const name) { return got_flag(name); }); 317 | } 318 | 319 | ////////////////////////////////////////////////////////////////////////// 320 | 321 | inline std::string const &parser::operator[](size_t ind) const { 322 | if (ind < pos_args_.size()) 323 | return pos_args_[ind]; 324 | return empty_; 325 | } 326 | 327 | ////////////////////////////////////////////////////////////////////////// 328 | 329 | inline string_stream parser::operator()(std::string const &name) const { 330 | auto optIt = params_.find(trim_leading_dashes(name)); 331 | if (params_.end() != optIt) 332 | return string_stream(optIt->second); 333 | return bad_stream(); 334 | } 335 | 336 | ////////////////////////////////////////////////////////////////////////// 337 | 338 | inline string_stream parser::operator()(std::initializer_list init_list) const { 339 | for (auto &name : init_list) { 340 | auto optIt = params_.find(trim_leading_dashes(name)); 341 | if (params_.end() != optIt) 342 | return string_stream(optIt->second); 343 | } 344 | return bad_stream(); 345 | } 346 | 347 | ////////////////////////////////////////////////////////////////////////// 348 | 349 | template 350 | string_stream parser::operator()(std::string const &name, T &&def_val) const { 351 | auto optIt = params_.find(trim_leading_dashes(name)); 352 | if (params_.end() != optIt) 353 | return string_stream(optIt->second); 354 | 355 | std::ostringstream ostr; 356 | ostr << def_val; 357 | return string_stream(ostr.str()); // use default 358 | } 359 | 360 | ////////////////////////////////////////////////////////////////////////// 361 | 362 | // same as above but for a list of names. returns the first value to be found. 363 | template 364 | string_stream parser::operator()(std::initializer_list init_list, T &&def_val) const { 365 | for (auto &name : init_list) { 366 | auto optIt = params_.find(trim_leading_dashes(name)); 367 | if (params_.end() != optIt) 368 | return string_stream(optIt->second); 369 | } 370 | std::ostringstream ostr; 371 | ostr << def_val; 372 | return string_stream(ostr.str()); // use default 373 | } 374 | 375 | ////////////////////////////////////////////////////////////////////////// 376 | 377 | inline string_stream parser::operator()(size_t ind) const { 378 | if (pos_args_.size() <= ind) 379 | return bad_stream(); 380 | 381 | return string_stream(pos_args_[ind]); 382 | } 383 | 384 | ////////////////////////////////////////////////////////////////////////// 385 | 386 | template 387 | string_stream parser::operator()(size_t ind, T &&def_val) const { 388 | if (pos_args_.size() <= ind) { 389 | std::ostringstream ostr; 390 | ostr << def_val; 391 | return string_stream(ostr.str()); 392 | } 393 | 394 | return string_stream(pos_args_[ind]); 395 | } 396 | 397 | ////////////////////////////////////////////////////////////////////////// 398 | 399 | inline void parser::add_param(std::string const &name) { 400 | registeredParams_.insert(trim_leading_dashes(name)); 401 | } 402 | 403 | ////////////////////////////////////////////////////////////////////////// 404 | 405 | inline void parser::add_params(std::initializer_list init_list) { 406 | for (auto &name : init_list) 407 | registeredParams_.insert(trim_leading_dashes(name)); 408 | } 409 | } 410 | 411 | #endif //UTILITIES_ARGH_H 412 | -------------------------------------------------------------------------------- /csrc/argv_parser.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "argv_parser.h" 3 | 4 | 5 | char *string2chars(const string &str) { 6 | int len = str.length(); 7 | auto *chars = new char[len + 1]; 8 | for (int i = 0; i < str.length(); i++) { 9 | chars[i] = str[i]; 10 | } 11 | chars[len] = 0; 12 | return chars; 13 | } 14 | 15 | string ensure_one_newline(const string &s) { 16 | string str(s); 17 | str.erase(std::remove(str.begin(), str.end(), '\n'), str.end()); 18 | return str + "\n"; 19 | } 20 | 21 | args_t parse(const string &s) { 22 | list argv; 23 | ostringstream token; 24 | 25 | bool in_token; 26 | bool in_container; 27 | bool escaped; 28 | char container_start; 29 | char c; 30 | int len; 31 | int i; 32 | 33 | string str = ensure_one_newline(s); 34 | 35 | container_start = 0; 36 | in_token = false; 37 | in_container = false; 38 | escaped = false; 39 | 40 | len = static_cast(str.length()); 41 | 42 | for (i = 0; i < len; i++) { 43 | c = str[i]; 44 | 45 | switch (c) { 46 | /* handle whitespace */ 47 | case ' ': 48 | case '\t': 49 | case '\n': 50 | if (!in_token) 51 | continue; 52 | 53 | if (in_container) { 54 | token << c; 55 | continue; 56 | } 57 | 58 | if (escaped) { 59 | escaped = false; 60 | token << c; 61 | continue; 62 | } 63 | 64 | /* if reached here, we're at end of token */ 65 | in_token = false; 66 | argv.push_back(token.str()); 67 | token.str(string()); 68 | token.clear(); 69 | break; 70 | 71 | /* handle quotes */ 72 | case '\'': 73 | case '\"': 74 | 75 | if (escaped) { 76 | token << c; 77 | escaped = false; 78 | continue; 79 | } 80 | 81 | if (!in_token) { 82 | in_token = true; 83 | in_container = true; 84 | container_start = c; 85 | continue; 86 | } 87 | 88 | if (in_container) { 89 | if (c == container_start) { 90 | in_container = false; 91 | in_token = false; 92 | argv.push_back(token.str()); 93 | token.str(string()); 94 | token.clear(); 95 | continue; 96 | } else { 97 | token << c; 98 | continue; 99 | } 100 | } 101 | 102 | /* XXX in this case, we: 103 | * 1. have a quote 104 | * 2. are in a token 105 | * 3. and not in a container 106 | * e.g. 107 | * hell"o 108 | * 109 | * what'str done here appears shell-dependent, 110 | * but overall, it'str an error.... i *think* 111 | */ 112 | throw runtime_error("Parse Error! Bad quotes"); 113 | case '\\': 114 | 115 | if (in_container && str[i + 1] != container_start) { 116 | token << c; 117 | continue; 118 | } 119 | 120 | if (escaped) { 121 | token << c; 122 | continue; 123 | } 124 | 125 | escaped = true; 126 | break; 127 | 128 | default: 129 | if (!in_token) { 130 | in_token = true; 131 | } 132 | 133 | token << c; 134 | } 135 | } 136 | 137 | if (in_container) 138 | throw runtime_error("Parse Error! Still in container\n"); 139 | 140 | if (escaped) 141 | throw runtime_error("Parse Error! Unused escape (\\)\n"); 142 | 143 | return args_t(argv); 144 | } 145 | -------------------------------------------------------------------------------- /csrc/argv_parser.h: -------------------------------------------------------------------------------- 1 | #ifndef ARGV_PARSER_H 2 | #define ARGV_PARSER_H 3 | 4 | #include "common.h" 5 | 6 | /** 7 | * Same as strdup. Make a deep copy of a string and return char* 8 | * @param s 9 | * @return 10 | */ 11 | char *string2chars(const string &str); 12 | 13 | /** 14 | * Tuple containing argc and argv (char**) 15 | */ 16 | struct args_t { 17 | int argc; 18 | char **argv; 19 | 20 | args_t(std::list args) { 21 | argc = static_cast(args.size()); 22 | argv = new char *[argc]; 23 | std::list::const_iterator iterator; 24 | int idx = 0; 25 | for (iterator = args.begin(); iterator != args.end(); ++iterator) { 26 | argv[idx] = string2chars((*iterator)); 27 | idx++; 28 | } 29 | } 30 | 31 | ~args_t() { 32 | for (int i = 0; i < argc; i++) { 33 | delete[] argv[i]; 34 | } 35 | delete[] argv; 36 | } 37 | }; 38 | 39 | /** 40 | * Parse a string of arguments into char** (such that can be used by main()) 41 | * This function is string aware, e.g. "hello world" is one arg, not two 42 | * @param s 43 | * @return 44 | */ 45 | args_t parse(const string &s); 46 | 47 | #endif //ARGV_PARSER_H 48 | -------------------------------------------------------------------------------- /csrc/calcdb.cc: -------------------------------------------------------------------------------- 1 | #include "calcdb.h" 2 | #include 3 | 4 | void DbaseCtrlBlk::init(const string &infilename, int buf_sz) { 5 | fd = open(infilename.c_str(), O_RDONLY | O_BINARY); 6 | if (fd < 0) { 7 | throw runtime_error("ERROR: InvalidFile -- Dbase_Ctrl_Blk()"); 8 | } 9 | 10 | buf_size = buf_sz; 11 | buf = new int[buf_sz]; 12 | cur_buf_pos = 0; 13 | cur_blk_size = 0; 14 | readall = 0; 15 | endpos = lseek(fd, 0, SEEK_END); 16 | } 17 | 18 | DbaseCtrlBlk::~DbaseCtrlBlk() { 19 | delete[] buf; 20 | close(fd); 21 | } 22 | 23 | void DbaseCtrlBlk::get_next_trans_ext() { 24 | // Need to get more items from file 25 | auto res = cur_blk_size - cur_buf_pos; 26 | if (res > 0) { 27 | // First copy partial transaction to beginning of buffer 28 | memcpy((void *) buf, 29 | (void *) (buf + cur_buf_pos), 30 | res * INT_SIZE); 31 | cur_blk_size = res; 32 | } else { 33 | // No partial transaction in buffer 34 | cur_blk_size = 0; 35 | } 36 | 37 | res = read(fd, (void *) (buf + cur_blk_size), ((buf_size - cur_blk_size) * INT_SIZE)); 38 | 39 | if (res < 0) { 40 | throw runtime_error("reading in database"); 41 | } 42 | cur_blk_size += res / INT_SIZE; 43 | cur_buf_pos = 0; 44 | } 45 | 46 | void DbaseCtrlBlk::get_first_blk() { 47 | readall = 0; 48 | lseek(fd, 0, SEEK_SET); 49 | cur_blk_size = (read(fd, (void *) buf, (buf_size * INT_SIZE))) / INT_SIZE; 50 | if (cur_blk_size < 0) { 51 | throw runtime_error("get_first_blk"); 52 | } 53 | cur_buf_pos = 0; 54 | } 55 | 56 | void DbaseCtrlBlk::get_next_trans(int *&lbuf, int &nitems, int &tid, int &cid) { 57 | if (cur_buf_pos + TRANSOFF >= cur_blk_size || 58 | cur_buf_pos + buf[cur_buf_pos + TRANSOFF - 1] + TRANSOFF > cur_blk_size) { 59 | if (lseek(fd, 0, SEEK_CUR) == endpos) readall = 1; 60 | if (!readall) { 61 | // Need to get more items from file 62 | get_next_trans_ext(); 63 | } 64 | } 65 | 66 | if (!readall) { 67 | cid = buf[cur_buf_pos]; 68 | tid = buf[cur_buf_pos + TRANSOFF - 2]; 69 | nitems = buf[cur_buf_pos + TRANSOFF - 1]; 70 | lbuf = buf + cur_buf_pos + TRANSOFF; 71 | cur_buf_pos += nitems + TRANSOFF; 72 | } 73 | } 74 | 75 | 76 | -------------------------------------------------------------------------------- /csrc/calcdb.h: -------------------------------------------------------------------------------- 1 | #ifndef __DATABASE_H 2 | #define __DATABASE_H 3 | 4 | #include "common.h" 5 | 6 | #define DCBBUFSZ 2048 7 | #define TRANSOFF 3 8 | 9 | #ifndef O_BINARY 10 | #define O_BINARY 0 11 | #endif 12 | 13 | class DbaseCtrlBlk { 14 | public: 15 | DbaseCtrlBlk() = default; 16 | 17 | void init(const string& infilename, int buf_sz = DCBBUFSZ); 18 | 19 | ~DbaseCtrlBlk(); 20 | 21 | void get_next_trans_ext(); 22 | 23 | void get_first_blk(); 24 | 25 | void get_next_trans(int *&lbuf, int &nitems, int &tid, int &cid); 26 | 27 | int eof() { 28 | return (readall == 1); 29 | } 30 | 31 | int fd; 32 | int buf_size; 33 | int *buf; 34 | unsigned long cur_blk_size; 35 | int cur_buf_pos; 36 | unsigned long endpos; 37 | char readall; 38 | }; 39 | 40 | #endif //__DATABASE_H 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /csrc/common.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "common.h" 3 | 4 | const int INT_SIZE = sizeof(int); 5 | const int FLOAT_SIZE = sizeof(float); 6 | 7 | vuint vuint_null; 8 | const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; 9 | const int alphanumlen = sizeof(alphanum) - 1; 10 | 11 | const string TMPDIR = get_temp_folder(); 12 | 13 | unsigned long file_size(fstream& file) { 14 | std::streampos fsize = 0; 15 | fsize = file.tellg(); 16 | file.seekg( 0, std::ios::end); 17 | fsize = file.tellg() - fsize; 18 | file.clear(); 19 | file.seekg( 0, std::ios::beg); 20 | return static_cast(fsize); 21 | } 22 | 23 | int* read_file(fstream& file, unsigned long flen) { 24 | if (!flen) return nullptr; 25 | 26 | auto buffer = new char[flen]; 27 | auto intbuf = (int *) buffer; 28 | file.read(buffer, flen); 29 | return intbuf; 30 | } 31 | 32 | bool file_exists(const string &name) { 33 | ifstream f(name.c_str()); 34 | return f.good(); 35 | } 36 | 37 | /* Reads a file and returns the number of lines in this file. */ 38 | int num_lines(const string &filename) { 39 | FILE *file = fopen(filename.c_str(), "r"); 40 | int lines = 0; 41 | int c; 42 | int last = '\n'; 43 | while (EOF != (c = fgetc(file))) { 44 | if (c == '\n' && last != '\n') { 45 | ++lines; 46 | } 47 | last = c; 48 | } 49 | fclose(file); 50 | return lines; 51 | } 52 | 53 | 54 | bool starts_with(const string &haystack, const string &needle) { 55 | return needle.length() <= haystack.length() 56 | && equal(needle.begin(), needle.end(), haystack.begin()); 57 | } 58 | 59 | 60 | list list_files(const string &folder, const string &prefix) { 61 | struct dirent *entry; 62 | list retval; 63 | DIR *dir = opendir(folder.c_str()); 64 | if (dir == nullptr) { 65 | return retval; 66 | } 67 | 68 | bool check_prefix = prefix.length() > 0; 69 | 70 | while ((entry = readdir(dir)) != nullptr) { 71 | string filename = entry->d_name; 72 | if (check_prefix && starts_with(filename, prefix)) { 73 | retval.push_back(filename); 74 | } 75 | } 76 | closedir(dir); 77 | 78 | return retval; 79 | } 80 | 81 | 82 | string random_id(const int len) { 83 | std::random_device rd; 84 | std::mt19937 gen(rd()); 85 | std::uniform_int_distribution<> dis(0, alphanumlen - 1); 86 | 87 | auto *s = new char[len + 1]; 88 | int rand_idx; 89 | for (int i = 0; i < len; ++i) { 90 | rand_idx = dis(gen); 91 | s[i] = alphanum[rand_idx]; 92 | } 93 | s[len] = 0; 94 | string retval(s); 95 | delete [] s; 96 | return retval; 97 | } 98 | 99 | string get_temp_folder() { 100 | #if defined(_MSC_VER) || defined(MS_WINDOWS) || defined(WIN32) 101 | string buffer; 102 | buffer.resize(1000); 103 | const auto new_size = GetTempPathA(buffer.size(), &buffer[0]); //deal with newsize == 0 104 | buffer.resize(new_size); 105 | return buffer; 106 | #else 107 | return "/tmp/"; 108 | #endif 109 | } 110 | -------------------------------------------------------------------------------- /csrc/common.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #if defined(_MSC_VER) || defined(MS_WINDOWS) || defined(WIN32) 20 | #include 21 | #include 22 | #include "dirent-win.h" 23 | #else 24 | #include 25 | #include 26 | #include 27 | #endif 28 | #include 29 | #include 30 | #include "argh.h" 31 | 32 | #ifndef INT_MAX 33 | #define INT_MAX 2147483647 34 | #endif 35 | 36 | using std::cout; 37 | using std::cerr; 38 | using std::endl; 39 | using std::flush; 40 | 41 | using std::ostringstream; 42 | using std::string; 43 | using std::ifstream; 44 | using std::ofstream; 45 | using std::ostream; 46 | using std::ios; 47 | using std::fstream; 48 | 49 | using std::list; 50 | using std::vector; 51 | 52 | 53 | using std::shared_ptr; 54 | using std::unique_ptr; 55 | using std::make_shared; 56 | 57 | using std::runtime_error; 58 | using std::exception; 59 | 60 | #define min(a, b) ((a) < (b) ? (a) : (b)) 61 | 62 | struct result_t { 63 | int nsequences; 64 | string seqstrm; 65 | string logger; 66 | string summary; 67 | }; 68 | 69 | #define ulong int 70 | extern const int INT_SIZE; 71 | extern const int FLOAT_SIZE; 72 | 73 | typedef vector vint; 74 | typedef vector> vvint; 75 | typedef vector vuint; 76 | 77 | extern vuint vuint_null; 78 | 79 | bool file_exists(const string &name); 80 | 81 | unsigned long file_size(fstream& file); 82 | 83 | int* read_file(fstream& file, unsigned long flen); 84 | 85 | int num_lines(const string &filename); 86 | 87 | list list_files(const string& folder, const string& prefix = ""); 88 | 89 | string random_id(const int len); 90 | 91 | string get_temp_folder(); 92 | 93 | extern const string TMPDIR; 94 | 95 | #endif 96 | -------------------------------------------------------------------------------- /csrc/exttpose.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "exttpose.h" 3 | #include "calcdb.h" 4 | #include "Env.h" 5 | #include "TransArray.h" 6 | 7 | 8 | int cmp2it(const void *a, const void *b) { 9 | auto *ary = (int *) a; 10 | auto *bry = (int *) b; 11 | if (ary[0] < bry[0]) return -1; 12 | else if (ary[0] > bry[0]) return 1; 13 | else { 14 | if (ary[1] < bry[1]) return -1; 15 | else if (ary[1] > bry[1]) return 1; 16 | else return 0; 17 | } 18 | } 19 | 20 | void Exttpose::sort_get_l2(int &l2cnt, fstream &file, ofstream &ofd, vector &cntary, bool use_seq) { 21 | //write 2-itemsets counts to file 22 | 23 | int i, j, k; 24 | int fcnt; 25 | int lit; 26 | int itbuf[3]; 27 | int *sortary; 28 | 29 | unsigned long filesize = file_size(file); 30 | 31 | if (filesize > 0) { 32 | sortary = read_file(file, filesize); 33 | 34 | if (!file) { 35 | throw runtime_error("Error reading file."); 36 | } 37 | qsort(sortary, (filesize / sizeof(int)) / 2, 2 * INT_SIZE, cmp2it); 38 | } 39 | 40 | unsigned long numel = filesize / INT_SIZE; 41 | i = 0; 42 | fcnt = 0; 43 | for (j = 0; j < numfreq; j++) { 44 | if (args.use_seq) k = 0; 45 | else k = j + 1; 46 | for (; k < numfreq; k++) { 47 | fcnt = 0; 48 | if (filesize > 0 && i < numel) { 49 | while (i < numel && 50 | j == freqidx[sortary[i]] && k == freqidx[sortary[i + 1]]) { 51 | fcnt += 256; 52 | i += 2; 53 | } 54 | } 55 | if (args.use_seq) fcnt += (int) cntary[j * numfreq + k]; 56 | else { 57 | lit = j; 58 | lit = (offsets[lit] - lit - 1); 59 | fcnt += (int) cntary[lit + k]; 60 | } 61 | 62 | if (fcnt >= args.MINSUPPORT) { 63 | if (args.write_only_fcnt) { 64 | ofd.write((char *) &fcnt, INT_SIZE); 65 | } else { 66 | itbuf[0] = backidx[j]; 67 | itbuf[1] = backidx[k]; 68 | itbuf[2] = fcnt; 69 | ofd.write((char *) itbuf, 3 * sizeof(int)); 70 | } 71 | l2cnt++; 72 | } 73 | } 74 | } 75 | if (filesize > 0) delete[] sortary; 76 | } 77 | 78 | 79 | void Exttpose::process_cust(int fcnt, fstream &seqfd, fstream &isetfd) { 80 | int j, k; 81 | int ii1, ii2, lit; 82 | 83 | for (k = 0; k < fcnt; k++) { 84 | for (j = k; j < fcnt; j++) { 85 | if (args.use_seq && extary[fidx[j]].size() > 0) { 86 | lit = extary[fidx[j]].size() - 1; 87 | if (extary[fidx[k]][0] < extary[fidx[j]][lit]) { 88 | if ((++seq2[fidx[k] * numfreq + fidx[j]]) == 0) { 89 | seqfd.write((char *) &backidx[fidx[k]], INT_SIZE); 90 | seqfd.write((char *) &backidx[fidx[j]], INT_SIZE); 91 | } 92 | } 93 | } 94 | if (j > k) { 95 | if (fidx[k] < fidx[j]) { 96 | ii1 = fidx[k]; 97 | ii2 = fidx[j]; 98 | } else { 99 | ii2 = fidx[k]; 100 | ii1 = fidx[j]; 101 | } 102 | lit = offsets[ii1] - ii1 - 1; 103 | if (ocust[lit + ii2] == 1) { 104 | if ((++itcnt2[lit + ii2]) == 0) { 105 | isetfd.write((char *) &backidx[ii1], INT_SIZE); 106 | isetfd.write((char *) &backidx[ii2], INT_SIZE); 107 | //itcnt2[lit+ii2] = 0; 108 | } 109 | ocust[lit + ii2] = 0; 110 | } 111 | 112 | if (extary[fidx[k]].size() > 0) { 113 | lit = extary[fidx[k]].size() - 1; 114 | if (extary[fidx[j]][0] < extary[fidx[k]][lit]) { 115 | if ((++seq2[fidx[j] * numfreq + fidx[k]]) == 0) { 116 | seqfd.write((char *) &backidx[fidx[j]], INT_SIZE); 117 | seqfd.write((char *) &backidx[fidx[k]], INT_SIZE); 118 | } 119 | } 120 | } 121 | } 122 | } 123 | extary[fidx[k]].reset(); 124 | } 125 | } 126 | 127 | void Exttpose::do_invert_db(int pblk, int mincustid, int maxcustid) { 128 | int numitem = 0, tid = 0, custid = 0; // DD: to avoid gcc warning uninitialized var 129 | int *buf = nullptr; // DD: to avoid gcc warning uninitialized var 130 | ostringstream tmpname; 131 | int fd; 132 | int i, j, k, idx; 133 | 134 | dcb.get_first_blk(); 135 | dcb.get_next_trans(buf, numitem, tid, custid); 136 | int ocid;// = -1; 137 | for (int p = 0; p < args.num_partitions; p++) { 138 | tmpname << args.output; 139 | 140 | if (args.num_partitions > 1) { 141 | tmpname << ".P" << p; 142 | } 143 | 144 | string tmpnam = tmpname.str(); 145 | tmpname.str(string()); 146 | 147 | if ((fd = open(tmpnam.c_str(), (O_WRONLY | O_CREAT | O_TRUNC | O_BINARY), 0666)) < 0) { 148 | throw runtime_error("Can't open out file"); 149 | } 150 | 151 | for (i = 0; i < numfreq; i++) { 152 | extary[i].reset(); 153 | } 154 | //count 2-itemsets 155 | int plb = p * pblk + mincustid; 156 | int pub = plb + pblk; 157 | if (pub >= maxcustid) pub = maxcustid + 1; 158 | env.logger << "BOUNDS " << plb << " " << pub << endl; 159 | int fcnt; 160 | for (; !dcb.eof() && custid < pub;) { 161 | fcnt = 0; 162 | ocid = custid; 163 | //env.logger << "TID " << custid << " " << tid << " " << numitem << endl; 164 | while (!dcb.eof() && ocid == custid && custid < pub) { 165 | //for (k=0; k < numitem; k++){ 166 | 167 | // } 168 | 169 | if (args.use_diff) { 170 | //add this tid to all items not in the trans 171 | k = 0; 172 | for (j = 0; j < numitem; j++) { 173 | if (freqidx[buf[j]] == -1) continue; 174 | 175 | while (backidx[k] < buf[j]) { 176 | //if ((idx = freqidx[backidx[k]]) != -1){ 177 | idx = k; 178 | if (!args.use_newformat) 179 | extary[idx].add(fd, tid, args.use_seq, p); 180 | else extary[idx].add(fd, tid, args.use_seq, p, custid); 181 | //} 182 | k++; 183 | } 184 | k++; //skip over buf[j] 185 | } 186 | for (; k < numfreq; k++) { 187 | //if ((idx = freqidx[backidx[k]]) != -1){ 188 | idx = k; 189 | if (!args.use_newformat) 190 | extary[idx].add(fd, tid, args.use_seq, p); 191 | else extary[idx].add(fd, tid, args.use_seq, p, custid); 192 | //} 193 | } 194 | } else { 195 | // add this tid to all items in the trans 196 | for (j = 0; j < numitem; j++) { 197 | idx = freqidx[buf[j]]; 198 | if (idx != -1) { 199 | if (!args.use_newformat) { 200 | if (args.use_seq && extary[idx].flg() == 0) { 201 | fidx[fcnt] = idx; 202 | fcnt++; 203 | extary[idx].setflg(1); 204 | extary[idx].add(fd, tid, args.use_seq, p, custid); 205 | } else { 206 | extary[idx].add(fd, tid, args.use_seq, p); 207 | } 208 | } else { 209 | extary[idx].add(fd, tid, args.use_seq, p, custid); 210 | } 211 | } 212 | } 213 | } 214 | 215 | dcb.get_next_trans(buf, numitem, tid, custid); 216 | } 217 | if (!args.use_newformat && args.use_seq) { 218 | for (k = 0; k < fcnt; k++) { 219 | extary[fidx[k]].setlastpos(); 220 | extary[fidx[k]].setflg(0); 221 | } 222 | fcnt = 0; 223 | } 224 | } 225 | 226 | for (i = 0; i < numfreq; i++) { 227 | //env.logger << "FLUSH " << i << " " << extary[i].lastPos << " " << 228 | // extary[i].theSize << endl; 229 | extary[i].flushbuf(fd, args.use_seq, p); 230 | } 231 | close(fd); 232 | } 233 | env.logger << "WROTE INVERT " << endl; 234 | } 235 | 236 | Exttpose::Exttpose(Env &env_, ExttposeArgument &args_) : env(env_), args(args_) { 237 | dcb.init(args.input); 238 | itcnt.resize(args.DBASE_MAXITEM); 239 | ocnt.resize(args.DBASE_MAXITEM, -1); 240 | itlen.resize(args.DBASE_MAXITEM); 241 | freqidx.resize(args.DBASE_MAXITEM); 242 | } 243 | 244 | void Exttpose::tpose() { 245 | int i, j, l; 246 | int idx; 247 | int custid, tid, numitem, fcnt; 248 | ofstream ofd; 249 | int sumsup = 0, sumdiff = 0; 250 | 251 | //count 1 items 252 | int *buf; 253 | dcb.get_first_blk(); 254 | dcb.get_next_trans(buf, numitem, tid, custid); 255 | int mincustid = custid; 256 | while (!dcb.eof()) { 257 | //env.logger << custid << " " << tid << " " << numitem; 258 | for (j = 0; j < numitem; j++) { 259 | //env.logger << " " << buf[j] << flush; 260 | itlen[buf[j]]++; 261 | if (args.use_seq && ocnt[buf[j]] != custid) { 262 | itcnt[buf[j]]++; 263 | ocnt[buf[j]] = custid; 264 | } 265 | //if (buf[j] == 17) env.logger << " " << tid; 266 | } 267 | //env.logger << endl; 268 | dcb.get_next_trans(buf, numitem, tid, custid); 269 | } 270 | //env.logger << endl; 271 | int maxcustid = static_cast(custid); 272 | env.logger << "MINMAX " << mincustid << " " << maxcustid << endl; 273 | 274 | for (i = 0; i < args.DBASE_MAXITEM; i++) { 275 | if (args.use_seq) { 276 | if (itcnt[i] >= args.MINSUPPORT) { 277 | env.logger << i << " SUPP " << itcnt[i] << endl; 278 | freqidx[i] = numfreq; 279 | numfreq++; 280 | } else freqidx[i] = static_cast(-1); 281 | } else { 282 | if (itlen[i] >= args.MINSUPPORT) { 283 | freqidx[i] = numfreq; 284 | numfreq++; 285 | sumsup += itlen[i]; 286 | sumdiff += (args.DBASE_NUM_TRANS - itlen[i]); 287 | } else freqidx[i] = static_cast(-1); 288 | } 289 | //if (i == 17) env.logger << " 17 SUP " << itlen[17] << endl; 290 | } 291 | 292 | backidx.resize(numfreq); 293 | 294 | numfreq = 0; 295 | for (i = 0; i < args.DBASE_MAXITEM; i++) { 296 | if (args.use_seq) { 297 | if (itcnt[i] >= args.MINSUPPORT) 298 | backidx[numfreq++] = i; 299 | } else { 300 | if (itlen[i] >= args.MINSUPPORT) 301 | backidx[numfreq++] = i; 302 | } 303 | } 304 | 305 | env.logger << "numfreq " << numfreq << " : " << " SUMSUP SUMDIFF = " << sumsup << " " << sumdiff << endl; 306 | 307 | env.summary << " F1stats = [ " << numfreq << " " << sumsup << " " << sumdiff << " ]"; 308 | 309 | if (numfreq == 0) return; 310 | 311 | long extarysz = args.AMEM / numfreq; 312 | extarysz /= INT_SIZE; 313 | env.logger << "EXTRARYSZ " << extarysz << endl; 314 | if (extarysz < 2) extarysz = 2; 315 | 316 | extary.reserve(numfreq); 317 | 318 | for (i = 0; i < numfreq; i++) { 319 | extary.emplace_back(extarysz, args.num_partitions); 320 | } 321 | 322 | int plb, pub, pblk; 323 | pblk = static_cast(ceil(((double) (maxcustid - mincustid + 1)) / args.num_partitions)); 324 | if (args.do_invert) { 325 | if (args.num_partitions > 1) { 326 | dcb.get_first_blk(); 327 | dcb.get_next_trans(buf, numitem, tid, custid); 328 | } 329 | for (j = 0; j < args.num_partitions; j++) { 330 | //construct offsets for 1-itemsets 331 | ostringstream tmpnamestrm; 332 | tmpnamestrm << args.idxfn; 333 | if (args.num_partitions > 1) { 334 | tmpnamestrm << ".P" << j; 335 | plb = j * pblk + mincustid; 336 | pub = plb + pblk; 337 | if (pub > maxcustid) pub = maxcustid + 1; 338 | std::fill(itcnt.begin(), itcnt.end(), 0); 339 | std::fill(ocnt.begin(), ocnt.end(), -1); 340 | std::fill(itlen.begin(), itlen.end(), 0); 341 | 342 | for (; !dcb.eof() && custid < pub;) { 343 | for (i = 0; i < numitem; i++) { 344 | itlen[buf[i]]++; 345 | if (args.use_seq && ocnt[buf[i]] != custid) { 346 | itcnt[buf[i]]++; 347 | ocnt[buf[i]] = custid; 348 | } 349 | } 350 | dcb.get_next_trans(buf, numitem, tid, custid); 351 | } 352 | } 353 | string tmpnam = tmpnamestrm.str(); 354 | env.logger << "OPENED " << tmpnam << endl; 355 | ofd.open(tmpnam, ios::binary); 356 | if (!ofd) { 357 | throw runtime_error("Can't open file " + tmpnam); 358 | } 359 | 360 | int file_offset = 0; 361 | int null = -1; 362 | for (i = 0; i < args.DBASE_MAXITEM; i++) { 363 | //if (i == 17) env.logger << "LIDX " << i << " " << itlen[i] << endl; 364 | if (freqidx[i] != -1) { 365 | ofd.write((char *) &file_offset, INT_SIZE); 366 | extary[freqidx[i]].set_offset(file_offset, j); 367 | if (args.use_seq) { 368 | if (args.use_newformat) file_offset += (2 * itlen[i]); 369 | else file_offset += (2 * itcnt[i] + itlen[i]); 370 | } else { 371 | if (args.use_diff) file_offset += (args.DBASE_NUM_TRANS - itlen[i]); 372 | else file_offset += itlen[i]; 373 | } 374 | } else if (args.no_minus_off) { 375 | ofd.write((char *) &file_offset, INT_SIZE); 376 | } else ofd.write((char *) &null, INT_SIZE); 377 | //env.logger << "OFF " << i <<" " << file_offset << endl; 378 | } 379 | env.logger << "OFF " << i << " " << file_offset << endl; 380 | ofd.write((char *) &file_offset, INT_SIZE); 381 | ofd.close(); 382 | } 383 | } 384 | 385 | env.logger << "Wrote Offt " << endl; 386 | 387 | fidx.resize(numfreq); 388 | 389 | int ocid = -1; 390 | if (args.do_l2) { 391 | fstream isetfd; 392 | fstream seqfd; 393 | string tmpseq, tmpiset; 394 | 395 | if (args.use_seq) { 396 | tmpseq = args.tmpfn; 397 | seqfd.open(tmpseq, ios::binary | ios::trunc); 398 | if (!seqfd.is_open()) { 399 | throw runtime_error("Can't open out file"); 400 | } 401 | } 402 | tmpiset = args.tmpfn + "iset"; 403 | isetfd.open(tmpiset, ios::binary | ios::trunc); 404 | 405 | if (!isetfd.is_open()) { 406 | throw runtime_error("Can't open out file"); 407 | } 408 | 409 | if (args.use_seq) { 410 | seq2.reserve(numfreq * numfreq); 411 | } 412 | 413 | itcnt2.resize(numfreq * (numfreq - 1) / 2); 414 | ocust.resize(numfreq * (numfreq - 1) / 2); 415 | offsets.resize(numfreq); 416 | 417 | int offt = 0; 418 | int start = static_cast(numfreq); 419 | for (i = start - 1; i >= 0; i--) { 420 | offsets[numfreq - i - 1] = offt; 421 | offt += i; 422 | } 423 | 424 | ocid = -1; 425 | int lit; 426 | //count 2-itemsets 427 | dcb.get_first_blk(); 428 | dcb.get_next_trans(buf, numitem, tid, custid); 429 | while (!dcb.eof()) { 430 | fcnt = 0; 431 | ocid = custid; 432 | while (!dcb.eof() && ocid == custid) { 433 | for (j = 0; j < numitem; j++) { 434 | idx = freqidx[buf[j]]; 435 | if (idx != -1) { 436 | if (args.use_seq) { 437 | if (extary[idx].size() == 0) { 438 | fidx[fcnt] = idx; 439 | fcnt++; 440 | //extary[idx].add(isetfd,tid,args.use_seq,0); 441 | //extary[idx].add(isetfd,tid,args.use_seq,0); 442 | extary[idx].setitem(0, tid); 443 | extary[idx].setitem(1, tid); 444 | extary[idx].setsize(2); 445 | } else { 446 | extary[idx].setitem(1, tid); 447 | } 448 | 449 | lit = offsets[idx] - idx - 1; 450 | for (l = j + 1; l < numitem; l++) { 451 | if (freqidx[buf[l]] != -1) { 452 | ocust[lit + freqidx[buf[l]]] = 1; 453 | } 454 | } 455 | } else { 456 | lit = offsets[idx] - idx - 1; 457 | for (l = j + 1; l < numitem; l++) { 458 | if (freqidx[buf[l]] != -1) { 459 | if ((++itcnt2[lit + freqidx[buf[l]]]) == 0) { 460 | isetfd.write((char *) &buf[j], INT_SIZE); 461 | isetfd.write((char *) &buf[l], INT_SIZE); 462 | } 463 | } 464 | } 465 | } 466 | } 467 | } 468 | dcb.get_next_trans(buf, numitem, tid, custid); 469 | } 470 | 471 | if (args.use_seq) { 472 | process_cust(fcnt, seqfd, isetfd); 473 | } 474 | } 475 | ocust.clear(); 476 | env.logger << "2-IT " << " " << endl; 477 | 478 | //write 2-itemsets counts to file 479 | int l2cnt = 0; 480 | if (args.use_seq) { 481 | ofd.open(args.seqfn, ios::binary); 482 | if (ofd.fail()) { 483 | throw runtime_error("Can't open seq file"); 484 | } 485 | sort_get_l2(l2cnt, seqfd, ofd, seq2, true); 486 | 487 | ofd.close(); 488 | env.logger << "SEQ2 cnt " << l2cnt << endl; 489 | env.summary << " " << l2cnt; 490 | } 491 | int seqs = l2cnt; 492 | 493 | ofd.open(args.it2fn, ios::binary); 494 | //if ((fd = open(args.it2fn, (O_WRONLY|O_CREAT|O_TRUNC|O_BINARY), 0666)) < 0){ 495 | if (ofd.fail()) { 496 | throw runtime_error("Can't open it2 file"); 497 | } 498 | sort_get_l2(l2cnt, isetfd, ofd, itcnt2, false); 499 | ofd.close(); 500 | env.logger << "SORT " << l2cnt << " " << endl; 501 | 502 | env.summary << " F2stats = [" << l2cnt << " " << seqs << " ]"; 503 | offsets.clear(); 504 | itcnt2.clear(); 505 | seq2.clear(); 506 | } 507 | 508 | if (args.do_invert) { 509 | do_invert_db(pblk, mincustid, maxcustid); 510 | } 511 | 512 | freqidx.clear(); 513 | backidx.clear(); 514 | } 515 | 516 | 517 | result_t exttposeFunc(Env &env, ExttposeArgument &args) { 518 | env.logger << "CONF " << args.DBASE_NUM_TRANS << " " << args.DBASE_MAXITEM << " " 519 | << args.DBASE_AVG_TRANS_SZ << " " << args.DBASE_AVG_CUST_SZ << endl; 520 | 521 | if (args.use_diff) { 522 | env.logger << "SEQ TURNED OFF and PARTITIONS = 1\n"; 523 | } 524 | 525 | args.MINSUPPORT = lround(args.MINSUP_PER * args.DBASE_NUM_TRANS + 0.5); 526 | 527 | //ensure that support is at least 2 528 | if (!args.write_only_fcnt && args.MINSUPPORT < 1) args.MINSUPPORT = 1; 529 | env.logger << "args.MINSUPPORT " << args.MINSUPPORT << " " << args.DBASE_NUM_TRANS << endl; 530 | 531 | env.summary << "TPOSE"; 532 | if (args.use_diff) env.summary << " DIFF"; 533 | if (args.use_seq) env.summary << " SEQ"; 534 | if (!args.do_invert) env.summary << " NOINVERT"; 535 | if (!args.do_l2) env.summary << " NOF2"; 536 | env.summary << " " << args.input << " " << args.MINSUP_PER << " " << args.DBASE_NUM_TRANS << " " 537 | << args.MINSUPPORT << " " << args.num_partitions; 538 | 539 | Exttpose exttpose(env, args); 540 | exttpose.tpose(); 541 | 542 | result_t result; 543 | result.logger = env.logger.str(); 544 | result.summary = env.summary.str(); 545 | return result; 546 | } 547 | 548 | result_t exttposeWrapper(const string &s, shared_ptr& envptr) { 549 | args_t args_ = parse(s); 550 | ExttposeArgument args; 551 | args.parse_args(args_.argc, args_.argv); 552 | if (envptr == nullptr) { 553 | Env env; 554 | return exttposeFunc(env, args); 555 | } 556 | else { 557 | return exttposeFunc(*envptr, args); 558 | } 559 | } -------------------------------------------------------------------------------- /csrc/exttpose.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 7/12/18. 3 | // 4 | 5 | #ifndef UTILITIES_EXTTPOSE_H 6 | #define UTILITIES_EXTTPOSE_H 7 | 8 | #include "common.h" 9 | #include "Env.h" 10 | #include "argv_parser.h" 11 | #include "TransArray.h" 12 | #include "calcdb.h" 13 | 14 | #define MEG (1024*1204) 15 | 16 | 17 | class ExttposeArgument { 18 | public: 19 | string input; //input file name 20 | string output; //output file name 21 | string idxfn; 22 | string inconfn; 23 | string it2fn; 24 | string seqfn; 25 | string tmpfn; // template for temporary files 26 | double MINSUP_PER = 0.0; 27 | long MINSUPPORT; 28 | int do_invert = 1; 29 | int do_l2 = 1; 30 | int use_seq = 1; 31 | int write_only_fcnt = 1; 32 | char use_newformat = 1; 33 | int num_partitions = 1; 34 | char no_minus_off = 0; 35 | 36 | char use_diff = 0; 37 | int DBASE_NUM_TRANS; //tot trans for assoc, num cust for sequences 38 | int DBASE_MAXITEM; //number of items 39 | float DBASE_AVG_TRANS_SZ; //avg trans size 40 | float DBASE_AVG_CUST_SZ = 0; //avg cust size for sequences 41 | int DBASE_TOT_TRANS; //tot trans for sequences 42 | 43 | long AMEM = 32 * MEG; 44 | 45 | string name; 46 | void parse_args(int argc, char **argv) { 47 | auto cmdl = argh::parser(argc, argv, argh::parser::PREFER_PARAM_FOR_UNREG_OPTION); 48 | if (!cmdl({"s", "i", "o"})) { 49 | cerr << "usage: exttpose [OPTION]... -i -o -s\n"; 50 | throw runtime_error("exttpose needs valid value of -i -o and -s"); 51 | } 52 | cmdl("i") >> name; 53 | input = name + ".data"; 54 | inconfn = name + ".conf"; 55 | cmdl("o") >> name; 56 | output = name + ".tpose"; 57 | idxfn = name + ".idx"; 58 | it2fn = name + ".2it"; 59 | seqfn = name + ".2seq"; 60 | tmpfn = name + ".tmp"; 61 | cmdl("p") >> num_partitions; 62 | cmdl("s") >> MINSUP_PER; 63 | if (cmdl("a")) { 64 | use_seq = 0; 65 | cmdl("a") >> write_only_fcnt; 66 | } 67 | if (cmdl["d"]) use_diff = 1; 68 | if (cmdl["l"]) do_l2 = 0; 69 | if (cmdl["v"]) do_invert = 0; 70 | if (cmdl["f"]) use_newformat = 0; 71 | if (cmdl("m")) { 72 | cmdl("m") >> AMEM; 73 | AMEM *= MEG; 74 | } 75 | if (cmdl["x"]) no_minus_off = 1; 76 | 77 | ifstream inconff(inconfn, ios::binary); 78 | if (!inconff) { 79 | throw runtime_error("ERROR: Can\'t read conf file: " + inconfn); 80 | } 81 | 82 | if (use_seq) { 83 | inconff.read((char *) &DBASE_NUM_TRANS, INT_SIZE); 84 | inconff.read((char *) &DBASE_MAXITEM, INT_SIZE); 85 | inconff.read((char *) &DBASE_AVG_CUST_SZ, sizeof(float)); 86 | inconff.read((char *) &DBASE_AVG_TRANS_SZ, sizeof(float)); 87 | inconff.read((char *) &DBASE_TOT_TRANS, INT_SIZE); 88 | } else { 89 | inconff.read((char *) &DBASE_NUM_TRANS, INT_SIZE); 90 | inconff.read((char *) &DBASE_MAXITEM, INT_SIZE); 91 | inconff.read((char *) &DBASE_AVG_TRANS_SZ, sizeof(float)); 92 | } 93 | 94 | if (use_diff) { 95 | use_seq = 0; 96 | num_partitions = 1; 97 | } 98 | if (use_seq) { 99 | write_only_fcnt = 0; 100 | } 101 | } 102 | }; 103 | 104 | class Exttpose { 105 | Env& env; 106 | ExttposeArgument& args; 107 | DbaseCtrlBlk dcb; 108 | unsigned long numfreq = 0; 109 | 110 | vector itcnt; 111 | vector ocnt; 112 | vector itlen; 113 | vector freqidx; 114 | vector backidx; 115 | vector extary; 116 | vector seq2; 117 | vector itcnt2; 118 | vector ocust; 119 | vector offsets; 120 | vector fidx; 121 | public: 122 | void sort_get_l2(int &l2cnt, fstream& file, ofstream &ofd, vector &cntary, bool use_seq); 123 | 124 | void process_cust(int fcnt, fstream &seqfd, fstream &isetfd); 125 | 126 | void do_invert_db(int pblk, int mincustid, int maxcustid); 127 | void tpose(); 128 | 129 | Exttpose(Env& env_, ExttposeArgument& args_); 130 | }; 131 | 132 | result_t exttposeFunc(Env &env, ExttposeArgument &args); 133 | 134 | /** 135 | * Call exttpose given the argument list as string 136 | * @param args e.g. 'exttpose -i zaki -o zaki -p 1 -l -x -s 0.3' 137 | */ 138 | result_t exttposeWrapper(const string& args, shared_ptr& envptr); 139 | 140 | #endif //UTILITIES_MAKEBIN_H 141 | -------------------------------------------------------------------------------- /csrc/exttpose_main.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "exttpose.h" 3 | 4 | int main(int argc, char **argv) { 5 | try { 6 | Env env; 7 | ExttposeArgument args; 8 | args.parse_args(argc, argv); 9 | exttposeFunc(env, args); 10 | cout << env.logger.str() << endl; 11 | cout << env.summary.str() << endl; 12 | return 0; 13 | } 14 | catch (exception &e) { 15 | cerr << "exttpose: Caught exception: " << e.what() << endl; 16 | } 17 | catch (...) { 18 | cerr << "exttpose: Caught unknown exception" << endl; 19 | } 20 | return 1; 21 | } -------------------------------------------------------------------------------- /csrc/getconf.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "argv_parser.h" 3 | #include "calcdb.h" 4 | #include "Env.h" 5 | #include "getconf.h" 6 | 7 | 8 | result_t getconfFunc(Env& env, const GetconfArgument& args) { 9 | int DBASE_NUM_TRANS = 0; 10 | int DBASE_MAXITEM = 0; 11 | int DBASE_NUM_CUST = 0; 12 | int DBASE_MINTRANS = 0; 13 | int DBASE_MAXTRANS = 0; 14 | float DBASE_AVG_TRANS_SZ = 0; 15 | float DBASE_AVG_CUST_SZ = 0; 16 | 17 | int i; 18 | 19 | int custid = 0, tid = 0, nitem = 0; 20 | int *buf = nullptr; 21 | int oldcustid = -1; 22 | int oldtcnt = 0; 23 | int tsizesum = 0; 24 | int tcustsum = 0; 25 | int tsizesq = 0; 26 | int maxnitem = 0; 27 | 28 | DbaseCtrlBlk dcb; 29 | dcb.init(args.input); 30 | 31 | dcb.get_first_blk(); 32 | dcb.get_next_trans(buf, nitem, tid, custid); 33 | DBASE_MINTRANS = custid; 34 | while (!dcb.eof()) { 35 | //printf ("%d %d %d\n", custid, tid, nitem); 36 | DBASE_MAXTRANS = custid; 37 | if (args.use_seq) { 38 | if (oldcustid != custid) { 39 | tcustsum += DBASE_NUM_TRANS - oldtcnt; 40 | oldtcnt = DBASE_NUM_TRANS; 41 | DBASE_NUM_CUST++; 42 | oldcustid = custid; 43 | } 44 | } 45 | DBASE_NUM_TRANS++; 46 | tsizesum += nitem; 47 | if (nitem > maxnitem) maxnitem = nitem; 48 | 49 | tsizesq += (nitem * nitem); 50 | for (i = 0; i < nitem; i++) 51 | if (buf[i] > DBASE_MAXITEM) DBASE_MAXITEM = buf[i]; 52 | dcb.get_next_trans(buf, nitem, tid, custid); 53 | } 54 | tcustsum += DBASE_NUM_TRANS - oldtcnt; 55 | DBASE_MAXITEM++; 56 | 57 | if (args.use_seq) { 58 | DBASE_AVG_CUST_SZ = (1.0 * tcustsum) / DBASE_NUM_CUST; 59 | } 60 | DBASE_AVG_TRANS_SZ = (1.0 * tsizesum) / DBASE_NUM_TRANS; 61 | double trans_sq_avg = (1.0 * tsizesq) / DBASE_NUM_TRANS; 62 | double stddev = sqrt(trans_sq_avg - (DBASE_AVG_TRANS_SZ * DBASE_AVG_TRANS_SZ)); 63 | 64 | 65 | //write config info to new file 66 | ofstream conffd(args.confn.c_str(), ios::binary | ios::out); 67 | 68 | if (!conffd) { 69 | string error_message = "can't open file: " + args.confn; 70 | throw runtime_error(error_message); 71 | } 72 | 73 | if (args.use_seq) { 74 | conffd.write((char *) &DBASE_NUM_CUST, INT_SIZE); 75 | conffd.write((char *) &DBASE_MAXITEM, INT_SIZE); 76 | conffd.write((char *) &DBASE_AVG_CUST_SZ, sizeof(float)); 77 | conffd.write((char *) &DBASE_AVG_TRANS_SZ, sizeof(float)); 78 | conffd.write((char *) &DBASE_NUM_TRANS, INT_SIZE); 79 | conffd.write((char *) &DBASE_MINTRANS, INT_SIZE); 80 | conffd.write((char *) &DBASE_MAXTRANS, INT_SIZE); 81 | } else { 82 | conffd.write((char *) &DBASE_NUM_TRANS, INT_SIZE); 83 | conffd.write((char *) &DBASE_MAXITEM, INT_SIZE); 84 | conffd.write((char *) &DBASE_AVG_TRANS_SZ, sizeof(float)); 85 | conffd.write((char *) &DBASE_MINTRANS, INT_SIZE); 86 | conffd.write((char *) &DBASE_MAXTRANS, INT_SIZE); 87 | } 88 | 89 | conffd.close(); 90 | env.summary << "CONF " << DBASE_NUM_CUST << " " << DBASE_MAXITEM << " " << DBASE_AVG_CUST_SZ 91 | << " " << DBASE_AVG_TRANS_SZ << " " << DBASE_NUM_TRANS << " " << DBASE_MINTRANS 92 | << " " << DBASE_MAXTRANS << " " << stddev << " " << maxnitem << endl; 93 | 94 | result_t result; 95 | result.logger = env.logger.str(); 96 | result.summary = env.summary.str(); 97 | return result; 98 | } 99 | 100 | result_t getconfWrapper(const string &s, shared_ptr& envptr) { 101 | args_t args_ = parse(s); 102 | GetconfArgument args; 103 | args.parse_args(args_.argc, args_.argv); 104 | 105 | if (envptr == nullptr) { 106 | Env env; 107 | return getconfFunc(env, args); 108 | } 109 | else { 110 | return getconfFunc(*envptr, args); 111 | } 112 | } 113 | 114 | // remark: the implementation assumes that a customer's transactions 115 | // appear as a contiguous block in the binary input data, and 116 | // therefore, in the user-supplied database. 117 | // 118 | // ceeboo 2007 119 | -------------------------------------------------------------------------------- /csrc/getconf.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 7/12/18. 3 | // 4 | 5 | #ifndef UTILITIES_GETCONF_H 6 | #define UTILITIES_GETCONF_H 7 | 8 | #include "common.h" 9 | #include "argv_parser.h" 10 | #include "Env.h" 11 | 12 | class GetconfArgument { 13 | public: 14 | string input; //input file name 15 | string confn; 16 | bool use_seq = true; 17 | 18 | void parse_args(int argc, char **argv) { 19 | auto cmdl = argh::parser(argc, argv, argh::parser::PREFER_PARAM_FOR_UNREG_OPTION); 20 | cmdl("i") >> input; 21 | cmdl("o") >> confn; 22 | cmdl("a") >> use_seq; 23 | 24 | if (input.empty() || confn.empty()) { 25 | cerr << "usage: getconf [-a] -i -o\n"; 26 | throw runtime_error("getconf needs valid value of -i and -o"); 27 | } 28 | 29 | input += ".data"; 30 | confn += ".conf"; 31 | } 32 | }; 33 | 34 | result_t getconfFunc(Env& env, const GetconfArgument& args); 35 | 36 | /** 37 | * Call getconf given the argument list as string 38 | * @param args e.g. 'getconf -i zaki -o zaki' 39 | */ 40 | result_t getconfWrapper(const string &args, shared_ptr& envptr); 41 | 42 | #endif //UTILITIES_GETCONF_H 43 | -------------------------------------------------------------------------------- /csrc/getconf_main.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "getconf.h" 3 | 4 | int main(int argc, char **argv) { 5 | try { 6 | Env env; 7 | GetconfArgument args; 8 | args.parse_args(argc, argv); 9 | getconfFunc(env, args); 10 | 11 | cout << env.logger.str(); 12 | cout << env.summary.str(); 13 | return 0; 14 | } 15 | catch (exception &e) { 16 | cerr << "getconf: Caught exception: " << e.what() << endl; 17 | } 18 | catch (...) { 19 | cerr << "getconf: Caught unknown exception" << endl; 20 | } 21 | return 1; 22 | } -------------------------------------------------------------------------------- /csrc/main.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "wrappers.h" 3 | 4 | 5 | int main(int argc, char** argv) { 6 | string filename; 7 | spade_arg_t args; 8 | args.maxsize = 10; 9 | args.maxlen = 10; 10 | args.maxgap = 1; 11 | 12 | auto cmdl = argh::parser(argc, argv, argh::parser::PREFER_PARAM_FOR_UNREG_OPTION); 13 | cmdl({"-i", "--input"}) >> filename; 14 | 15 | if (filename.empty() || !cmdl({"-s", "--support"})) { 16 | cerr << "usage: cspade-full -i -s\n"; 17 | throw runtime_error("cspade-full needs valid value of -i and -s"); 18 | } 19 | 20 | cmdl({"-s", "--support"}) >> args.support; 21 | if (cmdl({"-u", "--max-gap"})) cmdl({"-u", "--max-gap"}) >> args.maxgap; 22 | if (cmdl({"-m", "--maxsize"})) cmdl({"-m", "--maxsize"}) >> args.maxsize; 23 | if (cmdl({"-M", "--maxlen"})) cmdl({"-M", "--maxlen"})>> args.maxlen; 24 | 25 | result_t result = runSpade(filename, args); 26 | cout << result.seqstrm; 27 | return 0; 28 | } 29 | 30 | //int main1(int argc, char **argv) { 31 | // bool mb = false; 32 | // bool gc = false; 33 | // bool et = false; 34 | // bool spade = false; 35 | // 36 | // if (argc == 1) { 37 | // mb = true; 38 | // gc = true; 39 | // et = true; 40 | // spade = true; 41 | // } else { 42 | // char *command = argv[1]; 43 | // if (!strcmp(command, "makebin")) { 44 | // mb = true; 45 | // gc = false; 46 | // et = false; 47 | // spade = false; 48 | // } 49 | // if (!strcmp(command, "getconf")) { 50 | // mb = true; 51 | // gc = true; 52 | // et = false; 53 | // spade = false; 54 | // } 55 | // if (!strcmp(command, "exttpose")) { 56 | // mb = true; 57 | // gc = true; 58 | // et = true; 59 | // spade = false; 60 | // } 61 | // if (!strcmp(command, "spade")) { 62 | // mb = true; 63 | // gc = true; 64 | // et = true; 65 | // spade = true; 66 | // } 67 | // } 68 | // if (mb) { 69 | // makebinWrapper("makebin test/zaki.txt zaki.data"); 70 | // } 71 | // 72 | // if (gc) { 73 | // getconfWrapper("getconf -i zaki -o zaki"); 74 | // } 75 | // 76 | // if (et) { 77 | // exttposeWrapper("exttpose -i zaki -o zaki -p 1 -l -x -s 0.3"); 78 | // } 79 | // 80 | // if (spade) { 81 | // spadeWrapper("spade -i zaki -s 0.3 -Z 10 -z 10 -u 1 -r -e 1 -o"); 82 | // } 83 | // 84 | // result_t result = getResult(); 85 | // cout << result.mined; 86 | // cout << result.nsequences; 87 | // return 0; 88 | //} 89 | -------------------------------------------------------------------------------- /csrc/makebin.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "argv_parser.h" 3 | #include "Env.h" 4 | 5 | #define ITSZ sizeof(int) 6 | 7 | const int lineSize = 8192; 8 | const int wdSize = 256; 9 | 10 | void convbin(ostream &fout, char *inBuf, int inSize) { 11 | char *p; 12 | int it; 13 | 14 | for (;;) { 15 | it = strtol(inBuf, &p, 10); 16 | if (p == inBuf) 17 | break; 18 | 19 | fout.write((char *) &it, ITSZ); 20 | inBuf = p; 21 | } 22 | } 23 | 24 | result_t makebinFunc(Env& env, const string& infilename, const string& outfilename) { 25 | char inBuf[lineSize]; 26 | int inSize; 27 | ifstream fin(infilename); 28 | 29 | if (!fin) { 30 | string error_message = "can't open ascii file: " + infilename; 31 | throw runtime_error(error_message); 32 | } 33 | ofstream fout(outfilename, ios::binary); 34 | if (!fout) { 35 | string error_message = "can't open binary file: " + outfilename; 36 | throw runtime_error(error_message); 37 | } 38 | 39 | while (fin.getline(inBuf, lineSize)) { 40 | inSize = fin.gcount(); 41 | // env.logger << "IN SIZE " << inSize << endl; 42 | convbin(fout, inBuf, inSize); 43 | } 44 | 45 | fin.close(); 46 | fout.close(); 47 | 48 | result_t result; 49 | result.logger = env.logger.str(); 50 | return result; 51 | } 52 | 53 | result_t makebinWrapper(const string &s, shared_ptr& envptr) { 54 | args_t args = parse(s); 55 | Env env; 56 | string infilename(args.argv[1]); 57 | string outfilename(args.argv[2]); 58 | 59 | if (envptr == nullptr) { 60 | return makebinFunc(env, infilename, outfilename); 61 | } 62 | else { 63 | return makebinFunc(*envptr, infilename, outfilename); 64 | } 65 | } -------------------------------------------------------------------------------- /csrc/makebin.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 7/12/18. 3 | // 4 | 5 | #ifndef UTILITIES_MAKEBIN_H 6 | #define UTILITIES_MAKEBIN_H 7 | 8 | #include "common.h" 9 | #include "argv_parser.h" 10 | #include "Env.h" 11 | 12 | result_t makebinFunc(Env& env, const string& infilename, const string& outfilename); 13 | 14 | 15 | /** 16 | * Call makebin given the argument list as string 17 | * @param args e.g 'makebin test/zaki.txt zaki.data' 18 | */ 19 | result_t makebinWrapper(const string &arg, shared_ptr& envptr); 20 | 21 | #endif //UTILITIES_MAKEBIN_H 22 | -------------------------------------------------------------------------------- /csrc/makebin_main.cc: -------------------------------------------------------------------------------- 1 | #include "makebin.h" 2 | 3 | int main(int argc, char **argv) { 4 | try { 5 | Env env; 6 | string infilename(argv[1]); 7 | string outfilename(argv[2]); 8 | makebinFunc(env, infilename, outfilename); 9 | cout << env.logger.str(); 10 | return 0; 11 | } 12 | catch (exception &e) { 13 | cerr << "exttpose: Caught exception: " << e.what() << endl; 14 | } 15 | catch (...) { 16 | cerr << "exttpose: Caught unknown exception" << endl; 17 | } 18 | return 1; 19 | } -------------------------------------------------------------------------------- /csrc/spade_main.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "Sequence.h" 3 | 4 | int main(int argc, char **argv) { 5 | try { 6 | Env env; 7 | shared_ptr args = make_shared(); 8 | args->parse_args(argc, argv); 9 | result_t result = sequenceFunc(env, args); 10 | 11 | cout << result.logger << endl; 12 | cout << result.summary << endl; 13 | cout << result.seqstrm << endl; 14 | cout << result.nsequences << endl; 15 | return 0; 16 | } 17 | catch (exception &e) { 18 | cerr << "sequence: Caught '" << typeid(e).name() << "' exception: " << e.what() << endl; 19 | } 20 | catch (...) { 21 | cerr << "sequence: Caught unknown exception" << endl; 22 | } 23 | return 1; 24 | } -------------------------------------------------------------------------------- /csrc/test.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "wrappers.h" 3 | 4 | 5 | int main(int argc, char** argv) { 6 | string filename = "bb-tmi.txt"; 7 | spade_arg_t args; 8 | args.maxsize = 10; 9 | args.maxlen = 10; 10 | args.maxgap = 1; 11 | args.support = 0.01; 12 | 13 | result_t result = runSpade(filename, args); 14 | 15 | ifstream f("test-cases/bb-tmi.Z10z10s0.01"); 16 | string str((std::istreambuf_iterator(f)), std::istreambuf_iterator()); 17 | 18 | if (str != result.seqstrm){ 19 | cerr << "TEST FAILED" << endl; 20 | } 21 | else { 22 | cout << "TEST PASSED" << endl; 23 | } 24 | } -------------------------------------------------------------------------------- /csrc/wrappers.cc: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "wrappers.h" 3 | #include "Env.h" 4 | #include "makebin.h" 5 | #include "getconf.h" 6 | #include "SpadeArguments.h" 7 | #include "Sequence.h" 8 | #include "exttpose.h" 9 | 10 | void clean_up(const string& tmpprefix, ostream& logger, const string& tmpfolder) { 11 | list tmpfiles = list_files(tmpfolder, tmpprefix); 12 | for (string& tmpfile : tmpfiles) { 13 | string filepath = tmpfolder + tmpfile; 14 | if(remove(filepath.c_str()) != 0) { 15 | logger << "Error deleting file " << filepath << endl; 16 | } 17 | else { 18 | logger << "Cleaned up successful: " << filepath << endl; 19 | } 20 | } 21 | } 22 | 23 | result_t runSpade(const string &filename, spade_arg_t args, const string& tmpdir) { 24 | shared_ptr envptr = make_shared(); 25 | Env &env(*envptr); 26 | 27 | if (!file_exists(filename)) { 28 | throw runtime_error("File " + filename + " does not exist."); 29 | } 30 | 31 | if (args.support <= 0 || args.support > 1) { 32 | throw runtime_error("Support must be a floating point in range (0-1]"); 33 | } 34 | 35 | if (args.mingap > 0 && args.maxgap > 0 && args.maxgap < args.mingap) { 36 | args.mingap = args.maxgap; 37 | } 38 | 39 | int nrows = num_lines(filename); 40 | ostringstream opt; 41 | 42 | auto nop = static_cast(ceil((nrows + 2 * nrows) * sizeof(long) / pow(4, 10) / 5)); 43 | if (args.memsize > 0) { 44 | opt << " -m " << args.memsize; 45 | nop = static_cast(ceil(nop * 32 / float(args.memsize))); 46 | } 47 | 48 | if (args.numpart > 0) { 49 | if (args.numpart < nop) { 50 | env.logger << "numpart less than recommended\n"; 51 | } 52 | nop = args.numpart; 53 | } 54 | 55 | string random_suffix = random_id(16); 56 | string tmpprefix = "cspade-" + random_suffix; 57 | string otherfile = tmpdir + tmpprefix; 58 | string datafile = otherfile + ".data"; 59 | 60 | ostringstream makebin_args; 61 | ostringstream getconf_args; 62 | ostringstream exttpose_args; 63 | ostringstream spade_args; 64 | 65 | makebin_args << "makebin \"" << filename << "\" \"" << datafile +"\""; 66 | getconf_args << "getconf -i \"" << otherfile << "\" -o \"" << otherfile + "\""; 67 | exttpose_args << "exttpose -i \"" << otherfile << "\" -o \"" << otherfile << "\" -p " << nop << " -l -x -s " << args.support; 68 | 69 | if (args.maxsize > 0) { 70 | opt << " -Z " << args.maxsize; 71 | } 72 | if (args.maxlen > 0) { 73 | opt << " -z " << args.maxlen; 74 | } 75 | if (args.mingap > 0) { 76 | opt << " -l " << args.mingap; 77 | } 78 | if (args.maxgap > 0) { 79 | opt << " -u " << args.maxgap; 80 | } 81 | if (args.maxwin > 0) { 82 | opt << " -w " << args.maxwin; 83 | } 84 | if (!args.bfstype) { 85 | opt << " -r"; 86 | } 87 | if (args.tid_lists) { 88 | opt << " -y"; 89 | } 90 | 91 | spade_args << "spade -i \"" << otherfile << "\" -s " << args.support << opt.str() << " -e " << nop << " -o"; 92 | 93 | try { 94 | makebinWrapper(makebin_args.str(), envptr); 95 | getconfWrapper(getconf_args.str(), envptr); 96 | exttposeWrapper(exttpose_args.str(), envptr); 97 | result_t result = sequenceWrapper(spade_args.str(), envptr); 98 | clean_up(tmpprefix, env.logger, tmpdir); 99 | 100 | result.logger = env.logger.str(); 101 | return result; 102 | } 103 | catch (runtime_error& e) { 104 | clean_up(tmpprefix, env.logger, tmpdir); 105 | cerr << e.what(); 106 | throw e; 107 | } 108 | catch (std::exception& e) { 109 | clean_up(tmpprefix, env.logger, tmpdir); 110 | ostringstream message; 111 | message << "Caught '" << typeid(e).name() << "' exception: " << e.what() << endl; 112 | cerr << message.str(); 113 | throw runtime_error(message.str()); 114 | } 115 | catch (...) { 116 | ostringstream message; 117 | message << "sequence: Caught unknown exception" << endl; 118 | cerr << message.str(); 119 | throw runtime_error(message.str()); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /csrc/wrappers.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Yukio Fukuzawa on 3/12/18. 3 | // 4 | 5 | #ifndef SPADE_UTILITY_WRAPPERS_H 6 | #define SPADE_UTILITY_WRAPPERS_H 7 | 8 | #include "common.h" 9 | 10 | struct spade_arg_t { 11 | double support = 0.1; 12 | int maxsize = -1; 13 | int maxlen = -1; 14 | int mingap = -1; 15 | int maxgap = -1; 16 | int memsize = -1; 17 | int numpart = -1; 18 | int maxwin = -1; 19 | bool bfstype = false; 20 | bool tid_lists = false; 21 | }; 22 | 23 | /** 24 | * One function to call all 4 functions and return the result 25 | * @param filename name of the input file, e.g. /path/to/zaki.txt 26 | * @param args arguments to spade. 27 | * @param tmpdir temporary folder for spade to operate. Temp files will be cleaned afterwards. Must end with a slash 28 | * @return same as getResult 29 | */ 30 | result_t runSpade(const string& filename, spade_arg_t args, const string& tmpdir = TMPDIR); 31 | 32 | #endif //SPADE_UTILITY_WRAPPERS_H 33 | -------------------------------------------------------------------------------- /pycspade/__init__.py: -------------------------------------------------------------------------------- 1 | from .helpers import spade 2 | -------------------------------------------------------------------------------- /pycspade/cspade.pyx: -------------------------------------------------------------------------------- 1 | from libcpp cimport bool 2 | from libcpp.string cimport string as c_string 3 | 4 | cdef extern from "../csrc/common.h": 5 | cdef struct result_t: 6 | int nsequences; 7 | c_string seqstrm; 8 | c_string logger; 9 | c_string summary; 10 | 11 | cdef struct spade_arg_t: 12 | double support; 13 | int maxsize; 14 | int maxlen; 15 | int mingap; 16 | int maxgap; 17 | int memsize; 18 | int numpart; 19 | int maxwin; 20 | bool bfstype; 21 | bool tid_lists; 22 | 23 | cdef extern from "../csrc/wrappers.h": 24 | cdef result_t runSpade(const c_string& filename, spade_arg_t args) except +RuntimeError; 25 | 26 | 27 | def c_runspade(filename, support=0.1, maxsize=None, maxlen=None, mingap=None, maxgap=None, memsize=None, numpart=None, 28 | maxwin=None, bfstype=None, tid_lists=None): 29 | cdef spade_arg_t args 30 | args.support = support 31 | args.maxsize = maxsize or -1 32 | args.maxlen = maxlen or -1 33 | args.mingap = mingap or -1 34 | args.maxgap = maxgap or -1 35 | args.memsize = memsize or -1 36 | args.numpart = numpart or -1 37 | args.maxwin = maxwin or -1 38 | args.bfstype = bfstype or False 39 | args.tid_lists = bfstype or False 40 | 41 | filename = filename.encode('latin-1') 42 | return runSpade(filename, args) -------------------------------------------------------------------------------- /pycspade/helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | 4 | from pycspade.cspade import c_runspade 5 | 6 | 7 | def data_to_rows(data): 8 | rows = ['{} {} {} {}'.format(sid, eid, len(els), ' '.join(list(map(str, els)))) for sid, eid, els in data] 9 | return rows 10 | 11 | 12 | def file_len(fname): 13 | with open(fname) as f: 14 | i = 0 15 | for l in f: 16 | if len(l): 17 | i += 1 18 | return i 19 | 20 | 21 | class Item: 22 | def __init__(self, elements): 23 | self.elements = elements 24 | 25 | def __repr__(self): 26 | return '({})'.format(' '.join(list(map(str, self.elements)))) 27 | 28 | 29 | class Sequence: 30 | def __init__(self, name, noccurs): 31 | self.items = [] 32 | self.name = name 33 | self.noccurs = noccurs 34 | self.accum_occurs = noccurs 35 | self.confidence = None 36 | self.lift = None 37 | self.up_to_prev = None 38 | self.last_child = None 39 | self.frm_second = None 40 | self.up_to_prev_str = None 41 | self.last_child_str = None 42 | self.frm_second_str = None 43 | 44 | def add_item(self, item): 45 | self.items.append(item) 46 | 47 | def accumulate_occurs(self, child_occurs): 48 | self.accum_occurs += child_occurs 49 | if self.frm_second: 50 | self.frm_second.accumulate_occurs(child_occurs) 51 | 52 | def __repr__(self): 53 | return '{} - [{}]'.format('->'.join(list(map(str, self.items))), self.noccurs) 54 | 55 | 56 | def parse_results(result): 57 | lifts = {} 58 | confidences = {} 59 | nseqs = result['nsequences'] 60 | lines = result['seqstrm'].split('\n') 61 | lines.sort() 62 | sequences = {} 63 | for line in lines: 64 | if '0' <= line[0] <= '9': 65 | sequence_str, stats = line.split(' -- ') 66 | item_strs = sequence_str.split(' -> ') 67 | noccurs = int(stats[:stats.index(' ')]) 68 | 69 | sequence = Sequence(sequence_str, noccurs) 70 | if len(item_strs) > 1: 71 | sequence.up_to_prev_str = ' -> '.join(item_strs[:-1]) 72 | sequence.last_child_str = item_strs[-1] 73 | sequence.frm_second_str = ' -> '.join(item_strs[1:]) 74 | 75 | for _item in item_strs: 76 | _elements = list(map(int, _item.split(' '))) 77 | item = Item(_elements) 78 | sequence.add_item(item) 79 | sequences[sequence_str] = sequence 80 | 81 | # Second pass 82 | for sequence in list(sequences.values()): 83 | sequence.up_to_prev = up_to_prev = sequences.get(sequence.up_to_prev_str, None) 84 | sequence.last_child = last_child = sequences.get(sequence.last_child_str, None) 85 | sequence.frm_second = sequences.get(sequence.frm_second_str, None) 86 | 87 | if up_to_prev is not None: 88 | sequence.confidence = sequence.noccurs / up_to_prev.noccurs 89 | confidences[sequence.name] = sequence.confidence 90 | 91 | if last_child is not None: 92 | sequence.lift = sequence.noccurs * nseqs / (up_to_prev.noccurs * last_child.noccurs) 93 | lifts[sequence.name] = sequence.lift 94 | 95 | # Third pass - to calculate accummulated occurrence counts 96 | for sequence in list(sequences.values()): 97 | if sequence.frm_second is not None: 98 | sequence.frm_second.accumulate_occurs(sequence.noccurs) 99 | 100 | result['mined_objects'] = list(sequences.values()) 101 | 102 | 103 | def spade(filename=None, data=None, support=0.1, maxsize=None, maxlen=None, mingap=None, maxgap=None, memsize=None, 104 | numpart=None, maxwin=None, bfstype=None, tid_lists=None, parse=True): 105 | ''' 106 | Call C++'s cspade() 107 | :param filename: full path to the input file (ascii) 108 | :param support: is interpreted as the threshold of mimimum normalised support if within [0, 1]: 109 | if > 1: interpreted as the threshold of absolute support (e.g. 50 over 100 transactions) 110 | :param maxsize: an integer value specifying the maximum number of items of an element of a sequence (default=100) 111 | :param maxlen: an integer value specifying the maximum number of elements of a sequence (default=100) 112 | :param mingap: an integer value specifying the minimum time difference between consecutive elements of a sequence 113 | :param maxgap: an integer value specifying the minimum time difference between consecutive elements of a sequence 114 | 115 | :return: (result, logger, summary). where: 116 | -result: the mined sequences 117 | -logger: general logging 118 | -summary: same content as summary.out created by the original C code 119 | ''' 120 | if filename is None and data is None: 121 | raise Exception('You must provide either filename or data') 122 | if filename is not None and data is not None: 123 | raise Exception('You must provide either filename or data') 124 | 125 | if filename and not os.path.isfile(filename): 126 | raise Exception('File {} does not exist'.format(filename)) 127 | 128 | if memsize: 129 | if not isinstance(memsize, int): 130 | raise Exception('memsize must be integer') 131 | if numpart: 132 | if not isinstance(numpart, int): 133 | raise Exception('numpart must be integer') 134 | 135 | assert (0 < support <= 1), 'support must be a floating point in range (0-1]' 136 | 137 | if mingap is not None: 138 | assert mingap > 0, 'mingap cannot be 0 - that would mean two transactions happen at the same time' 139 | if maxgap is not None: 140 | assert maxgap > 0, 'maxgap cannot be 0' 141 | if mingap and maxgap < mingap: 142 | mingap = maxgap 143 | 144 | if data: 145 | rows = data_to_rows(data) 146 | hex = uuid.uuid4().hex 147 | filename = '/tmp/cspade-{}.txt'.format(hex) 148 | with open(filename, 'w', encoding='latin-1') as f: 149 | for row in rows: 150 | f.write(row) 151 | f.write('\n') 152 | 153 | try: 154 | result = c_runspade(filename, support, maxsize, maxlen, mingap, maxgap, memsize, numpart, maxwin, bfstype, 155 | tid_lists) 156 | decode_result(result) 157 | if parse: 158 | parse_results(result) 159 | return result 160 | 161 | finally: 162 | if data: 163 | os.remove(filename) 164 | 165 | 166 | def print_result(result): 167 | nseqs = result['nsequences'] 168 | print(('{0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>80s}'.format('Occurs', 'Accum', 'Support', 'Confid', 'Lift', 169 | 'Sequence'))) 170 | for mined_object in result['mined_objects']: 171 | conf = 'N/A' 172 | lift = 'N/A' 173 | if mined_object.confidence: 174 | conf = '{:0.7f}'.format(mined_object.confidence) 175 | if mined_object.lift: 176 | lift = '{:0.7f}'.format(mined_object.lift) 177 | 178 | print(('{0:>9d} {1:>9d} {2:>0.7f} {3:>9s} {4:>9s} {5:>80s} '.format( 179 | mined_object.noccurs, 180 | mined_object.accum_occurs, 181 | mined_object.noccurs / nseqs, 182 | conf, 183 | lift, 184 | '->'.join(list(map(str, mined_object.items)))))) 185 | 186 | 187 | def decode_result(result): 188 | result['seqstrm'] = result['seqstrm'].strip().decode('latin-1') 189 | result['logger'] = result['logger'].strip().decode('latin-1') 190 | result['summary'] = result['summary'].strip().decode('latin-1') -------------------------------------------------------------------------------- /pycspade/shortcuts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | 4 | from .cspade import cpp_cspade 5 | 6 | 7 | def data_to_rows(data): 8 | rows = ['{} {} {} {}'.format(sid, eid, len(els), ' '.join(list(map(str, els)))) for sid, eid, els in data] 9 | return rows 10 | 11 | 12 | class Item: 13 | def __init__(self, elements): 14 | self.elements = elements 15 | 16 | def __repr__(self): 17 | return '({})'.format(' '.join(list(map(str, self.elements)))) 18 | 19 | 20 | class Sequence: 21 | def __init__(self, name, noccurs): 22 | self.items = [] 23 | self.name = name 24 | self.noccurs = noccurs 25 | self.accum_occurs = noccurs 26 | self.confidence = None 27 | self.lift = None 28 | self.up_to_prev = None 29 | self.last_child = None 30 | self.frm_second = None 31 | self.up_to_prev_str = None 32 | self.last_child_str = None 33 | self.frm_second_str = None 34 | 35 | def add_item(self, item): 36 | self.items.append(item) 37 | 38 | def accumulate_occurs(self, child_occurs): 39 | self.accum_occurs += child_occurs 40 | if self.frm_second: 41 | self.frm_second.accumulate_occurs(child_occurs) 42 | 43 | def __repr__(self): 44 | return '{} - [{}]'.format('->'.join(list(map(str, self.items))), self.noccurs) 45 | 46 | 47 | def decode_results(result): 48 | lifts = {} 49 | confidences = {} 50 | nseqs = result['nsequences'] 51 | 52 | mined = result['mined'] 53 | lines = mined.strip().decode('latin-1').split('\n') 54 | lines.sort() 55 | sequences = {} 56 | for line in lines: 57 | if '0' <= line[0] <= '9': 58 | sequence_str, stats = line.split(' -- ') 59 | item_strs = sequence_str.split(' -> ') 60 | noccurs = int(stats[:stats.index(' ')]) 61 | 62 | sequence = Sequence(sequence_str, noccurs) 63 | if len(item_strs) > 1: 64 | sequence.up_to_prev_str = ' -> '.join(item_strs[:-1]) 65 | sequence.last_child_str = item_strs[-1] 66 | sequence.frm_second_str = ' -> '.join(item_strs[1:]) 67 | 68 | for _item in item_strs: 69 | _elements = list(map(int, _item.split(' '))) 70 | item = Item(_elements) 71 | sequence.add_item(item) 72 | sequences[sequence_str] = sequence 73 | 74 | # Second pass 75 | for sequence in list(sequences.values()): 76 | sequence.up_to_prev = up_to_prev = sequences.get(sequence.up_to_prev_str, None) 77 | sequence.last_child = last_child = sequences.get(sequence.last_child_str, None) 78 | sequence.frm_second = sequences.get(sequence.frm_second_str, None) 79 | 80 | if up_to_prev is not None: 81 | sequence.confidence = sequence.noccurs / up_to_prev.noccurs 82 | confidences[sequence.name] = sequence.confidence 83 | 84 | if last_child is not None: 85 | sequence.lift = sequence.noccurs * nseqs / (up_to_prev.noccurs * last_child.noccurs) 86 | lifts[sequence.name] = sequence.lift 87 | 88 | # Third pass - to calculate accummulated occurrence counts 89 | for sequence in list(sequences.values()): 90 | if sequence.frm_second is not None: 91 | sequence.frm_second.accumulate_occurs(sequence.noccurs) 92 | 93 | result['mined_objects'] = list(sequences.values()) 94 | 95 | 96 | def cspade(filename=None, data=None, support=3, maxsize=None, maxlen=None, mingap=None, maxgap=None): 97 | """ 98 | Shortcut to call cspade 99 | :param filename: path to the ascii file, must be given if data is None 100 | :param data: raw data as list of transactions, must be given if filename is None 101 | :param support: is interpreted as the threshold of mimimum normalised support if within [0, 1]: 102 | if > 1: interpreted as the threshold of absolute support (e.g. 50 over 100 transactions) 103 | :param maxsize: an integer value specifying the maximum number of items of a sequence (default=100) 104 | :param maxlen: an integer value specifying the maximum number of elements of a sequence (default=100) 105 | :param mingap: an integer value specifying the minimum time difference between consecutive elements of a sequence 106 | :param maxgap: an integer value specifying the maximum time difference between consecutive elements of a sequence 107 | :param decode: if True, the return strings will be decoded and line-separated, otherwise raw C++ strings 108 | (python bytes) are returned 109 | :return: (result, logger, summary). where: 110 | -result: the mined sequences 111 | -logger: general logging 112 | -summary: equivalent to the content of summary.out 113 | """ 114 | if filename is None and data is None: 115 | raise Exception('You must provide either filename or data') 116 | if filename is not None and data is not None: 117 | raise Exception('You must provide either filename or data') 118 | 119 | if data: 120 | rows = data_to_rows(data) 121 | hex = uuid.uuid4().hex 122 | filename = '/tmp/{}.ascii.data'.format(hex) 123 | with open(filename, 'w', encoding='latin-1') as f: 124 | for row in rows: 125 | f.write(row) 126 | f.write('\n') 127 | 128 | try: 129 | retval = cpp_cspade(filename, support, maxsize, maxlen, mingap, maxgap, decode=False) 130 | decode_results(retval) 131 | return retval 132 | finally: 133 | if data: 134 | os.remove(filename) 135 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flake8==2.6.2 2 | flake8-putty==0.4.0 3 | flake8-docstrings 4 | twine 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says that the code is written to work on both Python 2 and Python 3 | # 3. If at all possible, it is good practice to do this. If you cannot, you 4 | # will need to generate wheels for each Python version that you support. 5 | universal=1 6 | [metadata] 7 | description-file=README.md 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | from codecs import open 3 | import sys 4 | import os 5 | 6 | # is_windows = sys.platform.startswith('win') 7 | 8 | def is_platform_mac(): 9 | return sys.platform == 'darwin' 10 | 11 | def is_platform_windows(): 12 | return sys.platform == 'win32' or sys.platform == 'cygwin' 13 | 14 | try: 15 | from Cython.Distutils import build_ext 16 | except ImportError: 17 | use_cython = False 18 | else: 19 | use_cython = True 20 | 21 | if use_cython: 22 | sourcefiles = ['pycspade/cspade.pyx'] 23 | else: 24 | sourcefiles = ['pycspade/cspade.cpp'] 25 | 26 | extra_files = ['csrc/{}'.format(x) for x in [ 27 | 'makebin.cc', 28 | 'getconf.cc', 29 | 'exttpose.cc', 30 | 'wrappers.cc', 31 | 'calcdb.cc', 32 | 'TransArray.cc', 33 | 'Array.cc', 34 | 'Itemset.cc', 35 | 'Lists.cc', 36 | 'Eqclass.cc', 37 | 'InvertDatabase.cc', 38 | 'Partition.cc', 39 | 'Sequence.cc', 40 | 'common.cc', 41 | 'argv_parser.cc', 42 | 'SpadeArguments.cc', 43 | 'FreqIt.cc', 44 | 'EqGrNode.cc', 45 | 'ClassInfo.cc' 46 | ]] 47 | 48 | 49 | # Fix compatibility when compiling on Mac Mojave. 50 | # Explanation: https://github.com/pandas-dev/pandas/issues/23424#issuecomment-446393981 51 | # Code credit: https://github.com/pandas-dev/pandas/pull/24274/commits/256faf2011a12424e684a42c147e1ba7ac32c6fb 52 | if is_platform_mac(): 53 | import _osx_support 54 | import distutils.sysconfig 55 | if not 'MACOSX_DEPLOYMENT_TARGET' in os.environ: 56 | current_system = list(map(int, _osx_support._get_system_version().split('.'))) 57 | python_osx_target_str = distutils.sysconfig.get_config_var('MACOSX_DEPLOYMENT_TARGET') 58 | python_osx_target = list(map(int, python_osx_target_str.split('.'))) 59 | if python_osx_target < [10, 9] and current_system >= [10, 9]: 60 | os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.9' 61 | 62 | os.environ['CC'] = 'clang' 63 | os.environ['CXX'] = 'clang' 64 | 65 | if is_platform_windows(): 66 | extra_compiler_args = [] 67 | else: 68 | extra_compiler_args = [ 69 | '-std=c++11', 70 | '-Wno-sign-compare', 71 | '-Wno-incompatible-pointer-types', 72 | '-Wno-unused-variable', 73 | '-Wno-absolute-value', 74 | '-Wno-visibility', 75 | '-Wno-#warnings', 76 | ] 77 | 78 | if is_platform_mac(): 79 | ext_modules = [ 80 | Extension('pycspade.cspade', 81 | sourcefiles + extra_files, 82 | include_dirs=['csrc/'], 83 | language='c++', 84 | extra_compile_args=extra_compiler_args, 85 | extra_link_args=["-O2", "-march=native", '-stdlib=libc++'], 86 | ), 87 | ] 88 | else: 89 | ext_modules = [ 90 | Extension('pycspade.cspade', 91 | sourcefiles + extra_files, 92 | include_dirs=['csrc/'], 93 | language='c++', 94 | extra_compile_args=extra_compiler_args, 95 | ), 96 | ] 97 | 98 | with open('README.md', 'r') as fh: 99 | long_description = fh.read() 100 | 101 | setup_args = dict( 102 | name='pycspade', 103 | ext_modules=ext_modules, 104 | license='MIT', 105 | packages=['pycspade'], 106 | version='0.6.6', 107 | author=['Mohammed J. Zaki', 'Yukio Fukuzawa'], 108 | description='C-SPADE Python Implementation', 109 | long_description=long_description, 110 | long_description_content_type='text/markdown', 111 | url='https://github.com/fzyukio/python-cspade', 112 | keywords=['cspade', 'c-spade', 'sequence mining'], 113 | install_requires=['Cython'], 114 | ) 115 | 116 | if use_cython: 117 | setup_args['cmdclass'] = {'build_ext': build_ext} 118 | 119 | setup( 120 | **setup_args 121 | ) 122 | -------------------------------------------------------------------------------- /test-global.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | rm pycspade/cspade.cpp pycspade/*.so; 3 | pip uninstall -y "pycspade>=0.0.0" 4 | python setup.py clean; 5 | python setup.py install 6 | python tests/example.py 7 | -------------------------------------------------------------------------------- /test-local.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | rm pycspade/cspade.cpp pycspade/*.so; 3 | pip uninstall -y "pycspade>=0.0.0" 4 | python setup.py clean; 5 | python setup.py build_ext --inplace; 6 | python tests/example.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzyukio/python-cspade/abb46fed3d9edef3a0ac24bc4e226bdcc47c67aa/tests/__init__.py -------------------------------------------------------------------------------- /tests/example.py: -------------------------------------------------------------------------------- 1 | from pycspade.helpers import spade, print_result 2 | 3 | if __name__ == '__main__': 4 | result = spade(filename='tests/zaki.txt', support=0.3) 5 | print('Sequences mined:') 6 | print((result['seqstrm'])) 7 | print('Logger:') 8 | print((result['logger'])) 9 | print('Summary:') 10 | print((result['summary'])) 11 | 12 | print_result(result) 13 | 14 | # data = [ 15 | # [1, 10, [3, 4]], 16 | # [1, 15, [1, 2, 3]], 17 | # [1, 20, [1, 2, 6]], 18 | # [1, 25, [1, 3, 4, 6]], 19 | # [2, 15, [1, 2, 6]], 20 | # [2, 20, [5]], 21 | # [3, 10, [1, 2, 6]], 22 | # [4, 10, [4, 7, 8]], 23 | # [4, 20, [2, 6]], 24 | # [4, 25, [1, 7, 8]] 25 | # ] 26 | # 27 | # result = spade(data=data, support=0.3) 28 | # print_result(result) 29 | -------------------------------------------------------------------------------- /tests/simplest.txt: -------------------------------------------------------------------------------- 1 | 1 1 1 1 2 | 1 2 1 2 3 | 1 3 1 3 4 | 1 4 1 4 5 | 2 1 1 1 6 | 2 2 1 2 7 | 2 3 1 3 8 | 3 1 1 1 9 | 3 2 1 2 10 | 3 3 1 3 11 | 3 4 1 3 -------------------------------------------------------------------------------- /tests/test.ascii.data: -------------------------------------------------------------------------------- 1 | 1 1 3 8 37 42 2 | 1 2 4 4 11 37 42 3 | 1 3 3 27 64 91 4 | 1 4 2 3 4 5 | 1 5 3 4 24 73 6 | 1 6 2 26 67 7 | 1 7 3 4 58 84 8 | 1 8 3 19 62 88 9 | 2 1 2 10 73 10 | 2 2 1 72 11 | 2 3 3 4 24 77 12 | 2 4 3 19 32 39 13 | 2 5 2 50 72 14 | 2 6 2 3 22 15 | 2 7 3 51 68 72 16 | 2 8 4 11 27 53 54 17 | 2 9 3 47 77 91 18 | 2 10 3 3 13 58 19 | 3 1 3 48 62 78 20 | 3 2 3 9 32 48 21 | 3 3 5 40 62 67 72 76 22 | 3 4 3 10 47 58 23 | 3 5 2 35 37 24 | 3 6 2 45 77 25 | 3 7 1 53 26 | 3 8 5 3 9 11 64 92 27 | 4 1 4 11 32 62 97 28 | 4 2 4 37 50 56 58 29 | 4 3 4 3 17 18 92 30 | 4 4 3 64 68 84 31 | 4 5 4 17 58 60 94 32 | 4 6 6 22 27 62 80 91 92 33 | 4 7 5 18 39 60 72 83 34 | 4 8 3 18 58 72 35 | 4 9 4 6 40 42 63 36 | 4 10 5 22 49 60 72 77 37 | 4 11 3 19 48 59 38 | 4 12 4 9 36 79 91 39 | 4 13 5 14 32 57 60 75 40 | 4 14 3 6 26 44 41 | 4 15 4 12 44 77 91 42 | 4 16 1 55 43 | 4 17 6 12 23 42 53 69 84 44 | 5 1 4 30 53 71 72 45 | 5 2 3 55 72 99 46 | 5 3 3 0 11 59 47 | 5 4 2 22 48 48 | 5 5 4 3 11 71 74 49 | 5 6 5 22 42 43 72 80 50 | 5 7 4 3 34 62 72 51 | 5 8 5 26 35 48 68 72 52 | 5 9 6 13 23 26 55 62 80 53 | 6 1 5 14 18 33 39 60 54 | 6 2 4 28 47 62 77 55 | 6 3 4 4 40 44 57 56 | 6 4 4 23 48 72 88 57 | 6 5 3 4 53 85 58 | 6 6 6 15 28 33 44 75 92 59 | 6 7 4 26 27 40 96 60 | 6 8 5 3 11 42 47 48 61 | 6 9 4 4 8 17 47 62 | 6 10 3 28 32 40 63 | 6 11 4 10 58 67 68 64 | 6 12 4 3 37 62 87 65 | 7 1 3 30 39 72 66 | 7 2 2 6 40 67 | 7 3 1 18 68 | 7 4 4 22 32 72 80 69 | 7 5 3 13 53 77 70 | 7 6 3 14 53 72 71 | 7 7 3 4 42 69 72 | 7 8 1 91 73 | 7 9 5 4 15 22 71 84 74 | 7 10 2 39 56 75 | 8 1 6 6 26 48 68 72 77 76 | 8 2 5 1 33 50 58 68 77 | 8 3 4 29 39 60 71 78 | 8 4 4 12 79 82 88 79 | 8 5 6 11 12 22 48 53 80 80 | 8 6 2 11 71 81 | 8 7 2 17 45 82 | 8 8 4 18 22 43 64 83 | 9 1 3 10 49 72 84 | 9 2 2 91 92 85 | 9 3 2 22 51 86 | 9 4 2 53 91 87 | 9 5 2 3 30 88 | 9 6 2 32 69 89 | 9 7 1 71 90 | 9 8 2 26 48 91 | 9 9 1 92 92 | 9 10 2 50 58 93 | 9 11 3 39 40 87 94 | 9 12 2 40 70 95 | 9 13 3 5 13 50 96 | 10 1 3 4 39 45 97 | 10 2 3 4 55 80 98 | 10 3 3 23 30 95 99 | 10 4 1 35 100 | 10 5 3 13 33 37 101 | 10 6 2 40 72 102 | 10 7 4 48 49 58 95 103 | 10 8 2 95 98 104 | 10 9 1 4 105 | -------------------------------------------------------------------------------- /tests/test_cspade.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pycspade import spade 4 | 5 | 6 | class Test(unittest.TestCase): 7 | def setUp(self): 8 | result = spade(filename='tests/zaki.txt', support=2, maxsize=5, maxlen=5) 9 | self.nseqs = result['nsequences'] 10 | self.occurs = {} 11 | self.supports = {} 12 | self.confids = {} 13 | self.lifts = {} 14 | self.accum_occurs = {} 15 | self.sequences = [] 16 | 17 | for mined_object in result['mined_objects']: 18 | sequence = '->'.join(list(map(str, mined_object.items))) 19 | self.sequences.append(sequence) 20 | self.occurs[sequence] = mined_object.noccurs 21 | self.supports[sequence] = mined_object.noccurs / self.nseqs 22 | self.confids[sequence] = mined_object.confidence 23 | self.lifts[sequence] = mined_object.lift 24 | self.accum_occurs[sequence] = mined_object.accum_occurs 25 | 26 | def test_sequences(self): 27 | correct_sequences = ['(1)', '(1 2)', '(1 2 6)', '(1 6)', '(2)', '(2)->(1)', '(2 6)', '(2 6)->(1)', '(4)', 28 | '(4)->(1)', 29 | '(4)->(2)', '(4)->(2)->(1)', '(4)->(2 6)', '(4)->(2 6)->(1)', '(4)->(6)', '(4)->(6)->(1)', 30 | '(6)', '(6)->(1)'] 31 | correct_occurs = {'(1)': 4, '(1 2)': 3, '(1 2 6)': 3, '(1 6)': 3, '(2)': 4, '(2)->(1)': 2, '(2 6)': 4, 32 | '(2 6)->(1)': 2, '(4)': 2, '(4)->(1)': 2, '(4)->(2)': 2, '(4)->(2)->(1)': 2, '(4)->(2 6)': 2, 33 | '(4)->(2 6)->(1)': 2, '(4)->(6)': 2, '(4)->(6)->(1)': 2, '(6)': 4, '(6)->(1)': 2} 34 | correct_supports = {'(1)': 1.0, '(1 2)': 0.75, '(1 2 6)': 0.75, '(1 6)': 0.75, '(2)': 1.0, '(2)->(1)': 0.5, 35 | '(2 6)': 1.0, '(2 6)->(1)': 0.5, '(4)': 0.5, '(4)->(1)': 0.5, '(4)->(2)': 0.5, 36 | '(4)->(2)->(1)': 0.5, '(4)->(2 6)': 0.5, '(4)->(2 6)->(1)': 0.5, '(4)->(6)': 0.5, 37 | '(4)->(6)->(1)': 0.5, '(6)': 1.0, '(6)->(1)': 0.5} 38 | correct_lifts = {'(1)': None, '(1 2)': None, '(1 2 6)': None, '(1 6)': None, '(2)': None, '(2)->(1)': 0.5, 39 | '(2 6)': None, '(2 6)->(1)': 0.5, '(4)': None, '(4)->(1)': 1.0, '(4)->(2)': 1.0, 40 | '(4)->(2)->(1)': 1.0, '(4)->(2 6)': 1.0, '(4)->(2 6)->(1)': 1.0, '(4)->(6)': 1.0, 41 | '(4)->(6)->(1)': 1.0, '(6)': None, '(6)->(1)': 0.5} 42 | correct_confids = {'(1)': None, '(1 2)': None, '(1 2 6)': None, '(1 6)': None, '(2)': None, '(2)->(1)': 0.5, 43 | '(2 6)': None, '(2 6)->(1)': 0.5, '(4)': None, '(4)->(1)': 1.0, '(4)->(2)': 1.0, 44 | '(4)->(2)->(1)': 1.0, '(4)->(2 6)': 1.0, '(4)->(2 6)->(1)': 1.0, '(4)->(6)': 1.0, 45 | '(4)->(6)->(1)': 1.0, '(6)': None, '(6)->(1)': 0.5} 46 | 47 | self.assertListEqual(self.sequences, correct_sequences) 48 | self.assertDictEqual(self.occurs, correct_occurs) 49 | self.assertDictEqual(self.supports, correct_supports) 50 | self.assertDictEqual(self.lifts, correct_lifts) 51 | self.assertDictEqual(self.confids, correct_confids) 52 | 53 | print((self.accum_occurs)) 54 | -------------------------------------------------------------------------------- /tests/zaki.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzyukio/python-cspade/abb46fed3d9edef3a0ac24bc4e226bdcc47c67aa/tests/zaki.conf -------------------------------------------------------------------------------- /tests/zaki.data: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /tests/zaki.idx: -------------------------------------------------------------------------------- 1 |   -------------------------------------------------------------------------------- /tests/zaki.tpose: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /tests/zaki.txt: -------------------------------------------------------------------------------- 1 | 1 10 2 3 4 2 | 1 15 3 1 2 3 3 | 1 20 3 1 2 6 4 | 1 25 4 1 3 4 6 5 | 2 15 3 1 2 6 6 | 2 20 1 5 7 | 3 10 3 1 2 6 8 | 4 10 3 4 7 8 9 | 4 20 2 2 6 10 | 4 25 3 1 7 8 -------------------------------------------------------------------------------- /uppypi.sh: -------------------------------------------------------------------------------- 1 | rm -rf pycspade.egg-info 2 | rm -rf dist/* 3 | python setup.py sdist 4 | twine upload dist/* 5 | -------------------------------------------------------------------------------- /utilssrc/Array.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "Array.h" 4 | 5 | Array::Array (int sz, int npart){ 6 | totSize = sz; 7 | theSize = 0; 8 | lastPos = 0; 9 | theFlg = 0; 10 | //theIncr = incr; 11 | theArray = NULL; 12 | offset = new long[npart]; 13 | for (int i=0; i < npart; i++) offset[i]=0; 14 | if (sz > 0){ 15 | theArray = (int *) malloc (totSize*sizeof(int)); 16 | //theArray = new int [totSize]; 17 | if (theArray == NULL){ 18 | perror("memory:: Array"); 19 | exit(errno); 20 | } 21 | } 22 | } 23 | 24 | Array::~Array(){ 25 | if (theArray) { 26 | free(theArray); 27 | //delete [] theArray; 28 | } 29 | delete [] offset; 30 | theArray = NULL; 31 | } 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /utilssrc/Array.h: -------------------------------------------------------------------------------- 1 | #ifndef __ARRAY_H 2 | #define __ARRAY_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | class Array { 13 | protected: 14 | int *theArray; 15 | char theFlg; 16 | int lastPos; 17 | unsigned int theSize; 18 | unsigned int totSize; 19 | long *offset; 20 | public: 21 | 22 | Array(int sz, int npart=1); 23 | ~Array(); 24 | 25 | int operator [] (unsigned int index) 26 | { 27 | return theArray[index]; 28 | }; 29 | 30 | char flg() 31 | { 32 | return theFlg; 33 | } 34 | void setflg(char flg) 35 | { 36 | theFlg = flg; 37 | } 38 | int lastpos() 39 | { 40 | return lastPos; 41 | } 42 | 43 | //to be used ony for use_seq 44 | void setlastpos() 45 | { 46 | theArray[lastPos+1] = theSize-lastPos-2; 47 | lastPos = theSize; 48 | } 49 | long get_offset(int pos=0) 50 | { 51 | return offset[pos]; 52 | } 53 | void set_offset(long off, int pos=0) 54 | { 55 | offset[pos] = off; 56 | } 57 | 58 | int totsize() 59 | { 60 | return totSize; 61 | } 62 | void reset() 63 | { 64 | theSize = 0; 65 | lastPos = 0; 66 | theFlg = 0; 67 | } 68 | 69 | int *array() 70 | { 71 | return theArray; 72 | } 73 | 74 | int size() 75 | { 76 | return theSize; 77 | } 78 | void setsize(int size) 79 | { 80 | theSize = size; 81 | } 82 | 83 | void setitem(int pos, int item) 84 | { 85 | theArray[pos] = item; 86 | } 87 | 88 | void additem(int item){ 89 | theArray[theSize] = item; 90 | theSize++; 91 | } 92 | 93 | void flushbuf(int fd, int use_seq, int pos=0) 94 | { 95 | lseek(fd, offset[pos]*sizeof(int),SEEK_SET); 96 | // int wblk = (use_seq==1) ? lastPos : theSize; 97 | // //if (lastPos != theSize) 98 | // // cout << "WBLK " << wblk << " " << lastPos << " " 99 | // // << theSize << endl << flush; 100 | int wblk = theSize; 101 | if (wblk > 0){ 102 | int res = ::write(fd, (char *)theArray, wblk*sizeof(int)); 103 | if (res < wblk*sizeof(int)){ 104 | perror("Error writing"); 105 | exit(errno); 106 | } 107 | offset[pos] += wblk; 108 | } 109 | theSize = 0; 110 | } 111 | void add (int fd, int item, int use_seq, int pos, int custid=-1) 112 | { 113 | if (use_seq){ 114 | // if (theSize+1+((custid == -1)?0:2)> totSize){ 115 | // //cout << "WRITE " << item << " " << custid << " " 116 | // // << offset << " " << lastPos << " " << theSize << " " 117 | // // << totSize << endl << flush; 118 | // if (lastPos == 0 && custid == -1){ 119 | // cout << "REALLOC " << totSize << " "<< theSize << endl; 120 | // totSize *= 2; 121 | // theArray = (int *)realloc(theArray, totSize*sizeof(int)); 122 | // if (theArray == NULL){ 123 | // perror("ERROR IN REALLOC Array::add"); 124 | // exit(errno); 125 | // } 126 | // } 127 | // else{ 128 | // flushbuf(fd,use_seq,pos); 129 | // for (int i=0; lastPos < theSize; i++, lastPos++) 130 | // theArray[i] = theArray[lastPos]; 131 | // theSize = i; 132 | // lastPos = 0; 133 | // //cout << "WROTE " << theSize << " " << lastPos << " "<< 134 | // // offset << endl < totSize){ 142 | flushbuf(fd,use_seq,pos); 143 | } 144 | theArray[theSize++] = custid; 145 | } 146 | else{ 147 | if (theSize+1 > totSize){ 148 | flushbuf(fd,use_seq,pos); 149 | } 150 | } 151 | theArray[theSize++] = item; 152 | } 153 | // void add (int fd, int item, int use_seq, int custid=-1) 154 | // { 155 | // if (theSize+1 > totSize){ 156 | // totSize = (int) (totSize*2); 157 | 158 | // theArray = (int *)realloc(theArray, totSize*sizeof(int)); 159 | // if (theArray == NULL){ 160 | // cout << "MEMORY EXCEEDED\n"; 161 | // exit(-1); 162 | // } 163 | // } 164 | // theArray[theSize] = item; 165 | // theSize++; 166 | // } 167 | }; 168 | #endif //__ARRAY_H 169 | 170 | 171 | -------------------------------------------------------------------------------- /utilssrc/b2a.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | 13 | int main(int argc, char **argv) 14 | { 15 | int fd; 16 | if ((fd = open(argv[1], O_RDONLY)) < 0){ 17 | perror("cant openfile "); 18 | exit(errno); 19 | } 20 | long flen = lseek(fd, 0, SEEK_END); 21 | int *ary; 22 | #ifdef SGI 23 | ary = (int *) mmap((char *)NULL, flen, 24 | (PROT_WRITE|PROT_READ), 25 | MAP_PRIVATE, fd, 0); 26 | #else 27 | ary = (int *) mmap((char *)NULL, flen, 28 | (PROT_WRITE|PROT_READ), 29 | (MAP_FILE|MAP_VARIABLE|MAP_PRIVATE), fd, 0); 30 | #endif 31 | if (ary == (int *)-1){ 32 | perror("MMAP ERROR"); 33 | exit(errno); 34 | } 35 | for (int i=0; i < flen/sizeof(int); i++) 36 | cout << " " << ary[i]; 37 | cout << endl; 38 | 39 | munmap((caddr_t)ary, flen); 40 | close(fd); 41 | 42 | } 43 | -------------------------------------------------------------------------------- /utilssrc/calcdb.cc: -------------------------------------------------------------------------------- 1 | #include "calcdb.h" 2 | #include 3 | 4 | Dbase_Ctrl_Blk::Dbase_Ctrl_Blk(char *infile, int buf_sz) 5 | { 6 | fd = open (infile, O_RDONLY); 7 | if (fd < 0){ 8 | printf("ERROR: InvalidFile -- Dbase_Ctrl_Blk()\n"); 9 | exit(-1); 10 | } 11 | buf_size = buf_sz; 12 | buf = new int [buf_sz]; 13 | cur_buf_pos = 0; 14 | cur_blk_size = 0; 15 | readall = 0; 16 | endpos = lseek(fd,0,SEEK_END); 17 | } 18 | 19 | Dbase_Ctrl_Blk::~Dbase_Ctrl_Blk() 20 | { 21 | delete [] buf; 22 | close(fd); 23 | } 24 | 25 | void Dbase_Ctrl_Blk::get_next_trans_ext() 26 | { 27 | // Need to get more items from file 28 | int res = cur_blk_size - cur_buf_pos; 29 | if (res > 0) 30 | { 31 | // First copy partial transaction to beginning of buffer 32 | memcpy((void *)buf, 33 | (void *)(buf + cur_buf_pos), 34 | res * ITSZ); 35 | cur_blk_size = res; 36 | } 37 | else 38 | { 39 | // No partial transaction in buffer 40 | cur_blk_size = 0; 41 | } 42 | 43 | res = read(fd, (void *)(buf + cur_blk_size), 44 | ((buf_size - cur_blk_size)*ITSZ)); 45 | 46 | if (res < 0){ 47 | perror("reading in database"); 48 | exit(errno); 49 | } 50 | cur_blk_size += res/ITSZ; 51 | //if (cur_blk_size > 0) 52 | //{ 53 | // custid = buf[0]; 54 | // tid = buf[1]; 55 | // numitem = buf[2]; 56 | // cur_buf_pos = 3; 57 | //} 58 | cur_buf_pos = 0; 59 | } 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /utilssrc/calcdb.h: -------------------------------------------------------------------------------- 1 | #ifndef __DATABASE_H 2 | #define __DATABASE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | extern int use_seq; 16 | 17 | #define ITSZ sizeof(int) 18 | #define DCBBUFSZ 2048 19 | //#define TRANSOFF ((use_seq)?3:2) 20 | #define TRANSOFF 3 21 | 22 | class Dbase_Ctrl_Blk{ 23 | public: 24 | Dbase_Ctrl_Blk(char *infile, int buf_sz=DCBBUFSZ); 25 | ~Dbase_Ctrl_Blk(); 26 | 27 | void get_next_trans_ext(); 28 | inline void get_first_blk(); 29 | inline void get_next_trans(int *&lbuf, int &numitem, int &tid, int &custid); 30 | 31 | int eof() 32 | { 33 | return (readall == 1); 34 | } 35 | int fd; 36 | int buf_size; 37 | int * buf; 38 | int cur_blk_size; 39 | int cur_buf_pos; 40 | int endpos; 41 | char readall; 42 | }; 43 | 44 | inline void Dbase_Ctrl_Blk::get_first_blk() 45 | { 46 | readall=0; 47 | lseek(fd, 0, SEEK_SET); 48 | cur_blk_size = (read(fd,(void *)buf, (buf_size*ITSZ)))/ITSZ; 49 | if (cur_blk_size < 0){ 50 | perror("get_first_blk"); 51 | exit(errno); 52 | } 53 | cur_buf_pos = 0; 54 | } 55 | 56 | inline void Dbase_Ctrl_Blk::get_next_trans (int *&lbuf, 57 | int &nitems, int &tid, int &cid) 58 | { 59 | if (cur_buf_pos+TRANSOFF >= cur_blk_size || 60 | cur_buf_pos+buf[cur_buf_pos+TRANSOFF-1]+TRANSOFF > cur_blk_size){ 61 | if (lseek(fd, 0, SEEK_CUR) == endpos) readall = 1; 62 | if (!readall){ 63 | // Need to get more items from file 64 | get_next_trans_ext(); 65 | } 66 | } 67 | 68 | if (!readall){ 69 | cid = buf[cur_buf_pos]; 70 | tid = buf[cur_buf_pos+TRANSOFF-2]; 71 | nitems = buf[cur_buf_pos+TRANSOFF-1]; 72 | lbuf = buf + cur_buf_pos + TRANSOFF; 73 | cur_buf_pos += nitems + TRANSOFF; 74 | } 75 | } 76 | #endif //__DATABASE_H 77 | 78 | 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /utilssrc/getconf.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "calcdb.h" 14 | 15 | using namespace std; 16 | 17 | #define ITSZ sizeof(int) 18 | 19 | char input[300]; //input file name 20 | char confn[300]; 21 | int use_seq = 1; 22 | 23 | void parse_args(int argc, char **argv) 24 | { 25 | extern char * optarg; 26 | int c; 27 | 28 | if (argc < 2) 29 | cout << "usage: assocFB -i -o\n"; 30 | else{ 31 | while ((c=getopt(argc,argv,"ai:o:"))!=-1){ 32 | switch(c){ 33 | case 'a': //work on assoc 34 | use_seq = 0; 35 | printf("USE SEQ = 0\n"); 36 | break; 37 | case 'i': 38 | sprintf(input,"%s.data",optarg); 39 | break; 40 | case 'o': 41 | sprintf(confn, "%s.conf", optarg); 42 | break; 43 | } 44 | } 45 | } 46 | } 47 | 48 | 49 | int getconfFunc(int argc, char **argv) 50 | { 51 | parse_args(argc, argv); 52 | 53 | int DBASE_NUM_TRANS=0; 54 | int DBASE_MAXITEM=0; 55 | int DBASE_NUM_CUST=0; 56 | int DBASE_MINTRANS=0; 57 | int DBASE_MAXTRANS=0; 58 | float DBASE_AVG_TRANS_SZ=0; 59 | float DBASE_AVG_CUST_SZ=0; 60 | 61 | int i; 62 | 63 | int custid, tid, nitem; 64 | int *buf; 65 | int oldcustid=-1; 66 | int oldtcnt = 0; 67 | int tsizesum = 0; 68 | int tcustsum = 0; 69 | int tsizesq = 0; 70 | int maxnitem = 0; 71 | 72 | Dbase_Ctrl_Blk *DCB = new Dbase_Ctrl_Blk(input); 73 | DCB->get_first_blk(); 74 | DCB->get_next_trans(buf, nitem, tid, custid); 75 | DBASE_MINTRANS = custid; 76 | while (!DCB->eof()){ 77 | //printf ("%d %d %d\n", custid, tid, nitem); 78 | DBASE_MAXTRANS = custid; 79 | if (use_seq){ 80 | if (oldcustid != custid){ 81 | tcustsum += DBASE_NUM_TRANS - oldtcnt; 82 | oldtcnt = DBASE_NUM_TRANS; 83 | DBASE_NUM_CUST++; 84 | oldcustid = custid; 85 | } 86 | } 87 | DBASE_NUM_TRANS++; 88 | tsizesum += nitem; 89 | if (nitem > maxnitem) maxnitem = nitem; 90 | 91 | tsizesq += (nitem*nitem); 92 | for (i=0; i < nitem; i++) 93 | if (buf[i] > DBASE_MAXITEM) DBASE_MAXITEM = buf[i]; 94 | DCB->get_next_trans(buf, nitem, tid, custid); 95 | } 96 | tcustsum += DBASE_NUM_TRANS - oldtcnt; 97 | DBASE_MAXITEM++; 98 | 99 | if (use_seq) DBASE_AVG_CUST_SZ = (1.0*tcustsum)/DBASE_NUM_CUST; 100 | DBASE_AVG_TRANS_SZ = (1.0*tsizesum)/DBASE_NUM_TRANS; 101 | double trans_sq_avg = (1.0*tsizesq)/DBASE_NUM_TRANS; 102 | double stddev = sqrt(trans_sq_avg - 103 | (DBASE_AVG_TRANS_SZ*DBASE_AVG_TRANS_SZ)); 104 | 105 | 106 | //write config info to new file 107 | int conffd; 108 | if ((conffd = open(confn, (O_WRONLY|O_CREAT), 0666)) < 0){ 109 | perror("Can't open out file"); 110 | exit (errno); 111 | } 112 | if (use_seq){ 113 | write(conffd,(char *)&DBASE_NUM_CUST,ITSZ); 114 | write(conffd,(char *)&DBASE_MAXITEM,ITSZ); 115 | write(conffd,(char *)&DBASE_AVG_CUST_SZ, sizeof(float)); 116 | write(conffd,(char *)&DBASE_AVG_TRANS_SZ, sizeof(float)); 117 | write(conffd,(char *)&DBASE_NUM_TRANS,ITSZ); 118 | write(conffd,(char *)&DBASE_MINTRANS,ITSZ); 119 | write(conffd,(char *)&DBASE_MAXTRANS,ITSZ); 120 | } 121 | else{ 122 | write(conffd,(char *)&DBASE_NUM_TRANS,ITSZ); 123 | write(conffd,(char *)&DBASE_MAXITEM,ITSZ); 124 | write(conffd,(char *)&DBASE_AVG_TRANS_SZ, sizeof(float)); 125 | write(conffd,(char *)&DBASE_MINTRANS,ITSZ); 126 | write(conffd,(char *)&DBASE_MAXTRANS,ITSZ); 127 | } 128 | 129 | close(conffd); 130 | printf("CONF %d %d %f %f %d %d %d %f %d\n", DBASE_NUM_CUST, DBASE_MAXITEM, 131 | DBASE_AVG_CUST_SZ, DBASE_AVG_TRANS_SZ, DBASE_NUM_TRANS, 132 | DBASE_MINTRANS, DBASE_MAXTRANS, stddev, maxnitem); 133 | delete DCB; 134 | exit(0); 135 | } 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /utilssrc/makebin.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | #define ITSZ sizeof(int) 15 | const int lineSize=8192; 16 | const int wdSize=256; 17 | ifstream fin; 18 | ofstream fout; 19 | 20 | void convbin(char *inBuf, int inSize) 21 | { 22 | char inStr[wdSize]; 23 | istrstream ist(inBuf, inSize); 24 | int it; 25 | while(ist >> inStr){ 26 | it = atoi(inStr); 27 | //cout << it << " "; 28 | fout.write((char*)&it, ITSZ); 29 | } 30 | //cout << endl; 31 | } 32 | 33 | int makebinFunc(int argc, char **argv) 34 | { 35 | char inBuf[lineSize]; 36 | int inSize; 37 | fin.open(argv[1]); 38 | if (!fin){ 39 | perror("cannot open in file"); 40 | exit(errno); 41 | } 42 | fout.open(argv[2]); 43 | if (!fout){ 44 | perror("cannot open out file"); 45 | exit(errno); 46 | } 47 | 48 | while(fin.getline(inBuf, lineSize)){ 49 | inSize = fin.gcount(); 50 | //cout << "IN SIZE " << inSize << endl; 51 | convbin(inBuf, inSize); 52 | } 53 | } 54 | --------------------------------------------------------------------------------