├── .flake8
├── .gitignore
├── .gitlab-ci.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── csrc
    ├── Array.cc
    ├── Array.h
    ├── ClassInfo.cc
    ├── ClassInfo.h
    ├── Env.h
    ├── EqGrNode.cc
    ├── EqGrNode.h
    ├── Eqclass.cc
    ├── Eqclass.h
    ├── FreqIt.cc
    ├── FreqIt.h
    ├── InvertDatabase.cc
    ├── InvertDatabase.h
    ├── Itemset.cc
    ├── Itemset.h
    ├── Lists.cc
    ├── Lists.h
    ├── Partition.cc
    ├── Partition.h
    ├── Sequence.cc
    ├── Sequence.h
    ├── SpadeArguments.cc
    ├── SpadeArguments.h
    ├── TransArray.cc
    ├── TransArray.h
    ├── argh.h
    ├── argv_parser.cc
    ├── argv_parser.h
    ├── calcdb.cc
    ├── calcdb.h
    ├── common.cc
    ├── common.h
    ├── dirent-win.h
    ├── exttpose.cc
    ├── exttpose.h
    ├── exttpose_main.cc
    ├── getconf.cc
    ├── getconf.h
    ├── getconf_main.cc
    ├── main.cc
    ├── makebin.cc
    ├── makebin.h
    ├── makebin_main.cc
    ├── spade_main.cc
    ├── test.cc
    ├── wrappers.cc
    └── wrappers.h
├── pycspade
    ├── __init__.py
    ├── cspade.cpp
    ├── cspade.pyx
    ├── helpers.py
    └── shortcuts.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── test-global.sh
├── test-local.sh
├── tests
    ├── __init__.py
    ├── bb-tmi.txt
    ├── example.py
    ├── simplest.txt
    ├── test.ascii.data
    ├── test1.ascii.data
    ├── test_cspade.py
    ├── zaki.conf
    ├── zaki.data
    ├── zaki.idx
    ├── zaki.tpose
    └── zaki.txt
├── uppypi.sh
└── utilssrc
    ├── Array.cc
    ├── Array.h
    ├── b2a.cc
    ├── calcdb.cc
    ├── calcdb.h
    ├── exttpose.cc
    ├── getconf.cc
    └── makebin.cc


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | ignore = D100,D101,D102,D103,D105,D106,D107,D200,D205,D400,D401,D413,F403,F405
4 | exclude = .venv,__init__.py,./build/
5 | 
6 | putty-ignore =
7 |   test_*.py : E501
8 | 
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea/
  2 | .venv
  3 | build
  4 | *.pyc
  5 | *.so
  6 | 
  7 | # Created by https://www.gitignore.io/api/c++,python,pycharm
  8 | 
  9 | ### C++ ###
 10 | # Prerequisites
 11 | *.d
 12 | 
 13 | # Compiled Object files
 14 | *.slo
 15 | *.lo
 16 | *.o
 17 | *.obj
 18 | 
 19 | # Precompiled Headers
 20 | *.gch
 21 | *.pch
 22 | 
 23 | # Compiled Dynamic libraries
 24 | *.so
 25 | *.dylib
 26 | *.dll
 27 | 
 28 | # Fortran module files
 29 | *.mod
 30 | *.smod
 31 | 
 32 | # Compiled Static libraries
 33 | *.lai
 34 | *.la
 35 | *.a
 36 | *.lib
 37 | 
 38 | # Executables
 39 | *.exe
 40 | *.out
 41 | *.app
 42 | 
 43 | ### PyCharm ###
 44 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
 45 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 46 | 
 47 | # User-specific stuff
 48 | .idea/**/workspace.xml
 49 | .idea/**/tasks.xml
 50 | .idea/**/usage.statistics.xml
 51 | .idea/**/dictionaries
 52 | .idea/**/shelf
 53 | 
 54 | # Sensitive or high-churn files
 55 | .idea/**/dataSources/
 56 | .idea/**/dataSources.ids
 57 | .idea/**/dataSources.local.xml
 58 | .idea/**/sqlDataSources.xml
 59 | .idea/**/dynamic.xml
 60 | .idea/**/uiDesigner.xml
 61 | .idea/**/dbnavigator.xml
 62 | 
 63 | # Gradle
 64 | .idea/**/gradle.xml
 65 | .idea/**/libraries
 66 | 
 67 | # Gradle and Maven with auto-import
 68 | # When using Gradle or Maven with auto-import, you should exclude module files,
 69 | # since they will be recreated, and may cause churn.  Uncomment if using
 70 | # auto-import.
 71 | # .idea/modules.xml
 72 | # .idea/*.iml
 73 | # .idea/modules
 74 | 
 75 | # CMake
 76 | cmake-build-*/
 77 | 
 78 | # Mongo Explorer plugin
 79 | .idea/**/mongoSettings.xml
 80 | 
 81 | # File-based project format
 82 | *.iws
 83 | 
 84 | # IntelliJ
 85 | out/
 86 | 
 87 | # mpeltonen/sbt-idea plugin
 88 | .idea_modules/
 89 | 
 90 | # JIRA plugin
 91 | atlassian-ide-plugin.xml
 92 | 
 93 | # Cursive Clojure plugin
 94 | .idea/replstate.xml
 95 | 
 96 | # Crashlytics plugin (for Android Studio and IntelliJ)
 97 | com_crashlytics_export_strings.xml
 98 | crashlytics.properties
 99 | crashlytics-build.properties
100 | fabric.properties
101 | 
102 | # Editor-based Rest Client
103 | .idea/httpRequests
104 | 
105 | ### PyCharm Patch ###
106 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
107 | 
108 | # *.iml
109 | # modules.xml
110 | # .idea/misc.xml
111 | # *.ipr
112 | 
113 | # Sonarlint plugin
114 | .idea/sonarlint
115 | 
116 | ### Python ###
117 | # Byte-compiled / optimized / DLL files
118 | __pycache__/
119 | *.py[cod]
120 | *$py.class
121 | 
122 | # C extensions
123 | 
124 | # Distribution / packaging
125 | .Python
126 | build/
127 | develop-eggs/
128 | dist/
129 | downloads/
130 | eggs/
131 | .eggs/
132 | lib/
133 | lib64/
134 | parts/
135 | sdist/
136 | var/
137 | wheels/
138 | *.egg-info/
139 | .installed.cfg
140 | *.egg
141 | MANIFEST
142 | 
143 | # PyInstaller
144 | #  Usually these files are written by a python script from a template
145 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
146 | *.manifest
147 | *.spec
148 | 
149 | # Installer logs
150 | pip-log.txt
151 | pip-delete-this-directory.txt
152 | 
153 | # Unit test / coverage reports
154 | htmlcov/
155 | .tox/
156 | .coverage
157 | .coverage.*
158 | .cache
159 | nosetests.xml
160 | coverage.xml
161 | *.cover
162 | .hypothesis/
163 | .pytest_cache/
164 | 
165 | # Translations
166 | *.mo
167 | *.pot
168 | 
169 | # Django stuff:
170 | *.log
171 | local_settings.py
172 | db.sqlite3
173 | 
174 | # Flask stuff:
175 | instance/
176 | .webassets-cache
177 | 
178 | # Scrapy stuff:
179 | .scrapy
180 | 
181 | # Sphinx documentation
182 | docs/_build/
183 | 
184 | # PyBuilder
185 | target/
186 | 
187 | # Jupyter Notebook
188 | .ipynb_checkpoints
189 | 
190 | # pyenv
191 | .python-version
192 | 
193 | # celery beat schedule file
194 | celerybeat-schedule
195 | 
196 | # SageMath parsed files
197 | *.sage.py
198 | 
199 | # Environments
200 | .env
201 | .venv
202 | env/
203 | venv/
204 | ENV/
205 | env.bak/
206 | venv.bak/
207 | 
208 | # Spyder project settings
209 | .spyderproject
210 | .spyproject
211 | 
212 | # Rope project settings
213 | .ropeproject
214 | 
215 | # mkdocs documentation
216 | /site
217 | 
218 | # mypy
219 | .mypy_cache/
220 | 
221 | ### Python Patch ###
222 | .venv/
223 | 
224 | 
225 | # End of https://www.gitignore.io/api/c++,python,pycharm
226 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | image: agileware/python-3.6.1-node-6.11
 2 | 
 3 | before_script:
 4 |   - pip install pycodestyle==2.0.0 flake8==2.6.2 flake8-docstrings==1.3.0 flake8-polyfill==1.0.2 flake8-putty==0.4.0
 5 | 
 6 | check-coding-standard-compliance:
 7 |   script:
 8 |   - flake8
 9 |   allow_failure: false
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018-2022 Mohammed J. Zaki and Yukio Fukuzawa
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 
11 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include csrc/ *.c *.cpp *.cc *.h *.py *.pyx
2 | recursive-include pycspade/ *.c *.cpp *.cc *.h *.py *.pyx
3 | include *.c *.cpp *.h *.py *.pyx
4 | include README.md
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Pycspade
  2 | ===
  3 | 
  4 | #### What is this?
  5 | This is a python wrapper for the C++ implementation of C-SPADE algorithm by the author, Mohammed J. Zaki
  6 | Original code was downloaded from http://www.cs.rpi.edu/~zaki/www-new/pmwiki.php/Software/Software#toc11
  7 | Since this is just a wrapper it is as fast as the C++ code
  8 | 
  9 | #### How to install?
 10 | Compatible with Python 2 and 3. 
 11 | On Windows, Visual Studio 2015 Build Tools is also required.
 12 | 
 13 | ```bash
 14 | pip install Cython pycspade
 15 | ```
 16 | 
 17 | #### How to use?
 18 | Your data needs to be in a particular format similar to the following:
 19 | ```text
 20 | 1 1 3 8 37 42
 21 | 1 2 4 4 11 37 42
 22 | 2 1 2 10 73
 23 | 2 2 1 72
 24 | 2 3 3 4 24 77
 25 | ...
 26 | ```
 27 | 
 28 | The first number is the sequence index, the second is the event index, the third is the number of elements, 
 29 | followed by the element, space separated
 30 | 
 31 | Let's call this file `data.txt`. You will call cspade as following:
 32 | ```python
 33 | from pycspade.helpers import spade, print_result
 34 | 
 35 | # To get raw SPADE output
 36 | result = spade(filename='tests/zaki.txt', support=0.3, parse=False)
 37 | print(result['mined'])
 38 | ```
 39 | ```bash
 40 | 1 -- 4 4 
 41 | 2 -- 4 4 
 42 | 4 -- 2 2 
 43 | 6 -- 4 4 
 44 | 4 -> 6 -- 2 2 
 45 | 4 -> 2 -- 2 2 
 46 | 2 -> 1 -- 2 2 
 47 | 4 -> 1 -- 2 2 
 48 | 6 -> 1 -- 2 2 
 49 | 4 -> 6 -> 1 -- 2 2 
 50 | 4 -> 2 -> 1 -- 2 2
 51 | ```
 52 | ```python
 53 | print(result['logger'])
 54 | ```
 55 | ```bash
 56 | CONF 4 9 2.7 2.5
 57 | args.MINSUPPORT 2 4
 58 | MINMAX 1 4
 59 | 1 SUPP 4
 60 | 2 SUPP 4
 61 | 4 SUPP 2
 62 | 6 SUPP 4
 63 | numfreq 4 :   SUMSUP SUMDIFF = 0 0
 64 | EXTRARYSZ 2465792
 65 | OPENED /tmp/cspade-WWv9bQWBYdDyH85T.idx
 66 | OFF 9 38
 67 | Wrote Offt 
 68 | BOUNDS 1 5
 69 | WROTE INVERT 
 70 | Cleaned up successful: /tmp/cspade-WWv9bQWBYdDyH85T.tpose
 71 | Cleaned up successful: /tmp/cspade-WWv9bQWBYdDyH85T.idx
 72 | Cleaned up successful: /tmp/cspade-WWv9bQWBYdDyH85T.data
 73 | Cleaned up successful: /tmp/cspade-WWv9bQWBYdDyH85T.conf
 74 | ```
 75 | ```python
 76 | print(result['summary'])
 77 | ```
 78 | ```bash
 79 | CONF 4 9 2.5 2.7 10 1 4 0.781025 4
 80 | TPOSE SEQ NOF2 /tmp/cspade-WWv9bQWBYdDyH85T.data 0.3 4 2 1 
 81 | F1stats = [ 4 0 0 ]
 82 | SPADE /tmp/cspade-WWv9bQWBYdDyH85T.tpose 0.3 2 7 0 0 0 0 0 -1 1 100 100 4 5 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 83 | ```
 84 | 
 85 | ```python
 86 | # To also get other sequence mining's measures, incl. lift, support, confidence:
 87 | result = spade(filename='tests/zaki.txt', support=0.3, parse=True)
 88 | # Pretty print result:
 89 | print_result(result)
 90 | ```
 91 | ```bash
 92 |    Occurs     Accum   Support    Confid      Lift          Sequence
 93 |         4        14 1.0000000       N/A       N/A               (1) 
 94 |         4         6 1.0000000       N/A       N/A               (2) 
 95 |         2         4 0.5000000 0.5000000 0.5000000          (2)->(1) 
 96 |         2         2 0.5000000       N/A       N/A               (4) 
 97 |         2         2 0.5000000 1.0000000 1.0000000          (4)->(1) 
 98 |         2         2 0.5000000 1.0000000 1.0000000          (4)->(2) 
 99 |         2         2 0.5000000 1.0000000 1.0000000     (4)->(2)->(1) 
100 |         2         2 0.5000000 1.0000000 1.0000000          (4)->(6) 
101 |         2         2 0.5000000 1.0000000 1.0000000     (4)->(6)->(1) 
102 |         4         6 1.0000000       N/A       N/A               (6) 
103 |         2         4 0.5000000 0.5000000 0.5000000          (6)->(1) 
104 | ```
105 | 
106 | ##### You can provide cspade with list of sequences instead of a file:
107 | ```python
108 | data = [
109 |     [1, 10, [3, 4]],
110 |     [1, 15, [1, 2, 3]],
111 |     [1, 20, [1, 2, 6]],
112 |     [1, 25, [1, 3, 4, 6]],
113 |     [2, 15, [1, 2, 6]],
114 |     [2, 20, [5]],
115 |     [3, 10, [1, 2, 6]],
116 |     [4, 10, [4, 7, 8]],
117 |     [4, 20, [2, 6]],
118 |     [4, 25, [1, 7, 8]]
119 | ]
120 | 
121 | result = spade(data=data, support=0.01)
122 | print_result(result)
123 | ```
124 | 
125 | The result `seq` is a string, that have multiple rows and looks like this:
126 | 
127 | ```text
128 | 22 80 -> 72 -> 42 -> 22 -- 2 2
129 | 22 -> 45 71 -> 42 -- 1 1
130 | 80 -> 45 71 -> 42 -- 1 1
131 | 22 80 -> 45 71 -> 42 -- 1 1
132 | ```
133 | Let's decipher the first row:
134 | ```bash
135 | 22 80 -> 72 -> 42 -> 22 -- 2 2
136 | ```
137 | 
138 | It gives you the frequent sequence followed by support (the last two numbers, which will be the same in this application).
139 | The row reads: the itemset (22 80) is followed by (72) followed by (42) followed by (22).
140 | 
141 | 
142 | There're a lot of parameters that can be passed to this function. most important ones are:
143 | 
144 | - `support`: this is the minimum support level, default to 0 (not excluding anything)
145 | - `max_gap`: The max number of itemset that can be skipped in a sequence
146 | - `min_gap`: The min number of itemset that must be skipped in a sequence
147 | 
148 | Read the original paper and the C++ implementation for more details
149 | 
150 | #### How to contribute?
151 | - Fork this repo
152 | - Make change
153 | - Pull request
154 | 
155 | #### How to recompile to use in IDE?
156 | - `rm cspade.cpp; python setup.py build_ext --inplace`
157 | 
158 | #### Licence
159 | - MIT


--------------------------------------------------------------------------------
/csrc/Array.cc:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <errno.h>
 3 | #include "Array.h"
 4 | 
 5 | Array::Array(int sz) {
 6 |     totSize = sz;
 7 |     theSize = 0;
 8 |     if (sz > 0) {
 9 |         theArray = make_shared<vint>(sz);
10 |     }
11 | }
12 | 
13 | Array::~Array() = default;
14 | 
15 | ostream &operator<<(ostream &outputStream, Array &arr) {
16 |     for (int i = 0; i < arr.theSize; i++)
17 |         outputStream << arr[i] << " ";
18 |     return outputStream;
19 | }
20 | 
21 | int Array::subsequence(Array_S ar) {
22 |     int i, j;
23 |     int sz1, sz2;
24 |     Array_S ar1, ar2;
25 |     int retval;
26 | 
27 |     if (theSize <= ar->theSize) {
28 |         sz1 = theSize;
29 |         sz2 = ar->theSize;
30 |         ar1 = shared_ptr<Array>(this);
31 |         ar2 = ar;
32 |         retval = 1;
33 |     } else {
34 |         sz1 = ar->theSize;
35 |         sz2 = theSize;
36 |         ar1 = ar;
37 |         ar2 = shared_ptr<Array>(this);
38 |         retval = -1;
39 |     }
40 |     int start = 0;
41 |     for (i = 0; i < sz1; i++) {
42 |         for (j = start; j < sz2; j++) {
43 |             if (ar1->theArray->at(i) == ar2->theArray->at(j)) {
44 |                 start = j + 1;
45 |                 break;
46 |             }
47 |         }
48 |         if (j >= ar2->theSize) return 0;
49 |     }
50 |     return retval;
51 | }
52 | 
53 | 
54 | int Array::compare(Array &ar2) {
55 |     int len;
56 |     if (size() <= ar2.size()) len = size();
57 |     else len = ar2.size();
58 |     for (int i = 0; i < len; i++) {
59 |         if (theArray->at(i) > ar2.theArray->at(i)) return 1;
60 |         else if (theArray->at(i) < ar2.theArray->at(i)) return -1;
61 |     }
62 |     if (size() < ar2.size()) return -1;
63 |     else if (size() > ar2.size()) return 1;
64 |     else return 0;
65 | }
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/csrc/Array.h:
--------------------------------------------------------------------------------
  1 | #ifndef __ARRAY_H
  2 | #define __ARRAY_H
  3 | 
  4 | #include "common.h"
  5 | 
  6 | class Array;
  7 | typedef shared_ptr<Array> Array_S;
  8 | typedef shared_ptr<vector<Array_S>> Array_SS;
  9 | 
 10 | class Array {
 11 | private:
 12 |     shared_ptr<vint> theArray = nullptr;
 13 |     int theSize;        // DD
 14 |     int totSize;        // DD
 15 |     //unsigned int theIncr;
 16 | public:
 17 | 
 18 |     //Array (int sz, int incr);
 19 |     explicit Array(int sz);
 20 | 
 21 |     ~Array();
 22 | 
 23 |     int subsequence(Array_S ar);
 24 | 
 25 |     //void add (int, unsigned int);
 26 |     void add_ext(int val, int off, int *ary) {
 27 |         ary[off + theSize] = val;
 28 |         theSize++;
 29 |     }
 30 | 
 31 |     int operator[](unsigned int index) {
 32 |         return theArray->at(index);
 33 |     };
 34 | 
 35 |     void setitem(int pos, int val) {
 36 |         theArray->at(pos) = val;
 37 |     };
 38 | 
 39 |     int totsize() {
 40 |         return totSize;
 41 |     }
 42 | 
 43 |     void set_totsize(int sz) {
 44 |         totSize = sz;
 45 |     }
 46 | 
 47 |     void set_size(int sz) {
 48 |         theSize = sz;
 49 |     }
 50 | 
 51 |     void reset() {
 52 |         theSize = 0;
 53 |     }
 54 | 
 55 |     shared_ptr<vint> array() {
 56 |         return theArray;
 57 |     }
 58 | 
 59 |     void set_array(shared_ptr<vint> ary) {
 60 |         theArray = ary;
 61 |     }
 62 | 
 63 |     //int subsequence(Array&);
 64 |     //int compare(Array&);
 65 |     friend ostream &operator<<(ostream &outputStream, Array &arr);
 66 | 
 67 |     int compare(Array &ar2);
 68 | 
 69 |     int item(unsigned int index) {
 70 |         return theArray->at(index);
 71 |     }
 72 | 
 73 |     int size()        // DD
 74 |     {
 75 |         return theSize;
 76 |     }
 77 | 
 78 |     void resize(int newsz) {
 79 |         totSize = newsz;
 80 |         theArray->resize(totSize);
 81 |     }
 82 | 
 83 |     void compact() {
 84 |         theArray->resize(theSize);
 85 |     }
 86 | 
 87 |     void optadd(int item) {
 88 |         add(item);
 89 |     }
 90 | 
 91 |     void add(int item) {
 92 |         if (theSize + 1 > totSize) {
 93 |             resize((int) (totSize * 1.5));
 94 |         }
 95 |         theArray->at(theSize) = (item);
 96 |         theSize++;
 97 |     }
 98 | };
 99 | 
100 | #endif //__ARRAY_H
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/csrc/ClassInfo.cc:
--------------------------------------------------------------------------------
 1 | #include "ClassInfo.h"
 2 | 
 3 | void ClassInfo::init() {
 4 |     int i, maxval = 0;
 5 |     long numtrans;        // DD
 6 | 
 7 |     const string& classfn = args->classf;
 8 |     const double min_support_per_class = args->min_support_per_class;
 9 |     has_class = args->use_class;
10 | 
11 |     if (has_class) {
12 |         fstream classf(classfn, ios::binary);
13 |         if (!classf.is_open()) {
14 |             throw runtime_error("Unable to read file " + classfn);
15 |         }
16 | 
17 |         long fdlen = file_size(classf);
18 |         int *clsaddr = read_file(classf, fdlen);
19 |         if (! classf) {
20 |             throw runtime_error("Error reading file " + classfn);
21 |         }
22 | 
23 |         // first entry contains num classes
24 |         num_class = clsaddr[0];
25 |         //input is num_class followed by <cid, class> pairs
26 |         numtrans = (fdlen / INT_SIZE - 1) / 2;
27 |         maxval = clsaddr[numtrans * 2 - 1] + 1;
28 |         classes.resize(maxval);
29 |         for (i = 0; i < maxval; i++) classes[i] = NOCLASS;
30 |         for (i = 1; i < (int) (fdlen / INT_SIZE); i += 2) {        // DD
31 |             classes[clsaddr[i]] = clsaddr[i + 1];
32 |         }
33 |         delete [] clsaddr;
34 |     }
35 |     else {
36 |         classes.resize(num_class);
37 |     }
38 | 
39 |     class_count.resize(num_class, 0);
40 |     tmpe.resize(num_class);
41 |     tmpm.resize(num_class);
42 |     tmpl.resize(num_class);
43 |     min_supports.resize(num_class);
44 | 
45 |     if (has_class) {
46 |         // class frequency
47 |         for (i = 0; i < maxval; i++)
48 |             if (classes[i] != NOCLASS)
49 |                 class_count[classes[i]]++;
50 |     }
51 |     else {
52 |         class_count[0] = args->total_trans_count;
53 |     }
54 | 
55 |     for (i = 0; i < num_class; i++) {
56 |         min_supports[i] = (int) ceil(min_support_per_class * class_count[i]);
57 |         if (min_supports[i] < 1) min_supports[i] = 1;
58 |     }
59 | }
60 | 
61 | int ClassInfo::get_num_class() const {
62 |     return num_class;
63 | }
64 | 
65 | int ClassInfo::get_min_support(int idx) const {
66 |     return min_supports[idx];
67 | }
68 | 
69 | const vint &ClassInfo::get_tmpe() const {
70 |     return tmpe;
71 | }
72 | 
73 | const vint &ClassInfo::get_tmpm() const {
74 |     return tmpm;
75 | }
76 | 
77 | const vint &ClassInfo::get_tmpl() const {
78 |     return tmpl;
79 | }
80 | 
81 | void ClassInfo::setArgs(const shared_ptr<SpadeArguments> &args) {
82 |     ClassInfo::args = args;
83 | }
84 | 
85 | ClassInfo::ClassInfo() {
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/csrc/ClassInfo.h:
--------------------------------------------------------------------------------
 1 | #ifndef CSPADE_CLASSINFO_H
 2 | #define CSPADE_CLASSINFO_H
 3 | 
 4 | 
 5 | #include "common.h"
 6 | #include "SpadeArguments.h"
 7 | 
 8 | #define NOCLASS -1
 9 | 
10 | class ClassInfo {
11 | private:
12 |     bool has_class;
13 |     vint clsaddr;
14 |     vint classes;
15 |     vint class_count;
16 |     vint min_supports;
17 |     vint tmpe;            // temporary variables to keep support
18 |     vint tmpm;            // counts during intersections
19 |     vint tmpl;
20 |     int num_class = 1;
21 |     shared_ptr<SpadeArguments> args;
22 | public:
23 |     ClassInfo();
24 | 
25 |     void setArgs(const shared_ptr<SpadeArguments> &args);
26 | 
27 |     void reset_temps() {
28 |         std::fill(tmpe.begin(), tmpe.end(), 0);
29 |         std::fill(tmpm.begin(), tmpm.end(), 0);
30 |         std::fill(tmpl.begin(), tmpl.end(), 0);
31 |     }
32 | 
33 |     void increase_tmpl(int i) {
34 |         tmpl[getcls(i)]++;
35 |     }
36 | 
37 |     void init();
38 | 
39 |     void increase_tmpm(int i) {
40 |         tmpm[getcls(i)]++;
41 |     }
42 | 
43 |     void increase_tmpe(int i) {
44 |         tmpe[getcls(i)]++;
45 |     }
46 | 
47 |     void set_tmpe_item(int i, int val) {
48 |         tmpe[i] = val;
49 |     }
50 | 
51 |     int get_tmpe_item(int i) {
52 |         return tmpe[i];
53 |     }
54 | 
55 |     bool strong_lsupport(int i) {
56 |         return tmpl[i] >= min_supports[i];
57 |     }
58 | 
59 |     bool strong_esupport(int i) {
60 |         return tmpe[i] >= min_supports[i];
61 |     }
62 | 
63 |     bool strong_msupport(int i) {
64 |         return tmpm[i] >= min_supports[i];
65 |     }
66 | 
67 |     const vint &get_tmpe() const;
68 | 
69 |     const vint &get_tmpm() const;
70 | 
71 |     const vint &get_tmpl() const;
72 | 
73 |     int get_num_class() const;
74 | 
75 |     int get_min_support(int idx) const;
76 | 
77 |     int getcnt(int cls = -1) {
78 |         if (cls == -1) {
79 |             int sum = 0;
80 |             for (int i = 0; i < num_class; i++)
81 |                 sum += class_count[i];
82 |             return sum;
83 |         } else return class_count[cls];
84 |     }
85 | 
86 |     int getcls(int idx) {
87 |         if (has_class)
88 |             return classes[idx];
89 |         else
90 |             return 0;
91 |     }
92 | };
93 | 
94 | #endif //CSPADE_CLASSINFO_H
95 | 


--------------------------------------------------------------------------------
/csrc/Env.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Yukio Fukuzawa on 10/12/18.
 3 | //
 4 | 
 5 | #ifndef UTILITIES_ENV_H
 6 | #define UTILITIES_ENV_H
 7 | 
 8 | #include "common.h"
 9 | 
10 | class Env {
11 | public:
12 |     ostringstream seqstrm; // Print the sequences
13 |     ostringstream logger;
14 |     ostringstream summary;
15 |     ostringstream idlstrm; // To print ID list
16 | };
17 | 
18 | 
19 | #endif //UTILITIES_ENV_H
20 | 


--------------------------------------------------------------------------------
/csrc/EqGrNode.cc:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Yukio Fukuzawa on 8/12/18.
 3 | //
 4 | 
 5 | #include "EqGrNode.h"
 6 | #include "Array.h"
 7 | #include "FreqIt.h"
 8 | 
 9 | 
10 | EqGrNode::EqGrNode(int sz, int num_class_) {
11 |     num_class = num_class_;
12 |     if (sz > 0) {
13 |         theElements.reset(new Array(sz));
14 |         stheElements.reset(new Array(sz));
15 | 
16 |         _set_sup = make_shared<vector<Array_S>>();
17 |         _seq_sup = make_shared<vector<Array_S>>();
18 | 
19 |         _set_sup->reserve(num_class);
20 |         _seq_sup->reserve(num_class);
21 | 
22 |         for (int i = 0; i < num_class; i++) {
23 |             _set_sup->push_back(make_shared<Array>(sz));
24 |             _seq_sup->push_back(make_shared<Array>(sz));
25 |         }
26 |     } else {
27 |         theElements = nullptr;
28 |         stheElements = nullptr;
29 |         _set_sup->resize(0);
30 |         _seq_sup->resize(0);
31 |     }
32 | 
33 |     freqArray = nullptr;
34 |     freqArraySz = 0;
35 |     theFlg = 0;
36 | }
37 | 
38 | EqGrNode::~EqGrNode() = default;
39 | 
40 | 
41 | //assume that elements are sorted in descending order
42 | int EqGrNode::bsearch(int min, int max, FreqIt_SS freqArray, FreqIt_S fit, int recursive) {
43 |     int mid = (max + min) / 2;
44 |     if (max < min) return -1;
45 | 
46 |     int res = freqArray->at(mid)->compare(fit, recursive);
47 |     if (res == 0) return mid;
48 |     else if (res < 0) return bsearch(min, mid - 1, freqArray, fit, recursive);
49 |     else return bsearch(mid + 1, max, freqArray, fit, recursive);
50 | }
51 | 
52 | 
53 | int EqGrNode::bsearch(int min, int max, shared_ptr<vint> itary, int it) {
54 |     int mid = (max + min) / 2;
55 |     if (max < min) return -1;
56 | 
57 |     if (it == itary->at(mid)) return mid;
58 |     else if (it < itary->at(mid)) return bsearch(min, mid - 1, itary, it);
59 |     else return bsearch(mid + 1, max, itary, it);
60 | }
61 | 
62 | 
63 | int EqGrNode::find_freqarray(FreqIt_S fit, int recursive) {
64 |     if (freqArraySz > 0)
65 |         return bsearch(0, freqArraySz - 1, freqArray, fit, recursive);
66 |     else return 0;
67 | }
68 | 
69 | 
70 | ostream &operator<<(ostream &outputStream, EqGrNode &EQ) {
71 |     int i;
72 |     if (EQ.theElements) {
73 |         outputStream << "SET " << *EQ.theElements << endl;
74 |         for (i = 0; i < EQ.num_class; i++)
75 |             outputStream << "Sup" << i << " : " << *EQ._set_sup->at(i) << endl;
76 |         outputStream << "Tot";
77 |         for (i = 0; i < EQ.theElements->size(); i++)
78 |             outputStream << " " << EQ.get_sup(i);
79 |         outputStream << endl;
80 |     }
81 |     if (EQ.stheElements) {
82 |         outputStream << "SEQ " << *EQ.stheElements << endl;
83 |         for (i = 0; i < EQ.num_class; i++)
84 |             outputStream << "SSup" << i << " : " << *EQ._seq_sup->at(i) << endl;
85 |         outputStream << "Tot";
86 |         for (i = 0; i < EQ.stheElements->size(); i++)
87 |             outputStream << " " << EQ.get_seqsup(i);
88 |         outputStream << endl;
89 |     }
90 | 
91 |     return outputStream;
92 | }
93 | 


--------------------------------------------------------------------------------
/csrc/EqGrNode.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Yukio Fukuzawa on 8/12/18.
  3 | //
  4 | 
  5 | #ifndef CSPADE_EQGRNODE_H
  6 | #define CSPADE_EQGRNODE_H
  7 | 
  8 | #include "common.h"
  9 | #include "Array.h"
 10 | #include "FreqIt.h"
 11 | 
 12 | class EqGrNode {
 13 | private:
 14 |     Array_S theElements;
 15 |     Array_S stheElements;
 16 | 
 17 |     Array_SS _set_sup;          // support in different classes
 18 |     Array_SS _seq_sup;          // support in different classes
 19 | 
 20 |     FreqIt_SS freqArray; //frequent seq from this class
 21 |     int freqArraySz;
 22 | 
 23 |     int theFlg; //indicates if class is in memory
 24 | 
 25 |     int num_class;
 26 | 
 27 | public:
 28 |     static int bsearch(int min, int max, FreqIt_SS freqArray, FreqIt_S fit, int recursive);
 29 | 
 30 |     static int bsearch(int min, int max, shared_ptr<vint> itary, int it);
 31 | 
 32 |     EqGrNode(int sz, int num_class);
 33 | 
 34 |     ~EqGrNode();
 35 | 
 36 |     FreqIt_SS freqarray() {
 37 |         return freqArray;
 38 |     }
 39 | 
 40 |     int freqarraysz() {
 41 |         return freqArraySz;
 42 |     }
 43 | 
 44 |     void set_freqarray(FreqIt_SS fit, int sz) {
 45 |         freqArray = fit;
 46 |         freqArraySz = sz;
 47 |     }
 48 | 
 49 |     int find_freqarray(FreqIt_S fit, int recursive);
 50 | 
 51 |     int getflg() {
 52 |         return theFlg;
 53 |     }
 54 | 
 55 |     void setflg(int val) {
 56 |         theFlg = val;
 57 |     }
 58 | 
 59 |     void add_sup(int sup, int clas) {
 60 |         _set_sup->at(clas)->add(sup);
 61 |     }
 62 | 
 63 |     void add_seqsup(int sup, int clas) {
 64 |         _seq_sup->at(clas)->add(sup);
 65 |     }
 66 | 
 67 |     int get_sup(int idx, int clas = -1) {
 68 |         if (clas == -1) {
 69 |             int sum = 0;
 70 |             //return sup in all classes
 71 |             for (int i = 0; i < num_class; i++)
 72 |                 sum += (*_set_sup->at(i))[idx];
 73 |             return sum;
 74 |         } else return (*_set_sup->at(clas))[idx]; //return sup in class only
 75 |     }
 76 | 
 77 |     int get_seqsup(int idx, int clas = -1) {
 78 |         if (clas == -1) {
 79 |             int sum = 0;
 80 |             //return sup in all classes
 81 |             for (int i = 0; i < num_class; i++)
 82 |                 sum += (*_seq_sup->at(i))[idx];
 83 |             return sum;
 84 |         } else return (*_seq_sup->at(clas))[idx]; //return sup in class only
 85 |     }
 86 | 
 87 |     Array_S elements() {
 88 |         return theElements;
 89 |     }
 90 | 
 91 |     int num_elements() {
 92 |         if (theElements)
 93 |             return theElements->size();
 94 |         else return 0;
 95 |     }
 96 | 
 97 |     void add_element(int el) {
 98 |         //theElements[numElements] = el;
 99 |         //numElements++;
100 |         theElements->add(el);
101 |     }
102 | 
103 |     void add_element(int el, int pos) {
104 |         theElements->setitem(pos, el);
105 |     }
106 | 
107 |     int get_element(int pos) {
108 |         return (*theElements)[pos];
109 |     }
110 | 
111 |     void seqsetelements(Array_S ary) {
112 |         stheElements = ary;
113 |     }
114 | 
115 |     Array_S seqelements() {
116 |         return stheElements;
117 |     }
118 | 
119 |     int seqnum_elements() {
120 |         if (stheElements)
121 |             return stheElements->size();
122 |         else return 0;
123 |     }
124 | 
125 |     void seqadd_element(int el) {
126 |         stheElements->add(el);
127 |         //snumElements++;
128 |     }
129 | 
130 |     void seqadd_element(int el, int pos) {
131 |         stheElements->setitem(pos, el);
132 |     }
133 | 
134 |     int seqget_element(int pos) {
135 |         return (*stheElements)[pos];
136 |     }
137 | 
138 | 
139 |     int find(int it) {
140 |         if (theElements) {
141 |             //for (int i=0; i <theElements->size(); i++)
142 |             //   if ((*theElements)[i] == it) return 1;
143 |             return bsearch(0, theElements->size() - 1, theElements->array(), it);
144 |         }
145 |         return -1;
146 |     }
147 | 
148 |     int seqfind(int it) {
149 |         if (stheElements) {
150 |             //for (int i=0; i <stheElements->size(); i++)
151 |             //   if ((*stheElements)[i] == it) return 1;
152 |             return bsearch(0, stheElements->size() - 1, stheElements->array(), it);
153 |         }
154 |         return -1;
155 |     }
156 | 
157 |     friend ostream &operator<<(ostream &outputStream, EqGrNode &EQ);
158 | };
159 | 
160 | typedef shared_ptr<EqGrNode> EqGrNode_S;
161 | typedef shared_ptr<vector<EqGrNode_S>> EqGrNode_SS;
162 | 
163 | #endif //CSPADE_EQGRNODE_H
164 | 


--------------------------------------------------------------------------------
/csrc/Eqclass.cc:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "Eqclass.h"
 3 | 
 4 | Eqclass::Eqclass(int iset_sz, int eqt) {
 5 |     Iset_size = iset_sz;
 6 |     Eqtype = eqt;
 7 |     theList.reset(new Lists<shared_ptr<Itemset>>());
 8 |     seqTemplate = seqTemplate2 = 0;
 9 | 
10 |     if (Eqtype == EQCTYP1) {
11 |         theList2.reset(new Lists<shared_ptr<Itemset>>());
12 |     }
13 | }
14 | 
15 | Eqclass::~Eqclass() = default;
16 | 
17 | 
18 | int Eqclass::templ_sz() {
19 |     return Iset_size;
20 | }
21 | 
22 | int Eqclass::eqtype() {
23 |     return Eqtype;
24 | }
25 | 
26 | shared_ptr<Lists<shared_ptr<Itemset>>> &Eqclass::list() {
27 |     return theList;
28 | }
29 | 
30 | shared_ptr<Lists<shared_ptr<Itemset>>> &Eqclass::list2() {
31 |     return theList2;
32 | }
33 | 
34 | unsigned int Eqclass::templ() {
35 |     return seqTemplate;
36 | }
37 | 
38 | unsigned int Eqclass::templ2() {
39 |     return seqTemplate2;
40 | }
41 | 
42 | void Eqclass::set_templ(unsigned int val) {
43 |     seqTemplate = val;
44 | }
45 | 
46 | void Eqclass::set_templ2(unsigned int val) {
47 |     seqTemplate2 = val;
48 | }
49 | 
50 | void Eqclass::append(Itemset_S it) {
51 |     theList->append(std::move(it));
52 | }
53 | 
54 | void Eqclass::append2(Itemset_S it) {
55 |     theList2->append(std::move(it));
56 | }
57 | 
58 | void Eqclass::prepend(Itemset_S it) {
59 |     theList->prepend(std::move(it));
60 | }
61 | 
62 | void Eqclass::prepend2(Itemset_S it) {
63 |     theList2->prepend(std::move(it));
64 | }
65 | 


--------------------------------------------------------------------------------
/csrc/Eqclass.h:
--------------------------------------------------------------------------------
 1 | #ifndef _EQCLASS_H
 2 | #define _EQCLASS_H
 3 | 
 4 | #include "common.h"
 5 | #include "Lists.h"
 6 | #include "Itemset.h"
 7 | 
 8 | #define EQCTYP1 1
 9 | #define EQCTYP2 2
10 | #define EQCTYP3 3
11 | 
12 | class Eqclass;
13 | typedef shared_ptr<Eqclass> Eqclass_S;
14 | typedef shared_ptr<vector<Eqclass_S>> Eqclass_SS;
15 | 
16 | class Eqclass {
17 | private:
18 |     shared_ptr<Lists<shared_ptr<Itemset>>> theList;
19 |     int Iset_size;
20 |     unsigned int seqTemplate;
21 |     shared_ptr<Lists<shared_ptr<Itemset>>> theList2;
22 |     unsigned int seqTemplate2;
23 |     int Eqtype;
24 | public:
25 |     Eqclass(int iset_sz, int eqt);
26 | 
27 |     ~Eqclass();
28 | 
29 |     int templ_sz();
30 | 
31 |     int eqtype();
32 | 
33 |     shared_ptr<Lists<shared_ptr<Itemset>>>& list();
34 | 
35 |     shared_ptr<Lists<shared_ptr<Itemset>>>& list2();
36 | 
37 |     unsigned int templ();
38 | 
39 |     unsigned int templ2();
40 | 
41 |     void set_templ(unsigned int val);
42 | 
43 |     void set_templ2(unsigned int val);
44 | 
45 |     void append(Itemset_S it);
46 | 
47 |     void append2(Itemset_S it);
48 | 
49 |     void prepend(Itemset_S it);
50 | 
51 |     void prepend2(Itemset_S it);
52 | };
53 | 
54 | #endif
55 | 
56 | 


--------------------------------------------------------------------------------
/csrc/FreqIt.cc:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Yukio Fukuzawa on 8/12/18.
 3 | //
 4 | 
 5 | #include "FreqIt.h"
 6 | 
 7 | int FreqIt::compare(shared_ptr<Itemset> fit, unsigned int itpl) {
 8 |     int i;
 9 | 
10 |     //first compare seqsz, one with larger seqsz is smaller
11 |     if (seqsz > fit->size()) return -1;
12 |     else if (seqsz < fit->size()) return 1;
13 | 
14 |     int *seq_data = seq->data();
15 | 
16 |     //compare items & template bits
17 |     if (seq_data[0] < (*fit)[0]) return -1;
18 |     else if (seq_data[0] > (*fit)[0]) return 1;
19 | 
20 |     int bpos = seqsz - 1;
21 |     int b1, b2;
22 |     for (i = 1; i < seqsz; i++) {
23 |         b1 = GETBIT(templ, bpos - i);
24 |         b2 = GETBIT(itpl, bpos - i);
25 |         if (b1 < b2) return -1;
26 |         else if (b1 > b2) return 1;
27 | 
28 |         if (seq_data[i] < (*fit)[i]) return -1;
29 |         else if (seq_data[i] > (*fit)[i]) return 1;
30 |     }
31 |     return 0;
32 | }
33 | 
34 | 
35 | int FreqIt::compare(FreqIt_S fit, int recursive) {
36 |     int i;
37 |     int *seq_data = seq->data();
38 |     int *fit_data = fit->seq->data();
39 | 
40 |     //first compare seqsz, one with larger seqsz is smaller
41 |     if (seqsz > fit->seqsz) return -1;
42 |     else if (seqsz < fit->seqsz) return 1;
43 | 
44 |     //compare items & template bits
45 |     if (seq_data[seqsz - 1] < fit_data[fit->seqsz - 1]) return -1;
46 |     else if (seq_data[seqsz - 1] > fit_data[fit->seqsz - 1]) return 1;
47 | 
48 |     int bpos = 0;
49 |     int b1, b2;
50 |     for (i = seqsz - 2; i >= 0; i--, bpos++) {
51 |         b1 = GETBIT(templ, bpos);
52 |         b2 = GETBIT(fit->templ, bpos);
53 |         if (b1 < b2) return -1;
54 |         else if (b1 > b2) return 1;
55 | 
56 |         if (seq_data[i] < fit_data[i]) return -1;
57 |         else if (seq_data[i] > fit_data[i]) return 1;
58 |     }
59 |     return 0;
60 | }
61 | 
62 | ostream &operator<<(ostream &outputStream, FreqIt_S freq) {
63 |     int *freq_data = freq->seq->data();
64 | 
65 |     outputStream << "FREQ : ";
66 |     for (int i = 0; i < freq->seqsz; i++)
67 |         outputStream << " " << freq_data[i];
68 |     outputStream << " --- " << freq->templ << endl;
69 |     return outputStream;
70 | }
71 | 
72 | int FreqIt::size() {
73 |     return seqsz;
74 | }
75 | 
76 | FreqIt::FreqIt(int sz, unsigned int tpl) {
77 |     templ = tpl;
78 |     seqsz = sz;
79 |     seq = make_shared<vint>(sz);
80 | }
81 | 
82 | FreqIt::FreqIt(shared_ptr<vint> ary, int sz, unsigned int tpl) {
83 |     templ = tpl;
84 |     seqsz = sz;
85 |     seq = make_shared<vint>();
86 |     seq->reserve(sz);
87 |     auto ary_data = ary->data();
88 |     for (int i = 0; i < sz; i++) {
89 |         seq->push_back(ary_data[i]);
90 |     }
91 | }


--------------------------------------------------------------------------------
/csrc/FreqIt.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Yukio Fukuzawa on 8/12/18.
 3 | //
 4 | 
 5 | #ifndef CSPADE_FREQIT_H
 6 | #define CSPADE_FREQIT_H
 7 | 
 8 | #include "common.h"
 9 | #include "Itemset.h"
10 | 
11 | class FreqIt;
12 | typedef shared_ptr<FreqIt> FreqIt_S;
13 | typedef shared_ptr<vector<FreqIt_S>> FreqIt_SS;
14 | 
15 | class FreqIt {
16 | public:
17 |     shared_ptr<vint> seq;
18 |     int seqsz;
19 |     unsigned int templ;
20 | 
21 |     FreqIt(int sz, unsigned int tpl);
22 | 
23 |     FreqIt(shared_ptr<vint> ary, int sz, unsigned int tpl);
24 | 
25 |     ~FreqIt() = default;
26 | 
27 |     int size();
28 | 
29 |     int compare(Itemset_S iset, unsigned int itpl);
30 | 
31 |     int compare(FreqIt_S fit, int recursive);
32 | 
33 |     friend ostream &operator<<(ostream &outputStream, FreqIt_S freq);
34 | };
35 | 
36 | 
37 | 
38 | #endif //CSPADE_FREQIT_H
39 | 


--------------------------------------------------------------------------------
/csrc/InvertDatabase.cc:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "InvertDatabase.h"
  3 | 
  4 | void InvertDatabase::incr(int sz) {
  5 |     int oldsz = numcust;
  6 |     numcust = sz;
  7 | 
  8 |     curits.resize(numcust);
  9 |     curcnts.resize(numcust);
 10 |     curcids.resize(numcust);
 11 |     curitszs.resize(numcust);
 12 | 
 13 |     int i;
 14 |     int ttval = (int) (args->avg_cust_size * args->avg_trans_count);
 15 |     for (i = oldsz; i < numcust; i++) {
 16 |         curitszs[i] = ttval;
 17 |         curits[i].resize(curitszs[i]);
 18 |         curcnts[i] = 0;
 19 |         curcids[i] = NOCLASS;
 20 |     }
 21 | }
 22 | 
 23 | void InvertDatabase::incr_curit(int midx) {
 24 |     curitszs[midx] *= 2;
 25 |     curits[midx].resize(curitszs[midx]);
 26 | }
 27 | 
 28 | 
 29 | void InvertDatabase::print_idlist(ostream& idlstrm, shared_ptr<vint> ival, int supsz) {
 30 |     int i, cid, cnt;
 31 |     int *ival_data = ival->data();
 32 | 
 33 |     if (supsz > 0) {
 34 |         cid = ival_data[0];
 35 |         cnt = 0;
 36 |         for (i = 0; i < supsz;) {
 37 |             if (cid == ival_data[i]) {
 38 |                 cnt++;
 39 |                 i += 2;
 40 |             } else {
 41 |                 idlstrm << cid << " " << cnt << " ";
 42 |                 cid = ival_data[i];
 43 |                 cnt = 0;
 44 |             }
 45 |         }
 46 |         idlstrm << cid << " " << cnt;
 47 |     }
 48 | }
 49 | 
 50 | void InvertDatabase::get_l2file(const string& fname, char use_seq, int &l2cnt) {
 51 |     fstream file(fname.c_str(), ios::binary);
 52 |     if (!file.is_open()) {
 53 |         throw runtime_error("can't open l2 file");
 54 |     }
 55 |     unsigned long flen = file_size(file);
 56 |     if (flen > 0) {
 57 |         int *cntary = read_file(file, flen);
 58 | 
 59 |         if (!file) {
 60 |             throw runtime_error("Error reading file " + fname);
 61 |         }
 62 |         file.close();
 63 | 
 64 |         // build eqgraph -- large 2-itemset relations
 65 |         int lim = flen / INT_SIZE;
 66 |         char lflg = 0;
 67 |         int i, j;
 68 |         for (i = 0; i < lim; i += 3) {
 69 |             lflg = 0;
 70 |             for (j = 0; j < cls->get_num_class(); j++) {
 71 |                 if (cntary[i + 2] >= cls->get_min_support(j)) {
 72 |                     lflg = 1;
 73 |                     break;
 74 |                 }
 75 |             }
 76 |             if (lflg) {
 77 |                 if (!extl2_pre_pruning(cntary[i + 2], cntary[i + 1], cntary[i], use_seq, vuint_null)) {
 78 |                     suffix_add_item_eqgraph(use_seq, cntary[i], cntary[i + 1]);
 79 |                     l2cnt++;
 80 |                     //assign sup to a single class, sice we don't know breakup
 81 |                     if (use_seq) eqgraph[cntary[i + 1]]->add_seqsup(cntary[i + 2], 0);
 82 |                     else eqgraph[cntary[i + 1]]->add_sup(cntary[i + 2], 0);
 83 |                     for (j = 1; j < cls->get_num_class(); j++)
 84 |                         if (use_seq) eqgraph[cntary[i + 1]]->add_seqsup(0, j);
 85 |                         else eqgraph[cntary[i + 1]]->add_sup(0, j);
 86 |                 }
 87 |             }
 88 |         }
 89 | 
 90 |         delete [] cntary;
 91 |     }
 92 | }
 93 | 
 94 | void InvertDatabase::suffix_add_item_eqgraph(char use_seq, int it1, int it2) {
 95 |     if (eqgraph[it2] == nullptr) {
 96 |         eqgraph[it2].reset(new EqGrNode(2, cls->get_num_class()));
 97 |     }
 98 |     if (use_seq) eqgraph[it2]->seqadd_element(it1);
 99 |     else eqgraph[it2]->add_element(it1);
100 | }
101 | 
102 | int InvertDatabase::make_l1_pass() {
103 |     int i, j;
104 |     int supsz;
105 |     int bsz = 100;
106 | 
107 |     ostringstream &seqstrm = env.seqstrm;
108 |     ostringstream &idlstrm = env.idlstrm;
109 | 
110 |     backidx.resize(bsz);
111 |     fidx.resize(args->dbase_max_item);
112 | 
113 |     numfreq = 0;
114 |     int ivalsz = 100;
115 |     shared_ptr<vint> ival = make_shared<vint>(ivalsz);
116 | //   int tt=0;
117 |     for (i = 0; i < args->dbase_max_item; i++) {
118 |         supsz = partition->partition_get_idxsup(i);
119 |         if (ivalsz < supsz) {
120 |             ivalsz = supsz;
121 |             ival->resize(ivalsz);
122 |         }
123 |         int *ival_data = ival->data();
124 |         partition->partition_read_item(ival, i);
125 |         for (j = 0; j < cls->get_num_class(); j++) {
126 |             cls->set_tmpe_item(j, 0);
127 |         }
128 | 
129 |         int cid = -1;
130 |         for (j = 0; j < supsz; j += 2) {
131 |             if (cid != ival_data[j]) {
132 |                 cls->increase_tmpe(j);
133 |             }
134 |             cid = ival_data[j];
135 |         }
136 | 
137 |         char lflg = 0;
138 |         fidx[i] = -1;       // default init
139 |         for (j = 0; j < cls->get_num_class(); j++) {
140 |             if (cls->strong_esupport(j)) {
141 |                 lflg = 1;
142 |                 if (numfreq + 1 > bsz) {
143 |                     bsz = 2 * bsz;
144 |                     backidx.resize(bsz);
145 |                 }
146 |                 backidx[numfreq] = i;
147 |                 fidx[i] = numfreq;
148 |                 //   cls->TMPE[j] << endl;
149 |                 numfreq++;
150 |                 break;
151 |             }
152 |         }
153 | 
154 |         if (lflg) {
155 |             for (j = 0; j < cls->get_num_class(); j++)
156 |                 add_sup(cls->get_tmpe_item(j), j);
157 |             if (args->outputfreq) {
158 |                 seqstrm << i << " --";
159 |                 seqstrm << " " << get_sup(i);
160 |                 for (j = 0; j < cls->get_num_class(); j++)
161 |                     seqstrm << " " << cls->get_tmpe_item(j);
162 |                 seqstrm << " ";
163 |                 if (args->print_tidlist) print_idlist(idlstrm, ival, supsz);
164 |                 seqstrm << endl;
165 |             }
166 |         }
167 |     }
168 | 
169 |     backidx.resize(numfreq);
170 |     ival = nullptr;
171 | 
172 |     return numfreq;
173 | }
174 | 
175 | void InvertDatabase::add_sup(int sup, int clsidx) {
176 |     itsup[clsidx].add(sup);
177 | }
178 | 
179 | int InvertDatabase::get_sup(int it, int clsidx) {
180 |     if (clsidx == -1) {
181 |         int sum = 0;
182 |         auto num_class = cls->get_num_class();
183 |         for (int i = 0; i < num_class; i++)
184 |             sum += itsup[i][fidx[it]];
185 |         return sum;
186 |     } else return itsup[clsidx][fidx[it]];
187 | }
188 | 
189 | void InvertDatabase::process_cust_invert(int custidx) {
190 |     int cid = curcids[custidx];
191 |     int curcnt = curcnts[custidx];
192 |     vint curit = curits[custidx];
193 | 
194 |     int i, j, k, l;
195 |     int nv1, nv2, diff;
196 |     int it1, it2;
197 | 
198 |     for (i = 0; i < curcnt; i = nv1) {
199 |         nv1 = i;
200 |         it1 = curit[i];
201 |         while (nv1 < curcnt && it1 == curit[nv1]) nv1 += 2;
202 |         for (j = i; j < curcnt; j = nv2) {
203 |             nv2 = j;
204 |             it2 = curit[j];
205 |             while (nv2 < curcnt && it2 == curit[nv2]) nv2 += 2;
206 |             if (!seq_sup[it1].empty() && curit[i + 1] + args->min_gap <= curit[nv2 - 1]) {
207 |                 for (k = i, l = j; k < nv1 && l < nv2;) {
208 |                     diff = curit[l + 1] - curit[k + 1];
209 |                     if (diff < args->min_gap) l += 2;
210 |                     else if (diff > args->max_gap) k += 2;
211 |                     else {
212 |                         seq_sup[it1][it2][cls->getcls(cid)]++;
213 |                         break;
214 |                     }
215 |                 }
216 |             }
217 | 
218 |             if (j > i) {
219 |                 if (!seq_sup[it2].empty() && curit[j + 1] + args->min_gap <= curit[nv1 - 1]) {
220 |                     for (k = j, l = i; k < nv2 && l < nv1;) {
221 |                         diff = curit[l + 1] - curit[k + 1];
222 |                         if (diff < args->min_gap) l += 2;
223 |                         else if (diff > args->max_gap) k += 2;
224 |                         else {
225 |                             seq_sup[it2][it1][cls->getcls(cid)]++;
226 |                             break;
227 |                         }
228 |                     }
229 |                 }
230 | 
231 |                 if (!set_sup[it1].empty()) {
232 |                     for (k = i, l = j; k < nv1 && l < nv2;) {
233 |                         if (curit[k + 1] > curit[l + 1]) l += 2;
234 |                         else if (curit[k + 1] < curit[l + 1]) k += 2;
235 |                         else {
236 |                             set_sup[it1][it2 - it1 - 1][cls->getcls(cid)]++;
237 |                             break;
238 |                         }
239 |                     }
240 |                 }
241 |             }
242 |         }
243 |     }
244 | }
245 | 
246 | void InvertDatabase::process_invert(int pnum) {
247 |     int i, k;
248 |     int minv, maxv;
249 |     partition->partition_get_minmaxcustid(backidx, numfreq, pnum, minv, maxv);
250 |     if (numcust < maxv - minv + 1)
251 |         incr(maxv - minv + 1);
252 | 
253 |     int supsz;
254 |     int ivalsz = 0;
255 |     shared_ptr<vint> ival = make_shared<vint>();
256 |     for (i = 0; i < numfreq; i++) {
257 |         supsz = partition->partition_get_lidxsup(pnum, backidx[i]);
258 |         if (ivalsz < supsz) {
259 |             ivalsz = supsz;
260 |             ival->resize(ivalsz);
261 |         }
262 |         partition->partition_lclread_item(ival, pnum, backidx[i]);
263 | 
264 |         int cid;
265 |         int midx;
266 |         int *ival_data = ival->data();
267 |         for (int pos = 0; pos < supsz; pos += 2) {
268 |             cid = ival_data[pos];
269 |             midx = cid - minv;
270 |             if (curcnts[midx] + 2 > curitszs[midx]) {
271 |                 incr_curit(midx);
272 |             }
273 |             curcids[midx] = cid;
274 |             curits[midx][curcnts[midx]++] = i;
275 |             curits[midx][curcnts[midx]++] = ival_data[pos + 1];
276 | 
277 |         }
278 |     }
279 |     for (k = 0; k < maxv - minv + 1; k++) {
280 |         if (curcnts[k] > 0) {
281 |             process_cust_invert(k);
282 |         }
283 |         curcnts[k] = 0;
284 |         curcids[k] = NOCLASS;
285 |     }
286 | }
287 | 
288 | bool InvertDatabase::extl2_pre_pruning(int totsup, int it, int pit, char use_seq, vuint& clsup) {
289 |     ostringstream &logger = env.logger;
290 | 
291 |     float conf, conf2;
292 |     int itsup;
293 |     if (args->pruning_type == NOPRUNING) return false;
294 |     if (use_seq) return false;
295 |     if (GETBIT(args->pruning_type, FOLLOWPRUNING - 1)) {
296 |         itsup = get_sup(it);
297 |         conf = (1.0f * totsup) / itsup;
298 |         conf2 = (1.0f * totsup) / get_sup(pit);
299 |         if (conf >= args->follow_thresh || conf2 >= args->follow_thresh) {
300 |             if (args->outputfreq && !clsup.empty()) {
301 |                 logger << "PRUNE_EXT " << pit << (use_seq ? " -2 " : " ")
302 |                      << it << " -1 " << totsup;
303 |                 for (int i = 0; i < cls->get_num_class(); i++)
304 |                     logger << " " << clsup[i];
305 |                 logger << endl;
306 |             }
307 |             args->prepruning++;
308 |             return true;
309 |         }
310 |     }
311 |     return false;
312 | }
313 | 
314 | void InvertDatabase::get_F2(int &l2cnt) {
315 |     int i, j, k;
316 |     int fcnt;
317 |     char lflg;
318 |     char use_seq;
319 | 
320 |     for (j = 0; j < numfreq; j++) {
321 |         if (set_sup[j].empty()) {
322 |             use_seq = 0;
323 |             for (k = j + 1; k < numfreq; k++) {
324 |                 lflg = 0;
325 |                 for (i = 0; i < cls->get_num_class(); i++) {
326 |                     fcnt = set_sup[j][k - j - 1][i];
327 |                     if (fcnt >= cls->get_min_support(i)) {
328 |                         lflg = 1;
329 |                         break;
330 |                     }
331 |                 }
332 |                 if (lflg) {
333 |                     fcnt = 0;
334 |                     for (i = 0; i < cls->get_num_class(); i++) {
335 |                         fcnt += set_sup[j][k - j - 1][i];
336 |                     }
337 |                     if (!extl2_pre_pruning(fcnt, backidx[k], backidx[j], use_seq, set_sup[j][k - j - 1])) {
338 |                         suffix_add_item_eqgraph(use_seq, backidx[j], backidx[k]);
339 |                         for (i = 0; i < cls->get_num_class(); i++) {
340 |                             int ffcnt = set_sup[j][k - j - 1][i];
341 |                             eqgraph[backidx[k]]->add_sup(ffcnt, i);
342 |                         }
343 |                         l2cnt++;
344 |                     }
345 |                 }
346 |             }
347 |         }
348 |         if (!seq_sup[j].empty()) {
349 |             use_seq = 1;
350 |             for (k = 0; k < numfreq; k++) {
351 |                 lflg = 0;
352 |                 for (i = 0; i < cls->get_num_class(); i++) {
353 |                     fcnt = seq_sup[j][k][i];
354 |                     if (fcnt >= cls->get_min_support(i)) {
355 |                         lflg = 1;
356 |                         break;
357 |                     }
358 |                 }
359 |                 if (lflg) {
360 |                     fcnt = 0;
361 |                     for (i = 0; i < cls->get_num_class(); i++) {
362 |                         fcnt += seq_sup[j][k][i];
363 |                     }
364 |                     if (!extl2_pre_pruning(fcnt, backidx[k], backidx[j], use_seq, seq_sup[j][k])) {
365 |                         suffix_add_item_eqgraph(use_seq, backidx[j], backidx[k]);
366 |                         l2cnt++;
367 |                         for (i = 0; i < cls->get_num_class(); i++) {
368 |                             int ffcnt = seq_sup[j][k][i];
369 |                             eqgraph[backidx[k]]->add_seqsup(ffcnt, i);
370 |                         }
371 |                     }
372 |                 }
373 |             }
374 |         }
375 |     }
376 | }
377 | 
378 | int InvertDatabase::make_l2_pass() {
379 |     int i, j;
380 | 
381 |     int l2cnt = 0;
382 |     int num_class = cls->get_num_class();
383 | 
384 |     // support for 2-itemsets
385 |     set_sup.resize(numfreq);
386 |     seq_sup.resize(numfreq);
387 | 
388 |     int low, high;
389 | 
390 |     for (low = 0; low < numfreq; low = high) {
391 |         for (high = low; high < numfreq; high++) {
392 |             if (args->max_iset_len > 1 && numfreq - high - 1 > 0) {
393 |                 set_sup[high].resize(numfreq - high - 1);
394 |                 for (i = 0; i < numfreq - high - 1; i++) {
395 |                     set_sup[high][i].resize(num_class, 0);
396 |                 }
397 |             }
398 |             if (args->max_seq_len > 1) {
399 |                 seq_sup[high].resize(numfreq);
400 |                 for (i = 0; i < numfreq; i++) {
401 |                     seq_sup[high][i].resize(num_class, 0);
402 |                 }
403 |             }
404 |         }
405 |         for (int p = 0; p < args->num_partitions; p++) {
406 |             process_invert(p);
407 |         }
408 |         get_F2(l2cnt);
409 |     }
410 | 
411 |     return l2cnt;
412 | }
413 | 
414 | void InvertDatabase::init(int sz) {
415 |     int i;
416 |     numcust = 0;
417 |     incr(sz);
418 |     eqgraph.resize(args->dbase_max_item);
419 | 
420 |     auto num_class = cls->get_num_class();
421 |     itsup.reserve(num_class);
422 |     for (i = 0; i <num_class; i++) {
423 |         itsup.emplace_back(2);
424 |     }
425 | }
426 | 
427 | InvertDatabase::InvertDatabase(Env& env_) : env(env_) {}
428 | 
429 | void InvertDatabase::set_cls(const shared_ptr<ClassInfo> &cls) {
430 |     InvertDatabase::cls = cls;
431 | }
432 | 
433 | void InvertDatabase::setPartition(const shared_ptr<Partition> &partition) {
434 |     InvertDatabase::partition = partition;
435 | }
436 | 
437 | void InvertDatabase::setArgs(const shared_ptr<SpadeArguments> &args) {
438 |     InvertDatabase::args = args;
439 | }
440 | 


--------------------------------------------------------------------------------
/csrc/InvertDatabase.h:
--------------------------------------------------------------------------------
  1 | #ifndef __EXT_H_
  2 | #define __EXT_H_
  3 | 
  4 | #include "common.h"
  5 | #include "Partition.h"
  6 | #include "Eqclass.h"
  7 | #include "EqGrNode.h"
  8 | #include "ClassInfo.h"
  9 | #include "Env.h"
 10 | 
 11 | class InvertDatabase {
 12 |     int numcust;
 13 |     vvint curits;
 14 |     vint curcnts;
 15 |     vint curcids;
 16 |     vint curitszs;
 17 |     Env& env;
 18 |     shared_ptr<ClassInfo> cls;
 19 |     shared_ptr<Partition> partition;
 20 |     shared_ptr<SpadeArguments> args;
 21 |     vector<EqGrNode_S> eqgraph;
 22 |     vint backidx;
 23 |     vint fidx;
 24 |     int numfreq;
 25 |     vector<Array> itsup;
 26 |     vector<shared_ptr<Itemset>> _items;
 27 |     vector<vector<vector<unsigned int>>> set_sup, seq_sup;
 28 | public:
 29 |     InvertDatabase(Env& env);
 30 | 
 31 |     const EqGrNode_S &get_eqgraph_item(int i) {
 32 |         return eqgraph[i];
 33 |     }
 34 | 
 35 |     void set_cls(const shared_ptr<ClassInfo> &cls);
 36 | 
 37 |     void setPartition(const shared_ptr<Partition> &partition);
 38 | 
 39 |     void setArgs(const shared_ptr<SpadeArguments> &args);
 40 | 
 41 |     void init(int sz);
 42 | 
 43 |     void incr(int sz);
 44 | 
 45 |     void incr_curit(int midx);
 46 | 
 47 |     void process_invert(int pnum);
 48 | 
 49 |     void process_cust_invert(int custidx);
 50 | 
 51 |     int make_l1_pass();
 52 | 
 53 |     int make_l2_pass();
 54 | 
 55 |     void get_l2file(const string &fname, char use_seq, int &l2cnt);
 56 | 
 57 |     bool extl2_pre_pruning(int totsup, int it, int pit, char use_seq, vuint& clsup);
 58 | 
 59 |     void suffix_add_item_eqgraph(char use_seq, int it1, int it2);
 60 | 
 61 |     void get_F2(int &l2cnt);
 62 | 
 63 |     void print_idlist(ostream& idlstrm, shared_ptr<vint> ival, int supsz);
 64 | 
 65 |     void add_sup(int sup, int cls);
 66 | 
 67 |     int get_sup(int it, int cls = -1);
 68 | 
 69 |     int in_mem(int it) {
 70 |         if (_items[fidx[it]]->ival()->array() == nullptr) return 0;
 71 |         else return 1;
 72 |     }
 73 | 
 74 |     Itemset_S get_item(int it) {
 75 |         if (!in_mem(it)) get_ext_item(it);
 76 |         return _items[fidx[it]];
 77 |     }
 78 | 
 79 |     void get_ext_item(int it) {
 80 |         int supsz = partition->partition_get_idxsup(it);
 81 |         shared_ptr<vint> newit = make_shared<vint>(supsz);
 82 |         partition->partition_read_item(newit, it);
 83 |         _items[fidx[it]]->set_support(supsz);
 84 |         _items[fidx[it]]->ival()->set_size(supsz);
 85 |         _items[fidx[it]]->ival()->set_array(newit);
 86 |     }
 87 | 
 88 |     void init_buffer(int num_class, int size) {
 89 |         _items.reserve(size);
 90 |         bool print_idlist = args->print_tidlist;
 91 | 
 92 |         for (int i = 0; i < size; i++) {
 93 |             Itemset_S tmp = make_shared<Itemset>(1, 0, num_class, print_idlist);
 94 |             tmp->setitem(0, backidx[i]);
 95 |             _items.push_back(tmp);
 96 |         }
 97 |     }
 98 | 
 99 | };
100 | 
101 | #endif //__EXT_H_
102 | 


--------------------------------------------------------------------------------
/csrc/Itemset.cc:
--------------------------------------------------------------------------------
 1 | #include "Itemset.h"
 2 | 
 3 | 
 4 | Itemset::Itemset(int it_sz, int ival_sz, int nclass, bool print) {
 5 |     num_class = nclass;
 6 |     do_print = print;
 7 |     theItemset.reset(new Array(it_sz));
 8 |     theIval.reset(new Array(ival_sz));
 9 |     theSupport = 0;
10 |     clsSup.resize(num_class, 0);
11 | }
12 | 
13 | Itemset::~Itemset() = default;
14 | 
15 | int Itemset::compare(Itemset &ar2) {
16 |     int len;
17 |     if (size() <= ar2.size()) len = size();
18 |     else len = ar2.size();
19 |     for (int i = 0; i < len; i++) {
20 |         if ((*theItemset)[i] > (*ar2.theItemset)[i]) return 1;
21 |         else if ((*theItemset)[i] < (*ar2.theItemset)[i]) return -1;
22 |     }
23 |     if (size() < ar2.size()) return -1;
24 |     else if (size() > ar2.size()) return 1;
25 |     else return 0;
26 | }
27 | 
28 | int Itemset::subsequence(Itemset_S ar) {
29 |     int i, j;
30 |     if (size() > ar->size()) return 0;
31 |     int start = 0;
32 |     for (i = 0; i < size(); i++) {
33 |         for (j = start; j <ar->size(); j++) {
34 |             if ((*theItemset)[i] == (*ar->theItemset)[j]) {
35 |                 start = j + 1;
36 |                 break;
37 |             }
38 |         }
39 |         if (j >= ar->size()) return 0;
40 |     }
41 |     return 1;
42 | }
43 | 
44 | ostream &operator<<(ostream &outputStream, Itemset &itemset) {
45 |     outputStream << "ITEM: ";
46 |     outputStream << *itemset.theItemset;
47 |     outputStream << "(" << itemset.theSupport << ")";
48 |     outputStream << "\n";
49 |     return outputStream;
50 | }
51 | 
52 | void Itemset::print_seq(ostream& seqstrm, int itempl) {
53 |     int i;
54 |     int sz = size();
55 |     seqstrm << (*theItemset)[0] << " ";
56 |     for (i = 1; i < sz - 1; i++) {
57 |         if (GETBIT(itempl, sz - 1 - i))
58 |             seqstrm << "-> ";
59 |         seqstrm << (*theItemset)[i] << " ";
60 |     }
61 |     if (GETBIT(itempl, sz - 1 - i))
62 |         seqstrm << "-> ";
63 |     seqstrm << (*theItemset)[sz - 1] << " ";
64 |     seqstrm << "-- " << theSupport;
65 |     for (i = 0; i < num_class; i++)
66 |         seqstrm << " " << clsSup[i];
67 |     seqstrm << " ";
68 | //    if (do_print) print_idlist();
69 |     seqstrm << endl;
70 | }
71 | 
72 | void Itemset::print_idlist(ostream& idlstrm) {
73 |     int i, cid, cnt;
74 | 
75 |     if (theIval && theIval->size() > 0) {
76 |         cid = (*theIval)[0];
77 |         cnt = 0;
78 |         for (i = 0; i <theIval->size();) {
79 |             if (cid == (*theIval)[i]) {
80 |                 cnt++;
81 |                 i += 2;
82 |             } else {
83 |                 idlstrm << cid << " " << cnt << " ";
84 |                 cid = (*theIval)[i];
85 |                 cnt = 0;
86 |             }
87 |         }
88 |         idlstrm << cid << " " << cnt;
89 |     }
90 | }
91 | 
92 | Itemset_S placeholder = make_shared<Itemset>(true);


--------------------------------------------------------------------------------
/csrc/Itemset.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef __ITEMSET_H
  3 | #define __ITEMSET_H
  4 | 
  5 | #include "common.h"
  6 | #include "Array.h"
  7 | #include "Lists.h"
  8 | #include "SpadeArguments.h"
  9 | #include "Partition.h"
 10 | 
 11 | #define SETBIT(a, v, b)  (((v) != 0) ? ((a) | (01 << (b))): ((a) & ~(01 << (b))))
 12 | #define GETBIT(a, b) ((a) & (01 << (b)))
 13 | 
 14 | class Itemset;
 15 | typedef shared_ptr<Itemset> Itemset_S;
 16 | 
 17 | class Itemset {
 18 | protected:
 19 |     Array_S theItemset;
 20 |     Array_S theIval;
 21 |     int theSupport;
 22 |     vint clsSup;
 23 |     int num_class;
 24 |     bool do_print;
 25 | //    shared_ptr<ClassInfo> cls;
 26 | //    shared_ptr<SpadeArguments> args;
 27 | public:
 28 |     explicit Itemset(bool x){}
 29 | 
 30 |     Itemset(int it_sz, int ival_sz, int nclass, bool print);
 31 | 
 32 |     ~Itemset();
 33 | 
 34 |     friend ostream &operator<<(ostream &outputStream, Itemset &itemset);
 35 | 
 36 |     int compare(Itemset &ar2);
 37 | 
 38 |     int subsequence(Itemset_S ar);
 39 | 
 40 |     void print_seq(ostream& seqstrm, int itempl);
 41 | 
 42 |     void print_idlist(ostream& idlstrm);
 43 | 
 44 |     Array_S &ival() {
 45 |         return theIval;
 46 |     }
 47 | 
 48 |     int ival(int pos) {
 49 |         return (*theIval)[pos];
 50 |     }
 51 | 
 52 |     int ivalsize() {
 53 |         return theIval->size();
 54 |     }
 55 | 
 56 |     void reallocival() {
 57 |         theIval->resize(ivalsize());
 58 |     }
 59 | 
 60 |     int operator[](int pos) {
 61 |         return (*theItemset)[pos];
 62 |     };
 63 | 
 64 |     void setitem(int pos, int val) {
 65 |         theItemset->setitem(pos, val);
 66 |     };
 67 | 
 68 |     Array_S itemset() {
 69 |         return theItemset;
 70 |     };
 71 | 
 72 |     void add_item(int val) {
 73 |         theItemset->add(val);
 74 |     };
 75 | 
 76 |     int size() {
 77 |         return theItemset->size();
 78 |     };
 79 | 
 80 |     int support() {
 81 |         return theSupport;
 82 |     };
 83 | 
 84 |     void set_support(int sup) {
 85 |         theSupport = sup;
 86 |     }
 87 | 
 88 |     void increment_support() {
 89 |         theSupport++;
 90 |     };
 91 | 
 92 |     int cls_support(int cls) {
 93 |         return clsSup[cls];
 94 |     }
 95 | 
 96 |     void increment_cls_support(int cls) {
 97 |         clsSup[cls]++;
 98 |     }
 99 | 
100 |     void set_cls_support(int sup, int cls) {
101 |         clsSup[cls] = sup;
102 |     }
103 | 
104 |     static int Itemcompare(void *iset1, void *iset2) {
105 |         Itemset* it1 = (Itemset* ) iset1;
106 |         Itemset* it2 = (Itemset* ) iset2;
107 |         return it1->compare(*it2);
108 |     }
109 | };
110 | 
111 | extern Itemset_S placeholder;
112 | 
113 | #endif //__ITEMSET_H
114 | 
115 | 


--------------------------------------------------------------------------------
/csrc/Lists.cc:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "Lists.h"
 3 | #include "Array.h"
 4 | #include "Eqclass.h"
 5 | #include "Itemset.h"
 6 | 
 7 | template<class T>
 8 | ListNodes<T>::ListNodes(T item, shared_ptr<ListNodes<T>> next) {
 9 |     theItem = item;
10 |     theNext = next;
11 | }
12 | 
13 | template<class T>
14 | ListNodes<T>::~ListNodes() {
15 |     theNext = nullptr;
16 |     theItem = nullptr;
17 | }
18 | 
19 | template<class T>
20 | Lists<T>::Lists() {
21 |     theHead = 0;
22 |     theLast = 0;
23 |     theSize = 0;
24 | }
25 | 
26 | //only listnodes are deleted, if node->item() is a pointer to some object
27 | //that object is *not* deleted
28 | template<class T>
29 | Lists<T>::~Lists() = default;
30 | 
31 | //listnodes are deleted, if node->item() is a pointer to some object
32 | //that object is *also*  deleted
33 | template<class T>
34 | void Lists<T>::clear() {
35 |     theHead = nullptr;
36 |     theLast = nullptr;
37 |     theSize = 0;
38 | }
39 | 
40 | 
41 | template<class T>
42 | void Lists<T>::append(T item) {
43 |     shared_ptr<ListNodes<T>> node;
44 | 
45 |     theSize++;
46 |     node.reset(new ListNodes<T>(item, 0));
47 | 
48 |     if (theHead == nullptr) {
49 |         theHead = node;
50 |         theLast = node;
51 |     } else {
52 |         theLast->set_next(node);
53 |         theLast = node;
54 |     }
55 | }
56 | 
57 | 
58 | template<class T>
59 | void Lists<T>::prepend(T item) {
60 |     shared_ptr<ListNodes<T>> node;
61 | 
62 |     theSize++;
63 |     node.reset(new ListNodes<T>(item, 0));
64 | 
65 |     if (theHead == 0) {
66 |         theHead = node;
67 |         theLast = node;
68 |     } else {
69 |         node->set_next(theHead);
70 |         theHead = node;
71 |     }
72 | }
73 | 
74 | template
75 | class Lists<int *>;
76 | 
77 | template
78 | class Lists<Array_S>;
79 | 
80 | template
81 | class Lists<shared_ptr<Itemset>>;
82 | 
83 | template
84 | class Lists<Eqclass_S>;
85 | 


--------------------------------------------------------------------------------
/csrc/Lists.h:
--------------------------------------------------------------------------------
 1 | #ifndef __LISTS_H
 2 | #define __LISTS_H
 3 | 
 4 | #include "common.h"
 5 | 
 6 | typedef int (*CMP_FUNC)(void *, void *);
 7 | 
 8 | template<class T>
 9 | class ListNodes {
10 | private:
11 |     shared_ptr<ListNodes<T>> theNext;
12 |     T theItem;
13 | 
14 | public:
15 |     ListNodes(T item, shared_ptr<ListNodes<T>> next);
16 | 
17 |     ~ListNodes();
18 | 
19 |     shared_ptr<ListNodes<T>> next() {
20 |         return theNext;
21 |     }
22 | 
23 |     void set_next(shared_ptr<ListNodes<T>> next) {
24 |         theNext = next;
25 |     }
26 | 
27 |     T& item() {
28 |         return theItem;
29 |     }
30 | 
31 |     void set_item(T item) {
32 |         theItem = item;
33 |     }
34 | };
35 | 
36 | template<class T>
37 | class Lists {
38 | private:
39 |     shared_ptr<ListNodes<T>> theHead;
40 |     shared_ptr<ListNodes<T>> theLast;
41 |     int theSize;
42 | 
43 | public:
44 | 
45 |     Lists();
46 | 
47 |     ~Lists();
48 | 
49 |     void clear();
50 | 
51 |     shared_ptr<ListNodes<T>> head() {
52 |         return theHead;
53 |     };
54 | 
55 |     shared_ptr<ListNodes<T>> last() {
56 |         return theLast;
57 |     };
58 | 
59 |     int size() {
60 |         return theSize;
61 |     };
62 | 
63 |     void append(T item);
64 | 
65 |     void prepend(T item);
66 | 
67 |     T find(T item, CMP_FUNC cmpare);
68 | };
69 | 
70 | #endif// __LISTS_H
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/csrc/Partition.cc:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "Partition.h"
  3 | #include "Sequence.h"
  4 | 
  5 | 
  6 | Partition::Partition() {}
  7 | 
  8 | void Partition::init() {
  9 |     DATAFD.resize(args->num_partitions);
 10 |     ITEMIDX.reserve(args->num_partitions);
 11 | }
 12 | 
 13 | void Partition::partition_alloc() {
 14 |     ostringstream datafnstream;
 15 |     ostringstream idxfnstream;
 16 | 
 17 |     for (int i = 0; i < args->num_partitions; i++) {
 18 |         datafnstream.str(string());
 19 |         idxfnstream.str(string());
 20 |         datafnstream << args->dataf;
 21 |         idxfnstream << args->idxf;
 22 | 
 23 |         if (args->num_partitions > 1) {
 24 |             datafnstream << ".P" << i;
 25 |             idxfnstream << ".P" << i;
 26 |         }
 27 |         fstream &datafstream = DATAFD[i];
 28 |         datafstream.open(datafnstream.str(), ios::binary | ios::in);
 29 |         if (!datafstream.is_open()) {
 30 |             throw runtime_error("can't open data file: " + datafnstream.str());
 31 |         }
 32 | 
 33 |         fstream idxfstream(idxfnstream.str().c_str(), ios::binary | ios::in);
 34 |         if (!idxfstream.is_open()) {
 35 |             throw runtime_error("can't open idx file: " + idxfnstream.str());
 36 |         }
 37 | 
 38 |         auto idxflen = file_size(idxfstream);
 39 |         vint chunk;
 40 |         chunk.reserve(idxflen);
 41 | 
 42 |         int *chunk_buf = read_file(idxfstream, idxflen);
 43 | 
 44 |         for (int j=0; j<idxflen; j++) {
 45 |             chunk.push_back(chunk_buf[j]);
 46 |         }
 47 | 
 48 |         ITEMIDX.push_back(chunk);
 49 |         delete [] chunk_buf;
 50 |     }
 51 | }
 52 | 
 53 | 
 54 | int Partition::partition_get_idxsup(int it) {
 55 |     int supsz = 0;
 56 |     for (int i = 0; i < args->num_partitions; i++) {
 57 |         int* data = ITEMIDX[i].data();
 58 |         supsz += data[it + 1] - data[it];
 59 |     }
 60 |     return supsz;
 61 | }
 62 | 
 63 | int Partition::partition_get_lidxsup(int idx, int it) {
 64 |     int* data = ITEMIDX[idx].data();
 65 |     return (data[it + 1] - data[it]);
 66 | }
 67 | 
 68 | void Partition::partition_read_item(shared_ptr<vint> ival, int it) {
 69 |     int ipos = 0;
 70 |     int supsz;
 71 |     for (int i = 0; i < args->num_partitions; i++) {
 72 |         supsz = ITEMIDX[i][it + 1] - ITEMIDX[i][it];
 73 |         if (supsz > 0) {
 74 |             fstream& f = DATAFD[i];
 75 |             f.seekg(ITEMIDX[i][it] * INT_SIZE, ios::beg);
 76 |             f.read((char *) &ival->at(ipos), supsz * INT_SIZE);
 77 |             if (!f) {
 78 |                 throw runtime_error("Error reading item");
 79 |             }
 80 |             ipos += supsz;
 81 |         }
 82 |     }
 83 | }
 84 | 
 85 | void Partition::partition_lclread_item(shared_ptr<vint> ival, int pnum, int it) {
 86 |     int supsz;
 87 |     supsz = ITEMIDX[pnum][it + 1] - ITEMIDX[pnum][it];
 88 |     if (supsz > 0) {
 89 |         fstream& f = DATAFD[pnum];
 90 |         f.seekg(ITEMIDX[pnum][it] * INT_SIZE, ios::beg);
 91 |         f.read((char *) ival->data(), supsz * INT_SIZE);
 92 | 
 93 |         if (!f) {
 94 |             throw runtime_error("Error reading item");
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | 
100 | void Partition::partition_get_minmaxcustid(vint& backidx, int numit, int pnum, int &minv, int &maxv) {
101 |     int custid, it, i, supsz;
102 |     minv = INT_MAX;
103 |     maxv = 0;
104 |     for (i = 0; i < numit; i++) {
105 |         it = backidx[i];
106 |         supsz = ITEMIDX[pnum][it + 1] - ITEMIDX[pnum][it];
107 |         if (supsz > 0) {
108 |             fstream& f = DATAFD[pnum];
109 |             f.seekg(ITEMIDX[pnum][it] * INT_SIZE, ios::beg);
110 |             f.read((char *) &custid, INT_SIZE);
111 |             if (minv > custid) {
112 |                 minv = custid;
113 |             }
114 | 
115 |             f.seekg((supsz - 3) * INT_SIZE, ios::cur);
116 |             f.read((char *) &custid, INT_SIZE);
117 | 
118 |             if (maxv < custid) {
119 |                 maxv = custid;
120 |             }
121 |         }
122 |     }
123 | }
124 | 
125 | void Partition::set_args(const shared_ptr<SpadeArguments> &args) {
126 |     Partition::args = args;
127 | }


--------------------------------------------------------------------------------
/csrc/Partition.h:
--------------------------------------------------------------------------------
 1 | #ifndef __PARTITION_H_
 2 | #define __PARTITION_H_
 3 | 
 4 | //#include "spade.h"
 5 | //#include "sequence.h"
 6 | 
 7 | #include "common.h"
 8 | #include "SpadeArguments.h"
 9 | #include "Array.h"
10 | 
11 | class Partition {
12 | private:
13 |     shared_ptr<SpadeArguments> args;
14 |     vector<fstream> DATAFD;
15 |     vvint ITEMIDX;
16 | public:
17 |     Partition();
18 | 
19 |     void init();
20 | 
21 |     void set_args(const shared_ptr<SpadeArguments> &args);
22 | 
23 |     void partition_alloc();
24 | 
25 |     int partition_get_idxsup(int it);
26 | 
27 |     int partition_get_lidxsup(int idx, int it);
28 | 
29 |     void partition_read_item(shared_ptr<vint> ival, int it);
30 | 
31 |     void partition_lclread_item(shared_ptr<vint> ival, int pnum, int it);
32 | 
33 |     void partition_get_minmaxcustid(vint& backidx, int numit, int pnum, int &minv, int &maxv);
34 | };
35 | 
36 | #endif// __PARTITION_H_
37 | 


--------------------------------------------------------------------------------
/csrc/Sequence.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Yukio Fukuzawa on 7/12/18.
  3 | //
  4 | 
  5 | #ifndef CSPADE_SEQUENCE_H
  6 | #define CSPADE_SEQUENCE_H
  7 | 
  8 | #include "common.h"
  9 | #include "SpadeArguments.h"
 10 | #include "argv_parser.h"
 11 | #include "Partition.h"
 12 | #include "InvertDatabase.h"
 13 | #include "FreqIt.h"
 14 | 
 15 | result_t sequenceFunc(Env& env, const shared_ptr<SpadeArguments>& args);
 16 | 
 17 | /**
 18 |  * Call spade given the argument list as string
 19 |  * @param args e.g 'spade -i zaki -s 0.3 -Z 10 -z 10 -u 1 -r -e 1 -o'
 20 |  */
 21 | result_t sequenceWrapper(const string &s, shared_ptr<Env>& envptr);
 22 | 
 23 | class Sequence {
 24 | private:
 25 |     Env& env;
 26 |     shared_ptr<SpadeArguments> args;
 27 |     shared_ptr<Partition> partition;
 28 |     shared_ptr<ClassInfo> cls;
 29 |     shared_ptr<InvertDatabase> invdb;
 30 | 
 31 |     Array_S l_array, e_array, m_array;
 32 |     FreqIt_SS FreqArray;
 33 |     unsigned long FreqArraySz = 100;
 34 |     int FreqArrayPos = 0;
 35 |     shared_ptr<vint> numLargeItemset;
 36 | public:
 37 | 
 38 |     explicit Sequence(Env& env_);
 39 | 
 40 |     void set_args(const shared_ptr<SpadeArguments> &args);
 41 | 
 42 |     void set_partition(const shared_ptr<Partition> &partition);
 43 | 
 44 |     void set_cls(const shared_ptr<ClassInfo> &cls);
 45 | 
 46 |     void set_num_large_dataset(const shared_ptr<vint> &numLargeItemset);
 47 | 
 48 |     void set_invdb(const shared_ptr<InvertDatabase> &invdb);
 49 | 
 50 |     void read_files();
 51 | 
 52 |     int get_file_l2() {
 53 |         int l2cnt = 0;
 54 | 
 55 |         if (args->max_iset_len > 1) {
 56 |             invdb->get_l2file(args->it2f, 0, l2cnt);
 57 |         }
 58 |         if (args->max_seq_len > 1) {
 59 |             invdb->get_l2file(args->seqf, 1, l2cnt);
 60 |         }
 61 | 
 62 |         cerr << "L2 : " << l2cnt << endl;
 63 |         return l2cnt;
 64 |     }
 65 | 
 66 |     void get_tmpnewf_intersect(Itemset_S &ljoin, Itemset_S &ejoin, Itemset_S &mjoin,
 67 |                                int &lcnt, int &ecnt, int &mcnt,
 68 |                                Itemset_S& it1, Itemset_S& it2, int iter);
 69 | 
 70 |     void make_itemset(Itemset_S& it, Array_S& ary, int cnt, const vint &clscnt);
 71 | 
 72 |     void pre_pruning(Itemset_S &join, unsigned int ptempl, Itemset_S& clas, Itemset_S& prefix, char use_seq);
 73 | 
 74 |     void post_pruning(Itemset_S &iset, unsigned int templ);
 75 | 
 76 |     void newSeq();
 77 | 
 78 |     void process_class(int it);
 79 | 
 80 |     Eqclass_S get_ext_eqclass(int it);
 81 | 
 82 |     void get_2newf_intersect(Itemset_S& ljoin, Itemset_S& ejoin, shared_ptr<vint> vit1, shared_ptr<vint> vit2, int sup1,
 83 |                              int sup2);
 84 | 
 85 |     void add_freq(Itemset_S &it, int templ);
 86 | 
 87 |     Itemset_S prune_decision(Itemset_S& it1, Itemset_S& it2, unsigned int ptempl, int jflg);
 88 | 
 89 |     void find_large(Eqclass_S cluster, int it);
 90 | 
 91 |     void insert_freqarray(shared_ptr<Lists<Eqclass_S>>& LargeL);
 92 | 
 93 |     int get_valid_el(int it, vector<char> &ibvec, vector<char> &sbvec);
 94 | 
 95 |     void process_itemset(Itemset_S iset, unsigned int templ, int iter);
 96 | 
 97 |     void process_maxgap(Eqclass_S L2);
 98 | 
 99 |     void process_cluster1(Eqclass_S cluster, shared_ptr<Lists<Eqclass_S>> LargeL, int iter);
100 | 
101 |     void process_cluster_list1(shared_ptr<ListNodes<shared_ptr<Itemset>>>& hdr1,
102 |                                shared_ptr<Lists<shared_ptr<Itemset>>>& cluster1,
103 |                                shared_ptr<Lists<shared_ptr<Itemset>>>& cluster2,
104 |                                shared_ptr<Lists<Eqclass_S>>& LargeL,
105 |                                int iter, int eqtype, Eqclass_S& parent);
106 | 
107 |     void process_cluster_list2(shared_ptr<ListNodes<shared_ptr<Itemset>>>& hdr1, int i, Eqclass_SS& EQ,
108 |                                shared_ptr<Lists<shared_ptr<Itemset>>>& cluster,
109 |                                shared_ptr<Lists<Eqclass_S>>& LargeL,
110 |                                int iter, int eqtype, Eqclass_S& parent);
111 | 
112 |     void fill_join(Itemset_S& join, Itemset_S& hdr1, Itemset_S& hdr2);
113 | };
114 | 
115 | #endif //CSPADE_SEQUENCE_H
116 | 


--------------------------------------------------------------------------------
/csrc/SpadeArguments.cc:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Yukio Fukuzawa on 8/12/18.
 3 | //
 4 | 
 5 | #include "SpadeArguments.h"
 6 | 
 7 | void SpadeArguments::parse_args(int argc, char **argv) {
 8 |     string name;
 9 | 
10 |     auto cmdl = argh::parser(argc, argv, argh::parser::PREFER_PARAM_FOR_UNREG_OPTION);
11 |     cmdl("i") >> name;
12 | 
13 |     if (name.empty() || !cmdl("s")) {
14 |         cerr << "usage: usage: spade [OPTION]... -i<infile> -s<support>\n";
15 |         throw runtime_error("spade needs valid value of -i and -s");
16 |     }
17 |     cmdl("s") >> min_support_per_class;
18 |     cmdl("a") >> use_ascending;
19 |     cmdl("c") >> use_class;
20 |     if (cmdl("e")) {
21 |         ext_l2_pass = 1;
22 |         cmdl("e") >> num_partitions;
23 |     }
24 |     if (cmdl["h"]) use_hash = 1;
25 |     if (cmdl["o"]) outputfreq = 1;
26 |     if (cmdl["r"]) recursive = 1;
27 |     cmdl("l") >> min_gap;
28 |     if (cmdl("u")) {
29 |         use_maxgap = 1;
30 |         use_hash = 0;
31 |         cmdl("u") >> max_gap;
32 |     }
33 |     cmdl("t") >> pruning_type;
34 |     cmdl("v") >> min_support;
35 |     if (cmdl("w") || cmdl["w"]) {
36 |         if (!cmdl("u")) {
37 |             cerr << "-u is required when -w is enabled" << endl;
38 |             throw runtime_error("-u is required when -w is enabled");
39 |         }
40 |         use_window = 1;
41 |     }
42 |     cmdl("y") >> print_tidlist;
43 |     cmdl("z") >> max_seq_len;
44 |     cmdl("Z") >> max_iset_len;
45 | 
46 | 
47 |     dataf = name + ".tpose";
48 |     idxf = name + ".idx";
49 |     conf = name + ".conf";
50 |     it2f = name + ".2it";
51 |     seqf = name + ".2seq";
52 |     classf = name + ".class";
53 | 
54 |     ifstream conff(conf, ios::binary);
55 |     if (!conff.is_open()) {
56 |         throw runtime_error("File " + string(conf) + " doesn\'t exist.");
57 |     }
58 | 
59 |     conff.read((char *) &total_trans_count, INT_SIZE);
60 |     if (min_support == -1)
61 |         min_support = (int) ceil(min_support_per_class * total_trans_count);
62 |     //ensure that support is at least 2
63 |     if (min_support < 1) {
64 |         min_support = 1;
65 |     }
66 | 
67 |     conff.read((char *) &dbase_max_item, INT_SIZE);
68 |     conff.read((char *) &avg_cust_size, FLOAT_SIZE);
69 |     conff.read((char *) &avg_trans_count, FLOAT_SIZE);
70 |     conff.read((char *) &dbase_total_trans, INT_SIZE);
71 |     conff.close();
72 | }


--------------------------------------------------------------------------------
/csrc/SpadeArguments.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Yukio Fukuzawa on 8/12/18.
 3 | //
 4 | 
 5 | #ifndef CSPADE_SPADE_ARGS_T_H
 6 | #define CSPADE_SPADE_ARGS_T_H
 7 | 
 8 | #include "common.h"
 9 | #include "argv_parser.h"
10 | 
11 | //join type
12 | #define LJOIN 0
13 | #define EJOIN 1
14 | #define MJOIN 2
15 | 
16 | //pruning types
17 | #define NOPRUNING 0            // no pruning
18 | #define L2PRUNING 1
19 | #define ZEROPRUNING 2     // when sup goes to zero in other classes, prune
20 | #define FOLLOWPRUNING 4         //
21 | 
22 | class SpadeArguments {
23 | public:
24 |     int pruning_type = NOPRUNING;
25 |     string dataf;
26 |     string idxf;
27 |     string conf;
28 |     string it2f;
29 |     string seqf;
30 |     string classf;
31 |     int ext_l2_pass = 0;
32 |     int use_hash = 0;
33 |     int num_intersect = 0;
34 |     int recursive = 0;
35 |     int maxiter = 2;
36 |     int min_gap = 1;
37 |     int max_gap = INT_MAX;
38 |     char use_maxgap = 0;
39 |     char use_window = 0;
40 |     int use_ascending = -2;
41 |     bool use_class = false;
42 |     char outputfreq = 0;
43 |     char print_tidlist = 0;
44 | 
45 |     int L2pruning = 0;
46 |     int prepruning = 0;
47 |     int postpruning = 0;
48 | 
49 |     int max_seq_len = 100;
50 |     int max_iset_len = 100;
51 | 
52 |     int total_trans_count;
53 |     int dbase_max_item;
54 |     float avg_trans_count;
55 |     float avg_cust_size;
56 |     int dbase_total_trans;
57 | 
58 |     double min_support_per_class;
59 |     float follow_thresh = 1.0;
60 |     float zero_thresh = 0.0;
61 |     int min_support = -1;
62 | 
63 |     int num_partitions;
64 | 
65 |     void parse_args(int argc, char **argv);
66 | };
67 | 
68 | 
69 | #endif //CSPADE_SPADE_ARGS_T_H
70 | 


--------------------------------------------------------------------------------
/csrc/TransArray.cc:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "TransArray.h"
 3 | 
 4 | TransArray::TransArray(long sz, int npart) {
 5 |     totSize = sz;
 6 |     theSize = 0;
 7 |     lastPos = 0;
 8 |     theFlg = 0;
 9 |     offset.reserve(npart);
10 |     for (int i = 0; i < npart; i++) offset[i] = 0;
11 |     if (sz > 0) {
12 |         theArray.reserve(totSize);
13 |     }
14 | }
15 | 
16 | TransArray::~TransArray() {
17 |     theArray.clear();
18 |     offset.clear();
19 | }
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/csrc/TransArray.h:
--------------------------------------------------------------------------------
  1 | #ifndef __TRANS_ARRAY_H
  2 | #define __TRANS_ARRAY_H
  3 | 
  4 | #include "common.h"
  5 | 
  6 | class TransArray {
  7 | private:
  8 |     vector<int> theArray;
  9 |     char theFlg;
 10 |     int lastPos;
 11 |     // DD: remove unsigned to avoid gcc warning signed vs non-signed cmp
 12 |     int theSize;
 13 |     // DD: remove unsigned to avoid gcc warning signed vs non-signed cmp
 14 |     long totSize;
 15 |     vector<long> offset;
 16 | public:
 17 | 
 18 |     explicit TransArray(long sz, int npart = 1);
 19 | 
 20 |     ~TransArray();
 21 | 
 22 |     int operator[](unsigned int index) {
 23 |         return theArray[index];
 24 |     };
 25 | 
 26 |     char flg() {
 27 |         return theFlg;
 28 |     }
 29 | 
 30 |     void setflg(char flg) {
 31 |         theFlg = flg;
 32 |     }
 33 | 
 34 |     int lastpos() {
 35 |         return lastPos;
 36 |     }
 37 | 
 38 |     //to be used ony for use_seq
 39 |     void setlastpos() {
 40 |         theArray[lastPos + 1] = theSize - lastPos - 2;
 41 |         lastPos = theSize;
 42 |     }
 43 | 
 44 |     long get_offset(int pos = 0) {
 45 |         return offset[pos];
 46 |     }
 47 | 
 48 |     void set_offset(long off, int pos = 0) {
 49 |         offset[pos] = off;
 50 |     }
 51 | 
 52 |     int totsize() {
 53 |         return totSize;
 54 |     }
 55 | 
 56 |     void reset() {
 57 |         theSize = 0;
 58 |         lastPos = 0;
 59 |         theFlg = 0;
 60 |     }
 61 | 
 62 |     vector<int>& array() {
 63 |         return theArray;
 64 |     }
 65 | 
 66 |     int size() {
 67 |         return theSize;
 68 |     }
 69 | 
 70 |     void setsize(int size) {
 71 |         theSize = size;
 72 |     }
 73 | 
 74 |     void setitem(int pos, int item) {
 75 |         theArray[pos] = item;
 76 |     }
 77 | 
 78 |     void additem(int item) {
 79 |         theArray[theSize] = item;
 80 |         theSize++;
 81 |     }
 82 | 
 83 |     void flushbuf(int fd, int use_seq, int pos = 0) {
 84 |         lseek(fd, offset[pos] * sizeof(int), SEEK_SET);
 85 |         int wblk = theSize;
 86 |         if (wblk > 0) {
 87 |             auto res = write(fd, (char *) theArray.data(), wblk * sizeof(int));
 88 |             if (res < wblk * sizeof(int)) {
 89 |                 throw runtime_error("Error writing");
 90 |             }
 91 |             offset[pos] += wblk;
 92 |         }
 93 |         theSize = 0;
 94 |     }
 95 | 
 96 |     void add(int fd, int item, int use_seq, int pos, int custid = -1) {
 97 |         if (use_seq) {
 98 |             if (theSize + 2 > totSize) {
 99 |                 flushbuf(fd, use_seq, pos);
100 |             }
101 |             theArray[theSize++] = custid;
102 |         } else {
103 |             if (theSize + 1 > totSize) {
104 |                 flushbuf(fd, use_seq, pos);
105 |             }
106 |         }
107 |         theArray[theSize++] = item;
108 |     }
109 | };
110 | 
111 | #endif //__TRANS_ARRAY_H
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/csrc/argh.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Yukio Fukuzawa on 11/12/18.
  3 | //
  4 | 
  5 | #ifndef UTILITIES_ARGH_H
  6 | #define UTILITIES_ARGH_H
  7 | 
  8 | #pragma once
  9 | 
 10 | #include <algorithm>
 11 | #include <sstream>
 12 | #include <string>
 13 | #include <vector>
 14 | #include <set>
 15 | #include <map>
 16 | #include <cassert>
 17 | 
 18 | namespace argh {
 19 |     // Terminology:
 20 |     // A command line is composed of 2 types of args:
 21 |     // 1. Positional args, i.e. free standing values
 22 |     // 2. Options: args beginning with '-'. We identify two kinds:
 23 |     //    2.1: Flags: boolean options =>  (exist ? true : false)
 24 |     //    2.2: Parameters: a name followed by a non-option value
 25 | 
 26 | #if !defined(__GNUC__) || (__GNUC__ >= 5)
 27 |     using string_stream = std::istringstream;
 28 | #else
 29 | 
 30 |     // Until GCC 5, istringstream did not have a move constructor.
 31 |     // stringstream_proxy is used instead, as a workaround.
 32 |     class stringstream_proxy {
 33 |     public:
 34 |         stringstream_proxy() = default;
 35 | 
 36 |         // Construct with a value.
 37 |         stringstream_proxy(std::string const &value) :
 38 |                 stream_(value) {}
 39 | 
 40 |         // Copy constructor.
 41 |         stringstream_proxy(const stringstream_proxy &other) :
 42 |                 stream_(other.stream_.str()) {
 43 |             stream_.setstate(other.stream_.rdstate());
 44 |         }
 45 | 
 46 |         void setstate(std::ios_base::iostate state) { stream_.setstate(state); }
 47 | 
 48 |         // Stream out the value of the parameter.
 49 |         // If the conversion was not possible, the stream will enter the fail state,
 50 |         // and operator bool will return false.
 51 |         template<typename T>
 52 |         stringstream_proxy &operator>>(T &thing) {
 53 |             stream_ >> thing;
 54 |             return *this;
 55 |         }
 56 | 
 57 | 
 58 |         // Get the string value.
 59 |         std::string str() const { return stream_.str(); }
 60 | 
 61 |         std::stringbuf *rdbuf() const { return stream_.rdbuf(); }
 62 | 
 63 |         // Check the state of the stream.
 64 |         // False when the most recent stream operation failed
 65 |         operator bool() const { return !!stream_; }
 66 | 
 67 |         ~stringstream_proxy() = default;
 68 | 
 69 |     private:
 70 |         std::istringstream stream_;
 71 |     };
 72 | 
 73 |     using string_stream = stringstream_proxy;
 74 | #endif
 75 | 
 76 |     class parser {
 77 |     public:
 78 |         enum Mode {
 79 |             PREFER_FLAG_FOR_UNREG_OPTION = 1 << 0,
 80 |             PREFER_PARAM_FOR_UNREG_OPTION = 1 << 1,
 81 |             NO_SPLIT_ON_EQUALSIGN = 1 << 2,
 82 |             SINGLE_DASH_IS_MULTIFLAG = 1 << 3,
 83 |         };
 84 | 
 85 |         parser() = default;
 86 | 
 87 |         parser(std::initializer_list<char const *const> pre_reg_names) { add_params(pre_reg_names); }
 88 | 
 89 |         parser(char **argv, int mode = PREFER_FLAG_FOR_UNREG_OPTION) { parse(argv, mode); }
 90 | 
 91 |         parser(int argc, char **argv, int mode = PREFER_FLAG_FOR_UNREG_OPTION) { parse(argc, argv, mode); }
 92 | 
 93 |         void add_param(std::string const &name);
 94 | 
 95 |         void add_params(std::initializer_list<char const *const> init_list);
 96 | 
 97 |         void parse(char **argv, int mode = PREFER_FLAG_FOR_UNREG_OPTION);
 98 | 
 99 |         void parse(int argc, char **argv, int mode = PREFER_FLAG_FOR_UNREG_OPTION);
100 | 
101 |         std::multiset<std::string> const &flags() const { return flags_; }
102 | 
103 |         std::map<std::string, std::string> const &params() const { return params_; }
104 | 
105 |         std::vector<std::string> const &pos_args() const { return pos_args_; }
106 | 
107 |         // begin() and end() for using range-for over positional args.
108 |         std::vector<std::string>::const_iterator begin() const { return pos_args_.cbegin(); }
109 | 
110 |         std::vector<std::string>::const_iterator end() const { return pos_args_.cend(); }
111 | 
112 |         size_t size() const { return pos_args_.size(); }
113 | 
114 |         //////////////////////////////////////////////////////////////////////////
115 |         // Accessors
116 | 
117 |         // flag (boolean) accessors: return true if the flag appeared, otherwise false.
118 |         bool operator[](std::string const &name) const;
119 | 
120 |         // multiple flag (boolean) accessors: return true if at least one of the flag appeared, otherwise false.
121 |         bool operator[](std::initializer_list<char const *const> init_list) const;
122 | 
123 |         // returns positional arg string by order. Like argv[] but without the options
124 |         std::string const &operator[](size_t ind) const;
125 | 
126 |         // returns a std::istream that can be used to convert a positional arg to a typed value.
127 |         string_stream operator()(size_t ind) const;
128 | 
129 |         // same as above, but with a default value in case the arg is missing (index out of range).
130 |         template<typename T>
131 |         string_stream operator()(size_t ind, T &&def_val) const;
132 | 
133 |         // parameter accessors, give a name get an std::istream that can be used to convert to a typed value.
134 |         // call .str() on result to get as string
135 |         string_stream operator()(std::string const &name) const;
136 | 
137 |         // accessor for a parameter with multiple names, give a list of names, get an std::istream that can be used to convert to a typed value.
138 |         // call .str() on result to get as string
139 |         // returns the first value in the list to be found.
140 |         string_stream operator()(std::initializer_list<char const *const> init_list) const;
141 | 
142 |         // same as above, but with a default value in case the param was missing.
143 |         // Non-string def_val types must have an operator<<() (output stream operator)
144 |         // If T only has an input stream operator, pass the string version of the type as in "3" instead of 3.
145 |         template<typename T>
146 |         string_stream operator()(std::string const &name, T &&def_val) const;
147 | 
148 |         // same as above but for a list of names. returns the first value to be found.
149 |         template<typename T>
150 |         string_stream operator()(std::initializer_list<char const *const> init_list, T &&def_val) const;
151 | 
152 |     private:
153 |         string_stream bad_stream() const;
154 | 
155 |         std::string trim_leading_dashes(std::string const &name) const;
156 | 
157 |         bool is_number(std::string const &arg) const;
158 | 
159 |         bool is_option(std::string const &arg) const;
160 | 
161 |         bool got_flag(std::string const &name) const;
162 | 
163 |         bool is_param(std::string const &name) const;
164 | 
165 |     private:
166 |         std::vector<std::string> args_;
167 |         std::map<std::string, std::string> params_;
168 |         std::vector<std::string> pos_args_;
169 |         std::multiset<std::string> flags_;
170 |         std::set<std::string> registeredParams_;
171 |         std::string empty_;
172 |     };
173 | 
174 | 
175 |     //////////////////////////////////////////////////////////////////////////
176 | 
177 |     inline void parser::parse(char **argv, int mode) {
178 |         int argc = 0;
179 |         for (auto argvp = argv; *argvp; ++argc, ++argvp);
180 |         parse(argc, argv, mode);
181 |     }
182 | 
183 |     //////////////////////////////////////////////////////////////////////////
184 | 
185 |     inline void parser::parse(int argc, char **argv, int mode /*= PREFER_FLAG_FOR_UNREG_OPTION*/) {
186 |         // convert to strings
187 |         args_.resize(argc);
188 |         std::transform(argv, argv + argc, args_.begin(), [](const char *const arg) { return arg; });
189 | 
190 |         // parse line
191 |         for (auto i = 0u; i < args_.size(); ++i) {
192 |             if (!is_option(args_[i])) {
193 |                 pos_args_.emplace_back(args_[i]);
194 |                 continue;
195 |             }
196 | 
197 |             auto name = trim_leading_dashes(args_[i]);
198 | 
199 |             if (!(mode & NO_SPLIT_ON_EQUALSIGN)) {
200 |                 auto equalPos = name.find('=');
201 |                 if (equalPos != std::string::npos) {
202 |                     params_.insert({name.substr(0, equalPos), name.substr(equalPos + 1)});
203 |                     continue;
204 |                 }
205 |             }
206 | 
207 |             // if the option is unregistered and should be a multi-flag
208 |             if (1 == (args_[i].size() - name.size()) &&         // single dash
209 |                 argh::parser::SINGLE_DASH_IS_MULTIFLAG & mode && // multi-flag mode
210 |                 !is_param(name))                                  // unregistered
211 |             {
212 |                 std::string keep_param;
213 | 
214 |                 if (!name.empty() && is_param(std::string(1ul, name.back()))) // last char is param
215 |                 {
216 |                     keep_param += name.back();
217 |                     name.resize(name.size() - 1);
218 |                 }
219 | 
220 |                 for (auto const &c : name) {
221 |                     flags_.emplace(std::string{c});
222 |                 }
223 | 
224 |                 if (!keep_param.empty()) {
225 |                     name = keep_param;
226 |                 } else {
227 |                     continue; // do not consider other options for this arg
228 |                 }
229 |             }
230 | 
231 |             // any potential option will get as its value the next arg, unless that arg is an option too
232 |             // in that case it will be determined a flag.
233 |             if (i == args_.size() - 1 || is_option(args_[i + 1])) {
234 |                 flags_.emplace(name);
235 |                 continue;
236 |             }
237 | 
238 |             // if 'name' is a pre-registered option, then the next arg cannot be a free parameter to it is skipped
239 |             // otherwise we have 2 modes:
240 |             // PREFER_FLAG_FOR_UNREG_OPTION: a non-registered 'name' is determined a flag.
241 |             //                               The following value (the next arg) will be a free parameter.
242 |             //
243 |             // PREFER_PARAM_FOR_UNREG_OPTION: a non-registered 'name' is determined a parameter, the next arg
244 |             //                                will be the value of that option.
245 | 
246 |             assert(!(mode & argh::parser::PREFER_FLAG_FOR_UNREG_OPTION)
247 |                    || !(mode & argh::parser::PREFER_PARAM_FOR_UNREG_OPTION));
248 | 
249 |             bool preferParam = mode & argh::parser::PREFER_PARAM_FOR_UNREG_OPTION;
250 | 
251 |             if (is_param(name) || preferParam) {
252 |                 params_.insert({name, args_[i + 1]});
253 |                 ++i; // skip next value, it is not a free parameter
254 |                 continue;
255 |             } else {
256 |                 flags_.emplace(name);
257 |             }
258 |         };
259 |     }
260 | 
261 |     //////////////////////////////////////////////////////////////////////////
262 | 
263 |     inline string_stream parser::bad_stream() const {
264 |         string_stream bad;
265 |         bad.setstate(std::ios_base::failbit);
266 |         return bad;
267 |     }
268 | 
269 |     //////////////////////////////////////////////////////////////////////////
270 | 
271 |     inline bool parser::is_number(std::string const &arg) const {
272 |         // inefficient but simple way to determine if a string is a number (which can start with a '-')
273 |         std::istringstream istr(arg);
274 |         double number;
275 |         istr >> number;
276 |         return !(istr.fail() || istr.bad());
277 |     }
278 | 
279 |     //////////////////////////////////////////////////////////////////////////
280 | 
281 |     inline bool parser::is_option(std::string const &arg) const {
282 |         assert(0 != arg.size());
283 |         if (is_number(arg))
284 |             return false;
285 |         return '-' == arg[0];
286 |     }
287 | 
288 |     //////////////////////////////////////////////////////////////////////////
289 | 
290 |     inline std::string parser::trim_leading_dashes(std::string const &name) const {
291 |         auto pos = name.find_first_not_of('-');
292 |         return std::string::npos != pos ? name.substr(pos) : name;
293 |     }
294 | 
295 |     //////////////////////////////////////////////////////////////////////////
296 | 
297 |     inline bool argh::parser::got_flag(std::string const &name) const {
298 |         return flags_.end() != flags_.find(trim_leading_dashes(name));
299 |     }
300 | 
301 |     //////////////////////////////////////////////////////////////////////////
302 | 
303 |     inline bool argh::parser::is_param(std::string const &name) const {
304 |         return registeredParams_.count(name);
305 |     }
306 | 
307 |     //////////////////////////////////////////////////////////////////////////
308 | 
309 |     inline bool parser::operator[](std::string const &name) const {
310 |         return got_flag(name);
311 |     }
312 | 
313 |     //////////////////////////////////////////////////////////////////////////
314 | 
315 |     inline bool parser::operator[](std::initializer_list<char const *const> init_list) const {
316 |         return std::any_of(init_list.begin(), init_list.end(), [&](char const *const name) { return got_flag(name); });
317 |     }
318 | 
319 |     //////////////////////////////////////////////////////////////////////////
320 | 
321 |     inline std::string const &parser::operator[](size_t ind) const {
322 |         if (ind < pos_args_.size())
323 |             return pos_args_[ind];
324 |         return empty_;
325 |     }
326 | 
327 |     //////////////////////////////////////////////////////////////////////////
328 | 
329 |     inline string_stream parser::operator()(std::string const &name) const {
330 |         auto optIt = params_.find(trim_leading_dashes(name));
331 |         if (params_.end() != optIt)
332 |             return string_stream(optIt->second);
333 |         return bad_stream();
334 |     }
335 | 
336 |     //////////////////////////////////////////////////////////////////////////
337 | 
338 |     inline string_stream parser::operator()(std::initializer_list<char const *const> init_list) const {
339 |         for (auto &name : init_list) {
340 |             auto optIt = params_.find(trim_leading_dashes(name));
341 |             if (params_.end() != optIt)
342 |                 return string_stream(optIt->second);
343 |         }
344 |         return bad_stream();
345 |     }
346 | 
347 |     //////////////////////////////////////////////////////////////////////////
348 | 
349 |     template<typename T>
350 |     string_stream parser::operator()(std::string const &name, T &&def_val) const {
351 |         auto optIt = params_.find(trim_leading_dashes(name));
352 |         if (params_.end() != optIt)
353 |             return string_stream(optIt->second);
354 | 
355 |         std::ostringstream ostr;
356 |         ostr << def_val;
357 |         return string_stream(ostr.str()); // use default
358 |     }
359 | 
360 |     //////////////////////////////////////////////////////////////////////////
361 | 
362 |     // same as above but for a list of names. returns the first value to be found.
363 |     template<typename T>
364 |     string_stream parser::operator()(std::initializer_list<char const *const> init_list, T &&def_val) const {
365 |         for (auto &name : init_list) {
366 |             auto optIt = params_.find(trim_leading_dashes(name));
367 |             if (params_.end() != optIt)
368 |                 return string_stream(optIt->second);
369 |         }
370 |         std::ostringstream ostr;
371 |         ostr << def_val;
372 |         return string_stream(ostr.str()); // use default
373 |     }
374 | 
375 |     //////////////////////////////////////////////////////////////////////////
376 | 
377 |     inline string_stream parser::operator()(size_t ind) const {
378 |         if (pos_args_.size() <= ind)
379 |             return bad_stream();
380 | 
381 |         return string_stream(pos_args_[ind]);
382 |     }
383 | 
384 |     //////////////////////////////////////////////////////////////////////////
385 | 
386 |     template<typename T>
387 |     string_stream parser::operator()(size_t ind, T &&def_val) const {
388 |         if (pos_args_.size() <= ind) {
389 |             std::ostringstream ostr;
390 |             ostr << def_val;
391 |             return string_stream(ostr.str());
392 |         }
393 | 
394 |         return string_stream(pos_args_[ind]);
395 |     }
396 | 
397 |     //////////////////////////////////////////////////////////////////////////
398 | 
399 |     inline void parser::add_param(std::string const &name) {
400 |         registeredParams_.insert(trim_leading_dashes(name));
401 |     }
402 | 
403 |     //////////////////////////////////////////////////////////////////////////
404 | 
405 |     inline void parser::add_params(std::initializer_list<char const *const> init_list) {
406 |         for (auto &name : init_list)
407 |             registeredParams_.insert(trim_leading_dashes(name));
408 |     }
409 | }
410 | 
411 | #endif //UTILITIES_ARGH_H
412 | 


--------------------------------------------------------------------------------
/csrc/argv_parser.cc:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "argv_parser.h"
  3 | 
  4 | 
  5 | char *string2chars(const string &str) {
  6 |     int len = str.length();
  7 |     auto *chars = new char[len + 1];
  8 |     for (int i = 0; i < str.length(); i++) {
  9 |         chars[i] = str[i];
 10 |     }
 11 |     chars[len] = 0;
 12 |     return chars;
 13 | }
 14 | 
 15 | string ensure_one_newline(const string &s) {
 16 |     string str(s);
 17 |     str.erase(std::remove(str.begin(), str.end(), '\n'), str.end());
 18 |     return str + "\n";
 19 | }
 20 | 
 21 | args_t parse(const string &s) {
 22 |     list<string> argv;
 23 |     ostringstream token;
 24 | 
 25 |     bool in_token;
 26 |     bool in_container;
 27 |     bool escaped;
 28 |     char container_start;
 29 |     char c;
 30 |     int len;
 31 |     int i;
 32 | 
 33 |     string str = ensure_one_newline(s);
 34 | 
 35 |     container_start = 0;
 36 |     in_token = false;
 37 |     in_container = false;
 38 |     escaped = false;
 39 | 
 40 |     len = static_cast<int>(str.length());
 41 | 
 42 |     for (i = 0; i < len; i++) {
 43 |         c = str[i];
 44 | 
 45 |         switch (c) {
 46 |             /* handle whitespace */
 47 |             case ' ':
 48 |             case '\t':
 49 |             case '\n':
 50 |                 if (!in_token)
 51 |                     continue;
 52 | 
 53 |                 if (in_container) {
 54 |                     token << c;
 55 |                     continue;
 56 |                 }
 57 | 
 58 |                 if (escaped) {
 59 |                     escaped = false;
 60 |                     token << c;
 61 |                     continue;
 62 |                 }
 63 | 
 64 |                 /* if reached here, we're at end of token */
 65 |                 in_token = false;
 66 |                 argv.push_back(token.str());
 67 |                 token.str(string());
 68 |                 token.clear();
 69 |                 break;
 70 | 
 71 |                 /* handle quotes */
 72 |             case '\'':
 73 |             case '\"':
 74 | 
 75 |                 if (escaped) {
 76 |                     token << c;
 77 |                     escaped = false;
 78 |                     continue;
 79 |                 }
 80 | 
 81 |                 if (!in_token) {
 82 |                     in_token = true;
 83 |                     in_container = true;
 84 |                     container_start = c;
 85 |                     continue;
 86 |                 }
 87 | 
 88 |                 if (in_container) {
 89 |                     if (c == container_start) {
 90 |                         in_container = false;
 91 |                         in_token = false;
 92 |                         argv.push_back(token.str());
 93 |                         token.str(string());
 94 |                         token.clear();
 95 |                         continue;
 96 |                     } else {
 97 |                         token << c;
 98 |                         continue;
 99 |                     }
100 |                 }
101 | 
102 |                 /* XXX in this case, we:
103 |                  *    1. have a quote
104 |                  *    2. are in a token
105 |                  *    3. and not in a container
106 |                  * e.g.
107 |                  *    hell"o
108 |                  *
109 |                  * what'str done here appears shell-dependent,
110 |                  * but overall, it'str an error.... i *think*
111 |                  */
112 |                 throw runtime_error("Parse Error! Bad quotes");
113 |             case '\\':
114 | 
115 |                 if (in_container && str[i + 1] != container_start) {
116 |                     token << c;
117 |                     continue;
118 |                 }
119 | 
120 |                 if (escaped) {
121 |                     token << c;
122 |                     continue;
123 |                 }
124 | 
125 |                 escaped = true;
126 |                 break;
127 | 
128 |             default:
129 |                 if (!in_token) {
130 |                     in_token = true;
131 |                 }
132 | 
133 |                 token << c;
134 |         }
135 |     }
136 | 
137 |     if (in_container)
138 |         throw runtime_error("Parse Error! Still in container\n");
139 | 
140 |     if (escaped)
141 |         throw runtime_error("Parse Error! Unused escape (\\)\n");
142 | 
143 |     return args_t(argv);
144 | }
145 | 


--------------------------------------------------------------------------------
/csrc/argv_parser.h:
--------------------------------------------------------------------------------
 1 | #ifndef ARGV_PARSER_H
 2 | #define ARGV_PARSER_H
 3 | 
 4 | #include "common.h"
 5 | 
 6 | /**
 7 |  * Same as strdup. Make a deep copy of a string and return char*
 8 |  * @param s
 9 |  * @return
10 |  */
11 | char *string2chars(const string &str);
12 | 
13 | /**
14 |  * Tuple containing argc and argv (char**)
15 |  */
16 | struct args_t {
17 |     int argc;
18 |     char **argv;
19 | 
20 |     args_t(std::list<string> args) {
21 |         argc = static_cast<int>(args.size());
22 |         argv = new char *[argc];
23 |         std::list<string>::const_iterator iterator;
24 |         int idx = 0;
25 |         for (iterator = args.begin(); iterator != args.end(); ++iterator) {
26 |             argv[idx] = string2chars((*iterator));
27 |             idx++;
28 |         }
29 |     }
30 | 
31 |     ~args_t() {
32 |         for (int i = 0; i < argc; i++) {
33 |             delete[] argv[i];
34 |         }
35 |         delete[] argv;
36 |     }
37 | };
38 | 
39 | /**
40 |  * Parse a string of arguments into char** (such that can be used by main())
41 |  * This function is string aware, e.g. "hello world" is one arg, not two
42 |  * @param s
43 |  * @return
44 |  */
45 | args_t parse(const string &s);
46 | 
47 | #endif //ARGV_PARSER_H
48 | 


--------------------------------------------------------------------------------
/csrc/calcdb.cc:
--------------------------------------------------------------------------------
 1 | #include "calcdb.h"
 2 | #include <string.h>
 3 | 
 4 | void DbaseCtrlBlk::init(const string &infilename, int buf_sz) {
 5 |     fd = open(infilename.c_str(), O_RDONLY | O_BINARY);
 6 |     if (fd < 0) {
 7 |         throw runtime_error("ERROR: InvalidFile -- Dbase_Ctrl_Blk()");
 8 |     }
 9 | 
10 |     buf_size = buf_sz;
11 |     buf = new int[buf_sz];
12 |     cur_buf_pos = 0;
13 |     cur_blk_size = 0;
14 |     readall = 0;
15 |     endpos = lseek(fd, 0, SEEK_END);
16 | }
17 | 
18 | DbaseCtrlBlk::~DbaseCtrlBlk() {
19 |     delete[] buf;
20 |     close(fd);
21 | }
22 | 
23 | void DbaseCtrlBlk::get_next_trans_ext() {
24 |     // Need to get more items from file
25 |     auto res = cur_blk_size - cur_buf_pos;
26 |     if (res > 0) {
27 |         // First copy partial transaction to beginning of buffer
28 |         memcpy((void *) buf,
29 |                (void *) (buf + cur_buf_pos),
30 |                res * INT_SIZE);
31 |         cur_blk_size = res;
32 |     } else {
33 |         // No partial transaction in buffer
34 |         cur_blk_size = 0;
35 |     }
36 | 
37 |     res = read(fd, (void *) (buf + cur_blk_size), ((buf_size - cur_blk_size) * INT_SIZE));
38 | 
39 |     if (res < 0) {
40 |         throw runtime_error("reading in database");
41 |     }
42 |     cur_blk_size += res / INT_SIZE;
43 |     cur_buf_pos = 0;
44 | }
45 | 
46 | void DbaseCtrlBlk::get_first_blk() {
47 |     readall = 0;
48 |     lseek(fd, 0, SEEK_SET);
49 |     cur_blk_size = (read(fd, (void *) buf, (buf_size * INT_SIZE))) / INT_SIZE;
50 |     if (cur_blk_size < 0) {
51 |         throw runtime_error("get_first_blk");
52 |     }
53 |     cur_buf_pos = 0;
54 | }
55 | 
56 | void DbaseCtrlBlk::get_next_trans(int *&lbuf, int &nitems, int &tid, int &cid) {
57 |     if (cur_buf_pos + TRANSOFF >= cur_blk_size ||
58 |         cur_buf_pos + buf[cur_buf_pos + TRANSOFF - 1] + TRANSOFF > cur_blk_size) {
59 |         if (lseek(fd, 0, SEEK_CUR) == endpos) readall = 1;
60 |         if (!readall) {
61 |             // Need to get more items from file
62 |             get_next_trans_ext();
63 |         }
64 |     }
65 | 
66 |     if (!readall) {
67 |         cid = buf[cur_buf_pos];
68 |         tid = buf[cur_buf_pos + TRANSOFF - 2];
69 |         nitems = buf[cur_buf_pos + TRANSOFF - 1];
70 |         lbuf = buf + cur_buf_pos + TRANSOFF;
71 |         cur_buf_pos += nitems + TRANSOFF;
72 |     }
73 | }
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/csrc/calcdb.h:
--------------------------------------------------------------------------------
 1 | #ifndef __DATABASE_H
 2 | #define __DATABASE_H
 3 | 
 4 | #include "common.h"
 5 | 
 6 | #define DCBBUFSZ 2048
 7 | #define TRANSOFF 3
 8 | 
 9 | #ifndef O_BINARY
10 | #define O_BINARY 0
11 | #endif
12 | 
13 | class DbaseCtrlBlk {
14 | public:
15 |     DbaseCtrlBlk() = default;
16 | 
17 |     void init(const string& infilename, int buf_sz = DCBBUFSZ);
18 | 
19 |     ~DbaseCtrlBlk();
20 | 
21 |     void get_next_trans_ext();
22 | 
23 |     void get_first_blk();
24 | 
25 |     void get_next_trans(int *&lbuf, int &nitems, int &tid, int &cid);
26 | 
27 |     int eof() {
28 |         return (readall == 1);
29 |     }
30 | 
31 |     int fd;
32 |     int buf_size;
33 |     int *buf;
34 |     unsigned long cur_blk_size;
35 |     int cur_buf_pos;
36 |     unsigned long endpos;
37 |     char readall;
38 | };
39 | 
40 | #endif //__DATABASE_H
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/csrc/common.cc:
--------------------------------------------------------------------------------
  1 | #include <random>
  2 | #include "common.h"
  3 | 
  4 | const int INT_SIZE = sizeof(int);
  5 | const int FLOAT_SIZE = sizeof(float);
  6 | 
  7 | vuint vuint_null;
  8 | const char alphanum[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
  9 | const int alphanumlen = sizeof(alphanum) - 1;
 10 | 
 11 | const string TMPDIR = get_temp_folder();
 12 | 
 13 | unsigned long file_size(fstream& file) {
 14 |     std::streampos fsize = 0;
 15 |     fsize = file.tellg();
 16 |     file.seekg( 0, std::ios::end);
 17 |     fsize = file.tellg() - fsize;
 18 |     file.clear();
 19 |     file.seekg( 0, std::ios::beg);
 20 |     return static_cast<unsigned long>(fsize);
 21 | }
 22 | 
 23 | int* read_file(fstream& file, unsigned long flen) {
 24 |     if (!flen) return nullptr;
 25 | 
 26 |     auto buffer = new char[flen];
 27 |     auto intbuf = (int *) buffer;
 28 |     file.read(buffer, flen);
 29 |     return intbuf;
 30 | }
 31 | 
 32 | bool file_exists(const string &name) {
 33 |     ifstream f(name.c_str());
 34 |     return f.good();
 35 | }
 36 | 
 37 | /* Reads a file and returns the number of lines in this file. */
 38 | int num_lines(const string &filename) {
 39 |     FILE *file = fopen(filename.c_str(), "r");
 40 |     int lines = 0;
 41 |     int c;
 42 |     int last = '\n';
 43 |     while (EOF != (c = fgetc(file))) {
 44 |         if (c == '\n' && last != '\n') {
 45 |             ++lines;
 46 |         }
 47 |         last = c;
 48 |     }
 49 |     fclose(file);
 50 |     return lines;
 51 | }
 52 | 
 53 | 
 54 | bool starts_with(const string &haystack, const string &needle) {
 55 |     return needle.length() <= haystack.length()
 56 |            && equal(needle.begin(), needle.end(), haystack.begin());
 57 | }
 58 | 
 59 | 
 60 | list<string> list_files(const string &folder, const string &prefix) {
 61 |     struct dirent *entry;
 62 |     list<string> retval;
 63 |     DIR *dir = opendir(folder.c_str());
 64 |     if (dir == nullptr) {
 65 |         return retval;
 66 |     }
 67 | 
 68 |     bool check_prefix = prefix.length() > 0;
 69 | 
 70 |     while ((entry = readdir(dir)) != nullptr) {
 71 |         string filename = entry->d_name;
 72 |         if (check_prefix && starts_with(filename, prefix)) {
 73 |             retval.push_back(filename);
 74 |         }
 75 |     }
 76 |     closedir(dir);
 77 | 
 78 |     return retval;
 79 | }
 80 | 
 81 | 
 82 | string random_id(const int len) {
 83 |     std::random_device rd;
 84 |     std::mt19937 gen(rd());
 85 |     std::uniform_int_distribution<> dis(0, alphanumlen - 1);
 86 | 
 87 |     auto *s = new char[len + 1];
 88 |     int rand_idx;
 89 |     for (int i = 0; i < len; ++i) {
 90 |         rand_idx = dis(gen);
 91 |         s[i] = alphanum[rand_idx];
 92 |     }
 93 |     s[len] = 0;
 94 |     string retval(s);
 95 |     delete [] s;
 96 |     return retval;
 97 | }
 98 | 
 99 | string get_temp_folder() {
100 | #if defined(_MSC_VER) || defined(MS_WINDOWS) ||  defined(WIN32)
101 |     string buffer;
102 |     buffer.resize(1000);
103 |     const auto new_size = GetTempPathA(buffer.size(), &buffer[0]); //deal with newsize == 0
104 |     buffer.resize(new_size);
105 |     return buffer;
106 | #else
107 |     return "/tmp/";
108 | #endif
109 | }
110 | 


--------------------------------------------------------------------------------
/csrc/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H
 2 | #define UTILS_H
 3 | 
 4 | #include <iostream>
 5 | #include <fstream>
 6 | #include <sstream>
 7 | #include <list>
 8 | #include <vector>
 9 | #include <string>
10 | #include <cerrno>
11 | #include <cstdio>
12 | #include <cstdlib>
13 | #include <sys/stat.h>
14 | #include <cmath>
15 | #include <exception>
16 | #include <stdexcept>
17 | #include <memory>
18 | #include <algorithm>
19 | #if defined(_MSC_VER) || defined(MS_WINDOWS) ||  defined(WIN32)
20 | #include <io.h>
21 | #include <fcntl.h>
22 | #include "dirent-win.h"
23 | #else
24 | #include <fcntl.h>
25 | #include <unistd.h>
26 | #include <dirent.h>
27 | #endif
28 | #include <sys/types.h>
29 | #include <sys/stat.h>
30 | #include "argh.h"
31 | 
32 | #ifndef INT_MAX
33 | #define INT_MAX 2147483647
34 | #endif
35 | 
36 | using std::cout;
37 | using std::cerr;
38 | using std::endl;
39 | using std::flush;
40 | 
41 | using std::ostringstream;
42 | using std::string;
43 | using std::ifstream;
44 | using std::ofstream;
45 | using std::ostream;
46 | using std::ios;
47 | using std::fstream;
48 | 
49 | using std::list;
50 | using std::vector;
51 | 
52 | 
53 | using std::shared_ptr;
54 | using std::unique_ptr;
55 | using std::make_shared;
56 | 
57 | using std::runtime_error;
58 | using std::exception;
59 | 
60 | #define min(a, b) ((a) < (b) ? (a) : (b))
61 | 
62 | struct result_t {
63 |     int nsequences;
64 |     string seqstrm;
65 |     string logger;
66 |     string summary;
67 | };
68 | 
69 | #define ulong int
70 | extern const int INT_SIZE;
71 | extern const int FLOAT_SIZE;
72 | 
73 | typedef vector<int> vint;
74 | typedef vector<vector<int>> vvint;
75 | typedef vector<unsigned int> vuint;
76 | 
77 | extern vuint vuint_null;
78 | 
79 | bool file_exists(const string &name);
80 | 
81 | unsigned long file_size(fstream& file);
82 | 
83 | int* read_file(fstream& file, unsigned long flen);
84 | 
85 | int num_lines(const string &filename);
86 | 
87 | list<string> list_files(const string& folder, const string& prefix = "");
88 | 
89 | string random_id(const int len);
90 | 
91 | string get_temp_folder();
92 | 
93 | extern const string TMPDIR;
94 | 
95 | #endif
96 | 


--------------------------------------------------------------------------------
/csrc/exttpose.cc:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "exttpose.h"
  3 | #include "calcdb.h"
  4 | #include "Env.h"
  5 | #include "TransArray.h"
  6 | 
  7 | 
  8 | int cmp2it(const void *a, const void *b) {
  9 |     auto *ary = (int *) a;
 10 |     auto *bry = (int *) b;
 11 |     if (ary[0] < bry[0]) return -1;
 12 |     else if (ary[0] > bry[0]) return 1;
 13 |     else {
 14 |         if (ary[1] < bry[1]) return -1;
 15 |         else if (ary[1] > bry[1]) return 1;
 16 |         else return 0;
 17 |     }
 18 | }
 19 | 
 20 | void Exttpose::sort_get_l2(int &l2cnt, fstream &file, ofstream &ofd, vector<unsigned char> &cntary, bool use_seq) {
 21 |     //write 2-itemsets counts to file
 22 | 
 23 |     int i, j, k;
 24 |     int fcnt;
 25 |     int lit;
 26 |     int itbuf[3];
 27 |     int *sortary;
 28 | 
 29 |     unsigned long filesize = file_size(file);
 30 | 
 31 |     if (filesize > 0) {
 32 |         sortary = read_file(file, filesize);
 33 | 
 34 |         if (!file) {
 35 |             throw runtime_error("Error reading file.");
 36 |         }
 37 |         qsort(sortary, (filesize / sizeof(int)) / 2, 2 * INT_SIZE, cmp2it);
 38 |     }
 39 | 
 40 |     unsigned long numel = filesize / INT_SIZE;
 41 |     i = 0;
 42 |     fcnt = 0;
 43 |     for (j = 0; j < numfreq; j++) {
 44 |         if (args.use_seq) k = 0;
 45 |         else k = j + 1;
 46 |         for (; k < numfreq; k++) {
 47 |             fcnt = 0;
 48 |             if (filesize > 0 && i < numel) {
 49 |                 while (i < numel &&
 50 |                        j == freqidx[sortary[i]] && k == freqidx[sortary[i + 1]]) {
 51 |                     fcnt += 256;
 52 |                     i += 2;
 53 |                 }
 54 |             }
 55 |             if (args.use_seq) fcnt += (int) cntary[j * numfreq + k];
 56 |             else {
 57 |                 lit = j;
 58 |                 lit = (offsets[lit] - lit - 1);
 59 |                 fcnt += (int) cntary[lit + k];
 60 |             }
 61 | 
 62 |             if (fcnt >= args.MINSUPPORT) {
 63 |                 if (args.write_only_fcnt) {
 64 |                     ofd.write((char *) &fcnt, INT_SIZE);
 65 |                 } else {
 66 |                     itbuf[0] = backidx[j];
 67 |                     itbuf[1] = backidx[k];
 68 |                     itbuf[2] = fcnt;
 69 |                     ofd.write((char *) itbuf, 3 * sizeof(int));
 70 |                 }
 71 |                 l2cnt++;
 72 |             }
 73 |         }
 74 |     }
 75 |     if (filesize > 0) delete[] sortary;
 76 | }
 77 | 
 78 | 
 79 | void Exttpose::process_cust(int fcnt, fstream &seqfd, fstream &isetfd) {
 80 |     int j, k;
 81 |     int ii1, ii2, lit;
 82 | 
 83 |     for (k = 0; k < fcnt; k++) {
 84 |         for (j = k; j < fcnt; j++) {
 85 |             if (args.use_seq && extary[fidx[j]].size() > 0) {
 86 |                 lit = extary[fidx[j]].size() - 1;
 87 |                 if (extary[fidx[k]][0] < extary[fidx[j]][lit]) {
 88 |                     if ((++seq2[fidx[k] * numfreq + fidx[j]]) == 0) {
 89 |                         seqfd.write((char *) &backidx[fidx[k]], INT_SIZE);
 90 |                         seqfd.write((char *) &backidx[fidx[j]], INT_SIZE);
 91 |                     }
 92 |                 }
 93 |             }
 94 |             if (j > k) {
 95 |                 if (fidx[k] < fidx[j]) {
 96 |                     ii1 = fidx[k];
 97 |                     ii2 = fidx[j];
 98 |                 } else {
 99 |                     ii2 = fidx[k];
100 |                     ii1 = fidx[j];
101 |                 }
102 |                 lit = offsets[ii1] - ii1 - 1;
103 |                 if (ocust[lit + ii2] == 1) {
104 |                     if ((++itcnt2[lit + ii2]) == 0) {
105 |                         isetfd.write((char *) &backidx[ii1], INT_SIZE);
106 |                         isetfd.write((char *) &backidx[ii2], INT_SIZE);
107 |                         //itcnt2[lit+ii2] = 0;
108 |                     }
109 |                     ocust[lit + ii2] = 0;
110 |                 }
111 | 
112 |                 if (extary[fidx[k]].size() > 0) {
113 |                     lit = extary[fidx[k]].size() - 1;
114 |                     if (extary[fidx[j]][0] < extary[fidx[k]][lit]) {
115 |                         if ((++seq2[fidx[j] * numfreq + fidx[k]]) == 0) {
116 |                             seqfd.write((char *) &backidx[fidx[j]], INT_SIZE);
117 |                             seqfd.write((char *) &backidx[fidx[k]], INT_SIZE);
118 |                         }
119 |                     }
120 |                 }
121 |             }
122 |         }
123 |         extary[fidx[k]].reset();
124 |     }
125 | }
126 | 
127 | void Exttpose::do_invert_db(int pblk, int mincustid, int maxcustid) {
128 |     int numitem = 0, tid = 0, custid = 0; // DD: to avoid gcc warning uninitialized var
129 |     int *buf = nullptr; // DD: to avoid gcc warning uninitialized var
130 |     ostringstream tmpname;
131 |     int fd;
132 |     int i, j, k, idx;
133 | 
134 |     dcb.get_first_blk();
135 |     dcb.get_next_trans(buf, numitem, tid, custid);
136 |     int ocid;// = -1;
137 |     for (int p = 0; p < args.num_partitions; p++) {
138 |         tmpname << args.output;
139 | 
140 |         if (args.num_partitions > 1) {
141 |             tmpname << ".P" << p;
142 |         }
143 |         
144 |         string tmpnam = tmpname.str();
145 |         tmpname.str(string());
146 | 
147 |         if ((fd = open(tmpnam.c_str(), (O_WRONLY | O_CREAT | O_TRUNC | O_BINARY), 0666)) < 0) {
148 |             throw runtime_error("Can't open out file");
149 |         }
150 | 
151 |         for (i = 0; i < numfreq; i++) {
152 |             extary[i].reset();
153 |         }
154 |         //count 2-itemsets
155 |         int plb = p * pblk + mincustid;
156 |         int pub = plb + pblk;
157 |         if (pub >= maxcustid) pub = maxcustid + 1;
158 |         env.logger << "BOUNDS " << plb << " " << pub << endl;
159 |         int fcnt;
160 |         for (; !dcb.eof() && custid < pub;) {
161 |             fcnt = 0;
162 |             ocid = custid;
163 |             //env.logger << "TID " << custid << " " << tid << " " << numitem << endl;
164 |             while (!dcb.eof() && ocid == custid && custid < pub) {
165 |                 //for (k=0; k < numitem; k++){
166 | 
167 |                 // }
168 | 
169 |                 if (args.use_diff) {
170 |                     //add this tid to all items not in the trans
171 |                     k = 0;
172 |                     for (j = 0; j < numitem; j++) {
173 |                         if (freqidx[buf[j]] == -1) continue;
174 | 
175 |                         while (backidx[k] < buf[j]) {
176 |                             //if ((idx = freqidx[backidx[k]]) != -1){
177 |                             idx = k;
178 |                             if (!args.use_newformat)
179 |                                 extary[idx].add(fd, tid, args.use_seq, p);
180 |                             else extary[idx].add(fd, tid, args.use_seq, p, custid);
181 |                             //}
182 |                             k++;
183 |                         }
184 |                         k++; //skip over buf[j]
185 |                     }
186 |                     for (; k < numfreq; k++) {
187 |                         //if ((idx = freqidx[backidx[k]]) != -1){
188 |                         idx = k;
189 |                         if (!args.use_newformat)
190 |                             extary[idx].add(fd, tid, args.use_seq, p);
191 |                         else extary[idx].add(fd, tid, args.use_seq, p, custid);
192 |                         //}
193 |                     }
194 |                 } else {
195 |                     // add this tid to all items in the trans
196 |                     for (j = 0; j < numitem; j++) {
197 |                         idx = freqidx[buf[j]];
198 |                         if (idx != -1) {
199 |                             if (!args.use_newformat) {
200 |                                 if (args.use_seq && extary[idx].flg() == 0) {
201 |                                     fidx[fcnt] = idx;
202 |                                     fcnt++;
203 |                                     extary[idx].setflg(1);
204 |                                     extary[idx].add(fd, tid, args.use_seq, p, custid);
205 |                                 } else {
206 |                                     extary[idx].add(fd, tid, args.use_seq, p);
207 |                                 }
208 |                             } else {
209 |                                 extary[idx].add(fd, tid, args.use_seq, p, custid);
210 |                             }
211 |                         }
212 |                     }
213 |                 }
214 | 
215 |                 dcb.get_next_trans(buf, numitem, tid, custid);
216 |             }
217 |             if (!args.use_newformat && args.use_seq) {
218 |                 for (k = 0; k < fcnt; k++) {
219 |                     extary[fidx[k]].setlastpos();
220 |                     extary[fidx[k]].setflg(0);
221 |                 }
222 |                 fcnt = 0;
223 |             }
224 |         }
225 | 
226 |         for (i = 0; i < numfreq; i++) {
227 |             //env.logger << "FLUSH " << i << " " << extary[i].lastPos << " " <<
228 |             //   extary[i].theSize << endl;
229 |             extary[i].flushbuf(fd, args.use_seq, p);
230 |         }
231 |         close(fd);
232 |     }
233 |     env.logger << "WROTE INVERT " << endl;
234 | }
235 | 
236 | Exttpose::Exttpose(Env &env_, ExttposeArgument &args_) : env(env_), args(args_) {
237 |     dcb.init(args.input);
238 |     itcnt.resize(args.DBASE_MAXITEM);
239 |     ocnt.resize(args.DBASE_MAXITEM, -1);
240 |     itlen.resize(args.DBASE_MAXITEM);
241 |     freqidx.resize(args.DBASE_MAXITEM);
242 | }
243 | 
244 | void Exttpose::tpose() {
245 |     int i, j, l;
246 |     int idx;
247 |     int custid, tid, numitem, fcnt;
248 |     ofstream ofd;
249 |     int sumsup = 0, sumdiff = 0;
250 | 
251 |     //count 1 items
252 |     int *buf;
253 |     dcb.get_first_blk();
254 |     dcb.get_next_trans(buf, numitem, tid, custid);
255 |     int mincustid = custid;
256 |     while (!dcb.eof()) {
257 |         //env.logger << custid << " " << tid << " " << numitem;
258 |         for (j = 0; j < numitem; j++) {
259 |             //env.logger << " " << buf[j] << flush;
260 |             itlen[buf[j]]++;
261 |             if (args.use_seq && ocnt[buf[j]] != custid) {
262 |                 itcnt[buf[j]]++;
263 |                 ocnt[buf[j]] = custid;
264 |             }
265 |             //if (buf[j] == 17) env.logger << " " << tid;
266 |         }
267 |         //env.logger << endl;
268 |         dcb.get_next_trans(buf, numitem, tid, custid);
269 |     }
270 |     //env.logger << endl;
271 |     int maxcustid = static_cast<int>(custid);
272 |     env.logger << "MINMAX " << mincustid << " " << maxcustid << endl;
273 | 
274 |     for (i = 0; i < args.DBASE_MAXITEM; i++) {
275 |         if (args.use_seq) {
276 |             if (itcnt[i] >= args.MINSUPPORT) {
277 |                 env.logger << i << " SUPP " << itcnt[i] << endl;
278 |                 freqidx[i] = numfreq;
279 |                 numfreq++;
280 |             } else freqidx[i] = static_cast<int>(-1);
281 |         } else {
282 |             if (itlen[i] >= args.MINSUPPORT) {
283 |                 freqidx[i] = numfreq;
284 |                 numfreq++;
285 |                 sumsup += itlen[i];
286 |                 sumdiff += (args.DBASE_NUM_TRANS - itlen[i]);
287 |             } else freqidx[i] = static_cast<int>(-1);
288 |         }
289 |         //if (i == 17) env.logger << " 17 SUP " << itlen[17] << endl;
290 |     }
291 | 
292 |     backidx.resize(numfreq);
293 | 
294 |     numfreq = 0;
295 |     for (i = 0; i < args.DBASE_MAXITEM; i++) {
296 |         if (args.use_seq) {
297 |             if (itcnt[i] >= args.MINSUPPORT)
298 |                 backidx[numfreq++] = i;
299 |         } else {
300 |             if (itlen[i] >= args.MINSUPPORT)
301 |                 backidx[numfreq++] = i;
302 |         }
303 |     }
304 | 
305 |     env.logger << "numfreq " << numfreq << " :  " << " SUMSUP SUMDIFF = " << sumsup << " " << sumdiff << endl;
306 | 
307 |     env.summary << " F1stats = [ " << numfreq << " " << sumsup << " " << sumdiff << " ]";
308 | 
309 |     if (numfreq == 0) return;
310 | 
311 |     long extarysz = args.AMEM / numfreq;
312 |     extarysz /= INT_SIZE;
313 |     env.logger << "EXTRARYSZ " << extarysz << endl;
314 |     if (extarysz < 2) extarysz = 2;
315 | 
316 |     extary.reserve(numfreq);
317 | 
318 |     for (i = 0; i < numfreq; i++) {
319 |         extary.emplace_back(extarysz, args.num_partitions);
320 |     }
321 | 
322 |     int plb, pub, pblk;
323 |     pblk = static_cast<int>(ceil(((double) (maxcustid - mincustid + 1)) / args.num_partitions));
324 |     if (args.do_invert) {
325 |         if (args.num_partitions > 1) {
326 |             dcb.get_first_blk();
327 |             dcb.get_next_trans(buf, numitem, tid, custid);
328 |         }
329 |         for (j = 0; j < args.num_partitions; j++) {
330 |             //construct offsets for 1-itemsets
331 |             ostringstream tmpnamestrm;
332 |             tmpnamestrm << args.idxfn;
333 |             if (args.num_partitions > 1) {
334 |                 tmpnamestrm << ".P" << j;
335 |                 plb = j * pblk + mincustid;
336 |                 pub = plb + pblk;
337 |                 if (pub > maxcustid) pub = maxcustid + 1;
338 |                 std::fill(itcnt.begin(), itcnt.end(), 0);
339 |                 std::fill(ocnt.begin(), ocnt.end(), -1);
340 |                 std::fill(itlen.begin(), itlen.end(), 0);
341 | 
342 |                 for (; !dcb.eof() && custid < pub;) {
343 |                     for (i = 0; i < numitem; i++) {
344 |                         itlen[buf[i]]++;
345 |                         if (args.use_seq && ocnt[buf[i]] != custid) {
346 |                             itcnt[buf[i]]++;
347 |                             ocnt[buf[i]] = custid;
348 |                         }
349 |                     }
350 |                     dcb.get_next_trans(buf, numitem, tid, custid);
351 |                 }
352 |             }
353 |             string tmpnam = tmpnamestrm.str();
354 |             env.logger << "OPENED " << tmpnam << endl;
355 |             ofd.open(tmpnam, ios::binary);
356 |             if (!ofd) {
357 |                 throw runtime_error("Can't open file " + tmpnam);
358 |             }
359 | 
360 |             int file_offset = 0;
361 |             int null = -1;
362 |             for (i = 0; i < args.DBASE_MAXITEM; i++) {
363 |                 //if (i == 17) env.logger << "LIDX " << i << " " << itlen[i] << endl;
364 |                 if (freqidx[i] != -1) {
365 |                     ofd.write((char *) &file_offset, INT_SIZE);
366 |                     extary[freqidx[i]].set_offset(file_offset, j);
367 |                     if (args.use_seq) {
368 |                         if (args.use_newformat) file_offset += (2 * itlen[i]);
369 |                         else file_offset += (2 * itcnt[i] + itlen[i]);
370 |                     } else {
371 |                         if (args.use_diff) file_offset += (args.DBASE_NUM_TRANS - itlen[i]);
372 |                         else file_offset += itlen[i];
373 |                     }
374 |                 } else if (args.no_minus_off) {
375 |                     ofd.write((char *) &file_offset, INT_SIZE);
376 |                 } else ofd.write((char *) &null, INT_SIZE);
377 |                 //env.logger << "OFF " << i <<" " << file_offset << endl;
378 |             }
379 |             env.logger << "OFF " << i << " " << file_offset << endl;
380 |             ofd.write((char *) &file_offset, INT_SIZE);
381 |             ofd.close();
382 |         }
383 |     }
384 | 
385 |     env.logger << "Wrote Offt " << endl;
386 | 
387 |     fidx.resize(numfreq);
388 | 
389 |     int ocid = -1;
390 |     if (args.do_l2) {
391 |         fstream isetfd;
392 |         fstream seqfd;
393 |         string tmpseq, tmpiset;
394 | 
395 |         if (args.use_seq) {
396 |             tmpseq = args.tmpfn;
397 |             seqfd.open(tmpseq, ios::binary | ios::trunc);
398 |             if (!seqfd.is_open()) {
399 |                 throw runtime_error("Can't open out file");
400 |             }
401 |         }
402 |         tmpiset = args.tmpfn + "iset";
403 |         isetfd.open(tmpiset, ios::binary | ios::trunc);
404 | 
405 |         if (!isetfd.is_open()) {
406 |             throw runtime_error("Can't open out file");
407 |         }
408 | 
409 |         if (args.use_seq) {
410 |             seq2.reserve(numfreq * numfreq);
411 |         }
412 | 
413 |         itcnt2.resize(numfreq * (numfreq - 1) / 2);
414 |         ocust.resize(numfreq * (numfreq - 1) / 2);
415 |         offsets.resize(numfreq);
416 | 
417 |         int offt = 0;
418 |         int start = static_cast<int>(numfreq);
419 |         for (i = start - 1; i >= 0; i--) {
420 |             offsets[numfreq - i - 1] = offt;
421 |             offt += i;
422 |         }
423 | 
424 |         ocid = -1;
425 |         int lit;
426 |         //count 2-itemsets
427 |         dcb.get_first_blk();
428 |         dcb.get_next_trans(buf, numitem, tid, custid);
429 |         while (!dcb.eof()) {
430 |             fcnt = 0;
431 |             ocid = custid;
432 |             while (!dcb.eof() && ocid == custid) {
433 |                 for (j = 0; j < numitem; j++) {
434 |                     idx = freqidx[buf[j]];
435 |                     if (idx != -1) {
436 |                         if (args.use_seq) {
437 |                             if (extary[idx].size() == 0) {
438 |                                 fidx[fcnt] = idx;
439 |                                 fcnt++;
440 |                                 //extary[idx].add(isetfd,tid,args.use_seq,0);
441 |                                 //extary[idx].add(isetfd,tid,args.use_seq,0);
442 |                                 extary[idx].setitem(0, tid);
443 |                                 extary[idx].setitem(1, tid);
444 |                                 extary[idx].setsize(2);
445 |                             } else {
446 |                                 extary[idx].setitem(1, tid);
447 |                             }
448 | 
449 |                             lit = offsets[idx] - idx - 1;
450 |                             for (l = j + 1; l < numitem; l++) {
451 |                                 if (freqidx[buf[l]] != -1) {
452 |                                     ocust[lit + freqidx[buf[l]]] = 1;
453 |                                 }
454 |                             }
455 |                         } else {
456 |                             lit = offsets[idx] - idx - 1;
457 |                             for (l = j + 1; l < numitem; l++) {
458 |                                 if (freqidx[buf[l]] != -1) {
459 |                                     if ((++itcnt2[lit + freqidx[buf[l]]]) == 0) {
460 |                                         isetfd.write((char *) &buf[j], INT_SIZE);
461 |                                         isetfd.write((char *) &buf[l], INT_SIZE);
462 |                                     }
463 |                                 }
464 |                             }
465 |                         }
466 |                     }
467 |                 }
468 |                 dcb.get_next_trans(buf, numitem, tid, custid);
469 |             }
470 | 
471 |             if (args.use_seq) {
472 |                 process_cust(fcnt, seqfd, isetfd);
473 |             }
474 |         }
475 |         ocust.clear();
476 |         env.logger << "2-IT " << " " << endl;
477 | 
478 |         //write 2-itemsets counts to file
479 |         int l2cnt = 0;
480 |         if (args.use_seq) {
481 |             ofd.open(args.seqfn, ios::binary);
482 |             if (ofd.fail()) {
483 |                 throw runtime_error("Can't open seq file");
484 |             }
485 |             sort_get_l2(l2cnt, seqfd, ofd, seq2, true);
486 | 
487 |             ofd.close();
488 |             env.logger << "SEQ2 cnt " << l2cnt << endl;
489 |             env.summary << " " << l2cnt;
490 |         }
491 |         int seqs = l2cnt;
492 | 
493 |         ofd.open(args.it2fn, ios::binary);
494 |         //if ((fd = open(args.it2fn, (O_WRONLY|O_CREAT|O_TRUNC|O_BINARY), 0666)) < 0){
495 |         if (ofd.fail()) {
496 |             throw runtime_error("Can't open it2 file");
497 |         }
498 |         sort_get_l2(l2cnt, isetfd, ofd, itcnt2, false);
499 |         ofd.close();
500 |         env.logger << "SORT " << l2cnt << "  " << endl;
501 | 
502 |         env.summary << " F2stats = [" << l2cnt << " " << seqs << " ]";
503 |         offsets.clear();
504 |         itcnt2.clear();
505 |         seq2.clear();
506 |     }
507 | 
508 |     if (args.do_invert) {
509 |         do_invert_db(pblk, mincustid, maxcustid);
510 |     }
511 | 
512 |     freqidx.clear();
513 |     backidx.clear();
514 | }
515 | 
516 | 
517 | result_t exttposeFunc(Env &env, ExttposeArgument &args) {
518 |     env.logger << "CONF " << args.DBASE_NUM_TRANS << " " << args.DBASE_MAXITEM << " "
519 |                << args.DBASE_AVG_TRANS_SZ << " " << args.DBASE_AVG_CUST_SZ << endl;
520 | 
521 |     if (args.use_diff) {
522 |         env.logger << "SEQ TURNED OFF and PARTITIONS = 1\n";
523 |     }
524 | 
525 |     args.MINSUPPORT = lround(args.MINSUP_PER * args.DBASE_NUM_TRANS + 0.5);
526 | 
527 |     //ensure that support is at least 2
528 |     if (!args.write_only_fcnt && args.MINSUPPORT < 1) args.MINSUPPORT = 1;
529 |     env.logger << "args.MINSUPPORT " << args.MINSUPPORT << " " << args.DBASE_NUM_TRANS << endl;
530 | 
531 |     env.summary << "TPOSE";
532 |     if (args.use_diff) env.summary << " DIFF";
533 |     if (args.use_seq) env.summary << " SEQ";
534 |     if (!args.do_invert) env.summary << " NOINVERT";
535 |     if (!args.do_l2) env.summary << " NOF2";
536 |     env.summary << " " << args.input << " " << args.MINSUP_PER << " " << args.DBASE_NUM_TRANS << " "
537 |                 << args.MINSUPPORT << " " << args.num_partitions;
538 | 
539 |     Exttpose exttpose(env, args);
540 |     exttpose.tpose();
541 | 
542 |     result_t result;
543 |     result.logger = env.logger.str();
544 |     result.summary = env.summary.str();
545 |     return result;
546 | }
547 | 
548 | result_t exttposeWrapper(const string &s, shared_ptr<Env>& envptr) {
549 |     args_t args_ = parse(s);
550 |     ExttposeArgument args;
551 |     args.parse_args(args_.argc, args_.argv);
552 |     if (envptr == nullptr) {
553 |         Env env;
554 |         return exttposeFunc(env, args);
555 |     }
556 |     else {
557 |         return exttposeFunc(*envptr, args);
558 |     }
559 | }


--------------------------------------------------------------------------------
/csrc/exttpose.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Yukio Fukuzawa on 7/12/18.
  3 | //
  4 | 
  5 | #ifndef UTILITIES_EXTTPOSE_H
  6 | #define UTILITIES_EXTTPOSE_H
  7 | 
  8 | #include "common.h"
  9 | #include "Env.h"
 10 | #include "argv_parser.h"
 11 | #include "TransArray.h"
 12 | #include "calcdb.h"
 13 | 
 14 | #define MEG (1024*1204)
 15 | 
 16 | 
 17 | class ExttposeArgument {
 18 | public:
 19 |     string input;       //input file name
 20 |     string output;       //output file name
 21 |     string idxfn;
 22 |     string inconfn;
 23 |     string it2fn;
 24 |     string seqfn;
 25 |     string tmpfn;    // template for temporary files
 26 |     double MINSUP_PER = 0.0;
 27 |     long MINSUPPORT;
 28 |     int do_invert = 1;
 29 |     int do_l2 = 1;
 30 |     int use_seq = 1;
 31 |     int write_only_fcnt = 1;
 32 |     char use_newformat = 1;
 33 |     int num_partitions = 1;
 34 |     char no_minus_off = 0;
 35 | 
 36 |     char use_diff = 0;
 37 |     int DBASE_NUM_TRANS; //tot trans for assoc, num cust for sequences
 38 |     int DBASE_MAXITEM;   //number of items
 39 |     float DBASE_AVG_TRANS_SZ; //avg trans size
 40 |     float DBASE_AVG_CUST_SZ = 0; //avg cust size for sequences
 41 |     int DBASE_TOT_TRANS; //tot trans for sequences
 42 | 
 43 |     long AMEM = 32 * MEG;
 44 |     
 45 |     string name;
 46 |     void parse_args(int argc, char **argv) {
 47 |         auto cmdl = argh::parser(argc, argv, argh::parser::PREFER_PARAM_FOR_UNREG_OPTION);
 48 |         if (!cmdl({"s", "i", "o"})) {
 49 |             cerr << "usage: exttpose [OPTION]... -i<infile> -o<outfile> -s\n";
 50 |             throw runtime_error("exttpose needs valid value of -i -o and -s");
 51 |         }
 52 |         cmdl("i") >> name;
 53 |         input = name + ".data";
 54 |         inconfn = name + ".conf";
 55 |         cmdl("o") >> name;
 56 |         output = name + ".tpose";
 57 |         idxfn = name + ".idx";
 58 |         it2fn = name + ".2it";
 59 |         seqfn = name + ".2seq";
 60 |         tmpfn = name + ".tmp";
 61 |         cmdl("p") >> num_partitions;
 62 |         cmdl("s") >> MINSUP_PER;
 63 |         if (cmdl("a")) {
 64 |             use_seq = 0;
 65 |             cmdl("a") >> write_only_fcnt;
 66 |         }
 67 |         if (cmdl["d"]) use_diff = 1;
 68 |         if (cmdl["l"]) do_l2 = 0;
 69 |         if (cmdl["v"]) do_invert = 0;
 70 |         if (cmdl["f"]) use_newformat = 0;
 71 |         if (cmdl("m")) {
 72 |             cmdl("m") >> AMEM;
 73 |             AMEM *= MEG;
 74 |         }
 75 |         if (cmdl["x"]) no_minus_off = 1;
 76 | 
 77 |         ifstream inconff(inconfn, ios::binary);
 78 |         if (!inconff) {
 79 |             throw runtime_error("ERROR: Can\'t read conf file: " + inconfn);
 80 |         }
 81 | 
 82 |         if (use_seq) {
 83 |             inconff.read((char *) &DBASE_NUM_TRANS, INT_SIZE);
 84 |             inconff.read((char *) &DBASE_MAXITEM, INT_SIZE);
 85 |             inconff.read((char *) &DBASE_AVG_CUST_SZ, sizeof(float));
 86 |             inconff.read((char *) &DBASE_AVG_TRANS_SZ, sizeof(float));
 87 |             inconff.read((char *) &DBASE_TOT_TRANS, INT_SIZE);
 88 |         } else {
 89 |             inconff.read((char *) &DBASE_NUM_TRANS, INT_SIZE);
 90 |             inconff.read((char *) &DBASE_MAXITEM, INT_SIZE);
 91 |             inconff.read((char *) &DBASE_AVG_TRANS_SZ, sizeof(float));
 92 |         }
 93 | 
 94 |         if (use_diff) {
 95 |             use_seq = 0;
 96 |             num_partitions = 1;
 97 |         }
 98 |         if (use_seq) {
 99 |             write_only_fcnt = 0;
100 |         }
101 |     }
102 | };
103 | 
104 | class Exttpose {
105 |     Env& env;
106 |     ExttposeArgument& args;
107 |     DbaseCtrlBlk dcb;
108 |     unsigned long numfreq = 0;
109 | 
110 |     vector<int> itcnt;
111 |     vector<int> ocnt;
112 |     vector<int> itlen;
113 |     vector<int> freqidx;
114 |     vector<int> backidx;
115 |     vector<TransArray> extary;
116 |     vector<unsigned char> seq2;
117 |     vector<unsigned char> itcnt2;
118 |     vector<char> ocust;
119 |     vector<int> offsets;
120 |     vector<int> fidx;
121 | public:
122 |     void sort_get_l2(int &l2cnt, fstream& file, ofstream &ofd, vector<unsigned char> &cntary, bool use_seq);
123 | 
124 |     void process_cust(int fcnt, fstream &seqfd, fstream &isetfd);
125 | 
126 |     void do_invert_db(int pblk, int mincustid, int maxcustid);
127 |     void tpose();
128 | 
129 |     Exttpose(Env& env_, ExttposeArgument& args_);
130 | };
131 | 
132 | result_t exttposeFunc(Env &env, ExttposeArgument &args);
133 | 
134 | /**
135 |  * Call exttpose given the argument list as string
136 |  * @param args e.g. 'exttpose -i zaki -o zaki -p 1 -l -x -s 0.3'
137 |  */
138 | result_t exttposeWrapper(const string& args, shared_ptr<Env>& envptr);
139 | 
140 | #endif //UTILITIES_MAKEBIN_H
141 | 


--------------------------------------------------------------------------------
/csrc/exttpose_main.cc:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "exttpose.h"
 3 | 
 4 | int main(int argc, char **argv) {
 5 |     try {
 6 |         Env env;
 7 |         ExttposeArgument args;
 8 |         args.parse_args(argc, argv);
 9 |         exttposeFunc(env, args);
10 |         cout << env.logger.str() << endl;
11 |         cout << env.summary.str() << endl;
12 |         return 0;
13 |     }
14 |     catch (exception &e) {
15 |         cerr << "exttpose: Caught exception: " << e.what() << endl;
16 |     }
17 |     catch (...) {
18 |         cerr << "exttpose: Caught unknown exception" << endl;
19 |     }
20 |     return 1;
21 | }


--------------------------------------------------------------------------------
/csrc/getconf.cc:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "argv_parser.h"
  3 | #include "calcdb.h"
  4 | #include "Env.h"
  5 | #include "getconf.h"
  6 | 
  7 | 
  8 | result_t getconfFunc(Env& env, const GetconfArgument& args) {
  9 |     int DBASE_NUM_TRANS = 0;
 10 |     int DBASE_MAXITEM = 0;
 11 |     int DBASE_NUM_CUST = 0;
 12 |     int DBASE_MINTRANS = 0;
 13 |     int DBASE_MAXTRANS = 0;
 14 |     float DBASE_AVG_TRANS_SZ = 0;
 15 |     float DBASE_AVG_CUST_SZ = 0;
 16 | 
 17 |     int i;
 18 | 
 19 |     int custid = 0, tid = 0, nitem = 0;
 20 |     int *buf = nullptr;
 21 |     int oldcustid = -1;
 22 |     int oldtcnt = 0;
 23 |     int tsizesum = 0;
 24 |     int tcustsum = 0;
 25 |     int tsizesq = 0;
 26 |     int maxnitem = 0;
 27 | 
 28 |     DbaseCtrlBlk dcb;
 29 |     dcb.init(args.input);
 30 | 
 31 |     dcb.get_first_blk();
 32 |     dcb.get_next_trans(buf, nitem, tid, custid);
 33 |     DBASE_MINTRANS = custid;
 34 |     while (!dcb.eof()) {
 35 |         //printf ("%d %d %d\n", custid, tid, nitem);
 36 |         DBASE_MAXTRANS = custid;
 37 |         if (args.use_seq) {
 38 |             if (oldcustid != custid) {
 39 |                 tcustsum += DBASE_NUM_TRANS - oldtcnt;
 40 |                 oldtcnt = DBASE_NUM_TRANS;
 41 |                 DBASE_NUM_CUST++;
 42 |                 oldcustid = custid;
 43 |             }
 44 |         }
 45 |         DBASE_NUM_TRANS++;
 46 |         tsizesum += nitem;
 47 |         if (nitem > maxnitem) maxnitem = nitem;
 48 | 
 49 |         tsizesq += (nitem * nitem);
 50 |         for (i = 0; i < nitem; i++)
 51 |             if (buf[i] > DBASE_MAXITEM) DBASE_MAXITEM = buf[i];
 52 |         dcb.get_next_trans(buf, nitem, tid, custid);
 53 |     }
 54 |     tcustsum += DBASE_NUM_TRANS - oldtcnt;
 55 |     DBASE_MAXITEM++;
 56 | 
 57 |     if (args.use_seq) {
 58 |         DBASE_AVG_CUST_SZ = (1.0 * tcustsum) / DBASE_NUM_CUST;
 59 |     }
 60 |     DBASE_AVG_TRANS_SZ = (1.0 * tsizesum) / DBASE_NUM_TRANS;
 61 |     double trans_sq_avg = (1.0 * tsizesq) / DBASE_NUM_TRANS;
 62 |     double stddev = sqrt(trans_sq_avg - (DBASE_AVG_TRANS_SZ * DBASE_AVG_TRANS_SZ));
 63 | 
 64 | 
 65 |     //write config info to new file
 66 |     ofstream conffd(args.confn.c_str(), ios::binary | ios::out);
 67 | 
 68 |     if (!conffd) {
 69 |         string error_message = "can't open file: " + args.confn;
 70 |         throw runtime_error(error_message);
 71 |     }
 72 | 
 73 |     if (args.use_seq) {
 74 |         conffd.write((char *) &DBASE_NUM_CUST, INT_SIZE);
 75 |         conffd.write((char *) &DBASE_MAXITEM, INT_SIZE);
 76 |         conffd.write((char *) &DBASE_AVG_CUST_SZ, sizeof(float));
 77 |         conffd.write((char *) &DBASE_AVG_TRANS_SZ, sizeof(float));
 78 |         conffd.write((char *) &DBASE_NUM_TRANS, INT_SIZE);
 79 |         conffd.write((char *) &DBASE_MINTRANS, INT_SIZE);
 80 |         conffd.write((char *) &DBASE_MAXTRANS, INT_SIZE);
 81 |     } else {
 82 |         conffd.write((char *) &DBASE_NUM_TRANS, INT_SIZE);
 83 |         conffd.write((char *) &DBASE_MAXITEM, INT_SIZE);
 84 |         conffd.write((char *) &DBASE_AVG_TRANS_SZ, sizeof(float));
 85 |         conffd.write((char *) &DBASE_MINTRANS, INT_SIZE);
 86 |         conffd.write((char *) &DBASE_MAXTRANS, INT_SIZE);
 87 |     }
 88 | 
 89 |     conffd.close();
 90 |     env.summary << "CONF " << DBASE_NUM_CUST << " " << DBASE_MAXITEM << " " << DBASE_AVG_CUST_SZ
 91 |             << " " << DBASE_AVG_TRANS_SZ << " " << DBASE_NUM_TRANS << " " << DBASE_MINTRANS
 92 |             << " " << DBASE_MAXTRANS << " " << stddev << " " << maxnitem << endl;
 93 | 
 94 |     result_t result;
 95 |     result.logger = env.logger.str();
 96 |     result.summary = env.summary.str();
 97 |     return result;
 98 | }
 99 | 
100 | result_t getconfWrapper(const string &s, shared_ptr<Env>& envptr) {
101 |     args_t args_ = parse(s);
102 |     GetconfArgument args;
103 |     args.parse_args(args_.argc, args_.argv);
104 | 
105 |     if (envptr == nullptr) {
106 |         Env env;
107 |         return getconfFunc(env, args);
108 |     }
109 |     else {
110 |         return getconfFunc(*envptr, args);
111 |     }
112 | }
113 | 
114 | // remark: the implementation assumes that a customer's transactions
115 | //         appear as a contiguous block in the binary input data, and
116 | //         therefore, in the user-supplied database.
117 | //
118 | // ceeboo 2007
119 | 


--------------------------------------------------------------------------------
/csrc/getconf.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Yukio Fukuzawa on 7/12/18.
 3 | //
 4 | 
 5 | #ifndef UTILITIES_GETCONF_H
 6 | #define UTILITIES_GETCONF_H
 7 | 
 8 | #include "common.h"
 9 | #include "argv_parser.h"
10 | #include "Env.h"
11 | 
12 | class GetconfArgument {
13 | public:
14 |     string input;       //input file name
15 |     string confn;
16 |     bool use_seq = true;
17 | 
18 |     void parse_args(int argc, char **argv) {
19 |         auto cmdl = argh::parser(argc, argv, argh::parser::PREFER_PARAM_FOR_UNREG_OPTION);
20 |         cmdl("i") >> input;
21 |         cmdl("o") >> confn;
22 |         cmdl("a") >> use_seq;
23 | 
24 |         if (input.empty() || confn.empty()) {
25 |             cerr << "usage: getconf [-a] -i<infile> -o<outfile>\n";
26 |             throw runtime_error("getconf needs valid value of -i and -o");
27 |         }
28 | 
29 |         input += ".data";
30 |         confn += ".conf";
31 |     }
32 | };
33 | 
34 | result_t getconfFunc(Env& env, const GetconfArgument& args);
35 | 
36 | /**
37 |  * Call getconf given the argument list as string
38 |  * @param args e.g. 'getconf -i zaki -o zaki'
39 |  */
40 | result_t getconfWrapper(const string &args, shared_ptr<Env>& envptr);
41 | 
42 | #endif //UTILITIES_GETCONF_H
43 | 


--------------------------------------------------------------------------------
/csrc/getconf_main.cc:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "getconf.h"
 3 | 
 4 | int main(int argc, char **argv) {
 5 |     try {
 6 |         Env env;
 7 |         GetconfArgument args;
 8 |         args.parse_args(argc, argv);
 9 |         getconfFunc(env, args);
10 | 
11 |         cout << env.logger.str();
12 |         cout << env.summary.str();
13 |         return 0;
14 |     }
15 |     catch (exception &e) {
16 |         cerr << "getconf: Caught exception: " << e.what() << endl;
17 |     }
18 |     catch (...) {
19 |         cerr << "getconf: Caught unknown exception" << endl;
20 |     }
21 |     return 1;
22 | }


--------------------------------------------------------------------------------
/csrc/main.cc:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "wrappers.h"
 3 | 
 4 | 
 5 | int main(int argc, char** argv) {
 6 |     string filename;
 7 |     spade_arg_t args;
 8 |     args.maxsize = 10;
 9 |     args.maxlen = 10;
10 |     args.maxgap = 1;
11 | 
12 |     auto cmdl = argh::parser(argc, argv, argh::parser::PREFER_PARAM_FOR_UNREG_OPTION);
13 |     cmdl({"-i", "--input"}) >> filename;
14 | 
15 |     if (filename.empty() || !cmdl({"-s", "--support"})) {
16 |         cerr << "usage: cspade-full -i<infile> -s<support>\n";
17 |         throw runtime_error("cspade-full needs valid value of -i and -s");
18 |     }
19 | 
20 |     cmdl({"-s", "--support"}) >> args.support;
21 |     if (cmdl({"-u", "--max-gap"})) cmdl({"-u", "--max-gap"}) >> args.maxgap;
22 |     if (cmdl({"-m", "--maxsize"})) cmdl({"-m", "--maxsize"}) >> args.maxsize;
23 |     if (cmdl({"-M", "--maxlen"})) cmdl({"-M", "--maxlen"})>> args.maxlen;
24 | 
25 |     result_t result = runSpade(filename, args);
26 |     cout << result.seqstrm;
27 |     return 0;
28 | }
29 | 
30 | //int main1(int argc, char **argv) {
31 | //    bool mb = false;
32 | //    bool gc = false;
33 | //    bool et = false;
34 | //    bool spade = false;
35 | //
36 | //    if (argc == 1) {
37 | //        mb = true;
38 | //        gc = true;
39 | //        et = true;
40 | //        spade = true;
41 | //    } else {
42 | //        char *command = argv[1];
43 | //        if (!strcmp(command, "makebin")) {
44 | //            mb = true;
45 | //            gc = false;
46 | //            et = false;
47 | //            spade = false;
48 | //        }
49 | //        if (!strcmp(command, "getconf")) {
50 | //            mb = true;
51 | //            gc = true;
52 | //            et = false;
53 | //            spade = false;
54 | //        }
55 | //        if (!strcmp(command, "exttpose")) {
56 | //            mb = true;
57 | //            gc = true;
58 | //            et = true;
59 | //            spade = false;
60 | //        }
61 | //        if (!strcmp(command, "spade")) {
62 | //            mb = true;
63 | //            gc = true;
64 | //            et = true;
65 | //            spade = true;
66 | //        }
67 | //    }
68 | //    if (mb) {
69 | //        makebinWrapper("makebin test/zaki.txt zaki.data");
70 | //    }
71 | //
72 | //    if (gc) {
73 | //        getconfWrapper("getconf -i zaki -o zaki");
74 | //    }
75 | //
76 | //    if (et) {
77 | //        exttposeWrapper("exttpose -i zaki -o zaki -p 1 -l -x -s 0.3");
78 | //    }
79 | //
80 | //    if (spade) {
81 | //        spadeWrapper("spade -i zaki -s 0.3 -Z 10 -z 10 -u 1 -r -e 1 -o");
82 | //    }
83 | //
84 | //    result_t result = getResult();
85 | //    cout << result.mined;
86 | //    cout << result.nsequences;
87 | //    return 0;
88 | //}
89 | 


--------------------------------------------------------------------------------
/csrc/makebin.cc:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "argv_parser.h"
 3 | #include "Env.h"
 4 | 
 5 | #define ITSZ sizeof(int)
 6 | 
 7 | const int lineSize = 8192;
 8 | const int wdSize = 256;
 9 | 
10 | void convbin(ostream &fout, char *inBuf, int inSize) {
11 |     char *p;
12 |     int it;
13 | 
14 |     for (;;) {
15 |         it = strtol(inBuf, &p, 10);
16 |         if (p == inBuf)
17 |             break;
18 | 
19 |         fout.write((char *) &it, ITSZ);
20 |         inBuf = p;
21 |     }
22 | }
23 | 
24 | result_t makebinFunc(Env& env, const string& infilename, const string& outfilename) {
25 |     char inBuf[lineSize];
26 |     int inSize;
27 |     ifstream fin(infilename);
28 | 
29 |     if (!fin) {
30 |         string error_message = "can't open ascii file: " + infilename;
31 |         throw runtime_error(error_message);
32 |     }
33 |     ofstream fout(outfilename, ios::binary);
34 |     if (!fout) {
35 |         string error_message = "can't open binary file: " + outfilename;
36 |         throw runtime_error(error_message);
37 |     }
38 | 
39 |     while (fin.getline(inBuf, lineSize)) {
40 |         inSize = fin.gcount();
41 | //        env.logger << "IN SIZE " << inSize << endl;
42 |         convbin(fout, inBuf, inSize);
43 |     }
44 | 
45 |     fin.close();
46 |     fout.close();
47 | 
48 |     result_t result;
49 |     result.logger = env.logger.str();
50 |     return result;
51 | }
52 | 
53 | result_t makebinWrapper(const string &s, shared_ptr<Env>& envptr) {
54 |     args_t args = parse(s);
55 |     Env env;
56 |     string infilename(args.argv[1]);
57 |     string outfilename(args.argv[2]);
58 | 
59 |     if (envptr == nullptr) {
60 |         return makebinFunc(env, infilename, outfilename);
61 |     }
62 |     else {
63 |         return makebinFunc(*envptr, infilename, outfilename);
64 |     }
65 | }


--------------------------------------------------------------------------------
/csrc/makebin.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Yukio Fukuzawa on 7/12/18.
 3 | //
 4 | 
 5 | #ifndef UTILITIES_MAKEBIN_H
 6 | #define UTILITIES_MAKEBIN_H
 7 | 
 8 | #include "common.h"
 9 | #include "argv_parser.h"
10 | #include "Env.h"
11 | 
12 | result_t makebinFunc(Env& env, const string& infilename, const string& outfilename);
13 | 
14 | 
15 | /**
16 |  * Call makebin given the argument list as string
17 |  * @param args e.g 'makebin test/zaki.txt zaki.data'
18 |  */
19 | result_t makebinWrapper(const string &arg, shared_ptr<Env>& envptr);
20 | 
21 | #endif //UTILITIES_MAKEBIN_H
22 | 


--------------------------------------------------------------------------------
/csrc/makebin_main.cc:
--------------------------------------------------------------------------------
 1 | #include "makebin.h"
 2 | 
 3 | int main(int argc, char **argv) {
 4 |     try {
 5 |         Env env;
 6 |         string infilename(argv[1]);
 7 |         string outfilename(argv[2]);
 8 |         makebinFunc(env, infilename, outfilename);
 9 |         cout << env.logger.str();
10 |         return 0;
11 |     }
12 |     catch (exception &e) {
13 |         cerr << "exttpose: Caught exception: " << e.what() << endl;
14 |     }
15 |     catch (...) {
16 |         cerr << "exttpose: Caught unknown exception" << endl;
17 |     }
18 |     return 1;
19 | }


--------------------------------------------------------------------------------
/csrc/spade_main.cc:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "Sequence.h"
 3 | 
 4 | int main(int argc, char **argv) {
 5 |     try {
 6 |         Env env;
 7 |         shared_ptr<SpadeArguments> args = make_shared<SpadeArguments>();
 8 |         args->parse_args(argc, argv);
 9 |         result_t result = sequenceFunc(env, args);
10 | 
11 |         cout << result.logger << endl;
12 |         cout << result.summary << endl;
13 |         cout << result.seqstrm << endl;
14 |         cout << result.nsequences << endl;
15 |         return 0;
16 |     }
17 |     catch (exception &e) {
18 |         cerr << "sequence: Caught '" << typeid(e).name() << "' exception: " << e.what() << endl;
19 |     }
20 |     catch (...) {
21 |         cerr << "sequence: Caught unknown exception" << endl;
22 |     }
23 |     return 1;
24 | }


--------------------------------------------------------------------------------
/csrc/test.cc:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include "wrappers.h"
 3 | 
 4 | 
 5 | int main(int argc, char** argv) {
 6 |     string filename = "bb-tmi.txt";
 7 |     spade_arg_t args;
 8 |     args.maxsize = 10;
 9 |     args.maxlen = 10;
10 |     args.maxgap = 1;
11 |     args.support = 0.01;
12 | 
13 |     result_t result = runSpade(filename, args);
14 | 
15 |     ifstream f("test-cases/bb-tmi.Z10z10s0.01");
16 |     string str((std::istreambuf_iterator<char>(f)), std::istreambuf_iterator<char>());
17 | 
18 |     if (str != result.seqstrm){
19 |         cerr << "TEST FAILED" << endl;
20 |     }
21 |     else {
22 |         cout << "TEST PASSED" << endl;
23 |     }
24 | }


--------------------------------------------------------------------------------
/csrc/wrappers.cc:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "wrappers.h"
  3 | #include "Env.h"
  4 | #include "makebin.h"
  5 | #include "getconf.h"
  6 | #include "SpadeArguments.h"
  7 | #include "Sequence.h"
  8 | #include "exttpose.h"
  9 | 
 10 | void clean_up(const string& tmpprefix, ostream& logger, const string& tmpfolder) {
 11 |     list<string> tmpfiles = list_files(tmpfolder, tmpprefix);
 12 |     for (string& tmpfile : tmpfiles) {
 13 |         string filepath = tmpfolder + tmpfile;
 14 |         if(remove(filepath.c_str()) != 0) {
 15 |             logger << "Error deleting file " << filepath << endl;
 16 |         }
 17 |         else {
 18 |             logger << "Cleaned up successful: " << filepath << endl;
 19 |         }
 20 |     }
 21 | }
 22 | 
 23 | result_t runSpade(const string &filename, spade_arg_t args, const string& tmpdir) {
 24 |     shared_ptr<Env> envptr = make_shared<Env>();
 25 |     Env &env(*envptr);
 26 | 
 27 |     if (!file_exists(filename)) {
 28 |         throw runtime_error("File " + filename + " does not exist.");
 29 |     }
 30 | 
 31 |     if (args.support <= 0 || args.support > 1) {
 32 |         throw runtime_error("Support must be a floating point in range (0-1]");
 33 |     }
 34 | 
 35 |     if (args.mingap > 0 && args.maxgap > 0 && args.maxgap < args.mingap) {
 36 |         args.mingap = args.maxgap;
 37 |     }
 38 | 
 39 |     int nrows = num_lines(filename);
 40 |     ostringstream opt;
 41 | 
 42 |     auto nop = static_cast<int>(ceil((nrows + 2 * nrows) * sizeof(long) / pow(4, 10) / 5));
 43 |     if (args.memsize > 0) {
 44 |         opt << " -m " << args.memsize;
 45 |         nop = static_cast<int>(ceil(nop * 32 / float(args.memsize)));
 46 |     }
 47 | 
 48 |     if (args.numpart > 0) {
 49 |         if (args.numpart < nop) {
 50 |             env.logger << "numpart less than recommended\n";
 51 |         }
 52 |         nop = args.numpart;
 53 |     }
 54 | 
 55 |     string random_suffix = random_id(16);
 56 |     string tmpprefix = "cspade-" + random_suffix;
 57 |     string otherfile = tmpdir + tmpprefix;
 58 |     string datafile = otherfile + ".data";
 59 | 
 60 |     ostringstream makebin_args;
 61 |     ostringstream getconf_args;
 62 |     ostringstream exttpose_args;
 63 |     ostringstream spade_args;
 64 | 
 65 |     makebin_args << "makebin \"" << filename << "\" \"" << datafile +"\"";
 66 |     getconf_args << "getconf -i \"" << otherfile << "\" -o \"" << otherfile + "\"";
 67 |     exttpose_args << "exttpose -i \"" << otherfile << "\" -o \"" << otherfile << "\" -p " << nop << " -l -x -s " << args.support;
 68 | 
 69 |     if (args.maxsize > 0) {
 70 |         opt << " -Z " << args.maxsize;
 71 |     }
 72 |     if (args.maxlen > 0) {
 73 |         opt << " -z " << args.maxlen;
 74 |     }
 75 |     if (args.mingap > 0) {
 76 |         opt << " -l " << args.mingap;
 77 |     }
 78 |     if (args.maxgap > 0) {
 79 |         opt << " -u " << args.maxgap;
 80 |     }
 81 |     if (args.maxwin > 0) {
 82 |         opt << " -w " << args.maxwin;
 83 |     }
 84 |     if (!args.bfstype) {
 85 |         opt << " -r";
 86 |     }
 87 |     if (args.tid_lists) {
 88 |         opt << " -y";
 89 |     }
 90 | 
 91 |     spade_args << "spade -i \"" << otherfile << "\" -s " << args.support << opt.str() << " -e " << nop << " -o";
 92 | 
 93 |     try {
 94 |         makebinWrapper(makebin_args.str(), envptr);
 95 |         getconfWrapper(getconf_args.str(), envptr);
 96 |         exttposeWrapper(exttpose_args.str(), envptr);
 97 |         result_t result = sequenceWrapper(spade_args.str(), envptr);
 98 |         clean_up(tmpprefix, env.logger, tmpdir);
 99 | 
100 |         result.logger = env.logger.str();
101 |         return result;
102 |     }
103 |     catch (runtime_error& e) {
104 |         clean_up(tmpprefix, env.logger, tmpdir);
105 |         cerr << e.what();
106 |         throw e;
107 |     }
108 |     catch (std::exception& e) {
109 |         clean_up(tmpprefix, env.logger, tmpdir);
110 |         ostringstream message;
111 |         message << "Caught '" << typeid(e).name() << "' exception: " << e.what() << endl;
112 |         cerr << message.str();
113 |         throw runtime_error(message.str());
114 |     }
115 |     catch (...) {
116 |         ostringstream message;
117 |         message << "sequence: Caught unknown exception" << endl;
118 |         cerr << message.str();
119 |         throw runtime_error(message.str());
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/csrc/wrappers.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Yukio Fukuzawa on 3/12/18.
 3 | //
 4 | 
 5 | #ifndef SPADE_UTILITY_WRAPPERS_H
 6 | #define SPADE_UTILITY_WRAPPERS_H
 7 | 
 8 | #include "common.h"
 9 | 
10 | struct spade_arg_t {
11 |     double support = 0.1;
12 |     int maxsize = -1;
13 |     int maxlen = -1;
14 |     int mingap = -1;
15 |     int maxgap = -1;
16 |     int memsize = -1;
17 |     int numpart = -1;
18 |     int maxwin = -1;
19 |     bool bfstype = false;
20 |     bool tid_lists = false;
21 | };
22 | 
23 | /**
24 |  * One function to call all 4 functions and return the result
25 |  * @param filename name of the input file, e.g. /path/to/zaki.txt
26 |  * @param args arguments to spade.
27 |  * @param tmpdir temporary folder for spade to operate. Temp files will be cleaned afterwards. Must end with a slash
28 |  * @return same as getResult
29 |  */
30 | result_t runSpade(const string& filename, spade_arg_t args, const string& tmpdir = TMPDIR);
31 | 
32 | #endif //SPADE_UTILITY_WRAPPERS_H
33 | 


--------------------------------------------------------------------------------
/pycspade/__init__.py:
--------------------------------------------------------------------------------
1 | from .helpers import spade
2 | 


--------------------------------------------------------------------------------
/pycspade/cspade.pyx:
--------------------------------------------------------------------------------
 1 | from libcpp cimport bool
 2 | from libcpp.string cimport string as c_string
 3 | 
 4 | cdef extern from "../csrc/common.h":
 5 |     cdef struct result_t:
 6 |         int nsequences;
 7 |         c_string seqstrm;
 8 |         c_string logger;
 9 |         c_string summary;
10 | 
11 |     cdef struct spade_arg_t:
12 |         double support;
13 |         int maxsize;
14 |         int maxlen;
15 |         int mingap;
16 |         int maxgap;
17 |         int memsize;
18 |         int numpart;
19 |         int maxwin;
20 |         bool bfstype;
21 |         bool tid_lists;
22 | 
23 | cdef extern from "../csrc/wrappers.h":
24 |     cdef result_t runSpade(const c_string& filename, spade_arg_t args) except +RuntimeError;
25 | 
26 | 
27 | def c_runspade(filename, support=0.1, maxsize=None, maxlen=None, mingap=None, maxgap=None, memsize=None, numpart=None,
28 |                maxwin=None, bfstype=None, tid_lists=None):
29 |     cdef spade_arg_t args
30 |     args.support = support
31 |     args.maxsize = maxsize or -1
32 |     args.maxlen = maxlen or -1
33 |     args.mingap = mingap or -1
34 |     args.maxgap = maxgap or -1
35 |     args.memsize = memsize or -1
36 |     args.numpart = numpart or -1
37 |     args.maxwin = maxwin or -1
38 |     args.bfstype = bfstype or False
39 |     args.tid_lists = bfstype or False
40 | 
41 |     filename = filename.encode('latin-1')
42 |     return runSpade(filename, args)


--------------------------------------------------------------------------------
/pycspade/helpers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import uuid
  3 | 
  4 | from pycspade.cspade import c_runspade
  5 | 
  6 | 
  7 | def data_to_rows(data):
  8 |     rows = ['{} {} {} {}'.format(sid, eid, len(els), ' '.join(list(map(str, els)))) for sid, eid, els in data]
  9 |     return rows
 10 | 
 11 | 
 12 | def file_len(fname):
 13 |     with open(fname) as f:
 14 |         i = 0
 15 |         for l in f:
 16 |             if len(l):
 17 |                 i += 1
 18 |     return i
 19 | 
 20 | 
 21 | class Item:
 22 |     def __init__(self, elements):
 23 |         self.elements = elements
 24 | 
 25 |     def __repr__(self):
 26 |         return '({})'.format(' '.join(list(map(str, self.elements))))
 27 | 
 28 | 
 29 | class Sequence:
 30 |     def __init__(self, name, noccurs):
 31 |         self.items = []
 32 |         self.name = name
 33 |         self.noccurs = noccurs
 34 |         self.accum_occurs = noccurs
 35 |         self.confidence = None
 36 |         self.lift = None
 37 |         self.up_to_prev = None
 38 |         self.last_child = None
 39 |         self.frm_second = None
 40 |         self.up_to_prev_str = None
 41 |         self.last_child_str = None
 42 |         self.frm_second_str = None
 43 | 
 44 |     def add_item(self, item):
 45 |         self.items.append(item)
 46 | 
 47 |     def accumulate_occurs(self, child_occurs):
 48 |             self.accum_occurs += child_occurs
 49 |             if self.frm_second:
 50 |                 self.frm_second.accumulate_occurs(child_occurs)
 51 | 
 52 |     def __repr__(self):
 53 |         return '{} - [{}]'.format('->'.join(list(map(str, self.items))), self.noccurs)
 54 | 
 55 | 
 56 | def parse_results(result):
 57 |     lifts = {}
 58 |     confidences = {}
 59 |     nseqs = result['nsequences']
 60 |     lines = result['seqstrm'].split('\n')
 61 |     lines.sort()
 62 |     sequences = {}
 63 |     for line in lines:
 64 |         if '0' <= line[0] <= '9':
 65 |             sequence_str, stats = line.split(' -- ')
 66 |             item_strs = sequence_str.split(' -> ')
 67 |             noccurs = int(stats[:stats.index(' ')])
 68 | 
 69 |             sequence = Sequence(sequence_str, noccurs)
 70 |             if len(item_strs) > 1:
 71 |                 sequence.up_to_prev_str = ' -> '.join(item_strs[:-1])
 72 |                 sequence.last_child_str = item_strs[-1]
 73 |                 sequence.frm_second_str = ' -> '.join(item_strs[1:])
 74 | 
 75 |             for _item in item_strs:
 76 |                 _elements = list(map(int, _item.split(' ')))
 77 |                 item = Item(_elements)
 78 |                 sequence.add_item(item)
 79 |             sequences[sequence_str] = sequence
 80 | 
 81 |     # Second pass
 82 |     for sequence in list(sequences.values()):
 83 |         sequence.up_to_prev = up_to_prev = sequences.get(sequence.up_to_prev_str, None)
 84 |         sequence.last_child = last_child = sequences.get(sequence.last_child_str, None)
 85 |         sequence.frm_second = sequences.get(sequence.frm_second_str, None)
 86 | 
 87 |         if up_to_prev is not None:
 88 |             sequence.confidence = sequence.noccurs / up_to_prev.noccurs
 89 |             confidences[sequence.name] = sequence.confidence
 90 | 
 91 |             if last_child is not None:
 92 |                 sequence.lift = sequence.noccurs * nseqs / (up_to_prev.noccurs * last_child.noccurs)
 93 |                 lifts[sequence.name] = sequence.lift
 94 | 
 95 |     # Third pass - to calculate accummulated occurrence counts
 96 |     for sequence in list(sequences.values()):
 97 |         if sequence.frm_second is not None:
 98 |             sequence.frm_second.accumulate_occurs(sequence.noccurs)
 99 | 
100 |     result['mined_objects'] = list(sequences.values())
101 | 
102 | 
103 | def spade(filename=None, data=None, support=0.1, maxsize=None, maxlen=None, mingap=None, maxgap=None, memsize=None,
104 |           numpart=None, maxwin=None, bfstype=None, tid_lists=None, parse=True):
105 |     '''
106 |     Call C++'s cspade()
107 |     :param filename: full path to the input file (ascii)
108 |     :param support: is interpreted as the threshold of mimimum normalised support if within [0, 1]:
109 |                          if > 1: interpreted as the threshold of absolute support (e.g. 50 over 100 transactions)
110 |     :param maxsize: an integer value specifying the maximum number of items of an element of a sequence (default=100)
111 |     :param maxlen: an integer value specifying the maximum number of elements of a sequence (default=100)
112 |     :param mingap: an integer value specifying the minimum time difference between consecutive elements of a sequence
113 |     :param maxgap: an integer value specifying the minimum time difference between consecutive elements of a sequence
114 | 
115 |     :return: (result, logger, summary). where:
116 |              -result: the mined sequences
117 |              -logger: general logging
118 |              -summary: same content as summary.out created by the original C code
119 |     '''
120 |     if filename is None and data is None:
121 |         raise Exception('You must provide either filename or data')
122 |     if filename is not None and data is not None:
123 |         raise Exception('You must provide either filename or data')
124 | 
125 |     if filename and not os.path.isfile(filename):
126 |         raise Exception('File {} does not exist'.format(filename))
127 | 
128 |     if memsize:
129 |         if not isinstance(memsize, int):
130 |             raise Exception('memsize must be integer')
131 |     if numpart:
132 |         if not isinstance(numpart, int):
133 |             raise Exception('numpart must be integer')
134 | 
135 |     assert (0 < support <= 1), 'support must be a floating point in range (0-1]'
136 |     
137 |     if mingap is not None:
138 |         assert mingap > 0, 'mingap cannot be 0 - that would mean two transactions happen at the same time'
139 |     if maxgap is not None:
140 |         assert maxgap > 0, 'maxgap cannot be 0'
141 |         if mingap and maxgap < mingap:
142 |             mingap = maxgap
143 | 
144 |     if data:
145 |         rows = data_to_rows(data)
146 |         hex = uuid.uuid4().hex
147 |         filename = '/tmp/cspade-{}.txt'.format(hex)
148 |         with open(filename, 'w', encoding='latin-1') as f:
149 |             for row in rows:
150 |                 f.write(row)
151 |                 f.write('\n')
152 | 
153 |     try:
154 |         result = c_runspade(filename, support, maxsize, maxlen, mingap, maxgap, memsize, numpart, maxwin, bfstype,
155 |                             tid_lists)
156 |         decode_result(result)
157 |         if parse:
158 |             parse_results(result)
159 |         return result
160 | 
161 |     finally:
162 |         if data:
163 |             os.remove(filename)
164 | 
165 | 
166 | def print_result(result):
167 |     nseqs = result['nsequences']
168 |     print(('{0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>80s}'.format('Occurs', 'Accum', 'Support', 'Confid', 'Lift',
169 |                                                                     'Sequence')))
170 |     for mined_object in result['mined_objects']:
171 |         conf = 'N/A'
172 |         lift = 'N/A'
173 |         if mined_object.confidence:
174 |             conf = '{:0.7f}'.format(mined_object.confidence)
175 |         if mined_object.lift:
176 |             lift = '{:0.7f}'.format(mined_object.lift)
177 | 
178 |         print(('{0:>9d} {1:>9d} {2:>0.7f} {3:>9s} {4:>9s} {5:>80s} '.format(
179 |             mined_object.noccurs,
180 |             mined_object.accum_occurs,
181 |             mined_object.noccurs / nseqs,
182 |             conf,
183 |             lift,
184 |             '->'.join(list(map(str, mined_object.items))))))
185 | 
186 | 
187 | def decode_result(result):
188 |     result['seqstrm'] = result['seqstrm'].strip().decode('latin-1')
189 |     result['logger'] = result['logger'].strip().decode('latin-1')
190 |     result['summary'] = result['summary'].strip().decode('latin-1')


--------------------------------------------------------------------------------
/pycspade/shortcuts.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import uuid
  3 | 
  4 | from .cspade import cpp_cspade
  5 | 
  6 | 
  7 | def data_to_rows(data):
  8 |     rows = ['{} {} {} {}'.format(sid, eid, len(els), ' '.join(list(map(str, els)))) for sid, eid, els in data]
  9 |     return rows
 10 | 
 11 | 
 12 | class Item:
 13 |     def __init__(self, elements):
 14 |         self.elements = elements
 15 | 
 16 |     def __repr__(self):
 17 |         return '({})'.format(' '.join(list(map(str, self.elements))))
 18 | 
 19 | 
 20 | class Sequence:
 21 |     def __init__(self, name, noccurs):
 22 |         self.items = []
 23 |         self.name = name
 24 |         self.noccurs = noccurs
 25 |         self.accum_occurs = noccurs
 26 |         self.confidence = None
 27 |         self.lift = None
 28 |         self.up_to_prev = None
 29 |         self.last_child = None
 30 |         self.frm_second = None
 31 |         self.up_to_prev_str = None
 32 |         self.last_child_str = None
 33 |         self.frm_second_str = None
 34 | 
 35 |     def add_item(self, item):
 36 |         self.items.append(item)
 37 | 
 38 |     def accumulate_occurs(self, child_occurs):
 39 |             self.accum_occurs += child_occurs
 40 |             if self.frm_second:
 41 |                 self.frm_second.accumulate_occurs(child_occurs)
 42 | 
 43 |     def __repr__(self):
 44 |         return '{} - [{}]'.format('->'.join(list(map(str, self.items))), self.noccurs)
 45 | 
 46 | 
 47 | def decode_results(result):
 48 |     lifts = {}
 49 |     confidences = {}
 50 |     nseqs = result['nsequences']
 51 | 
 52 |     mined = result['mined']
 53 |     lines = mined.strip().decode('latin-1').split('\n')
 54 |     lines.sort()
 55 |     sequences = {}
 56 |     for line in lines:
 57 |         if '0' <= line[0] <= '9':
 58 |             sequence_str, stats = line.split(' -- ')
 59 |             item_strs = sequence_str.split(' -> ')
 60 |             noccurs = int(stats[:stats.index(' ')])
 61 | 
 62 |             sequence = Sequence(sequence_str, noccurs)
 63 |             if len(item_strs) > 1:
 64 |                 sequence.up_to_prev_str = ' -> '.join(item_strs[:-1])
 65 |                 sequence.last_child_str = item_strs[-1]
 66 |                 sequence.frm_second_str = ' -> '.join(item_strs[1:])
 67 | 
 68 |             for _item in item_strs:
 69 |                 _elements = list(map(int, _item.split(' ')))
 70 |                 item = Item(_elements)
 71 |                 sequence.add_item(item)
 72 |             sequences[sequence_str] = sequence
 73 | 
 74 |     # Second pass
 75 |     for sequence in list(sequences.values()):
 76 |         sequence.up_to_prev = up_to_prev = sequences.get(sequence.up_to_prev_str, None)
 77 |         sequence.last_child = last_child = sequences.get(sequence.last_child_str, None)
 78 |         sequence.frm_second = sequences.get(sequence.frm_second_str, None)
 79 | 
 80 |         if up_to_prev is not None:
 81 |             sequence.confidence = sequence.noccurs / up_to_prev.noccurs
 82 |             confidences[sequence.name] = sequence.confidence
 83 | 
 84 |             if last_child is not None:
 85 |                 sequence.lift = sequence.noccurs * nseqs / (up_to_prev.noccurs * last_child.noccurs)
 86 |                 lifts[sequence.name] = sequence.lift
 87 | 
 88 |     # Third pass - to calculate accummulated occurrence counts
 89 |     for sequence in list(sequences.values()):
 90 |         if sequence.frm_second is not None:
 91 |             sequence.frm_second.accumulate_occurs(sequence.noccurs)
 92 | 
 93 |     result['mined_objects'] = list(sequences.values())
 94 | 
 95 | 
 96 | def cspade(filename=None, data=None, support=3, maxsize=None, maxlen=None, mingap=None, maxgap=None):
 97 |     """
 98 |     Shortcut to call cspade
 99 |     :param filename: path to the ascii file, must be given if data is None
100 |     :param data: raw data as list of transactions, must be given if filename is None
101 |     :param support: is interpreted as the threshold of mimimum normalised support if within [0, 1]:
102 |                          if > 1: interpreted as the threshold of absolute support (e.g. 50 over 100 transactions)
103 |     :param maxsize: an integer value specifying the maximum number of items of a sequence (default=100)
104 |     :param maxlen: an integer value specifying the maximum number of elements of a sequence (default=100)
105 |     :param mingap: an integer value specifying the minimum time difference between consecutive elements of a sequence
106 |     :param maxgap: an integer value specifying the maximum time difference between consecutive elements of a sequence
107 |     :param decode: if True, the return strings will be decoded and line-separated, otherwise raw C++ strings
108 |                    (python bytes) are returned
109 |     :return: (result, logger, summary). where:
110 |              -result: the mined sequences
111 |              -logger: general logging
112 |              -summary: equivalent to the content of summary.out
113 |     """
114 |     if filename is None and data is None:
115 |         raise Exception('You must provide either filename or data')
116 |     if filename is not None and data is not None:
117 |         raise Exception('You must provide either filename or data')
118 | 
119 |     if data:
120 |         rows = data_to_rows(data)
121 |         hex = uuid.uuid4().hex
122 |         filename = '/tmp/{}.ascii.data'.format(hex)
123 |         with open(filename, 'w', encoding='latin-1') as f:
124 |             for row in rows:
125 |                 f.write(row)
126 |                 f.write('\n')
127 | 
128 |     try:
129 |         retval = cpp_cspade(filename, support, maxsize, maxlen, mingap, maxgap, decode=False)
130 |         decode_results(retval)
131 |         return retval
132 |     finally:
133 |         if data:
134 |             os.remove(filename)
135 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flake8==2.6.2
2 | flake8-putty==0.4.0
3 | flake8-docstrings
4 | twine
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | # This flag says that the code is written to work on both Python 2 and Python
3 | # 3. If at all possible, it is good practice to do this. If you cannot, you
4 | # will need to generate wheels for each Python version that you support.
5 | universal=1
6 | [metadata]
7 | description-file=README.md
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup, Extension
  2 | from codecs import open
  3 | import sys
  4 | import os
  5 | 
  6 | # is_windows = sys.platform.startswith('win')
  7 | 
  8 | def is_platform_mac():
  9 |     return sys.platform == 'darwin'
 10 | 
 11 | def is_platform_windows():
 12 |     return sys.platform == 'win32' or sys.platform == 'cygwin'
 13 | 
 14 | try:
 15 |     from Cython.Distutils import build_ext
 16 | except ImportError:
 17 |     use_cython = False
 18 | else:
 19 |     use_cython = True
 20 | 
 21 | if use_cython:
 22 |     sourcefiles = ['pycspade/cspade.pyx']
 23 | else:
 24 |     sourcefiles = ['pycspade/cspade.cpp']
 25 | 
 26 | extra_files = ['csrc/{}'.format(x) for x in [
 27 |     'makebin.cc',
 28 |     'getconf.cc',
 29 |     'exttpose.cc',
 30 |     'wrappers.cc',
 31 |     'calcdb.cc',
 32 |     'TransArray.cc',
 33 |     'Array.cc',
 34 |     'Itemset.cc',
 35 |     'Lists.cc',
 36 |     'Eqclass.cc',
 37 |     'InvertDatabase.cc',
 38 |     'Partition.cc',
 39 |     'Sequence.cc',
 40 |     'common.cc',
 41 |     'argv_parser.cc',
 42 |     'SpadeArguments.cc',
 43 |     'FreqIt.cc',
 44 |     'EqGrNode.cc',
 45 |     'ClassInfo.cc'
 46 | ]]
 47 | 
 48 | 
 49 | # Fix compatibility when compiling on Mac Mojave.
 50 | # Explanation: https://github.com/pandas-dev/pandas/issues/23424#issuecomment-446393981
 51 | # Code credit: https://github.com/pandas-dev/pandas/pull/24274/commits/256faf2011a12424e684a42c147e1ba7ac32c6fb
 52 | if is_platform_mac():
 53 |     import _osx_support
 54 |     import distutils.sysconfig
 55 |     if not 'MACOSX_DEPLOYMENT_TARGET' in os.environ:
 56 |         current_system = list(map(int, _osx_support._get_system_version().split('.')))
 57 |         python_osx_target_str = distutils.sysconfig.get_config_var('MACOSX_DEPLOYMENT_TARGET')
 58 |         python_osx_target = list(map(int, python_osx_target_str.split('.')))
 59 |         if python_osx_target < [10, 9] and current_system >= [10, 9]:
 60 |             os.environ['MACOSX_DEPLOYMENT_TARGET'] = '10.9'
 61 | 
 62 |     os.environ['CC'] = 'clang'
 63 |     os.environ['CXX'] = 'clang'
 64 | 
 65 | if is_platform_windows():
 66 |     extra_compiler_args = []
 67 | else:
 68 |     extra_compiler_args = [
 69 |         '-std=c++11',
 70 |         '-Wno-sign-compare',
 71 |         '-Wno-incompatible-pointer-types',
 72 |         '-Wno-unused-variable',
 73 |         '-Wno-absolute-value',
 74 |         '-Wno-visibility',
 75 |         '-Wno-#warnings',
 76 |     ]
 77 | 
 78 | if is_platform_mac():
 79 |     ext_modules = [
 80 |         Extension('pycspade.cspade',
 81 |                   sourcefiles + extra_files,
 82 |                   include_dirs=['csrc/'],
 83 |                   language='c++',
 84 |                   extra_compile_args=extra_compiler_args,
 85 |                   extra_link_args=["-O2", "-march=native", '-stdlib=libc++'],
 86 |                   ),
 87 |     ]
 88 | else:
 89 |     ext_modules = [
 90 |         Extension('pycspade.cspade',
 91 |                   sourcefiles + extra_files,
 92 |                   include_dirs=['csrc/'],
 93 |                   language='c++',
 94 |                   extra_compile_args=extra_compiler_args,
 95 |                   ),
 96 |     ]
 97 | 
 98 | with open('README.md', 'r') as fh:
 99 |     long_description = fh.read()
100 | 
101 | setup_args = dict(
102 |     name='pycspade',
103 |     ext_modules=ext_modules,
104 |     license='MIT',
105 |     packages=['pycspade'],
106 |     version='0.6.6',
107 |     author=['Mohammed J. Zaki', 'Yukio Fukuzawa'],
108 |     description='C-SPADE Python Implementation',
109 |     long_description=long_description,
110 |     long_description_content_type='text/markdown',
111 |     url='https://github.com/fzyukio/python-cspade',
112 |     keywords=['cspade', 'c-spade', 'sequence mining'],
113 |     install_requires=['Cython'],
114 | )
115 | 
116 | if use_cython:
117 |     setup_args['cmdclass'] = {'build_ext': build_ext}
118 | 
119 | setup(
120 |     **setup_args
121 | )
122 | 


--------------------------------------------------------------------------------
/test-global.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | rm pycspade/cspade.cpp pycspade/*.so;
3 | pip uninstall -y "pycspade>=0.0.0"
4 | python setup.py clean;
5 | python setup.py install
6 | python tests/example.py
7 | 


--------------------------------------------------------------------------------
/test-local.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | rm pycspade/cspade.cpp pycspade/*.so;
3 | pip uninstall -y "pycspade>=0.0.0"
4 | python setup.py clean;
5 | python setup.py build_ext --inplace;
6 | python tests/example.py


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fzyukio/python-cspade/abb46fed3d9edef3a0ac24bc4e226bdcc47c67aa/tests/__init__.py


--------------------------------------------------------------------------------
/tests/example.py:
--------------------------------------------------------------------------------
 1 | from pycspade.helpers import spade, print_result
 2 | 
 3 | if __name__ == '__main__':
 4 |     result = spade(filename='tests/zaki.txt', support=0.3)
 5 |     print('Sequences mined:')
 6 |     print((result['seqstrm']))
 7 |     print('Logger:')
 8 |     print((result['logger']))
 9 |     print('Summary:')
10 |     print((result['summary']))
11 | 
12 |     print_result(result)
13 | 
14 |     # data = [
15 |     #     [1, 10, [3, 4]],
16 |     #     [1, 15, [1, 2, 3]],
17 |     #     [1, 20, [1, 2, 6]],
18 |     #     [1, 25, [1, 3, 4, 6]],
19 |     #     [2, 15, [1, 2, 6]],
20 |     #     [2, 20, [5]],
21 |     #     [3, 10, [1, 2, 6]],
22 |     #     [4, 10, [4, 7, 8]],
23 |     #     [4, 20, [2, 6]],
24 |     #     [4, 25, [1, 7, 8]]
25 |     # ]
26 |     #
27 |     # result = spade(data=data, support=0.3)
28 |     # print_result(result)
29 | 


--------------------------------------------------------------------------------
/tests/simplest.txt:
--------------------------------------------------------------------------------
 1 | 1 1 1 1
 2 | 1 2 1 2
 3 | 1 3 1 3
 4 | 1 4 1 4
 5 | 2 1 1 1
 6 | 2 2 1 2
 7 | 2 3 1 3
 8 | 3 1 1 1
 9 | 3 2 1 2
10 | 3 3 1 3
11 | 3 4 1 3


--------------------------------------------------------------------------------
/tests/test.ascii.data:
--------------------------------------------------------------------------------
  1 | 1 1 3 8 37 42
  2 | 1 2 4 4 11 37 42
  3 | 1 3 3 27 64 91
  4 | 1 4 2 3 4
  5 | 1 5 3 4 24 73
  6 | 1 6 2 26 67
  7 | 1 7 3 4 58 84
  8 | 1 8 3 19 62 88
  9 | 2 1 2 10 73
 10 | 2 2 1 72
 11 | 2 3 3 4 24 77
 12 | 2 4 3 19 32 39
 13 | 2 5 2 50 72
 14 | 2 6 2 3 22
 15 | 2 7 3 51 68 72
 16 | 2 8 4 11 27 53 54
 17 | 2 9 3 47 77 91
 18 | 2 10 3 3 13 58
 19 | 3 1 3 48 62 78
 20 | 3 2 3 9 32 48
 21 | 3 3 5 40 62 67 72 76
 22 | 3 4 3 10 47 58
 23 | 3 5 2 35 37
 24 | 3 6 2 45 77
 25 | 3 7 1 53
 26 | 3 8 5 3 9 11 64 92
 27 | 4 1 4 11 32 62 97
 28 | 4 2 4 37 50 56 58
 29 | 4 3 4 3 17 18 92
 30 | 4 4 3 64 68 84
 31 | 4 5 4 17 58 60 94
 32 | 4 6 6 22 27 62 80 91 92
 33 | 4 7 5 18 39 60 72 83
 34 | 4 8 3 18 58 72
 35 | 4 9 4 6 40 42 63
 36 | 4 10 5 22 49 60 72 77
 37 | 4 11 3 19 48 59
 38 | 4 12 4 9 36 79 91
 39 | 4 13 5 14 32 57 60 75
 40 | 4 14 3 6 26 44
 41 | 4 15 4 12 44 77 91
 42 | 4 16 1 55
 43 | 4 17 6 12 23 42 53 69 84
 44 | 5 1 4 30 53 71 72
 45 | 5 2 3 55 72 99
 46 | 5 3 3 0 11 59
 47 | 5 4 2 22 48
 48 | 5 5 4 3 11 71 74
 49 | 5 6 5 22 42 43 72 80
 50 | 5 7 4 3 34 62 72
 51 | 5 8 5 26 35 48 68 72
 52 | 5 9 6 13 23 26 55 62 80
 53 | 6 1 5 14 18 33 39 60
 54 | 6 2 4 28 47 62 77
 55 | 6 3 4 4 40 44 57
 56 | 6 4 4 23 48 72 88
 57 | 6 5 3 4 53 85
 58 | 6 6 6 15 28 33 44 75 92
 59 | 6 7 4 26 27 40 96
 60 | 6 8 5 3 11 42 47 48
 61 | 6 9 4 4 8 17 47
 62 | 6 10 3 28 32 40
 63 | 6 11 4 10 58 67 68
 64 | 6 12 4 3 37 62 87
 65 | 7 1 3 30 39 72
 66 | 7 2 2 6 40
 67 | 7 3 1 18
 68 | 7 4 4 22 32 72 80
 69 | 7 5 3 13 53 77
 70 | 7 6 3 14 53 72
 71 | 7 7 3 4 42 69
 72 | 7 8 1 91
 73 | 7 9 5 4 15 22 71 84
 74 | 7 10 2 39 56
 75 | 8 1 6 6 26 48 68 72 77
 76 | 8 2 5 1 33 50 58 68
 77 | 8 3 4 29 39 60 71
 78 | 8 4 4 12 79 82 88
 79 | 8 5 6 11 12 22 48 53 80
 80 | 8 6 2 11 71
 81 | 8 7 2 17 45
 82 | 8 8 4 18 22 43 64
 83 | 9 1 3 10 49 72
 84 | 9 2 2 91 92
 85 | 9 3 2 22 51
 86 | 9 4 2 53 91
 87 | 9 5 2 3 30
 88 | 9 6 2 32 69
 89 | 9 7 1 71
 90 | 9 8 2 26 48
 91 | 9 9 1 92
 92 | 9 10 2 50 58
 93 | 9 11 3 39 40 87
 94 | 9 12 2 40 70
 95 | 9 13 3 5 13 50
 96 | 10 1 3 4 39 45
 97 | 10 2 3 4 55 80
 98 | 10 3 3 23 30 95
 99 | 10 4 1 35
100 | 10 5 3 13 33 37
101 | 10 6 2 40 72
102 | 10 7 4 48 49 58 95
103 | 10 8 2 95 98
104 | 10 9 1 4
105 | 


--------------------------------------------------------------------------------
/tests/test_cspade.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from pycspade import spade
 4 | 
 5 | 
 6 | class Test(unittest.TestCase):
 7 |     def setUp(self):
 8 |         result = spade(filename='tests/zaki.txt', support=2, maxsize=5, maxlen=5)
 9 |         self.nseqs = result['nsequences']
10 |         self.occurs = {}
11 |         self.supports = {}
12 |         self.confids = {}
13 |         self.lifts = {}
14 |         self.accum_occurs = {}
15 |         self.sequences = []
16 | 
17 |         for mined_object in result['mined_objects']:
18 |             sequence = '->'.join(list(map(str, mined_object.items)))
19 |             self.sequences.append(sequence)
20 |             self.occurs[sequence] = mined_object.noccurs
21 |             self.supports[sequence] = mined_object.noccurs / self.nseqs
22 |             self.confids[sequence] = mined_object.confidence
23 |             self.lifts[sequence] = mined_object.lift
24 |             self.accum_occurs[sequence] = mined_object.accum_occurs
25 | 
26 |     def test_sequences(self):
27 |         correct_sequences = ['(1)', '(1 2)', '(1 2 6)', '(1 6)', '(2)', '(2)->(1)', '(2 6)', '(2 6)->(1)', '(4)',
28 |                              '(4)->(1)',
29 |                              '(4)->(2)', '(4)->(2)->(1)', '(4)->(2 6)', '(4)->(2 6)->(1)', '(4)->(6)', '(4)->(6)->(1)',
30 |                              '(6)', '(6)->(1)']
31 |         correct_occurs = {'(1)': 4, '(1 2)': 3, '(1 2 6)': 3, '(1 6)': 3, '(2)': 4, '(2)->(1)': 2, '(2 6)': 4,
32 |                           '(2 6)->(1)': 2, '(4)': 2, '(4)->(1)': 2, '(4)->(2)': 2, '(4)->(2)->(1)': 2, '(4)->(2 6)': 2,
33 |                           '(4)->(2 6)->(1)': 2, '(4)->(6)': 2, '(4)->(6)->(1)': 2, '(6)': 4, '(6)->(1)': 2}
34 |         correct_supports = {'(1)': 1.0, '(1 2)': 0.75, '(1 2 6)': 0.75, '(1 6)': 0.75, '(2)': 1.0, '(2)->(1)': 0.5,
35 |                             '(2 6)': 1.0, '(2 6)->(1)': 0.5, '(4)': 0.5, '(4)->(1)': 0.5, '(4)->(2)': 0.5,
36 |                             '(4)->(2)->(1)': 0.5, '(4)->(2 6)': 0.5, '(4)->(2 6)->(1)': 0.5, '(4)->(6)': 0.5,
37 |                             '(4)->(6)->(1)': 0.5, '(6)': 1.0, '(6)->(1)': 0.5}
38 |         correct_lifts = {'(1)': None, '(1 2)': None, '(1 2 6)': None, '(1 6)': None, '(2)': None, '(2)->(1)': 0.5,
39 |                          '(2 6)': None, '(2 6)->(1)': 0.5, '(4)': None, '(4)->(1)': 1.0, '(4)->(2)': 1.0,
40 |                          '(4)->(2)->(1)': 1.0, '(4)->(2 6)': 1.0, '(4)->(2 6)->(1)': 1.0, '(4)->(6)': 1.0,
41 |                          '(4)->(6)->(1)': 1.0, '(6)': None, '(6)->(1)': 0.5}
42 |         correct_confids = {'(1)': None, '(1 2)': None, '(1 2 6)': None, '(1 6)': None, '(2)': None, '(2)->(1)': 0.5,
43 |                            '(2 6)': None, '(2 6)->(1)': 0.5, '(4)': None, '(4)->(1)': 1.0, '(4)->(2)': 1.0,
44 |                            '(4)->(2)->(1)': 1.0, '(4)->(2 6)': 1.0, '(4)->(2 6)->(1)': 1.0, '(4)->(6)': 1.0,
45 |                            '(4)->(6)->(1)': 1.0, '(6)': None, '(6)->(1)': 0.5}
46 | 
47 |         self.assertListEqual(self.sequences, correct_sequences)
48 |         self.assertDictEqual(self.occurs, correct_occurs)
49 |         self.assertDictEqual(self.supports, correct_supports)
50 |         self.assertDictEqual(self.lifts, correct_lifts)
51 |         self.assertDictEqual(self.confids, correct_confids)
52 | 
53 |         print((self.accum_occurs))
54 | 


--------------------------------------------------------------------------------
/tests/zaki.conf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fzyukio/python-cspade/abb46fed3d9edef3a0ac24bc4e226bdcc47c67aa/tests/zaki.conf


--------------------------------------------------------------------------------
/tests/zaki.data:
--------------------------------------------------------------------------------
1 |                                                                                                                                     


--------------------------------------------------------------------------------
/tests/zaki.idx:
--------------------------------------------------------------------------------
1 |                     


--------------------------------------------------------------------------------
/tests/zaki.tpose:
--------------------------------------------------------------------------------
1 |                                                                   


--------------------------------------------------------------------------------
/tests/zaki.txt:
--------------------------------------------------------------------------------
 1 | 1 10 2 3 4
 2 | 1 15 3 1 2 3
 3 | 1 20 3 1 2 6
 4 | 1 25 4 1 3 4 6
 5 | 2 15 3 1 2 6
 6 | 2 20 1 5
 7 | 3 10 3 1 2 6
 8 | 4 10 3 4 7 8
 9 | 4 20 2 2 6
10 | 4 25 3 1 7 8


--------------------------------------------------------------------------------
/uppypi.sh:
--------------------------------------------------------------------------------
1 | rm -rf pycspade.egg-info
2 | rm -rf dist/*
3 | python setup.py sdist
4 | twine upload dist/*
5 | 


--------------------------------------------------------------------------------
/utilssrc/Array.cc:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cerrno>
 3 | #include "Array.h"
 4 | 
 5 | Array::Array (int sz, int npart){
 6 |    totSize = sz;
 7 |    theSize = 0;
 8 |    lastPos = 0;
 9 |    theFlg = 0;
10 |    //theIncr = incr;
11 |    theArray = NULL;
12 |    offset = new long[npart];
13 |    for (int i=0; i < npart; i++) offset[i]=0;
14 |    if (sz > 0){
15 |       theArray =  (int *) malloc (totSize*sizeof(int));
16 |       //theArray = new int [totSize];
17 |       if (theArray == NULL){
18 |          perror("memory:: Array");
19 |          exit(errno);
20 |       }
21 |    }
22 | }
23 | 
24 | Array::~Array(){
25 |    if (theArray) {
26 |       free(theArray);
27 |       //delete [] theArray;
28 |    }
29 |    delete [] offset;
30 |    theArray = NULL;
31 | }
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/utilssrc/Array.h:
--------------------------------------------------------------------------------
  1 | #ifndef __ARRAY_H
  2 | #define __ARRAY_H
  3 | #include <cerrno>
  4 | #include <fcntl.h>
  5 | #include <unistd.h>
  6 | #include <stdlib.h>
  7 | #include <iostream>
  8 | #include <sys/types.h>
  9 | 
 10 | using namespace std;
 11 | 
 12 | class Array {
 13 | protected:
 14 |    int *theArray;
 15 |    char theFlg;
 16 |    int lastPos;
 17 |    unsigned int theSize;
 18 |    unsigned int totSize;
 19 |    long *offset;
 20 | public:
 21 |    
 22 |    Array(int sz, int npart=1);
 23 |    ~Array();
 24 | 
 25 |    int operator [] (unsigned int index)
 26 |    {
 27 |       return theArray[index];
 28 |    };
 29 | 
 30 |    char flg()
 31 |    {
 32 |       return theFlg;
 33 |    }
 34 |    void setflg(char flg)
 35 |    {
 36 |       theFlg = flg;
 37 |    }
 38 |    int lastpos()
 39 |    {
 40 |       return lastPos;
 41 |    }
 42 | 
 43 |    //to be used ony for use_seq
 44 |    void setlastpos()
 45 |    {
 46 |       theArray[lastPos+1] = theSize-lastPos-2;
 47 |       lastPos = theSize;
 48 |    }
 49 |    long get_offset(int pos=0)
 50 |    {
 51 |       return offset[pos];
 52 |    }
 53 |    void set_offset(long off, int pos=0)
 54 |    {
 55 |       offset[pos] = off;
 56 |    }
 57 |    
 58 |    int totsize()
 59 |    {
 60 |       return totSize;
 61 |    }
 62 |    void reset()
 63 |    {
 64 |       theSize = 0;
 65 |       lastPos = 0;
 66 |       theFlg = 0;
 67 |    }
 68 | 
 69 |    int *array()
 70 |    {
 71 |       return theArray;
 72 |    }
 73 |    
 74 |    int size() 
 75 |    {
 76 |       return theSize; 
 77 |    }
 78 |    void setsize(int size)
 79 |    {
 80 |       theSize = size;
 81 |    }
 82 |    
 83 |    void setitem(int pos, int item)
 84 |    {
 85 |       theArray[pos] = item;
 86 |    }
 87 | 
 88 |    void additem(int item){
 89 |       theArray[theSize] = item;
 90 |       theSize++;      
 91 |    }
 92 |    
 93 |    void flushbuf(int fd, int use_seq, int pos=0)
 94 |    {
 95 |       lseek(fd, offset[pos]*sizeof(int),SEEK_SET);
 96 | //       int wblk = (use_seq==1) ? lastPos : theSize;
 97 | //       //if (lastPos != theSize)
 98 | //       //   cout << "WBLK " << wblk << " " << lastPos << " "
 99 | //       //        << theSize << endl << flush;
100 |       int wblk = theSize;
101 |       if (wblk > 0){
102 |          int res = ::write(fd, (char *)theArray, wblk*sizeof(int));
103 |          if (res < wblk*sizeof(int)){
104 |             perror("Error writing");
105 |             exit(errno);
106 |          }
107 |          offset[pos] += wblk;
108 |       }
109 |       theSize = 0;
110 |    }
111 |    void add (int fd, int item, int use_seq, int pos, int custid=-1)
112 |    {
113 |       if (use_seq){
114 | //          if (theSize+1+((custid == -1)?0:2)> totSize){
115 | //             //cout << "WRITE " << item << " " << custid << " "
116 | //             //     << offset << " " << lastPos << " " << theSize << " "
117 | //             //     << totSize << endl << flush;
118 | //             if (lastPos == 0 && custid == -1){
119 | //                cout << "REALLOC " << totSize << " "<< theSize << endl;
120 | //                totSize *= 2;
121 | //                theArray = (int *)realloc(theArray, totSize*sizeof(int));
122 | //                if (theArray == NULL){
123 | //                   perror("ERROR IN REALLOC Array::add");
124 | //                   exit(errno);
125 | //                }
126 | //             }
127 | //             else{
128 | //                flushbuf(fd,use_seq,pos);
129 | //                for (int i=0; lastPos < theSize; i++, lastPos++)
130 | //                   theArray[i] = theArray[lastPos];
131 | //                theSize = i;
132 | //                lastPos = 0;
133 | //                //cout << "WROTE " << theSize << " " << lastPos << " "<<
134 | //                //   offset << endl <<flush;
135 | //             }
136 | //          }
137 | //          if (custid !=-1){
138 | //             theArray[theSize++] = custid; //store custid
139 | //             theSize++; //for the tid count
140 | //          }
141 |          if (theSize+2 > totSize){
142 |             flushbuf(fd,use_seq,pos);
143 |          }
144 |          theArray[theSize++] = custid;
145 |       }
146 |       else{
147 |          if (theSize+1 > totSize){
148 |             flushbuf(fd,use_seq,pos);
149 |          }
150 |       }
151 |       theArray[theSize++] = item;
152 |    }
153 | //    void add (int fd, int item, int use_seq, int custid=-1)
154 | //    {
155 | //       if (theSize+1 > totSize){
156 | //          totSize = (int) (totSize*2);
157 |          
158 | //          theArray = (int *)realloc(theArray, totSize*sizeof(int));
159 | //          if (theArray == NULL){
160 | //             cout << "MEMORY EXCEEDED\n";
161 | //             exit(-1);
162 | //          }
163 | //       }
164 | //       theArray[theSize] = item;
165 | //       theSize++;
166 | //    }
167 | };
168 | #endif //__ARRAY_H
169 | 
170 | 
171 | 


--------------------------------------------------------------------------------
/utilssrc/b2a.cc:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstdio>
 3 | #include <cerrno>
 4 | #include <fcntl.h>
 5 | #include <sys/stat.h>
 6 | #include <sys/types.h>
 7 | #include <sys/mman.h>
 8 | #include <unistd.h>
 9 | #include <cstdlib>
10 | 
11 | using namespace std;
12 | 
13 | int main(int argc, char **argv)
14 | {
15 |    int fd;
16 |    if ((fd = open(argv[1], O_RDONLY)) < 0){
17 |       perror("cant openfile ");
18 |       exit(errno);
19 |    }
20 |    long flen = lseek(fd, 0, SEEK_END);
21 |    int *ary;
22 | #ifdef SGI
23 |    ary = (int *) mmap((char *)NULL, flen,
24 |                           (PROT_WRITE|PROT_READ),
25 |                           MAP_PRIVATE, fd, 0);
26 | #else
27 |    ary = (int *) mmap((char *)NULL, flen,
28 |                           (PROT_WRITE|PROT_READ),
29 |                           (MAP_FILE|MAP_VARIABLE|MAP_PRIVATE), fd, 0);
30 | #endif
31 |    if (ary == (int *)-1){
32 |       perror("MMAP ERROR");
33 |       exit(errno);
34 |    }
35 |    for (int i=0; i < flen/sizeof(int); i++)
36 |       cout << " " << ary[i];
37 |    cout << endl;
38 | 
39 |    munmap((caddr_t)ary, flen);
40 |    close(fd);
41 |    
42 | }
43 | 


--------------------------------------------------------------------------------
/utilssrc/calcdb.cc:
--------------------------------------------------------------------------------
 1 | #include "calcdb.h"
 2 | #include <cstring>
 3 | 
 4 | Dbase_Ctrl_Blk::Dbase_Ctrl_Blk(char *infile, int buf_sz)
 5 | {
 6 |    fd = open (infile, O_RDONLY);
 7 |    if (fd < 0){
 8 |       printf("ERROR: InvalidFile -- Dbase_Ctrl_Blk()\n");
 9 |       exit(-1);
10 |    }
11 |    buf_size = buf_sz;
12 |    buf = new int [buf_sz];
13 |    cur_buf_pos = 0;
14 |    cur_blk_size = 0;
15 |    readall = 0;
16 |    endpos = lseek(fd,0,SEEK_END);
17 | }
18 |    
19 | Dbase_Ctrl_Blk::~Dbase_Ctrl_Blk()
20 | {
21 |    delete [] buf;
22 |    close(fd);
23 | }
24 | 
25 | void Dbase_Ctrl_Blk::get_next_trans_ext()
26 | {
27 |    // Need to get more items from file
28 |    int res = cur_blk_size - cur_buf_pos;
29 |    if (res > 0)
30 |    {
31 |       // First copy partial transaction to beginning of buffer
32 |       memcpy((void *)buf,
33 |              (void *)(buf + cur_buf_pos),
34 |              res * ITSZ);
35 |       cur_blk_size = res;
36 |    }
37 |    else
38 |    {
39 |       // No partial transaction in buffer
40 |       cur_blk_size = 0;
41 |    }
42 | 
43 |    res = read(fd, (void *)(buf + cur_blk_size),
44 |               ((buf_size - cur_blk_size)*ITSZ));
45 |    
46 |    if (res < 0){
47 |       perror("reading in database");
48 |       exit(errno);
49 |    }
50 |    cur_blk_size += res/ITSZ;
51 |    //if (cur_blk_size > 0)
52 |    //{
53 |    //   custid = buf[0];
54 |    //   tid = buf[1];
55 |    //   numitem = buf[2];
56 |    //   cur_buf_pos = 3;   
57 |    //}
58 |    cur_buf_pos = 0;
59 | }
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/utilssrc/calcdb.h:
--------------------------------------------------------------------------------
 1 | #ifndef __DATABASE_H
 2 | #define __DATABASE_H
 3 | 
 4 | #include <cstdio>
 5 | #include <fstream>
 6 | #include <cstdlib>
 7 | #include <sys/types.h>
 8 | #include <unistd.h>
 9 | #include <sys/stat.h>
10 | #include <fcntl.h>
11 | #include <cerrno>
12 | 
13 | using namespace std;
14 | 
15 | extern int use_seq;
16 | 
17 | #define ITSZ sizeof(int)
18 | #define DCBBUFSZ 2048
19 | //#define TRANSOFF ((use_seq)?3:2)
20 | #define TRANSOFF 3
21 | 
22 | class Dbase_Ctrl_Blk{
23 | public:
24 |    Dbase_Ctrl_Blk(char *infile, int buf_sz=DCBBUFSZ);
25 |    ~Dbase_Ctrl_Blk();
26 | 
27 |    void get_next_trans_ext();
28 |    inline void get_first_blk();
29 |    inline void get_next_trans(int *&lbuf, int &numitem, int &tid, int &custid);
30 | 
31 |    int eof()
32 |    {
33 |       return (readall == 1);
34 |    }
35 |    int fd;     
36 |    int buf_size;
37 |    int * buf;
38 |    int cur_blk_size; 
39 |    int cur_buf_pos;
40 |    int endpos;
41 |    char readall;
42 | };
43 | 
44 | inline void Dbase_Ctrl_Blk::get_first_blk()
45 | {
46 |    readall=0;
47 |    lseek(fd, 0, SEEK_SET);
48 |    cur_blk_size = (read(fd,(void *)buf, (buf_size*ITSZ)))/ITSZ;
49 |    if (cur_blk_size < 0){
50 |       perror("get_first_blk");
51 |       exit(errno);
52 |    }
53 |    cur_buf_pos = 0;
54 | }
55 | 
56 | inline  void Dbase_Ctrl_Blk::get_next_trans (int *&lbuf,
57 |                                              int &nitems, int &tid, int &cid)
58 | {
59 |    if (cur_buf_pos+TRANSOFF >= cur_blk_size ||
60 |        cur_buf_pos+buf[cur_buf_pos+TRANSOFF-1]+TRANSOFF > cur_blk_size){
61 |       if (lseek(fd, 0, SEEK_CUR) == endpos) readall = 1;
62 |       if (!readall){
63 |          // Need to get more items from file
64 |          get_next_trans_ext();
65 |       }      
66 |    }
67 |    
68 |    if (!readall){
69 |       cid = buf[cur_buf_pos];
70 |       tid = buf[cur_buf_pos+TRANSOFF-2];
71 |       nitems = buf[cur_buf_pos+TRANSOFF-1];
72 |       lbuf = buf + cur_buf_pos + TRANSOFF;
73 |       cur_buf_pos += nitems + TRANSOFF;
74 |    }
75 | }
76 | #endif //__DATABASE_H
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/utilssrc/getconf.cc:
--------------------------------------------------------------------------------
  1 | #include <cerrno>
  2 | #include <cstdio>
  3 | #include <stdlib.h>
  4 | #include <iostream>
  5 | #include <fcntl.h>
  6 | #include <sys/stat.h>
  7 | #include <sys/types.h>
  8 | #include <sys/mman.h>
  9 | #include <sys/time.h>
 10 | #include <cstring>
 11 | #include <cmath>
 12 | 
 13 | #include "calcdb.h"
 14 | 
 15 | using namespace std;
 16 | 
 17 | #define ITSZ sizeof(int)
 18 | 
 19 | char input[300];       //input file name
 20 | char confn[300];
 21 | int use_seq = 1;
 22 | 
 23 | void parse_args(int argc, char **argv)
 24 | {
 25 |    extern char * optarg;
 26 |    int c;
 27 |    
 28 |    if (argc < 2)
 29 |       cout << "usage: assocFB -i<infile> -o<outfile>\n";
 30 |    else{
 31 |       while ((c=getopt(argc,argv,"ai:o:"))!=-1){
 32 |          switch(c){
 33 |          case 'a': //work on assoc
 34 |             use_seq = 0;
 35 | 	    printf("USE SEQ = 0\n");
 36 |             break;            
 37 |          case 'i':
 38 |             sprintf(input,"%s.data",optarg);
 39 |             break;
 40 |          case 'o':
 41 |             sprintf(confn, "%s.conf", optarg);
 42 |             break;
 43 |          }
 44 |       }
 45 |    }
 46 | }
 47 | 
 48 | 
 49 | int getconfFunc(int argc, char **argv)
 50 | {
 51 |    parse_args(argc, argv);
 52 | 
 53 |    int DBASE_NUM_TRANS=0;
 54 |    int DBASE_MAXITEM=0;
 55 |    int DBASE_NUM_CUST=0;
 56 |    int DBASE_MINTRANS=0;
 57 |    int DBASE_MAXTRANS=0;   
 58 |    float DBASE_AVG_TRANS_SZ=0;
 59 |    float DBASE_AVG_CUST_SZ=0;
 60 |    
 61 |    int i;
 62 | 
 63 |    int custid, tid, nitem;
 64 |    int *buf;
 65 |    int oldcustid=-1;
 66 |    int oldtcnt = 0;
 67 |    int tsizesum = 0;
 68 |    int tcustsum = 0;
 69 |    int tsizesq = 0;
 70 |    int maxnitem = 0;
 71 | 
 72 |    Dbase_Ctrl_Blk *DCB = new Dbase_Ctrl_Blk(input);
 73 |    DCB->get_first_blk();
 74 |    DCB->get_next_trans(buf, nitem, tid, custid);
 75 |    DBASE_MINTRANS = custid;  
 76 |    while (!DCB->eof()){
 77 |       //printf ("%d %d %d\n", custid, tid, nitem);
 78 |       DBASE_MAXTRANS = custid;  
 79 |       if (use_seq){
 80 |          if (oldcustid != custid){
 81 |             tcustsum += DBASE_NUM_TRANS - oldtcnt;
 82 |             oldtcnt = DBASE_NUM_TRANS;
 83 |             DBASE_NUM_CUST++;
 84 |             oldcustid = custid;
 85 |          }
 86 |       }
 87 |       DBASE_NUM_TRANS++;
 88 |       tsizesum += nitem;
 89 |       if (nitem > maxnitem) maxnitem = nitem;
 90 |       
 91 |       tsizesq += (nitem*nitem);
 92 |       for (i=0; i < nitem; i++)
 93 |          if (buf[i] > DBASE_MAXITEM) DBASE_MAXITEM = buf[i];
 94 |       DCB->get_next_trans(buf, nitem, tid, custid);
 95 |    }
 96 |    tcustsum += DBASE_NUM_TRANS - oldtcnt;
 97 |    DBASE_MAXITEM++;
 98 | 
 99 |    if (use_seq) DBASE_AVG_CUST_SZ = (1.0*tcustsum)/DBASE_NUM_CUST;
100 |    DBASE_AVG_TRANS_SZ = (1.0*tsizesum)/DBASE_NUM_TRANS;
101 |    double trans_sq_avg = (1.0*tsizesq)/DBASE_NUM_TRANS;
102 |    double stddev = sqrt(trans_sq_avg - 
103 |                         (DBASE_AVG_TRANS_SZ*DBASE_AVG_TRANS_SZ));
104 |    
105 | 
106 |    //write config info to new file
107 |    int conffd;
108 |    if ((conffd = open(confn, (O_WRONLY|O_CREAT), 0666)) < 0){
109 |       perror("Can't open out file");
110 |       exit (errno);      
111 |    }
112 |    if (use_seq){
113 |       write(conffd,(char *)&DBASE_NUM_CUST,ITSZ);
114 |       write(conffd,(char *)&DBASE_MAXITEM,ITSZ);
115 |       write(conffd,(char *)&DBASE_AVG_CUST_SZ, sizeof(float));
116 |       write(conffd,(char *)&DBASE_AVG_TRANS_SZ, sizeof(float));
117 |       write(conffd,(char *)&DBASE_NUM_TRANS,ITSZ);
118 |       write(conffd,(char *)&DBASE_MINTRANS,ITSZ);
119 |       write(conffd,(char *)&DBASE_MAXTRANS,ITSZ);
120 |    }
121 |    else{
122 |       write(conffd,(char *)&DBASE_NUM_TRANS,ITSZ);
123 |       write(conffd,(char *)&DBASE_MAXITEM,ITSZ);
124 |       write(conffd,(char *)&DBASE_AVG_TRANS_SZ, sizeof(float));
125 |       write(conffd,(char *)&DBASE_MINTRANS,ITSZ);
126 |       write(conffd,(char *)&DBASE_MAXTRANS,ITSZ);
127 |    }
128 |    
129 |    close(conffd);
130 |    printf("CONF %d %d %f %f %d %d %d %f %d\n", DBASE_NUM_CUST, DBASE_MAXITEM,
131 |           DBASE_AVG_CUST_SZ, DBASE_AVG_TRANS_SZ, DBASE_NUM_TRANS,
132 |           DBASE_MINTRANS, DBASE_MAXTRANS, stddev, maxnitem);
133 |    delete DCB;
134 |    exit(0);
135 | }
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/utilssrc/makebin.cc:
--------------------------------------------------------------------------------
 1 | #include <cerrno>
 2 | #include <iostream>
 3 | #include <cstdio>
 4 | #include <fstream>
 5 | #include <strstream>
 6 | #include <stdlib.h>
 7 | #include <fcntl.h>
 8 | #include <sys/stat.h>
 9 | #include <unistd.h>
10 | #include <cmath>
11 | 
12 | using namespace std;
13 | 
14 | #define ITSZ sizeof(int)
15 | const int lineSize=8192;
16 | const int wdSize=256;
17 | ifstream fin;
18 | ofstream fout;
19 | 
20 | void convbin(char *inBuf, int inSize)
21 | {
22 |    char inStr[wdSize];
23 |    istrstream ist(inBuf, inSize);
24 |    int it;
25 |    while(ist >> inStr){
26 |       it = atoi(inStr);
27 |       //cout << it  << " ";
28 |       fout.write((char*)&it, ITSZ);
29 |    }
30 |    //cout << endl;
31 | }
32 | 
33 | int makebinFunc(int argc, char **argv)
34 | {
35 |    char inBuf[lineSize];
36 |    int inSize;
37 |    fin.open(argv[1]);
38 |    if (!fin){
39 |       perror("cannot open in file");
40 |       exit(errno);
41 |    }
42 |    fout.open(argv[2]);
43 |    if (!fout){
44 |       perror("cannot open out file");
45 |       exit(errno);
46 |    }
47 |    
48 |    while(fin.getline(inBuf, lineSize)){
49 |       inSize = fin.gcount();
50 |       //cout << "IN SIZE " << inSize << endl;
51 |       convbin(inBuf, inSize);
52 |    }
53 | }
54 | 


--------------------------------------------------------------------------------