├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── datasets
    ├── alice.dat
    ├── alice.lab
    ├── classification
    │   ├── aslbu.dat
    │   ├── aslbu.lab
    │   ├── aslgt.dat
    │   ├── aslgt.lab
    │   ├── auslan2.dat
    │   ├── auslan2.lab
    │   ├── context.dat
    │   ├── context.lab
    │   ├── pioneer.dat
    │   ├── pioneer.lab
    │   ├── skating.dat
    │   └── skating.lab
    ├── gazelle.dat
    ├── jmlr.dat
    ├── jmlr.lab
    ├── parallel.dat
    └── sign.dat
├── example.dat
├── pom.xml
├── run-ISM-all.sh
├── run-SQS.sh
├── run-local.sh
├── run-pr.sh
├── run-scaling.sh
├── scripts
    ├── intervals.py
    ├── pr.py
    ├── pr_par.py
    └── scaling.py
├── sequence-miner
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── sequencemining
    │       │   │   ├── eval
    │       │   │       ├── ExclusiveSequences.java
    │       │   │       ├── FrequentSequenceMining.java
    │       │   │       ├── IntervalClassification.java
    │       │   │       ├── PrecisionRecallBackground.java
    │       │   │       ├── PrecisionRecallParallel.java
    │       │   │       ├── SequenceScaling.java
    │       │   │       ├── SequenceSymmetricDistance.java
    │       │   │       └── StatisticalSequenceMining.java
    │       │   │   ├── main
    │       │   │       ├── EMStep.java
    │       │   │       ├── InferenceAlgorithms.java
    │       │   │       ├── SequenceMining.java
    │       │   │       ├── SequenceMiningCore.java
    │       │   │       ├── SparkEMStep.java
    │       │   │       └── SparkSequenceMining.java
    │       │   │   ├── sequence
    │       │   │       ├── AbstractSequence.java
    │       │   │       └── Sequence.java
    │       │   │   ├── transaction
    │       │   │       ├── Transaction.java
    │       │   │       ├── TransactionDatabase.java
    │       │   │       ├── TransactionGenerator.java
    │       │   │       ├── TransactionList.java
    │       │   │       └── TransactionRDD.java
    │       │   │   └── util
    │       │   │       ├── ClassRegistrator.java
    │       │   │       ├── Logging.java
    │       │   │       ├── MemoryLogger.java
    │       │   │       ├── PartialLogFixer.java
    │       │   │       └── Tuple2.java
    │       └── resources
    │       │   ├── log4j.properties
    │       │   └── spark.properties
    │   └── test
    │       ├── java
    │           └── sequencemining
    │           │   ├── main
    │           │       ├── InitialProbabilitiesTest.java
    │           │       ├── SequenceMiningTest.java
    │           │       └── SupportCountingTest.java
    │           │   └── sequence
    │           │       ├── PartitionTest.java
    │           │       └── SequenceTest.java
    │       └── resources
    │           └── TOY.txt
└── sequence-mining
    └── pom.xml


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | 
 3 | *.classpath
 4 | *.project
 5 | *.pydevproject
 6 | .metadata
 7 | bin/
 8 | tmp/
 9 | target/
10 | *.tmp
11 | *.bak
12 | *.swp
13 | *~.nib
14 | local.properties
15 | .loadpath
16 | .DS_Store
17 | .__afs*
18 | 
19 | # External tool builders
20 | .externalToolBuilders/
21 | 
22 | # Locally stored "Eclipse launch configurations"
23 | *.launch
24 | 
25 | # CDT-specific
26 | .cproject
27 | 
28 | # PDT-specific
29 | .buildpath
30 | 
31 | # Miltository specific
32 | *.ser
33 | runParameters.txt
34 | default.properties
35 | .settings/
36 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | jdk:
3 |   - oraclejdk8
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ISM: Interesting Sequence Miner [![Build Status](https://travis-ci.org/mast-group/sequence-mining.svg?branch=master)](https://travis-ci.org/mast-group/sequence-mining)
  2 | ================
  3 |  
  4 | ISM is a novel algorithm that mines the subsequences that are most interesting under a probablistic model of a sequence database. Our model is able to efficiently infer interesting subsequences directly from the database.
  5 | 
  6 | This is an implementation of the sequence miner from our paper:  
  7 | [*A Subsequence Interleaving Model for Sequential Pattern Mining*](http://arxiv.org/abs/1602.05012)  
  8 | J. Fowkes and C. Sutton. KDD 2016.   
  9 | 
 10 | 
 11 | Installation 
 12 | ------------
 13 | 
 14 | #### Installing in Eclipse
 15 | 
 16 | Simply import as a maven project into [Eclipse](https://eclipse.org/) using the *File -> Import...* menu option (note that this requires [m2eclipse](http://eclipse.org/m2e/)). 
 17 | 
 18 | It's also possible to export a runnable jar from Eclipse using the *File -> Export...* menu option.
 19 | 
 20 | #### Compiling a Runnable Jar
 21 | 
 22 | To compile a standalone runnable jar, simply run
 23 | 
 24 | ```
 25 | mvn package
 26 | ```
 27 | 
 28 | in the top-level directory (note that this requires [maven](https://maven.apache.org/)). This will create the standalone runnable jar ```sequence-mining-1.0.jar``` in the sequence-mining/target subdirectory. The main class is *sequencemining.main.SequenceMining* (see below).
 29 | 
 30 | 
 31 | Running ISM
 32 | -----------
 33 | 
 34 | ISM uses a Bayesian Network Model to determine which subsequences are the most interesting in a given dataset.  
 35 | 
 36 | #### Mining Interesting Sequences 
 37 | 
 38 | Main class *sequencemining.main.SequencesMining* mines subsequences from a specified sequences database file. It has the following command line options:
 39 | 
 40 | * **-f**  &nbsp;  database file to mine (in [SPMF](http://www.philippe-fournier-viger.com/spmf/) format)
 41 | * **-i**  &nbsp;  max. no. iterations
 42 | * **-s**  &nbsp;  max. no. structure steps
 43 | * **-r**  &nbsp;  max. runtime (min)
 44 | * **-l**  &nbsp;  log level (INFO/FINE/FINER/FINEST)
 45 | * **-v**  &nbsp;  print to console instead of log file   
 46 | 
 47 | See the individual file javadocs in *sequencemining.main.SequenceMining* for information on the Java interface.
 48 | In Eclipse you can set command line arguments for the ISM interface using the *Run Configurations...* menu option. 
 49 | 
 50 | #### Example Usage
 51 | 
 52 | A complete example using the command line interface on a runnable jar. We can mine the provided example dataset ```example.dat``` as follows: 
 53 | 
 54 |   ```sh 
 55 |   $ java -jar sequence-mining/target/sequence-mining-1.0.jar -i 100 -f example.dat -v 
 56 |   ```
 57 | 
 58 | which will output to the console. Omitting the ```-v``` flag will redirect output to a log-file in ```/tmp/```. 
 59 | 
 60 | Input/Output Formats
 61 | --------------------
 62 | 
 63 | #### Input Format
 64 | 
 65 | ISM takes as input a sequence database file in [SPMF](http://www.philippe-fournier-viger.com/spmf/) format. The SPMF format is very simple: each line of the input file represents a database sequence 
 66 | and each sequence is a list of items, represented by positive integers, separated by -1 and ending with -2. For example, the first few lines (database sequences) from ```example.dat``` are:
 67 | 
 68 | ```text
 69 | 1 -1 2 -1 3 -1 4 -1 -2
 70 | 3 -1 5 -1 6 -1 4 -1 -2
 71 | 3 -1 4 -1 -2
 72 | 3 -1 5 -1 6 -1 7 -1 8 -1 4 -1 -2
 73 | 3 -1 4 -1 -2
 74 | ```
 75 | 
 76 | Note that any other item formats (e.g. words for text corpora) 
 77 | need to be manually mapped to (and from) positive integers by means of a dictionary.   
 78 | 
 79 | #### Output Format
 80 | 
 81 | ISM outputs a list of interesting sequences, one sequence per line, ordered first by their interestingness (given in the 'int' column) followed by their probability (given in the 'prob' column). 
 82 | For example, the first few lines of output for the usage example above are:
 83 | 
 84 | ```text
 85 | ============= INTERESTING SEQUENCES =============
 86 | [3] prob: 1.00000   int: 1.00000 
 87 | [4] prob: 1.00000   int: 1.00000 
 88 | [7, 8]  prob: 0.47500   int: 1.00000 
 89 | [5, 6]  prob: 0.32000   int: 1.00000 
 90 | [9] prob: 0.10500   int: 1.00000 
 91 | [12]    prob: 0.02000   int: 1.00000 
 92 | [16]    prob: 0.01000   int: 1.00000 
 93 | [9, 10, 5, 6, 10, 6, 9, 10, 5, 6]   prob: 0.01000   int: 1.00000 
 94 | [11, 7, 8, 5, 6]    prob: 0.00500   int: 1.00000 
 95 | [13, 14, 15, 13, 14, 5, 6]  prob: 0.00500   int: 1.00000 
 96 | ```
 97 | 
 98 | See the accompanying [paper](http://arxiv.org/abs/1602.05012) for details of how to interpret 'interestingness' and 'probability' under ISM's probabilistic model.
 99 | 
100 | 
101 | Datasets
102 | --------
103 | 
104 | The datasets used in the paper are available in the ```datasets``` subdirectory. All datasets are in [SPMF](http://www.philippe-fournier-viger.com/spmf/) format (see above). The classification datasets additionally include the class labels for each transaction in a ```.lab``` file.
105 | 
106 | 
107 | Bugs
108 | ----
109 | 
110 | Please report any bugs using GitHub's issue tracker.
111 | 
112 | 
113 | License
114 | -------
115 | 
116 | This algorithm is released under the GNU GPLv3 license. Other licenses are available on request.
117 | 


--------------------------------------------------------------------------------
/datasets/classification/aslbu.lab:
--------------------------------------------------------------------------------
  1 | 195
  2 | 195
  3 | 191
  4 | 195
  5 | 195
  6 | 199
  7 | 199
  8 | 210
  9 | 218
 10 | 195
 11 | 195
 12 | 195
 13 | 218
 14 | 218
 15 | 218
 16 | 209
 17 | 195
 18 | 199
 19 | 199
 20 | 195
 21 | 191
 22 | 195
 23 | 191
 24 | 209
 25 | 191
 26 | 210
 27 | 191
 28 | 209
 29 | 199
 30 | 195
 31 | 209
 32 | 191
 33 | 191
 34 | 195
 35 | 195
 36 | 195
 37 | 209
 38 | 209
 39 | 191
 40 | 191
 41 | 191
 42 | 199
 43 | 199
 44 | 191
 45 | 191
 46 | 209
 47 | 191
 48 | 191
 49 | 199
 50 | 195
 51 | 191
 52 | 195
 53 | 195
 54 | 195
 55 | 199
 56 | 199
 57 | 195
 58 | 195
 59 | 199
 60 | 195
 61 | 209
 62 | 191
 63 | 191
 64 | 209
 65 | 191
 66 | 191
 67 | 191
 68 | 191
 69 | 195
 70 | 209
 71 | 199
 72 | 210
 73 | 195
 74 | 195
 75 | 199
 76 | 199
 77 | 199
 78 | 199
 79 | 209
 80 | 195
 81 | 195
 82 | 209
 83 | 210
 84 | 209
 85 | 199
 86 | 199
 87 | 209
 88 | 209
 89 | 191
 90 | 199
 91 | 195
 92 | 191
 93 | 209
 94 | 209
 95 | 209
 96 | 209
 97 | 199
 98 | 199
 99 | 209
100 | 191
101 | 209
102 | 191
103 | 191
104 | 209
105 | 209
106 | 209
107 | 209
108 | 191
109 | 209
110 | 191
111 | 209
112 | 191
113 | 209
114 | 195
115 | 195
116 | 199
117 | 209
118 | 199
119 | 191
120 | 209
121 | 199
122 | 199
123 | 209
124 | 210
125 | 210
126 | 210
127 | 210
128 | 210
129 | 199
130 | 195
131 | 199
132 | 199
133 | 199
134 | 199
135 | 209
136 | 209
137 | 209
138 | 195
139 | 195
140 | 195
141 | 191
142 | 195
143 | 210
144 | 210
145 | 210
146 | 218
147 | 210
148 | 195
149 | 195
150 | 218
151 | 210
152 | 209
153 | 210
154 | 209
155 | 195
156 | 195
157 | 195
158 | 191
159 | 195
160 | 191
161 | 209
162 | 191
163 | 209
164 | 191
165 | 210
166 | 191
167 | 209
168 | 209
169 | 209
170 | 209
171 | 209
172 | 191
173 | 191
174 | 191
175 | 195
176 | 195
177 | 209
178 | 209
179 | 209
180 | 210
181 | 210
182 | 210
183 | 199
184 | 199
185 | 195
186 | 195
187 | 210
188 | 210
189 | 209
190 | 210
191 | 210
192 | 210
193 | 210
194 | 191
195 | 191
196 | 209
197 | 191
198 | 191
199 | 191
200 | 191
201 | 191
202 | 191
203 | 209
204 | 209
205 | 191
206 | 191
207 | 191
208 | 191
209 | 209
210 | 209
211 | 209
212 | 209
213 | 209
214 | 195
215 | 209
216 | 209
217 | 209
218 | 209
219 | 209
220 | 203
221 | 195
222 | 195
223 | 195
224 | 209
225 | 195
226 | 210
227 | 195
228 | 210
229 | 195
230 | 195
231 | 195
232 | 195
233 | 203
234 | 203
235 | 203
236 | 203
237 | 203
238 | 195
239 | 203
240 | 203
241 | 195
242 | 203
243 | 195
244 | 195
245 | 195
246 | 209
247 | 195
248 | 195
249 | 195
250 | 195
251 | 195
252 | 195
253 | 195
254 | 195
255 | 195
256 | 195
257 | 195
258 | 195
259 | 195
260 | 195
261 | 203
262 | 195
263 | 199
264 | 195
265 | 191
266 | 199
267 | 210
268 | 199
269 | 199
270 | 195
271 | 210
272 | 199
273 | 191
274 | 195
275 | 191
276 | 191
277 | 210
278 | 199
279 | 209
280 | 195
281 | 199
282 | 191
283 | 191
284 | 195
285 | 191
286 | 191
287 | 209
288 | 209
289 | 191
290 | 191
291 | 191
292 | 191
293 | 191
294 | 191
295 | 195
296 | 191
297 | 191
298 | 191
299 | 191
300 | 191
301 | 191
302 | 195
303 | 191
304 | 195
305 | 191
306 | 191
307 | 191
308 | 195
309 | 195
310 | 195
311 | 195
312 | 195
313 | 195
314 | 195
315 | 195
316 | 195
317 | 195
318 | 195
319 | 195
320 | 195
321 | 195
322 | 195
323 | 195
324 | 195
325 | 195
326 | 195
327 | 195
328 | 195
329 | 191
330 | 191
331 | 191
332 | 191
333 | 195
334 | 195
335 | 195
336 | 195
337 | 195
338 | 195
339 | 195
340 | 195
341 | 195
342 | 195
343 | 195
344 | 195
345 | 195
346 | 195
347 | 199
348 | 199
349 | 199
350 | 199
351 | 209
352 | 209
353 | 210
354 | 209
355 | 209
356 | 210
357 | 210
358 | 191
359 | 195
360 | 195
361 | 195
362 | 195
363 | 195
364 | 195
365 | 195
366 | 195
367 | 195
368 | 195
369 | 195
370 | 195
371 | 195
372 | 195
373 | 195
374 | 195
375 | 195
376 | 195
377 | 195
378 | 195
379 | 195
380 | 195
381 | 195
382 | 195
383 | 195
384 | 195
385 | 195
386 | 195
387 | 195
388 | 195
389 | 195
390 | 195
391 | 195
392 | 195
393 | 195
394 | 195
395 | 195
396 | 199
397 | 195
398 | 195
399 | 218
400 | 191
401 | 218
402 | 218
403 | 218
404 | 191
405 | 191
406 | 218
407 | 191
408 | 191
409 | 218
410 | 218
411 | 218
412 | 191
413 | 191
414 | 218
415 | 191
416 | 218
417 | 191
418 | 218
419 | 209
420 | 210
421 | 210
422 | 209
423 | 195
424 | 195
425 | 


--------------------------------------------------------------------------------
/datasets/classification/auslan2.dat:
--------------------------------------------------------------------------------
  1 | 1 -1 2 -1 3 -1 4 -1 -2
  2 | 3 -1 5 -1 6 -1 4 -1 -2
  3 | 3 -1 4 -1 -2
  4 | 3 -1 5 -1 6 -1 7 -1 8 -1 4 -1 -2
  5 | 3 -1 4 -1 -2
  6 | 3 -1 4 -1 -2
  7 | 5 -1 6 -1 3 -1 4 -1 -2
  8 | 3 -1 5 -1 6 -1 4 -1 -2
  9 | 5 -1 6 -1 3 -1 9 -1 10 -1 5 -1 6 -1 4 -1 7 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
 10 | 3 -1 7 -1 8 -1 4 -1 -2
 11 | 3 -1 4 -1 -2
 12 | 3 -1 5 -1 6 -1 4 -1 -2
 13 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 14 | 5 -1 6 -1 9 -1 3 -1 7 -1 8 -1 10 -1 5 -1 4 -1 6 -1 -2
 15 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 16 | 3 -1 5 -1 6 -1 4 -1 -2
 17 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 18 | 1 -1 2 -1 3 -1 4 -1 -2
 19 | 3 -1 4 -1 -2
 20 | 3 -1 4 -1 -2
 21 | 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
 22 | 3 -1 7 -1 8 -1 4 -1 -2
 23 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
 24 | 3 -1 7 -1 8 -1 4 -1 -2
 25 | 3 -1 7 -1 8 -1 4 -1 -2
 26 | 1 -1 2 -1 3 -1 7 -1 8 -1 11 -1 12 -1 7 -1 8 -1 4 -1 -2
 27 | 3 -1 7 -1 8 -1 4 -1 -2
 28 | 3 -1 7 -1 8 -1 4 -1 -2
 29 | 3 -1 7 -1 8 -1 4 -1 13 -1 14 -1 -2
 30 | 3 -1 7 -1 8 -1 4 -1 -2
 31 | 3 -1 7 -1 8 -1 4 -1 -2
 32 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
 33 | 3 -1 7 -1 8 -1 4 -1 -2
 34 | 13 -1 14 -1 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
 35 | 3 -1 7 -1 8 -1 4 -1 -2
 36 | 3 -1 7 -1 8 -1 5 -1 4 -1 6 -1 -2
 37 | 3 -1 7 -1 8 -1 4 -1 -2
 38 | 1 -1 2 -1 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 8 -1 11 -1 4 -1 12 -1 7 -1 8 -1 -2
 39 | 3 -1 7 -1 8 -1 4 -1 -2
 40 | 13 -1 14 -1 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 41 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 42 | 3 -1 7 -1 8 -1 4 -1 -2
 43 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 5 -1 6 -1 -2
 44 | 9 -1 3 -1 4 -1 1 -1 2 -1 3 -1 10 -1 5 -1 6 -1 4 -1 -2
 45 | 3 -1 7 -1 4 -1 1 -1 2 -1 8 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 46 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 47 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 48 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 49 | 9 -1 10 -1 5 -1 6 -1 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 50 | 3 -1 7 -1 8 -1 4 -1 -2
 51 | 3 -1 4 -1 1 -1 2 -1 7 -1 3 -1 8 -1 4 -1 -2
 52 | 3 -1 4 -1 1 -1 2 -1 7 -1 8 -1 3 -1 4 -1 5 -1 6 -1 -2
 53 | 3 -1 7 -1 8 -1 4 -1 -2
 54 | 3 -1 7 -1 8 -1 4 -1 -2
 55 | 3 -1 4 -1 -2
 56 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 57 | 3 -1 5 -1 6 -1 4 -1 -2
 58 | 3 -1 4 -1 7 -1 1 -1 2 -1 3 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 59 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 60 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 61 | 3 -1 4 -1 -2
 62 | 3 -1 4 -1 -2
 63 | 3 -1 4 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 -2
 64 | 3 -1 4 -1 -2
 65 | 3 -1 4 -1 -2
 66 | 1 -1 2 -1 3 -1 5 -1 6 -1 4 -1 -2
 67 | 3 -1 4 -1 -2
 68 | 3 -1 4 -1 -2
 69 | 3 -1 4 -1 -2
 70 | 3 -1 4 -1 -2
 71 | 3 -1 4 -1 1 -1 2 -1 7 -1 8 -1 3 -1 4 -1 -2
 72 | 3 -1 4 -1 -2
 73 | 3 -1 4 -1 -2
 74 | 5 -1 3 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 4 -1 6 -1 -2
 75 | 13 -1 14 -1 15 -1 3 -1 16 -1 13 -1 14 -1 4 -1 -2
 76 | 3 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
 77 | 1 -1 2 -1 3 -1 4 -1 -2
 78 | 3 -1 13 -1 14 -1 5 -1 15 -1 6 -1 16 -1 9 -1 10 -1 7 -1 8 -1 13 -1 14 -1 5 -1 6 -1 4 -1 -2
 79 | 3 -1 5 -1 6 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 80 | 5 -1 3 -1 6 -1 4 -1 -2
 81 | 13 -1 14 -1 3 -1 7 -1 8 -1 4 -1 -2
 82 | 3 -1 7 -1 8 -1 4 -1 -2
 83 | 3 -1 4 -1 7 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
 84 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
 85 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 86 | 3 -1 7 -1 8 -1 4 -1 -2
 87 | 3 -1 7 -1 8 -1 4 -1 -2
 88 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 89 | 3 -1 7 -1 8 -1 4 -1 -2
 90 | 3 -1 7 -1 8 -1 4 -1 -2
 91 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 92 | 3 -1 7 -1 8 -1 4 -1 -2
 93 | 3 -1 7 -1 8 -1 4 -1 -2
 94 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
 95 | 13 -1 14 -1 3 -1 7 -1 8 -1 4 -1 -2
 96 | 3 -1 7 -1 5 -1 6 -1 8 -1 4 -1 -2
 97 | 3 -1 7 -1 8 -1 4 -1 -2
 98 | 3 -1 7 -1 8 -1 4 -1 -2
 99 | 3 -1 7 -1 8 -1 4 -1 -2
100 | 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
101 | 1 -1 2 -1 3 -1 7 -1 8 -1 5 -1 6 -1 4 -1 -2
102 | 1 -1 2 -1 3 -1 7 -1 5 -1 6 -1 8 -1 4 -1 -2
103 | 3 -1 7 -1 5 -1 8 -1 6 -1 4 -1 -2
104 | 3 -1 7 -1 8 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
105 | 1 -1 2 -1 3 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
106 | 3 -1 7 -1 5 -1 6 -1 9 -1 10 -1 5 -1 8 -1 6 -1 9 -1 10 -1 4 -1 5 -1 6 -1 -2
107 | 3 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 9 -1 10 -1 5 -1 6 -1 -2
108 | 3 -1 7 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 8 -1 4 -1 -2
109 | 1 -1 2 -1 3 -1 5 -1 6 -1 4 -1 -2
110 | 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 8 -1 4 -1 -2
111 | 3 -1 7 -1 8 -1 11 -1 5 -1 12 -1 7 -1 8 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
112 | 3 -1 7 -1 5 -1 6 -1 8 -1 4 -1 -2
113 | 3 -1 5 -1 6 -1 4 -1 -2
114 | 3 -1 7 -1 8 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
115 | 3 -1 7 -1 5 -1 8 -1 6 -1 4 -1 -2
116 | 3 -1 7 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 8 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
117 | 3 -1 7 -1 8 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
118 | 3 -1 7 -1 5 -1 8 -1 6 -1 4 -1 -2
119 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
120 | 3 -1 5 -1 6 -1 4 -1 -2
121 | 3 -1 4 -1 -2
122 | 3 -1 4 -1 -2
123 | 3 -1 4 -1 -2
124 | 3 -1 4 -1 5 -1 6 -1 -2
125 | 3 -1 4 -1 -2
126 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
127 | 3 -1 4 -1 -2
128 | 3 -1 5 -1 6 -1 4 -1 -2
129 | 1 -1 2 -1 3 -1 4 -1 -2
130 | 3 -1 4 -1 -2
131 | 3 -1 4 -1 -2
132 | 3 -1 4 -1 -2
133 | 13 -1 14 -1 3 -1 4 -1 -2
134 | 1 -1 2 -1 3 -1 4 -1 -2
135 | 3 -1 4 -1 -2
136 | 3 -1 4 -1 5 -1 6 -1 -2
137 | 3 -1 4 -1 -2
138 | 3 -1 4 -1 -2
139 | 3 -1 4 -1 -2
140 | 3 -1 5 -1 6 -1 4 -1 -2
141 | 9 -1 1 -1 2 -1 3 -1 4 -1 10 -1 -2
142 | 5 -1 3 -1 6 -1 9 -1 10 -1 5 -1 4 -1 6 -1 -2
143 | 9 -1 3 -1 4 -1 10 -1 5 -1 6 -1 -2
144 | 3 -1 5 -1 4 -1 6 -1 -2
145 | 3 -1 4 -1 -2
146 | 3 -1 4 -1 -2
147 | 3 -1 4 -1 -2
148 | 3 -1 4 -1 -2
149 | 3 -1 4 -1 -2
150 | 3 -1 4 -1 -2
151 | 3 -1 4 -1 -2
152 | 3 -1 4 -1 -2
153 | 3 -1 4 -1 -2
154 | 3 -1 4 -1 5 -1 6 -1 -2
155 | 3 -1 4 -1 -2
156 | 3 -1 4 -1 -2
157 | 3 -1 4 -1 -2
158 | 3 -1 4 -1 -2
159 | 3 -1 4 -1 -2
160 | 3 -1 4 -1 -2
161 | 3 -1 4 -1 -2
162 | 3 -1 4 -1 -2
163 | 3 -1 4 -1 -2
164 | 3 -1 4 -1 -2
165 | 5 -1 3 -1 6 -1 4 -1 -2
166 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
167 | 3 -1 4 -1 1 -1 2 -1 13 -1 14 -1 3 -1 4 -1 -2
168 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
169 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
170 | 3 -1 4 -1 -2
171 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
172 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
173 | 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
174 | 3 -1 4 -1 -2
175 | 3 -1 5 -1 6 -1 4 -1 -2
176 | 1 -1 2 -1 3 -1 4 -1 -2
177 | 3 -1 4 -1 -2
178 | 3 -1 4 -1 5 -1 6 -1 -2
179 | 3 -1 4 -1 -2
180 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
181 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 5 -1 6 -1 -2
182 | 3 -1 4 -1 -2
183 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
184 | 3 -1 7 -1 8 -1 5 -1 4 -1 6 -1 -2
185 | 3 -1 7 -1 8 -1 4 -1 -2
186 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
187 | 3 -1 7 -1 8 -1 4 -1 -2
188 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
189 | 3 -1 7 -1 8 -1 4 -1 -2
190 | 3 -1 4 -1 -2
191 | 3 -1 7 -1 8 -1 4 -1 -2
192 | 3 -1 7 -1 8 -1 4 -1 -2
193 | 3 -1 7 -1 8 -1 4 -1 -2
194 | 3 -1 4 -1 5 -1 6 -1 -2
195 | 3 -1 7 -1 8 -1 4 -1 -2
196 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
197 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 5 -1 6 -1 -2
198 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
199 | 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
200 | 3 -1 7 -1 8 -1 11 -1 4 -1 1 -1 2 -1 12 -1 -2
201 | 


--------------------------------------------------------------------------------
/datasets/classification/auslan2.lab:
--------------------------------------------------------------------------------
  1 | 1
  2 | 1
  3 | 1
  4 | 1
  5 | 1
  6 | 1
  7 | 1
  8 | 1
  9 | 1
 10 | 1
 11 | 1
 12 | 1
 13 | 1
 14 | 1
 15 | 1
 16 | 1
 17 | 1
 18 | 1
 19 | 1
 20 | 1
 21 | 2
 22 | 2
 23 | 2
 24 | 2
 25 | 2
 26 | 2
 27 | 2
 28 | 2
 29 | 2
 30 | 2
 31 | 2
 32 | 2
 33 | 2
 34 | 2
 35 | 2
 36 | 2
 37 | 2
 38 | 2
 39 | 2
 40 | 2
 41 | 3
 42 | 3
 43 | 3
 44 | 3
 45 | 3
 46 | 3
 47 | 3
 48 | 3
 49 | 3
 50 | 3
 51 | 3
 52 | 3
 53 | 3
 54 | 3
 55 | 3
 56 | 3
 57 | 3
 58 | 3
 59 | 3
 60 | 3
 61 | 4
 62 | 4
 63 | 4
 64 | 4
 65 | 4
 66 | 4
 67 | 4
 68 | 4
 69 | 4
 70 | 4
 71 | 4
 72 | 4
 73 | 4
 74 | 4
 75 | 4
 76 | 4
 77 | 4
 78 | 4
 79 | 4
 80 | 4
 81 | 5
 82 | 5
 83 | 5
 84 | 5
 85 | 5
 86 | 5
 87 | 5
 88 | 5
 89 | 5
 90 | 5
 91 | 5
 92 | 5
 93 | 5
 94 | 5
 95 | 5
 96 | 5
 97 | 5
 98 | 5
 99 | 5
100 | 5
101 | 6
102 | 6
103 | 6
104 | 6
105 | 6
106 | 6
107 | 6
108 | 6
109 | 6
110 | 6
111 | 6
112 | 6
113 | 6
114 | 6
115 | 6
116 | 6
117 | 6
118 | 6
119 | 6
120 | 6
121 | 7
122 | 7
123 | 7
124 | 7
125 | 7
126 | 7
127 | 7
128 | 7
129 | 7
130 | 7
131 | 7
132 | 7
133 | 7
134 | 7
135 | 7
136 | 7
137 | 7
138 | 7
139 | 7
140 | 7
141 | 8
142 | 8
143 | 8
144 | 8
145 | 8
146 | 8
147 | 8
148 | 8
149 | 8
150 | 8
151 | 8
152 | 8
153 | 8
154 | 8
155 | 8
156 | 8
157 | 8
158 | 8
159 | 8
160 | 8
161 | 9
162 | 9
163 | 9
164 | 9
165 | 9
166 | 9
167 | 9
168 | 9
169 | 9
170 | 9
171 | 9
172 | 9
173 | 9
174 | 9
175 | 9
176 | 9
177 | 9
178 | 9
179 | 9
180 | 9
181 | 10
182 | 10
183 | 10
184 | 10
185 | 10
186 | 10
187 | 10
188 | 10
189 | 10
190 | 10
191 | 10
192 | 10
193 | 10
194 | 10
195 | 10
196 | 10
197 | 10
198 | 10
199 | 10
200 | 10
201 | 


--------------------------------------------------------------------------------
/datasets/classification/context.lab:
--------------------------------------------------------------------------------
  1 | 1
  2 | 1
  3 | 1
  4 | 1
  5 | 1
  6 | 1
  7 | 1
  8 | 1
  9 | 1
 10 | 1
 11 | 1
 12 | 1
 13 | 1
 14 | 1
 15 | 1
 16 | 1
 17 | 1
 18 | 1
 19 | 1
 20 | 1
 21 | 1
 22 | 1
 23 | 1
 24 | 1
 25 | 1
 26 | 1
 27 | 1
 28 | 1
 29 | 1
 30 | 1
 31 | 1
 32 | 1
 33 | 1
 34 | 1
 35 | 1
 36 | 1
 37 | 1
 38 | 1
 39 | 1
 40 | 1
 41 | 1
 42 | 1
 43 | 1
 44 | 1
 45 | 2
 46 | 2
 47 | 2
 48 | 2
 49 | 2
 50 | 2
 51 | 2
 52 | 2
 53 | 2
 54 | 2
 55 | 2
 56 | 2
 57 | 2
 58 | 2
 59 | 2
 60 | 2
 61 | 2
 62 | 2
 63 | 2
 64 | 2
 65 | 2
 66 | 2
 67 | 2
 68 | 2
 69 | 2
 70 | 2
 71 | 2
 72 | 2
 73 | 2
 74 | 2
 75 | 2
 76 | 2
 77 | 2
 78 | 2
 79 | 2
 80 | 2
 81 | 2
 82 | 2
 83 | 2
 84 | 2
 85 | 2
 86 | 2
 87 | 2
 88 | 2
 89 | 2
 90 | 2
 91 | 2
 92 | 2
 93 | 3
 94 | 3
 95 | 3
 96 | 3
 97 | 3
 98 | 3
 99 | 3
100 | 3
101 | 3
102 | 3
103 | 3
104 | 3
105 | 3
106 | 3
107 | 3
108 | 3
109 | 3
110 | 3
111 | 3
112 | 3
113 | 3
114 | 3
115 | 3
116 | 3
117 | 3
118 | 3
119 | 3
120 | 3
121 | 3
122 | 3
123 | 3
124 | 3
125 | 3
126 | 3
127 | 3
128 | 3
129 | 3
130 | 3
131 | 3
132 | 3
133 | 3
134 | 3
135 | 3
136 | 3
137 | 3
138 | 3
139 | 3
140 | 3
141 | 4
142 | 4
143 | 4
144 | 4
145 | 4
146 | 4
147 | 4
148 | 4
149 | 4
150 | 4
151 | 4
152 | 4
153 | 4
154 | 4
155 | 4
156 | 4
157 | 4
158 | 4
159 | 4
160 | 4
161 | 4
162 | 4
163 | 4
164 | 4
165 | 4
166 | 4
167 | 4
168 | 4
169 | 4
170 | 4
171 | 4
172 | 4
173 | 4
174 | 4
175 | 4
176 | 4
177 | 4
178 | 4
179 | 4
180 | 4
181 | 4
182 | 4
183 | 4
184 | 4
185 | 4
186 | 4
187 | 4
188 | 4
189 | 4
190 | 4
191 | 5
192 | 5
193 | 5
194 | 5
195 | 5
196 | 5
197 | 5
198 | 5
199 | 5
200 | 5
201 | 5
202 | 5
203 | 5
204 | 5
205 | 5
206 | 5
207 | 5
208 | 5
209 | 5
210 | 5
211 | 5
212 | 5
213 | 5
214 | 5
215 | 5
216 | 5
217 | 5
218 | 5
219 | 5
220 | 5
221 | 5
222 | 5
223 | 5
224 | 5
225 | 5
226 | 5
227 | 5
228 | 5
229 | 5
230 | 5
231 | 5
232 | 5
233 | 5
234 | 5
235 | 5
236 | 5
237 | 5
238 | 5
239 | 5
240 | 5
241 | 


--------------------------------------------------------------------------------
/datasets/classification/pioneer.lab:
--------------------------------------------------------------------------------
  1 | 1
  2 | 1
  3 | 1
  4 | 1
  5 | 1
  6 | 1
  7 | 1
  8 | 1
  9 | 1
 10 | 1
 11 | 1
 12 | 1
 13 | 1
 14 | 1
 15 | 1
 16 | 1
 17 | 2
 18 | 2
 19 | 2
 20 | 2
 21 | 2
 22 | 2
 23 | 2
 24 | 2
 25 | 2
 26 | 2
 27 | 2
 28 | 2
 29 | 2
 30 | 2
 31 | 2
 32 | 2
 33 | 2
 34 | 2
 35 | 2
 36 | 2
 37 | 2
 38 | 2
 39 | 2
 40 | 2
 41 | 2
 42 | 2
 43 | 2
 44 | 2
 45 | 2
 46 | 2
 47 | 2
 48 | 2
 49 | 2
 50 | 2
 51 | 2
 52 | 2
 53 | 2
 54 | 2
 55 | 2
 56 | 2
 57 | 2
 58 | 2
 59 | 2
 60 | 2
 61 | 2
 62 | 2
 63 | 2
 64 | 2
 65 | 2
 66 | 2
 67 | 2
 68 | 2
 69 | 2
 70 | 2
 71 | 2
 72 | 2
 73 | 2
 74 | 2
 75 | 2
 76 | 2
 77 | 2
 78 | 2
 79 | 2
 80 | 2
 81 | 2
 82 | 2
 83 | 2
 84 | 2
 85 | 2
 86 | 2
 87 | 2
 88 | 2
 89 | 2
 90 | 2
 91 | 2
 92 | 2
 93 | 2
 94 | 2
 95 | 2
 96 | 2
 97 | 2
 98 | 2
 99 | 2
100 | 2
101 | 2
102 | 2
103 | 2
104 | 2
105 | 2
106 | 2
107 | 2
108 | 2
109 | 2
110 | 2
111 | 2
112 | 2
113 | 2
114 | 2
115 | 2
116 | 2
117 | 2
118 | 2
119 | 3
120 | 3
121 | 3
122 | 3
123 | 3
124 | 3
125 | 3
126 | 3
127 | 3
128 | 3
129 | 3
130 | 3
131 | 3
132 | 3
133 | 3
134 | 3
135 | 3
136 | 3
137 | 3
138 | 3
139 | 3
140 | 3
141 | 3
142 | 3
143 | 3
144 | 3
145 | 3
146 | 3
147 | 3
148 | 3
149 | 3
150 | 3
151 | 3
152 | 3
153 | 3
154 | 3
155 | 3
156 | 3
157 | 3
158 | 3
159 | 3
160 | 3
161 | 


--------------------------------------------------------------------------------
/datasets/classification/skating.lab:
--------------------------------------------------------------------------------
  1 | 1
  2 | 1
  3 | 1
  4 | 1
  5 | 1
  6 | 1
  7 | 1
  8 | 1
  9 | 1
 10 | 1
 11 | 1
 12 | 1
 13 | 1
 14 | 1
 15 | 1
 16 | 2
 17 | 2
 18 | 2
 19 | 2
 20 | 2
 21 | 2
 22 | 2
 23 | 2
 24 | 2
 25 | 2
 26 | 2
 27 | 2
 28 | 2
 29 | 2
 30 | 3
 31 | 3
 32 | 3
 33 | 3
 34 | 3
 35 | 3
 36 | 3
 37 | 3
 38 | 3
 39 | 3
 40 | 3
 41 | 3
 42 | 3
 43 | 3
 44 | 4
 45 | 4
 46 | 4
 47 | 4
 48 | 4
 49 | 4
 50 | 4
 51 | 4
 52 | 4
 53 | 4
 54 | 4
 55 | 4
 56 | 4
 57 | 4
 58 | 4
 59 | 4
 60 | 4
 61 | 5
 62 | 5
 63 | 5
 64 | 5
 65 | 5
 66 | 5
 67 | 5
 68 | 5
 69 | 5
 70 | 5
 71 | 5
 72 | 5
 73 | 5
 74 | 5
 75 | 5
 76 | 5
 77 | 5
 78 | 6
 79 | 6
 80 | 6
 81 | 6
 82 | 6
 83 | 6
 84 | 6
 85 | 6
 86 | 6
 87 | 6
 88 | 6
 89 | 6
 90 | 6
 91 | 6
 92 | 6
 93 | 6
 94 | 6
 95 | 6
 96 | 1
 97 | 1
 98 | 1
 99 | 1
100 | 1
101 | 1
102 | 1
103 | 1
104 | 2
105 | 2
106 | 2
107 | 2
108 | 2
109 | 2
110 | 2
111 | 2
112 | 3
113 | 3
114 | 3
115 | 3
116 | 3
117 | 3
118 | 3
119 | 3
120 | 3
121 | 3
122 | 4
123 | 4
124 | 4
125 | 4
126 | 4
127 | 4
128 | 4
129 | 4
130 | 4
131 | 4
132 | 4
133 | 4
134 | 5
135 | 5
136 | 5
137 | 5
138 | 5
139 | 5
140 | 5
141 | 5
142 | 5
143 | 5
144 | 5
145 | 5
146 | 6
147 | 6
148 | 6
149 | 6
150 | 6
151 | 6
152 | 6
153 | 6
154 | 6
155 | 7
156 | 7
157 | 7
158 | 7
159 | 7
160 | 7
161 | 7
162 | 7
163 | 7
164 | 7
165 | 7
166 | 7
167 | 1
168 | 1
169 | 1
170 | 1
171 | 1
172 | 1
173 | 1
174 | 1
175 | 1
176 | 1
177 | 2
178 | 2
179 | 2
180 | 2
181 | 2
182 | 2
183 | 2
184 | 2
185 | 2
186 | 2
187 | 2
188 | 2
189 | 2
190 | 2
191 | 2
192 | 3
193 | 3
194 | 3
195 | 3
196 | 3
197 | 3
198 | 3
199 | 3
200 | 3
201 | 3
202 | 3
203 | 3
204 | 3
205 | 3
206 | 4
207 | 4
208 | 4
209 | 4
210 | 4
211 | 4
212 | 4
213 | 4
214 | 4
215 | 4
216 | 4
217 | 4
218 | 4
219 | 4
220 | 4
221 | 4
222 | 5
223 | 5
224 | 5
225 | 5
226 | 5
227 | 5
228 | 5
229 | 5
230 | 5
231 | 5
232 | 5
233 | 5
234 | 5
235 | 5
236 | 5
237 | 5
238 | 5
239 | 6
240 | 6
241 | 6
242 | 6
243 | 6
244 | 6
245 | 6
246 | 6
247 | 6
248 | 6
249 | 6
250 | 6
251 | 6
252 | 6
253 | 6
254 | 6
255 | 6
256 | 6
257 | 7
258 | 7
259 | 7
260 | 7
261 | 7
262 | 7
263 | 7
264 | 7
265 | 7
266 | 7
267 | 7
268 | 7
269 | 7
270 | 7
271 | 7
272 | 7
273 | 7
274 | 7
275 | 7
276 | 7
277 | 7
278 | 1
279 | 1
280 | 1
281 | 1
282 | 1
283 | 1
284 | 1
285 | 1
286 | 1
287 | 1
288 | 2
289 | 2
290 | 2
291 | 2
292 | 2
293 | 2
294 | 2
295 | 2
296 | 2
297 | 2
298 | 2
299 | 3
300 | 3
301 | 3
302 | 3
303 | 3
304 | 3
305 | 3
306 | 3
307 | 3
308 | 3
309 | 3
310 | 3
311 | 4
312 | 4
313 | 4
314 | 4
315 | 4
316 | 4
317 | 4
318 | 4
319 | 4
320 | 4
321 | 4
322 | 4
323 | 4
324 | 4
325 | 4
326 | 5
327 | 5
328 | 5
329 | 5
330 | 5
331 | 5
332 | 5
333 | 5
334 | 5
335 | 5
336 | 5
337 | 5
338 | 5
339 | 5
340 | 5
341 | 5
342 | 5
343 | 5
344 | 6
345 | 6
346 | 6
347 | 6
348 | 6
349 | 6
350 | 6
351 | 6
352 | 6
353 | 6
354 | 6
355 | 6
356 | 6
357 | 6
358 | 1
359 | 1
360 | 1
361 | 1
362 | 1
363 | 1
364 | 1
365 | 1
366 | 1
367 | 1
368 | 1
369 | 1
370 | 1
371 | 2
372 | 2
373 | 2
374 | 2
375 | 2
376 | 2
377 | 2
378 | 2
379 | 2
380 | 2
381 | 2
382 | 2
383 | 2
384 | 2
385 | 3
386 | 3
387 | 3
388 | 3
389 | 3
390 | 3
391 | 3
392 | 3
393 | 3
394 | 3
395 | 4
396 | 4
397 | 4
398 | 4
399 | 4
400 | 4
401 | 4
402 | 4
403 | 4
404 | 4
405 | 4
406 | 4
407 | 4
408 | 4
409 | 4
410 | 5
411 | 5
412 | 5
413 | 5
414 | 5
415 | 5
416 | 5
417 | 5
418 | 5
419 | 5
420 | 5
421 | 5
422 | 5
423 | 6
424 | 6
425 | 6
426 | 6
427 | 6
428 | 6
429 | 6
430 | 6
431 | 6
432 | 6
433 | 6
434 | 6
435 | 7
436 | 7
437 | 7
438 | 7
439 | 7
440 | 7
441 | 7
442 | 7
443 | 7
444 | 7
445 | 7
446 | 7
447 | 7
448 | 7
449 | 7
450 | 7
451 | 7
452 | 1
453 | 1
454 | 1
455 | 1
456 | 1
457 | 1
458 | 1
459 | 1
460 | 1
461 | 1
462 | 1
463 | 1
464 | 1
465 | 1
466 | 1
467 | 2
468 | 2
469 | 2
470 | 2
471 | 2
472 | 2
473 | 2
474 | 2
475 | 2
476 | 2
477 | 2
478 | 2
479 | 2
480 | 2
481 | 2
482 | 2
483 | 2
484 | 2
485 | 3
486 | 3
487 | 3
488 | 3
489 | 3
490 | 3
491 | 3
492 | 3
493 | 3
494 | 3
495 | 3
496 | 3
497 | 3
498 | 3
499 | 3
500 | 3
501 | 3
502 | 4
503 | 4
504 | 4
505 | 4
506 | 4
507 | 4
508 | 4
509 | 4
510 | 4
511 | 4
512 | 4
513 | 4
514 | 4
515 | 5
516 | 5
517 | 5
518 | 5
519 | 5
520 | 5
521 | 5
522 | 5
523 | 5
524 | 5
525 | 5
526 | 5
527 | 5
528 | 5
529 | 5
530 | 5
531 | 


--------------------------------------------------------------------------------
/example.dat:
--------------------------------------------------------------------------------
  1 | 1 -1 2 -1 3 -1 4 -1 -2
  2 | 3 -1 5 -1 6 -1 4 -1 -2
  3 | 3 -1 4 -1 -2
  4 | 3 -1 5 -1 6 -1 7 -1 8 -1 4 -1 -2
  5 | 3 -1 4 -1 -2
  6 | 3 -1 4 -1 -2
  7 | 5 -1 6 -1 3 -1 4 -1 -2
  8 | 3 -1 5 -1 6 -1 4 -1 -2
  9 | 5 -1 6 -1 3 -1 9 -1 10 -1 5 -1 6 -1 4 -1 7 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
 10 | 3 -1 7 -1 8 -1 4 -1 -2
 11 | 3 -1 4 -1 -2
 12 | 3 -1 5 -1 6 -1 4 -1 -2
 13 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 14 | 5 -1 6 -1 9 -1 3 -1 7 -1 8 -1 10 -1 5 -1 4 -1 6 -1 -2
 15 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 16 | 3 -1 5 -1 6 -1 4 -1 -2
 17 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 18 | 1 -1 2 -1 3 -1 4 -1 -2
 19 | 3 -1 4 -1 -2
 20 | 3 -1 4 -1 -2
 21 | 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
 22 | 3 -1 7 -1 8 -1 4 -1 -2
 23 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
 24 | 3 -1 7 -1 8 -1 4 -1 -2
 25 | 3 -1 7 -1 8 -1 4 -1 -2
 26 | 1 -1 2 -1 3 -1 7 -1 8 -1 11 -1 12 -1 7 -1 8 -1 4 -1 -2
 27 | 3 -1 7 -1 8 -1 4 -1 -2
 28 | 3 -1 7 -1 8 -1 4 -1 -2
 29 | 3 -1 7 -1 8 -1 4 -1 13 -1 14 -1 -2
 30 | 3 -1 7 -1 8 -1 4 -1 -2
 31 | 3 -1 7 -1 8 -1 4 -1 -2
 32 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
 33 | 3 -1 7 -1 8 -1 4 -1 -2
 34 | 13 -1 14 -1 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
 35 | 3 -1 7 -1 8 -1 4 -1 -2
 36 | 3 -1 7 -1 8 -1 5 -1 4 -1 6 -1 -2
 37 | 3 -1 7 -1 8 -1 4 -1 -2
 38 | 1 -1 2 -1 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 8 -1 11 -1 4 -1 12 -1 7 -1 8 -1 -2
 39 | 3 -1 7 -1 8 -1 4 -1 -2
 40 | 13 -1 14 -1 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 41 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 42 | 3 -1 7 -1 8 -1 4 -1 -2
 43 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 5 -1 6 -1 -2
 44 | 9 -1 3 -1 4 -1 1 -1 2 -1 3 -1 10 -1 5 -1 6 -1 4 -1 -2
 45 | 3 -1 7 -1 4 -1 1 -1 2 -1 8 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 46 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 47 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 48 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 49 | 9 -1 10 -1 5 -1 6 -1 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 50 | 3 -1 7 -1 8 -1 4 -1 -2
 51 | 3 -1 4 -1 1 -1 2 -1 7 -1 3 -1 8 -1 4 -1 -2
 52 | 3 -1 4 -1 1 -1 2 -1 7 -1 8 -1 3 -1 4 -1 5 -1 6 -1 -2
 53 | 3 -1 7 -1 8 -1 4 -1 -2
 54 | 3 -1 7 -1 8 -1 4 -1 -2
 55 | 3 -1 4 -1 -2
 56 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 57 | 3 -1 5 -1 6 -1 4 -1 -2
 58 | 3 -1 4 -1 7 -1 1 -1 2 -1 3 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 59 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 60 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 61 | 3 -1 4 -1 -2
 62 | 3 -1 4 -1 -2
 63 | 3 -1 4 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 -2
 64 | 3 -1 4 -1 -2
 65 | 3 -1 4 -1 -2
 66 | 1 -1 2 -1 3 -1 5 -1 6 -1 4 -1 -2
 67 | 3 -1 4 -1 -2
 68 | 3 -1 4 -1 -2
 69 | 3 -1 4 -1 -2
 70 | 3 -1 4 -1 -2
 71 | 3 -1 4 -1 1 -1 2 -1 7 -1 8 -1 3 -1 4 -1 -2
 72 | 3 -1 4 -1 -2
 73 | 3 -1 4 -1 -2
 74 | 5 -1 3 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 4 -1 6 -1 -2
 75 | 13 -1 14 -1 15 -1 3 -1 16 -1 13 -1 14 -1 4 -1 -2
 76 | 3 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
 77 | 1 -1 2 -1 3 -1 4 -1 -2
 78 | 3 -1 13 -1 14 -1 5 -1 15 -1 6 -1 16 -1 9 -1 10 -1 7 -1 8 -1 13 -1 14 -1 5 -1 6 -1 4 -1 -2
 79 | 3 -1 5 -1 6 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
 80 | 5 -1 3 -1 6 -1 4 -1 -2
 81 | 13 -1 14 -1 3 -1 7 -1 8 -1 4 -1 -2
 82 | 3 -1 7 -1 8 -1 4 -1 -2
 83 | 3 -1 4 -1 7 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
 84 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
 85 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 86 | 3 -1 7 -1 8 -1 4 -1 -2
 87 | 3 -1 7 -1 8 -1 4 -1 -2
 88 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 89 | 3 -1 7 -1 8 -1 4 -1 -2
 90 | 3 -1 7 -1 8 -1 4 -1 -2
 91 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
 92 | 3 -1 7 -1 8 -1 4 -1 -2
 93 | 3 -1 7 -1 8 -1 4 -1 -2
 94 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
 95 | 13 -1 14 -1 3 -1 7 -1 8 -1 4 -1 -2
 96 | 3 -1 7 -1 5 -1 6 -1 8 -1 4 -1 -2
 97 | 3 -1 7 -1 8 -1 4 -1 -2
 98 | 3 -1 7 -1 8 -1 4 -1 -2
 99 | 3 -1 7 -1 8 -1 4 -1 -2
100 | 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 -2
101 | 1 -1 2 -1 3 -1 7 -1 8 -1 5 -1 6 -1 4 -1 -2
102 | 1 -1 2 -1 3 -1 7 -1 5 -1 6 -1 8 -1 4 -1 -2
103 | 3 -1 7 -1 5 -1 8 -1 6 -1 4 -1 -2
104 | 3 -1 7 -1 8 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
105 | 1 -1 2 -1 3 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
106 | 3 -1 7 -1 5 -1 6 -1 9 -1 10 -1 5 -1 8 -1 6 -1 9 -1 10 -1 4 -1 5 -1 6 -1 -2
107 | 3 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 9 -1 10 -1 5 -1 6 -1 -2
108 | 3 -1 7 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 8 -1 4 -1 -2
109 | 1 -1 2 -1 3 -1 5 -1 6 -1 4 -1 -2
110 | 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 8 -1 4 -1 -2
111 | 3 -1 7 -1 8 -1 11 -1 5 -1 12 -1 7 -1 8 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
112 | 3 -1 7 -1 5 -1 6 -1 8 -1 4 -1 -2
113 | 3 -1 5 -1 6 -1 4 -1 -2
114 | 3 -1 7 -1 8 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
115 | 3 -1 7 -1 5 -1 8 -1 6 -1 4 -1 -2
116 | 3 -1 7 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 8 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
117 | 3 -1 7 -1 8 -1 5 -1 6 -1 9 -1 10 -1 5 -1 6 -1 4 -1 -2
118 | 3 -1 7 -1 5 -1 8 -1 6 -1 4 -1 -2
119 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
120 | 3 -1 5 -1 6 -1 4 -1 -2
121 | 3 -1 4 -1 -2
122 | 3 -1 4 -1 -2
123 | 3 -1 4 -1 -2
124 | 3 -1 4 -1 5 -1 6 -1 -2
125 | 3 -1 4 -1 -2
126 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
127 | 3 -1 4 -1 -2
128 | 3 -1 5 -1 6 -1 4 -1 -2
129 | 1 -1 2 -1 3 -1 4 -1 -2
130 | 3 -1 4 -1 -2
131 | 3 -1 4 -1 -2
132 | 3 -1 4 -1 -2
133 | 13 -1 14 -1 3 -1 4 -1 -2
134 | 1 -1 2 -1 3 -1 4 -1 -2
135 | 3 -1 4 -1 -2
136 | 3 -1 4 -1 5 -1 6 -1 -2
137 | 3 -1 4 -1 -2
138 | 3 -1 4 -1 -2
139 | 3 -1 4 -1 -2
140 | 3 -1 5 -1 6 -1 4 -1 -2
141 | 9 -1 1 -1 2 -1 3 -1 4 -1 10 -1 -2
142 | 5 -1 3 -1 6 -1 9 -1 10 -1 5 -1 4 -1 6 -1 -2
143 | 9 -1 3 -1 4 -1 10 -1 5 -1 6 -1 -2
144 | 3 -1 5 -1 4 -1 6 -1 -2
145 | 3 -1 4 -1 -2
146 | 3 -1 4 -1 -2
147 | 3 -1 4 -1 -2
148 | 3 -1 4 -1 -2
149 | 3 -1 4 -1 -2
150 | 3 -1 4 -1 -2
151 | 3 -1 4 -1 -2
152 | 3 -1 4 -1 -2
153 | 3 -1 4 -1 -2
154 | 3 -1 4 -1 5 -1 6 -1 -2
155 | 3 -1 4 -1 -2
156 | 3 -1 4 -1 -2
157 | 3 -1 4 -1 -2
158 | 3 -1 4 -1 -2
159 | 3 -1 4 -1 -2
160 | 3 -1 4 -1 -2
161 | 3 -1 4 -1 -2
162 | 3 -1 4 -1 -2
163 | 3 -1 4 -1 -2
164 | 3 -1 4 -1 -2
165 | 5 -1 3 -1 6 -1 4 -1 -2
166 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
167 | 3 -1 4 -1 1 -1 2 -1 13 -1 14 -1 3 -1 4 -1 -2
168 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
169 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
170 | 3 -1 4 -1 -2
171 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
172 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
173 | 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
174 | 3 -1 4 -1 -2
175 | 3 -1 5 -1 6 -1 4 -1 -2
176 | 1 -1 2 -1 3 -1 4 -1 -2
177 | 3 -1 4 -1 -2
178 | 3 -1 4 -1 5 -1 6 -1 -2
179 | 3 -1 4 -1 -2
180 | 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
181 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 1 -1 2 -1 3 -1 4 -1 5 -1 6 -1 -2
182 | 3 -1 4 -1 -2
183 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
184 | 3 -1 7 -1 8 -1 5 -1 4 -1 6 -1 -2
185 | 3 -1 7 -1 8 -1 4 -1 -2
186 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
187 | 3 -1 7 -1 8 -1 4 -1 -2
188 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 -2
189 | 3 -1 7 -1 8 -1 4 -1 -2
190 | 3 -1 4 -1 -2
191 | 3 -1 7 -1 8 -1 4 -1 -2
192 | 3 -1 7 -1 8 -1 4 -1 -2
193 | 3 -1 7 -1 8 -1 4 -1 -2
194 | 3 -1 4 -1 5 -1 6 -1 -2
195 | 3 -1 7 -1 8 -1 4 -1 -2
196 | 1 -1 2 -1 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
197 | 3 -1 7 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 5 -1 6 -1 -2
198 | 3 -1 7 -1 8 -1 4 -1 5 -1 6 -1 -2
199 | 3 -1 7 -1 4 -1 1 -1 2 -1 3 -1 8 -1 4 -1 1 -1 2 -1 3 -1 4 -1 -2
200 | 3 -1 7 -1 8 -1 11 -1 4 -1 1 -1 2 -1 12 -1 -2
201 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>codemining</groupId>
 5 |   <artifactId>ism</artifactId>
 6 |   <packaging>pom</packaging>
 7 |   <version>1.0</version>
 8 |   <name>Interesting Sequence Mining</name>
 9 | 
10 |   <properties>
11 |     <findbugs.version>3.0.0</findbugs.version>
12 |     <checkstyle.version>5.7</checkstyle.version>
13 |     <surefire.reportplugin.version>2.12.4</surefire.reportplugin.version>
14 |     <cobertura.version>2.5.2</cobertura.version>
15 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
16 |   </properties>
17 | 
18 |   <build>
19 |      <plugins>
20 |         <plugin>
21 |            <groupId>org.codehaus.mojo</groupId>
22 |            <artifactId>findbugs-maven-plugin</artifactId>
23 |            <version>${findbugs.version}</version>
24 |         </plugin>
25 |         <plugin>
26 |            <groupId>org.codehaus.mojo</groupId>
27 |            <artifactId>cobertura-maven-plugin</artifactId>
28 |            <version>${cobertura.version}</version>
29 |            <configuration>
30 |                <formats>
31 |                    <format>xml</format>
32 |                </formats>
33 |            </configuration>
34 |         </plugin>
35 |         <plugin>
36 |         <artifactId>maven-compiler-plugin</artifactId>
37 |         <version>3.1</version>
38 |         <configuration>
39 |           <source>1.8</source>
40 |           <target>1.8</target>
41 |         </configuration>
42 |       </plugin>
43 |      </plugins>
44 |   </build>
45 |   
46 |   <modules>
47 |     <module>sequence-mining</module> <!-- Standalone Jar -->
48 |     <module>sequence-miner</module> <!-- Interesting Sequence Miner -->
49 |   </modules>
50 |   
51 |   <!-- Maven Repository on GitHub -->
52 |   <repositories>
53 |       <repository>
54 |           <id>maven-repo</id>
55 |           <url>https://github.com/mast-group/maven-repo/raw/master/repository/</url>
56 |       </repository>
57 |   </repositories>
58 |   
59 |   <reporting>
60 |      <plugins>
61 |         <plugin>
62 |            <groupId>org.apache.maven.plugins</groupId>
63 |            <artifactId>maven-surefire-report-plugin</artifactId>
64 |            <version>${surefire.reportplugin.version}</version>
65 |         </plugin>
66 |       <plugin>
67 |         <groupId>org.apache.maven.plugins</groupId>
68 |         <artifactId>maven-pmd-plugin</artifactId>
69 |         <version>3.2</version>
70 |         <configuration>
71 | 		  <linkXRef>false</linkXRef>
72 | 		</configuration>
73 |       </plugin>
74 |    </plugins>
75 |  </reporting>
76 |   
77 | </project>
78 | 


--------------------------------------------------------------------------------
/run-ISM-all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for db in gazelle alice jmlr sign parallel
 3 | do
 4 | java -cp sequence-mining/target/sequence-mining-1.0.jar sequencemining.main.SequenceMining -f datasets/$db.dat
 5 | done
 6 | for db in context auslan2 pioneer aslbu skating aslgt
 7 | do
 8 | java -cp sequence-mining/target/sequence-mining-1.0.jar sequencemining.main.SequenceMining -f datasets/classification/$db.dat
 9 | done 
10 | 


--------------------------------------------------------------------------------
/run-SQS.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | java -cp sequence-mining/target/sequence-mining-1.0.jar sequencemining.eval.StatisticalSequenceMining
3 | 


--------------------------------------------------------------------------------
/run-local.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | java -cp sequence-mining/target/sequence-mining-1.0.jar sequencemining.main.SequenceMining $*
3 | 


--------------------------------------------------------------------------------
/run-pr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | java -cp sequence-mining/target/sequence-mining-1.0.jar sequencemining.eval.BackgroundPrecisionRecall
3 | 


--------------------------------------------------------------------------------
/run-scaling.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | java -cp sequence-mining/target/sequence-mining-1.0.jar sequencemining.eval.SequenceScaling
3 | 


--------------------------------------------------------------------------------
/scripts/intervals.py:
--------------------------------------------------------------------------------
 1 | ''' Plot Classification Results '''
 2 | import matplotlib.pyplot as plt
 3 | from matplotlib import rc
 4 | 
 5 | rc('ps', fonttype=42)
 6 | rc('pdf', fonttype=42)
 7 | 
 8 | rc('xtick', labelsize=16) 
 9 | rc('ytick', labelsize=16) 
10 | 
11 | path = '/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Classification/'
12 | datasets = ['aslbu','aslgt','auslan2','context','pioneer','skating']
13 | algs = ['ISM','SQS','GoKrimp','BIDE','Singletons']
14 | cols = ['b','g','m','r','k']
15 | 
16 | for db in datasets:
17 | 
18 |     lines = open(path+db+'.txt').readlines()
19 |     x = map(int,lines[1].strip().split(': ')[1].replace('[','').replace(']','').split(', '))
20 |     
21 |     noseqs = {}
22 |     for i in range(2,7):
23 | 	line = lines[i].strip().split(' ')
24 | 	noseqs[line[1]] = int(line[0])
25 | 
26 |     n = 1	
27 |     for suffix in ['','_SVM']:
28 | 	for alg in algs:
29 |             plt.figure(n)
30 |             plt.subplot(2,3,datasets.index(db)+1)
31 | 	    name = alg+suffix	
32 | 	    for i in range(7,17):
33 | 		line = lines[i].strip().split(': ')
34 | 		if name != line[0]:	
35 | 		    continue	
36 | 		y = map(float,line[1].replace('[','').replace(']','').split(', '))
37 | 		
38 | 		xx = []
39 | 		yy = []
40 | 		nseqs = noseqs[name.replace('_SVM','')]
41 | 		if(x[0] > nseqs):
42 | 		    xx.append(nseqs)
43 | 		    yy.append(y[0])
44 | 		for k in range(0,len(x)):
45 | 		    if(x[k] > nseqs):
46 | 			break
47 | 		    xx.append(x[k])
48 | 		    yy.append(y[k])
49 | 
50 | 		if n == 2:
51 | 		    yy = map(lambda y:0.01*y,yy)		
52 | 		plt.figure(n)
53 | 		plt.plot(xx,yy,'.-',linewidth=2,markersize=12,color=cols[algs.index(alg)],clip_on=False)
54 | 
55 |         plt.figure(n)
56 | 	#if(n == 1):
57 | 	#     plt.suptitle('Naive Bayes')
58 | 	#else:
59 | 	#     plt.suptitle('Linear SVM')										
60 |     	plt.title(db,fontsize=16)
61 |         if(datasets.index(db)==0):
62 |     	    plt.legend(algs,'lower right')	
63 |     	plt.xlabel('top k',fontsize=16)
64 |     	plt.ylabel('Classification Accuracy',fontsize=16)
65 |    	plt.xlim([0,100])
66 |     	plt.grid()
67 |    	n+=1
68 | 
69 | plt.show()
70 | 


--------------------------------------------------------------------------------
/scripts/pr.py:
--------------------------------------------------------------------------------
 1 | # Plot itemset precision-recall
 2 | import matplotlib.pyplot as plt
 3 | from matplotlib import rc
 4 | import numpy as np
 5 | 
 6 | rc('ps', fonttype=42)
 7 | rc('pdf', fonttype=42)
 8 | 
 9 | rc('xtick', labelsize=16) 
10 | rc('ytick', labelsize=16) 
11 | 
12 | def main():
13 |     
14 |     path = '/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/PrecisionRecall/Background/'
15 |     probname = 'Background'    
16 |     cols = ['b','g','m','r']
17 |     prefixes = ['ISM','SQS','GoKrimp','BIDE']
18 |     
19 |     for prefix in prefixes:
20 |     
21 |         precision, recall = readdata(open(path+prefix+'_'+probname+'_pr.txt'))
22 | 	col = cols[prefixes.index(prefix)]
23 |         
24 |         # Calculate interpolated precision
25 |         pt_recall = np.arange(0,1.1,0.1)
26 |         interp_precision = [pinterp(zip(precision,recall),r) for r in pt_recall]
27 |         plotfigpr(interp_precision,pt_recall,prefix,col,1)
28 |         
29 |     plt.figure(1)   
30 |     plt.legend(prefixes,'lower right')
31 |     plt.show()
32 | 
33 | # Interpolate precision
34 | def pinterp(prarray,recall):
35 | 
36 |     m = [p for (p,r) in prarray if r >= recall]
37 |     if(len(m)==0):
38 |         return np.nan
39 |     else:
40 |         return max(m) 
41 | 
42 | def plotfigpr(precision,recall,name,col,figno):
43 | 
44 |     # sort
45 |     ind = np.array(recall).argsort()
46 |     r_d = np.array(recall)[ind]
47 |     p_d = np.array(precision)[ind]
48 | 
49 |     # zorder
50 |     zo = 5	
51 |     if name == 'SQS':
52 | 	zo = 10	
53 | 
54 |     plt.figure(figno)
55 |     plt.hold(True)
56 |     plt.plot(r_d,p_d,'.-',color=col,linewidth=2,markersize=12,clip_on=False,zorder=zo)
57 |     plt.xlabel('Recall',fontsize=16)
58 |     plt.ylabel('Precision',fontsize=16)
59 |     plt.xlim([0,1])
60 |     plt.ylim([0,1])
61 |     plt.grid(True)
62 | 
63 | def readdata(fl):
64 |    
65 |     for line in fl:
66 |       if 'Precision' in line:
67 | 	pre = line.strip().split(': ')[1].replace('[','').replace(']','').split(', ')
68 |       if 'Recall' in line:
69 | 	rec = line.strip().split(': ')[1].replace('[','').replace(']','').split(', ')
70 | 
71 |     return (map(float,pre),map(float,rec))
72 | 
73 | main()
74 | 


--------------------------------------------------------------------------------
/scripts/pr_par.py:
--------------------------------------------------------------------------------
 1 | # Plot itemset precision-recall
 2 | import matplotlib.pyplot as plt
 3 | from matplotlib import rc
 4 | import numpy as np
 5 | 
 6 | rc('ps', fonttype=42)
 7 | rc('pdf', fonttype=42)
 8 | 
 9 | rc('xtick', labelsize=16) 
10 | rc('ytick', labelsize=16) 
11 | 
12 | def main():
13 |     
14 |     path = '/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/PrecisionRecall/Parallel/'
15 |     probname = 'parallel'    
16 |     cols = ['b','g','m']
17 |     prefixes = ['ISM','SQS', 'GoKrimp']
18 |     
19 |     for prefix in prefixes:
20 |     
21 |         precision, recall = readdata(open(path+prefix+'_'+probname+'_pr.txt'))
22 | 	col = cols[prefixes.index(prefix)]
23 |         
24 | 	plt.figure(2)
25 |         plt.hold(True)
26 |         plt.plot(range(1,len(recall)+1),precision,'.-',color=col,linewidth=2,markersize=12,clip_on=False)
27 |         plt.xlabel('top k',fontsize=16)
28 |         plt.ylabel('Precision',fontsize=16)
29 | 	plt.xlim([0,50])
30 |         plt.ylim([0,1])
31 |         plt.grid(True)
32 | 
33 |         plt.figure(3)
34 |         plt.hold(True)
35 |         plt.plot(range(1,len(recall)+1),recall,'.-',color=col,linewidth=2,markersize=12,clip_on=False)
36 |         plt.xlabel('top k',fontsize=16)
37 |         plt.ylabel('Recall',fontsize=16)
38 |         plt.xlim([0,50])
39 |         plt.ylim([0,1])
40 |         plt.grid(True)
41 | 
42 |         # Calculate interpolated precision
43 |         pt_recall = np.arange(0,1.1,0.1)
44 |         interp_precision = [pinterp(zip(precision,recall),r) for r in pt_recall]
45 |         plotfigpr(interp_precision,pt_recall,probname,col,1)
46 |         
47 |     plt.figure(1)   
48 |     plt.legend(prefixes,'lower right')
49 | 
50 |     plt.figure(2)   
51 |     plt.legend(prefixes,'lower right')
52 | 
53 |     plt.figure(3)   
54 |     plt.legend(prefixes,'lower right')
55 | 
56 |     plt.show()
57 | 
58 | 
59 | # Interpolate precision
60 | def pinterp(prarray,recall):
61 | 
62 |     m = [p for (p,r) in prarray if r >= recall]
63 |     if(len(m)==0):
64 |         return np.nan
65 |     else:
66 |         return max(m) 
67 | 
68 | def plotfigpr(precision,recall,probname,col,figno):
69 | 
70 |     # sort
71 |     ind = np.array(recall).argsort()
72 |     r_d = np.array(recall)[ind]
73 |     p_d = np.array(precision)[ind]
74 | 
75 |     plt.figure(figno)
76 |     plt.hold(True)
77 |     plt.plot(r_d,p_d,'.-',color=col,linewidth=2,markersize=12,clip_on=False)
78 |     #plt.title(probname+' top-k precison-recall')
79 |     plt.xlabel('Recall',fontsize=16)
80 |     plt.ylabel('Precision',fontsize=16)
81 |     plt.xlim([0,1])
82 |     plt.ylim([0,1])
83 |     plt.grid(True)
84 | 
85 | def readdata(fl):
86 |    
87 |     for line in fl:
88 |       if 'Precision' in line:
89 | 	pre = line.strip().split(': ')[1].replace('[','').replace(']','').split(', ')
90 |       if 'Recall' in line:
91 | 	rec = line.strip().split(': ')[1].replace('[','').replace(']','').split(', ')
92 | 
93 |     return (map(float,pre),map(float,rec))
94 | 
95 | main()
96 | 


--------------------------------------------------------------------------------
/scripts/scaling.py:
--------------------------------------------------------------------------------
 1 | # Plot itemset scaling cup vs Spark 
 2 | import matplotlib.pyplot as plt
 3 | from matplotlib import rc
 4 | import numpy as np
 5 | 
 6 | rc('ps', fonttype=42)
 7 | rc('pdf', fonttype=42)
 8 | 
 9 | rc('xtick', labelsize=16) 
10 | rc('ytick', labelsize=16) 
11 | 
12 | trans = [1E3, 1E4, 1E5, 1E6]
13 | linear_trans = [1E1, 1E2, 1E3, 1E4]
14 | s32_time = [71.062, 292.757, 3712.463, 40283.949]
15 | 
16 | plt.figure(1)
17 | plt.hold(True)
18 | plt.plot(trans,s32_time,'.-',linewidth=2,markersize=12,clip_on=False)
19 | plt.plot(trans,linear_trans,'k--',linewidth=2)
20 | plt.gca().set_xscale('log')
21 | plt.gca().set_yscale('log')
22 | #plt.title('Transaction Scaling')
23 | plt.xlabel('No. Sequences',fontsize=16)
24 | plt.ylabel('Time (s)',fontsize=16)
25 | #plt.legend(['1 Core','4 Cores','16 Cores','64 Cores','128 Cores','Linear'],'upper left')
26 | #plt.axis('equal')
27 | plt.xlim([min(trans),max(trans)])
28 | #plt.ylim([-250,max(spark_time)])
29 | plt.grid(True)
30 | 
31 | plt.show()
32 | 


--------------------------------------------------------------------------------
/sequence-miner/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <groupId>codemining</groupId>
  5 |   <artifactId>sequence-miner</artifactId>
  6 |   <packaging>jar</packaging>
  7 |   <version>1.0</version>
  8 |   <name>Interesting Sequence Miner</name>
  9 | 
 10 |   <properties>
 11 |     <findbugs.version>3.0.0</findbugs.version>
 12 |     <checkstyle.version>5.7</checkstyle.version>
 13 |     <surefire.reportplugin.version>2.12.4</surefire.reportplugin.version>
 14 |     <cobertura.version>2.5.2</cobertura.version>
 15 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 16 |   </properties>
 17 | 
 18 |   <build>
 19 |      <plugins>
 20 |         <plugin>
 21 |            <groupId>org.codehaus.mojo</groupId>
 22 |            <artifactId>findbugs-maven-plugin</artifactId>
 23 |            <version>${findbugs.version}</version>
 24 |         </plugin>
 25 |         <plugin>
 26 |            <groupId>org.codehaus.mojo</groupId>
 27 |            <artifactId>cobertura-maven-plugin</artifactId>
 28 |            <version>${cobertura.version}</version>
 29 |            <configuration>
 30 |                <formats>
 31 |                    <format>xml</format>
 32 |                </formats>
 33 |            </configuration>
 34 |         </plugin>
 35 |         <plugin>
 36 |         <artifactId>maven-compiler-plugin</artifactId>
 37 |         <version>3.1</version>
 38 |         <configuration>
 39 |           <source>1.8</source>
 40 |           <target>1.8</target>
 41 |         </configuration>
 42 |       </plugin>
 43 |      </plugins>
 44 |   </build>
 45 |   
 46 |   <!-- Maven Repository on GitHub -->
 47 |   <repositories>
 48 |       <repository>
 49 |           <id>maven-repo</id>
 50 |           <url>https://github.com/mast-group/maven-repo/raw/master/repository/</url>
 51 |       </repository>
 52 |   </repositories>
 53 |   
 54 |   <dependencies>
 55 |      <dependency>
 56 |       <groupId>com.google.guava</groupId>
 57 |       <artifactId>guava</artifactId>
 58 |       <version>18.0</version>
 59 |      </dependency>
 60 |      <dependency>
 61 |         <groupId>commons-io</groupId>
 62 |         <artifactId>commons-io</artifactId>
 63 |         <version>2.4</version>
 64 |      </dependency>
 65 |      <dependency>
 66 |         <groupId>org.apache.commons</groupId>
 67 |         <artifactId>commons-math3</artifactId>
 68 |         <version>3.3</version>
 69 |      </dependency>
 70 |      <dependency>
 71 |       <groupId>junit</groupId>
 72 |       <artifactId>junit</artifactId>
 73 |       <version>4.11</version>
 74 |       <scope>test</scope>
 75 |     </dependency>
 76 |     <dependency>
 77 | 	  <groupId>codemining.deps</groupId>
 78 | 	  <artifactId>spmf</artifactId>
 79 | 	  <version>0.98c</version>
 80 |     </dependency>
 81 |     <dependency> <!-- Nice option parser -->
 82 | 	  <groupId>com.beust</groupId>
 83 | 	  <artifactId>jcommander</artifactId>
 84 | 	  <version>1.35</version>
 85 | 	</dependency>
 86 |    <?ignore 
 87 |     <dependency> <!-- Spark dependency -->
 88 |       <groupId>org.apache.spark</groupId>
 89 |       <artifactId>spark-core_2.10</artifactId>
 90 |       <version>1.1.0</version>
 91 |     </dependency>
 92 |     <dependency> <!-- HDFS dependency -->
 93 | 	  <groupId>org.apache.hadoop</groupId>
 94 | 	  <artifactId>hadoop-client</artifactId>
 95 | 	  <version>1.0.4</version>
 96 |     </dependency>
 97 |    ?> 
 98 |   </dependencies>
 99 |   
100 |   <reporting>
101 |      <plugins>
102 |         <plugin>
103 |            <groupId>org.apache.maven.plugins</groupId>
104 |            <artifactId>maven-surefire-report-plugin</artifactId>
105 |            <version>${surefire.reportplugin.version}</version>
106 |         </plugin>
107 |       <plugin>
108 |         <groupId>org.apache.maven.plugins</groupId>
109 |         <artifactId>maven-pmd-plugin</artifactId>
110 |         <version>3.2</version>
111 |         <configuration>
112 | 		  <linkXRef>false</linkXRef>
113 | 		</configuration>
114 |       </plugin>
115 |    </plugins>
116 |  </reporting>
117 |   
118 | </project>
119 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/ExclusiveSequences.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.eval;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileOutputStream;
  5 | import java.io.IOException;
  6 | import java.io.PrintStream;
  7 | import java.util.HashSet;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | import java.util.Map.Entry;
 11 | import java.util.Set;
 12 | 
 13 | import org.apache.commons.io.FileUtils;
 14 | import org.apache.commons.io.output.TeeOutputStream;
 15 | 
 16 | import sequencemining.main.SequenceMiningCore;
 17 | import sequencemining.sequence.Sequence;
 18 | 
 19 | public class ExclusiveSequences {
 20 | 
 21 | 	public static void main(final String[] args) throws IOException {
 22 | 
 23 | 		final int topN = 20;
 24 | 		final String baseDir = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/";
 25 | 		// final String dataset = "jmlr";
 26 | 		// final String seqLabels = baseDir + "Datasets/JMLR/jmlr.lab";
 27 | 		final String dataset = "alice_punc";
 28 | 		final String seqLabels = baseDir + "Datasets/Alice/WithPunctuation/alice_punc.lab";
 29 | 
 30 | 		// Set up logging
 31 | 		final FileOutputStream outFile = new FileOutputStream(baseDir + dataset + "_exclusive.txt");
 32 | 		final TeeOutputStream out = new TeeOutputStream(System.out, outFile);
 33 | 		final PrintStream ps = new PrintStream(out);
 34 | 		System.setOut(ps);
 35 | 
 36 | 		final Map<Sequence, Double> ismSeqs = SequenceMiningCore
 37 | 				.readISMSequences(new File(baseDir + "Logs/" + dataset + ".log"));
 38 | 		final Map<Sequence, Double> sqsSeqs = StatisticalSequenceMining
 39 | 				.readSQSSequences(new File(baseDir + "SQS/" + dataset + ".txt"));
 40 | 		final Map<Sequence, Double> gokrimpSeqs = StatisticalSequenceMining
 41 | 				.readGoKrimpSequences(new File(baseDir + "GoKrimp/" + dataset + ".txt"));
 42 | 
 43 | 		final Set<Sequence> ISMnotSQSorGoKrimp = getExclusiveSequences(ismSeqs.keySet(), sqsSeqs.keySet(),
 44 | 				gokrimpSeqs.keySet());
 45 | 		final Set<Sequence> SQSnotISMorGoKrimp = getExclusiveSequences(sqsSeqs.keySet(), ismSeqs.keySet(),
 46 | 				gokrimpSeqs.keySet());
 47 | 		final Set<Sequence> GoKrimpnotISMorSQS = getExclusiveSequences(gokrimpSeqs.keySet(), ismSeqs.keySet(),
 48 | 				sqsSeqs.keySet());
 49 | 
 50 | 		final List<String> dict = FileUtils.readLines(new File(seqLabels));
 51 | 
 52 | 		// Print top ten
 53 | 		System.out.print("\n============= ISM not SQS/GoKrimp =============\n");
 54 | 		printTopExclusiveSequences(topN, ismSeqs, ISMnotSQSorGoKrimp, dict);
 55 | 		System.out.print("\n============= SQS not ISM/GoKrimp =============\n");
 56 | 		printTopExclusiveSequences(topN, sqsSeqs, SQSnotISMorGoKrimp, dict);
 57 | 		System.out.print("\n============= GoKrimp not ISM/SQS =============\n");
 58 | 		printTopExclusiveSequences(topN, gokrimpSeqs, GoKrimpnotISMorSQS, dict);
 59 | 
 60 | 	}
 61 | 
 62 | 	/**
 63 | 	 * Set A \ B u C
 64 | 	 * <p>
 65 | 	 * Note: slow but Guava contains/Set.difference doesn't work here
 66 | 	 */
 67 | 	private static Set<Sequence> getExclusiveSequences(final Set<Sequence> setA, final Set<Sequence> setB,
 68 | 			final Set<Sequence> setC) {
 69 | 		final Set<Sequence> exclSeqs = new HashSet<>();
 70 | 		outer: for (final Sequence seqA : setA) {
 71 | 			for (final Sequence seqB : setB) {
 72 | 				if (seqA.equals(seqB))
 73 | 					continue outer;
 74 | 			}
 75 | 			for (final Sequence seqC : setC) {
 76 | 				if (seqA.equals(seqC))
 77 | 					continue outer;
 78 | 			}
 79 | 			exclSeqs.add(seqA);
 80 | 		}
 81 | 		return exclSeqs;
 82 | 	}
 83 | 
 84 | 	private static void printTopExclusiveSequences(final int topN, final Map<Sequence, Double> seqs,
 85 | 			final Set<Sequence> exclusiveSeqs, final List<String> dict) {
 86 | 		int count = 0;
 87 | 		for (final Entry<Sequence, Double> entry : seqs.entrySet()) {
 88 | 			final Sequence set = entry.getKey();
 89 | 			if (set.size() > 1 && exclusiveSeqs.contains(set)) {
 90 | 				System.out.print(String.format("%s\tprob: %1.5f %n", decode(entry.getKey(), dict), entry.getValue()));
 91 | 				count++;
 92 | 				if (count == topN)
 93 | 					break;
 94 | 			}
 95 | 		}
 96 | 		System.out.println();
 97 | 	}
 98 | 
 99 | 	private static String decode(final Sequence seq, final List<String> dict) {
100 | 		String prefix = "";
101 | 		final StringBuilder sb = new StringBuilder();
102 | 		for (final Integer item : seq) {
103 | 			sb.append(prefix);
104 | 			sb.append(dict.get(item - 1));
105 | 			prefix = ", ";
106 | 		}
107 | 		return sb.toString();
108 | 	}
109 | 
110 | }
111 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/FrequentSequenceMining.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.eval;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.util.HashMap;
  6 | import java.util.List;
  7 | import java.util.SortedMap;
  8 | 
  9 | import org.apache.commons.io.FileUtils;
 10 | import org.apache.commons.io.LineIterator;
 11 | 
 12 | import com.google.common.base.Functions;
 13 | import com.google.common.collect.ImmutableSortedMap;
 14 | import com.google.common.collect.Ordering;
 15 | 
 16 | import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.AlgoBIDEPlus;
 17 | import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.AlgoPrefixSpan;
 18 | import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.SequentialPattern;
 19 | import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.SequentialPatterns;
 20 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.AlgoSPADE;
 21 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.AlgoSPAM_AGP;
 22 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.candidatePatternsGeneration.CandidateGenerator;
 23 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.candidatePatternsGeneration.CandidateGenerator_Qualitative;
 24 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.creators.AbstractionCreator;
 25 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.creators.AbstractionCreator_Qualitative;
 26 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.idLists.creators.IdListCreator_FatBitmap;
 27 | import ca.pfv.spmf.input.sequence_database_list_integers.SequenceDatabase;
 28 | import ca.pfv.spmf.patterns.itemset_list_integers_without_support.Itemset;
 29 | import sequencemining.sequence.Sequence;
 30 | 
 31 | public class FrequentSequenceMining {
 32 | 
 33 | 	public static void main(final String[] args) throws IOException {
 34 | 
 35 | 		// Datasets and parameters
 36 | 		final String[] datasets = { "alice_punc", "GAZELLE1", "jmlr", "SIGN", "auslan2", "pioneer", "aslbu", "skating",
 37 | 				"aslgt", "context" };
 38 | 		final double[] minSupps = new double[] { 0.02, 0.004, 0.15, 0.45, 0.0001, 0.1, 0.04, 0.43, 0.25, 0.49 };
 39 | 
 40 | 		for (int i = 0; i < datasets.length; i++) {
 41 | 			final String dbPath = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Datasets/Paper/" + datasets[i]
 42 | 					+ ".dat";
 43 | 			final String saveFile = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/BIDE/" + datasets[i] + ".txt";
 44 | 			mineClosedFrequentSequencesBIDE(dbPath, saveFile, minSupps[i]);
 45 | 		}
 46 | 	}
 47 | 
 48 | 	/** Run PrefixSpan algorithm */
 49 | 	public static SortedMap<Sequence, Integer> mineFrequentSequencesPrefixSpan(final String dataset,
 50 | 			final String saveFile, final double minSupp) throws IOException {
 51 | 
 52 | 		final SequenceDatabase sequenceDatabase = new SequenceDatabase();
 53 | 		sequenceDatabase.loadFile(dataset);
 54 | 
 55 | 		final AlgoPrefixSpan algo = new AlgoPrefixSpan();
 56 | 		algo.setShowSequenceIdentifiers(false);
 57 | 		final SequentialPatterns patterns = algo.runAlgorithm(sequenceDatabase, minSupp, saveFile);
 58 | 		// algo.printStatistics(sequenceDatabase.size());
 59 | 
 60 | 		return toMap(patterns);
 61 | 	}
 62 | 
 63 | 	/** Run SPADE algorithm */
 64 | 	public static SortedMap<Sequence, Integer> mineFrequentSequencesSPADE(final String dataset, final String saveFile,
 65 | 			final double minSupp) throws IOException {
 66 | 
 67 | 		final boolean verbose = true;
 68 | 
 69 | 		final AbstractionCreator abstractionCreator = AbstractionCreator_Qualitative.getInstance();
 70 | 		final ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase sequenceDatabase = new ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase(
 71 | 				abstractionCreator, IdListCreator_FatBitmap.getInstance());
 72 | 		sequenceDatabase.loadFile(dataset, minSupp);
 73 | 
 74 | 		final AlgoSPADE algo = new AlgoSPADE(minSupp, true, abstractionCreator);
 75 | 		final CandidateGenerator candidateGenerator = CandidateGenerator_Qualitative.getInstance();
 76 | 		algo.runAlgorithmParallelized(sequenceDatabase, candidateGenerator, true, verbose, saveFile, false);
 77 | 		// algo.printStatistics();
 78 | 
 79 | 		return null;
 80 | 	}
 81 | 
 82 | 	/** Run SPAM algorithm */
 83 | 	public static SortedMap<Sequence, Integer> mineFrequentSequencesSPAM(final String dataset, final String saveFile,
 84 | 			final double minSupp) throws IOException {
 85 | 
 86 | 		final boolean verbose = true;
 87 | 
 88 | 		final AbstractionCreator abstractionCreator = AbstractionCreator_Qualitative.getInstance();
 89 | 		final ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase sequenceDatabase = new ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase(
 90 | 				abstractionCreator, IdListCreator_FatBitmap.getInstance());
 91 | 		sequenceDatabase.loadFile(dataset, minSupp);
 92 | 
 93 | 		final AlgoSPAM_AGP algorithm = new AlgoSPAM_AGP(minSupp);
 94 | 		algorithm.runAlgorithm(sequenceDatabase, true, verbose, saveFile, false);
 95 | 		// algo.printStatistics();
 96 | 
 97 | 		return null;
 98 | 	}
 99 | 
100 | 	/** Run BIDE algorithm */
101 | 	public static SortedMap<Sequence, Integer> mineClosedFrequentSequencesBIDE(final String dataset,
102 | 			final String saveFile, final double minSupp) throws IOException {
103 | 
104 | 		final SequenceDatabase sequenceDatabase = new SequenceDatabase();
105 | 		sequenceDatabase.loadFile(dataset);
106 | 
107 | 		// Convert to absolute support (rounding down)
108 | 		final int absMinSupp = (int) (sequenceDatabase.size() * minSupp);
109 | 
110 | 		final AlgoBIDEPlus algo = new AlgoBIDEPlus();
111 | 		algo.setShowSequenceIdentifiers(false);
112 | 		final SequentialPatterns patterns = algo.runAlgorithm(sequenceDatabase, saveFile, absMinSupp);
113 | 		// algo.printStatistics(sequenceDatabase.size());
114 | 
115 | 		return toMap(patterns);
116 | 	}
117 | 
118 | 	/** Convert frequent sequences to sorted Map<Sequence, Integer> */
119 | 	public static SortedMap<Sequence, Integer> toMap(final SequentialPatterns patterns) {
120 | 		if (patterns == null) {
121 | 			return null;
122 | 		} else {
123 | 			final HashMap<Sequence, Integer> sequences = new HashMap<>();
124 | 			for (final List<SequentialPattern> level : patterns.levels) {
125 | 				for (final SequentialPattern pattern : level) {
126 | 					final Sequence seq = new Sequence();
127 | 					for (final Itemset set : pattern.getItemsets())
128 | 						seq.add(set.get(0)); // Assumes a seq is just singleton
129 | 												// itemsets
130 | 					sequences.put(seq, pattern.getAbsoluteSupport());
131 | 				}
132 | 			}
133 | 			// Sort patterns by support
134 | 			final Ordering<Sequence> comparator = Ordering.natural().reverse().onResultOf(Functions.forMap(sequences))
135 | 					.compound(Ordering.usingToString());
136 | 			return ImmutableSortedMap.copyOf(sequences, comparator);
137 | 		}
138 | 	}
139 | 
140 | 	/** Read in frequent sequences (sorted by support) */
141 | 	public static SortedMap<Sequence, Integer> readFrequentSequences(final File output) throws IOException {
142 | 		final HashMap<Sequence, Integer> sequences = new HashMap<>();
143 | 
144 | 		final LineIterator it = FileUtils.lineIterator(output);
145 | 		while (it.hasNext()) {
146 | 			final String line = it.nextLine();
147 | 			if (!line.trim().isEmpty()) {
148 | 				final String[] splitLine = line.split("#SUP:");
149 | 				final String[] items = splitLine[0].trim().split("-1");
150 | 				final Sequence seq = new Sequence();
151 | 				for (final String item : items)
152 | 					seq.add(Integer.parseInt(item.trim()));
153 | 				final int supp = Integer.parseInt(splitLine[1].trim());
154 | 				sequences.put(seq, supp);
155 | 			}
156 | 		}
157 | 		// Sort sequences by support
158 | 		final Ordering<Sequence> comparator = Ordering.natural().reverse().onResultOf(Functions.forMap(sequences))
159 | 				.compound(Ordering.usingToString());
160 | 		return ImmutableSortedMap.copyOf(sequences, comparator);
161 | 	}
162 | 
163 | }
164 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/IntervalClassification.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.eval;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.io.PrintWriter;
  6 | import java.io.Writer;
  7 | import java.lang.ProcessBuilder.Redirect;
  8 | import java.util.Arrays;
  9 | import java.util.HashMap;
 10 | import java.util.HashSet;
 11 | import java.util.Map;
 12 | import java.util.Map.Entry;
 13 | import java.util.Set;
 14 | 
 15 | import org.apache.commons.io.FileUtils;
 16 | 
 17 | import com.google.common.base.Charsets;
 18 | import com.google.common.base.Functions;
 19 | import com.google.common.collect.ArrayListMultimap;
 20 | import com.google.common.collect.ImmutableSortedMap;
 21 | import com.google.common.collect.Multimap;
 22 | import com.google.common.collect.Ordering;
 23 | import com.google.common.collect.Table;
 24 | import com.google.common.io.Files;
 25 | 
 26 | import sequencemining.main.SequenceMining;
 27 | import sequencemining.sequence.Sequence;
 28 | import sequencemining.transaction.Transaction;
 29 | import sequencemining.transaction.TransactionList;
 30 | 
 31 | public class IntervalClassification {
 32 | 
 33 | 	public static void main(final String[] args) throws IOException {
 34 | 
 35 | 		final String[] datasets = new String[] { "context", "auslan2", "pioneer", "aslbu", "skating", "aslgt" };
 36 | 		final int[] topNs = new int[] { 10, 40, 70, 100 };
 37 | 		final String baseFolder = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/";
 38 | 		final String datasetFolder = baseFolder + "Datasets/Intervals/";
 39 | 		final String outFolder = baseFolder + "Classification/";
 40 | 
 41 | 		for (int i = 0; i < datasets.length; i++) {
 42 | 			final String dataset = datasets[i];
 43 | 
 44 | 			System.out.println("===== Dataset: " + dataset + " =====");
 45 | 			final File outFile = new File(outFolder + dataset + ".txt");
 46 | 			final Writer writer = Files.newWriter(outFile, Charsets.UTF_8);
 47 | 			writer.write("===== " + dataset + " =====\n");
 48 | 			writer.write("topN: " + Arrays.toString(topNs) + "\n");
 49 | 
 50 | 			// Read dataset
 51 | 			final File dbFile = new File(datasetFolder + dataset + "/" + dataset + ".dat");
 52 | 			final TransactionList dbTrans = SequenceMining.readTransactions(dbFile);
 53 | 			final File labelFile = new File(datasetFolder + dataset + "/" + dataset + ".lab");
 54 | 
 55 | 			// Read SQS seqs
 56 | 			final File outSQS = new File(baseFolder + "SQS/" + dataset + ".txt");
 57 | 			final Map<Sequence, Double> seqsSQS = StatisticalSequenceMining.readSQSSequences(outSQS);
 58 | 			// seqsSQS = removeSingletons(seqsSQS);
 59 | 			System.out.println("SQS: " + seqsSQS);
 60 | 			writer.write(seqsSQS.size() + " SQS seqs \n");
 61 | 
 62 | 			// Read GOKRIMP seqs
 63 | 			final File outGOKRIMP = new File(baseFolder + "GoKrimp/" + dataset + ".txt");
 64 | 			final Map<Sequence, Double> seqsGOKRIMP = StatisticalSequenceMining.readGoKrimpSequences(outGOKRIMP);
 65 | 			// seqsGOKRIMP = removeSingletons(seqsGOKRIMP);
 66 | 			System.out.println("GoKrimp: " + seqsGOKRIMP);
 67 | 			writer.write(seqsGOKRIMP.size() + " GoKrimp seqs \n");
 68 | 
 69 | 			// Read ISM seqs
 70 | 			final File outISM = new File(baseFolder + "Logs/" + dataset + ".log");
 71 | 			final Map<Sequence, Double> seqsISM = SequenceMining.readISMSequences(outISM);
 72 | 			System.out.println("ISM: " + seqsISM);
 73 | 			writer.write(seqsISM.size() + " ISM seqs \n");
 74 | 
 75 | 			// Read BIDE seqs
 76 | 			final File outBIDE = new File(baseFolder + "BIDE/" + dataset + ".txt");
 77 | 			final Map<Sequence, Integer> seqsBIDE = FrequentSequenceMining.readFrequentSequences(outBIDE);
 78 | 			// seqsBIDE = removeSingletons(seqsBIDE);
 79 | 			System.out.println("BIDE: " + seqsBIDE);
 80 | 			writer.write(seqsBIDE.size() + " BIDE seqs \n");
 81 | 
 82 | 			// Generate simple features
 83 | 			Map<Sequence, Double> seqsSingleton = new HashMap<>();
 84 | 			final Table<Sequence, Integer, Double> singletons = SequenceMining
 85 | 					.scanDatabaseToDetermineInitialProbabilities(dbFile);
 86 | 			for (final Sequence seq : singletons.rowKeySet())
 87 | 				seqsSingleton.put(seq, 1 - singletons.get(seq, 0));
 88 | 			// Sort by support
 89 | 			final Ordering<Sequence> comparator = Ordering.natural().reverse()
 90 | 					.onResultOf(Functions.forMap(seqsSingleton)).compound(Ordering.usingToString());
 91 | 			seqsSingleton = ImmutableSortedMap.copyOf(seqsSingleton, comparator);
 92 | 			System.out.println("Singeltons: " + seqsSingleton);
 93 | 			writer.write(seqsSingleton.size() + " Singletons seqs \n");
 94 | 
 95 | 			// Classify
 96 | 			final Multimap<String, Double> accuracy = ArrayListMultimap.create();
 97 | 			for (final int n : topNs) {
 98 | 				// Run MALLET Naive Bayes classifier
 99 | 				accuracy.put("SQS", classify(n, seqsSQS, dbTrans, labelFile));
100 | 				accuracy.put("GoKrimp", classify(n, seqsGOKRIMP, dbTrans, labelFile));
101 | 				accuracy.put("ISM", classify(n, seqsISM, dbTrans, labelFile));
102 | 				accuracy.put("BIDE", classify(n, seqsBIDE, dbTrans, labelFile));
103 | 				accuracy.put("Singletons", classify(n, seqsSingleton, dbTrans, labelFile));
104 | 				// Run libSVM Linear classifier
105 | 				accuracy.put("SQS_SVM", classifySVM(n, seqsSQS, dbTrans, labelFile));
106 | 				accuracy.put("GoKrimp_SVM", classifySVM(n, seqsGOKRIMP, dbTrans, labelFile));
107 | 				accuracy.put("ISM_SVM", classifySVM(n, seqsISM, dbTrans, labelFile));
108 | 				accuracy.put("BIDE_SVM", classifySVM(n, seqsBIDE, dbTrans, labelFile));
109 | 				accuracy.put("Singletons_SVM", classifySVM(n, seqsSingleton, dbTrans, labelFile));
110 | 			}
111 | 			for (final String alg : accuracy.keySet())
112 | 				writer.write(alg + ": " + accuracy.get(alg) + "\n");
113 | 			writer.close();
114 | 		}
115 | 	}
116 | 
117 | 	/** Classify using MALLET Naive Bayes */
118 | 	static <V> Double classify(final int topN, final Map<Sequence, V> seqs, final TransactionList dbTrans,
119 | 			final File labelFile) throws IOException {
120 | 		if (seqs.size() == 0)
121 | 			return 0.;
122 | 
123 | 		// Create temp files
124 | 		final File featureFile = File.createTempFile("features_temp", ".txt");
125 | 		final File tmpFile = File.createTempFile("mallet_temp", ".txt");
126 | 		final File outFile = File.createTempFile("mallet_output_temp", ".txt");
127 | 
128 | 		// Generate features
129 | 		generateFeatures(topN, seqs, dbTrans, featureFile, labelFile);
130 | 
131 | 		// Convert to binary MALLET format
132 | 		final String cmd[] = new String[4];
133 | 		cmd[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/mallet-2.0.7/bin/mallet";
134 | 		cmd[1] = "import-svmlight";
135 | 		cmd[2] = "--input " + featureFile;
136 | 		cmd[3] = "--output " + tmpFile;
137 | 		runScript(cmd, null);
138 | 
139 | 		// Classify
140 | 		final String cmd2[] = new String[5];
141 | 		cmd2[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/mallet-2.0.7/bin/mallet";
142 | 		cmd2[1] = "train-classifier";
143 | 		cmd2[2] = "--input " + tmpFile;
144 | 		cmd2[3] = "--cross-validation 10";
145 | 		cmd2[4] = "--report test:accuracy";
146 | 		runScript(cmd2, outFile);
147 | 
148 | 		// Print output to screen
149 | 		final String cmd3[] = new String[3];
150 | 		cmd3[0] = "tail";
151 | 		cmd3[1] = "-n 2";
152 | 		cmd3[2] = "" + outFile;
153 | 		runScript(cmd3, null);
154 | 
155 | 		// Get accuracy
156 | 		final String[] lines = FileUtils.readFileToString(outFile).split("\n");
157 | 		final double accuracy = Double.parseDouble(lines[lines.length - 1].split(" ")[5]);
158 | 
159 | 		// Remove temp files
160 | 		featureFile.delete();
161 | 		tmpFile.delete();
162 | 		outFile.delete();
163 | 
164 | 		return accuracy;
165 | 	}
166 | 
167 | 	/** Classify using libSVM linear kernel */
168 | 	static <V> Double classifySVM(final int topN, final Map<Sequence, V> seqs, final TransactionList dbTrans,
169 | 			final File labelFile) throws IOException {
170 | 		if (seqs.size() == 0)
171 | 			return 0.;
172 | 
173 | 		// Create temp files
174 | 		final File featureFile = File.createTempFile("features_temp", ".txt");
175 | 		final File outFile = File.createTempFile("libsvm_output_temp", ".txt");
176 | 
177 | 		// Generate features
178 | 		generateFeatures(topN, seqs, dbTrans, featureFile, labelFile);
179 | 
180 | 		// Classify
181 | 		final String cmd[] = new String[4];
182 | 		cmd[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/libsvm/svm.sh";
183 | 		cmd[1] = "-t 0"; // Linear kernel
184 | 		cmd[2] = "-v 10"; // 10-fold cross-validation
185 | 		cmd[3] = "" + featureFile;
186 | 		runScript(cmd, outFile);
187 | 
188 | 		// Print output to screen
189 | 		final String cmd2[] = new String[3];
190 | 		cmd2[0] = "tail";
191 | 		cmd2[1] = "-n 2";
192 | 		cmd2[2] = "" + outFile;
193 | 		runScript(cmd2, null);
194 | 
195 | 		// Get accuracy
196 | 		final String[] lines = FileUtils.readFileToString(outFile).split("\n");
197 | 		final double accuracy = Double.parseDouble(lines[lines.length - 1].split(" ")[4].replace("%", ""));
198 | 
199 | 		// Remove temp files
200 | 		featureFile.delete();
201 | 		outFile.delete();
202 | 
203 | 		return accuracy;
204 | 	}
205 | 
206 | 	private static <V> boolean generateFeatures(final int topN, final Map<Sequence, V> sequences,
207 | 			final TransactionList dbTrans, final File featureFile, final File labelFile) throws IOException {
208 | 
209 | 		// Get topN sequences
210 | 		final Set<Sequence> topSeqs = getTopSequences(sequences, topN);
211 | 
212 | 		// Set output file
213 | 		final PrintWriter out = new PrintWriter(featureFile, "UTF-8");
214 | 
215 | 		// Read transaction labels
216 | 		final String[] labels = FileUtils.readFileToString(labelFile).split("\n");
217 | 
218 | 		// Generate features
219 | 		int count = 0;
220 | 		for (final Transaction trans : dbTrans.getTransactionList()) {
221 | 			out.print(labels[count] + " ");
222 | 			int fNum = 0;
223 | 			for (final Sequence seq : topSeqs) {
224 | 				if (trans.contains(seq))
225 | 					out.print(fNum + ":1 ");
226 | 				else
227 | 					out.print(fNum + ":0 ");
228 | 				fNum++;
229 | 			}
230 | 			out.println();
231 | 			count++;
232 | 		}
233 | 		out.close();
234 | 
235 | 		return true;
236 | 	}
237 | 
238 | 	/** Get top sequences */
239 | 	private static <V> Set<Sequence> getTopSequences(final Map<Sequence, V> sequences, final int topN) {
240 | 
241 | 		int count = 0;
242 | 		final Set<Sequence> topItemsets = new HashSet<>();
243 | 		for (final Sequence set : sequences.keySet()) {
244 | 			topItemsets.add(set);
245 | 			count++;
246 | 			if (count == topN)
247 | 				break;
248 | 		}
249 | 		if (count < topN)
250 | 			System.out.println("WARNING: not enough sequences in set: " + count);
251 | 
252 | 		return topItemsets;
253 | 	}
254 | 
255 | 	@SuppressWarnings("unused")
256 | 	private static <V> Map<Sequence, V> removeSingletons(final Map<Sequence, V> oldSeqs) {
257 | 		final Map<Sequence, V> newSeqs = new HashMap<>();
258 | 		for (final Entry<Sequence, V> entry : oldSeqs.entrySet()) {
259 | 			if (entry.getKey().size() > 1)
260 | 				newSeqs.put(entry.getKey(), entry.getValue());
261 | 		}
262 | 		return newSeqs;
263 | 	}
264 | 
265 | 	/** Run shell script with command line arguments */
266 | 	public static void runScript(final String cmd[], final File outFile) {
267 | 
268 | 		try {
269 | 			final ProcessBuilder pb = new ProcessBuilder(cmd);
270 | 			if (outFile != null)
271 | 				pb.redirectOutput(outFile);
272 | 			else
273 | 				pb.redirectOutput(Redirect.INHERIT);
274 | 			pb.redirectError(Redirect.INHERIT);
275 | 			final Process process = pb.start();
276 | 			process.waitFor();
277 | 			process.destroy();
278 | 		} catch (final Exception e) {
279 | 			e.printStackTrace();
280 | 		}
281 | 
282 | 	}
283 | 
284 | }
285 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/PrecisionRecallBackground.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.eval;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileOutputStream;
  5 | import java.io.IOException;
  6 | import java.io.PrintStream;
  7 | import java.util.Arrays;
  8 | import java.util.HashMap;
  9 | import java.util.Map;
 10 | import java.util.Map.Entry;
 11 | import java.util.Set;
 12 | 
 13 | import org.apache.commons.io.FilenameUtils;
 14 | import org.apache.commons.io.output.TeeOutputStream;
 15 | 
 16 | import com.google.common.collect.Sets;
 17 | import com.google.common.collect.Table;
 18 | 
 19 | import sequencemining.main.InferenceAlgorithms.InferGreedy;
 20 | import sequencemining.main.SequenceMining;
 21 | import sequencemining.main.SequenceMiningCore;
 22 | import sequencemining.sequence.Sequence;
 23 | import sequencemining.transaction.TransactionGenerator;
 24 | import sequencemining.util.Logging;
 25 | 
 26 | public class PrecisionRecallBackground {
 27 | 
 28 | 	/** Main Settings */
 29 | 	private static final File dbFile = new File("/disk/data1/jfowkes/sequence.txt");
 30 | 	private static final File saveDir = new File("/disk/data1/jfowkes/logs/");
 31 | 
 32 | 	/** FSM Issues to incorporate */
 33 | 	private static final String name = "Background";
 34 | 	private static final int noIterations = 5_000;
 35 | 
 36 | 	/** Previously mined Sequences to use for background distribution */
 37 | 	private static final File sequenceLog = new File("/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Logs/SIGN.log");
 38 | 	private static final int noTransactions = 10_000;
 39 | 
 40 | 	/** Sequence Miner Settings */
 41 | 	private static final int maxStructureSteps = 100_000;
 42 | 	private static final double minSup = 0.05;
 43 | 
 44 | 	public static void main(final String[] args) throws IOException, ClassNotFoundException {
 45 | 
 46 | 		// Read in background distribution
 47 | 		final Map<Sequence, Double> backgroundSequences = SequenceMiningCore.readISMSequences(sequenceLog);
 48 | 
 49 | 		// Read in associated sequence count distribution
 50 | 		@SuppressWarnings("unchecked")
 51 | 		final Table<Sequence, Integer, Double> countDist = (Table<Sequence, Integer, Double>) Logging
 52 | 				.deserializeFrom(FilenameUtils.removeExtension(sequenceLog.getAbsolutePath()) + ".dist");
 53 | 
 54 | 		final HashMap<Sequence, Double> sequences = TransactionGenerator
 55 | 				.generateTransactionDatabase(backgroundSequences, countDist, noTransactions, dbFile);
 56 | 		System.out.print("\n============= ACTUAL SEQUENCES =============\n");
 57 | 		for (final Entry<Sequence, Double> entry : sequences.entrySet()) {
 58 | 			System.out.print(String.format("%s\tprob: %1.5f %n", entry.getKey(), entry.getValue()));
 59 | 		}
 60 | 		System.out.println("\nNo sequences: " + sequences.size());
 61 | 		SequenceScaling.printTransactionDBStats(dbFile);
 62 | 
 63 | 		// Set up logging
 64 | 		final FileOutputStream outFile = new FileOutputStream(saveDir + "/" + name + "_pr.txt");
 65 | 		final TeeOutputStream out = new TeeOutputStream(System.out, outFile);
 66 | 		final PrintStream ps = new PrintStream(out);
 67 | 		System.setOut(ps);
 68 | 
 69 | 		precisionRecall(sequences, "GoKrimp");
 70 | 		precisionRecall(sequences, "SQS");
 71 | 		precisionRecall(sequences, "BIDE");
 72 | 		precisionRecall(sequences, "ISM");
 73 | 
 74 | 	}
 75 | 
 76 | 	public static void precisionRecall(final Map<Sequence, Double> sequences, final String algorithm)
 77 | 			throws IOException {
 78 | 
 79 | 		// Mine sequences
 80 | 		Set<Sequence> minedSequences = null;
 81 | 		final File logFile = Logging.getLogFileName(algorithm, true, saveDir, dbFile);
 82 | 		final long startTime = System.currentTimeMillis();
 83 | 		if (algorithm.equals("BIDE")) {
 84 | 			FrequentSequenceMining.mineClosedFrequentSequencesBIDE(dbFile.getAbsolutePath(), logFile.getAbsolutePath(),
 85 | 					minSup);
 86 | 			minedSequences = FrequentSequenceMining.readFrequentSequences(logFile).keySet();
 87 | 		} else if (algorithm.equals("ISM")) {
 88 | 			minedSequences = SequenceMining
 89 | 					.mineSequences(dbFile, new InferGreedy(), maxStructureSteps, noIterations, logFile, false).keySet();
 90 | 		} else if (algorithm.equals("GoKrimp")) {
 91 | 			minedSequences = StatisticalSequenceMining.mineGoKrimpSequences(dbFile, logFile).keySet();
 92 | 		} else if (algorithm.equals("SQS")) {
 93 | 			minedSequences = StatisticalSequenceMining.mineSQSSequences(dbFile, logFile, 1).keySet();
 94 | 		} else
 95 | 			throw new RuntimeException("Incorrect algorithm name.");
 96 | 		final long endTime = System.currentTimeMillis();
 97 | 		final double time = (endTime - startTime) / (double) 1000;
 98 | 
 99 | 		// Calculate sorted precision and recall
100 | 		final int len = minedSequences.size();
101 | 		final double[] precision = new double[len];
102 | 		final double[] recall = new double[len];
103 | 		for (int k = 1; k <= minedSequences.size(); k++) {
104 | 
105 | 			final Set<Sequence> topKMined = Sets.newHashSet();
106 | 			for (final Sequence seq : minedSequences) {
107 | 				topKMined.add(seq);
108 | 				if (topKMined.size() == k)
109 | 					break;
110 | 			}
111 | 
112 | 			final double noInBoth = Sets.intersection(sequences.keySet(), topKMined).size();
113 | 			final double pr = noInBoth / (double) topKMined.size();
114 | 			final double rec = noInBoth / (double) sequences.size();
115 | 			precision[k - 1] = pr;
116 | 			recall[k - 1] = rec;
117 | 		}
118 | 
119 | 		// Output precision and recall
120 | 		System.out.println("\n======== " + algorithm + " ========");
121 | 		System.out.println("No. mined sequences: " + len);
122 | 		System.out.println("Time: " + time);
123 | 		System.out.println("Precision (all): " + Arrays.toString(precision));
124 | 		System.out.println("Recall (all): " + Arrays.toString(recall));
125 | 
126 | 	}
127 | 
128 | }
129 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/PrecisionRecallParallel.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.eval;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileNotFoundException;
  5 | import java.io.FileOutputStream;
  6 | import java.io.IOException;
  7 | import java.io.PrintStream;
  8 | import java.io.PrintWriter;
  9 | import java.io.UnsupportedEncodingException;
 10 | import java.util.Arrays;
 11 | import java.util.HashSet;
 12 | import java.util.Map;
 13 | import java.util.Random;
 14 | import java.util.Set;
 15 | 
 16 | import org.apache.commons.io.output.TeeOutputStream;
 17 | 
 18 | import sequencemining.main.SequenceMining;
 19 | import sequencemining.sequence.Sequence;
 20 | 
 21 | public class PrecisionRecallParallel {
 22 | 
 23 | 	public static void main(final String[] args) throws IOException, ClassNotFoundException {
 24 | 
 25 | 		final String baseFolder = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/";
 26 | 		// final File dbFile = new File(baseFolder + "Datasets/parallel",
 27 | 		// ".dat");
 28 | 		// generateParallelDataset(dbFile);
 29 | 
 30 | 		// Set up logging
 31 | 		final FileOutputStream outFile = new FileOutputStream(baseFolder + "PrecisionRecall/parallel_pr.txt");
 32 | 		final TeeOutputStream out = new TeeOutputStream(System.out, outFile);
 33 | 		final PrintStream ps = new PrintStream(out);
 34 | 		System.setOut(ps);
 35 | 
 36 | 		// Read SQS sequences
 37 | 		final File outSQS = new File(baseFolder + "SQS/parallel_partial.txt");
 38 | 		final Map<Sequence, Double> seqsSQS = StatisticalSequenceMining.readSQSSequences(outSQS);
 39 | 
 40 | 		// Read GoKrimp sequences
 41 | 		final File outGOKRIMP = new File(baseFolder + "GoKrimp/parallel.txt");
 42 | 		final Map<Sequence, Double> seqsGORKIMP = StatisticalSequenceMining.readGoKrimpSequences(outGOKRIMP);
 43 | 
 44 | 		// Read ISM sequences
 45 | 		final File outISM = new File(baseFolder + "Logs/parallel.log");
 46 | 		final Map<Sequence, Double> seqsISM = SequenceMining.readISMSequences(outISM);
 47 | 
 48 | 		// Precision-recall
 49 | 		precisionRecall(seqsSQS, "SQS");
 50 | 		precisionRecall(seqsGORKIMP, "GoKrimp");
 51 | 		precisionRecall(seqsISM, "ISM");
 52 | 
 53 | 	}
 54 | 
 55 | 	private static void precisionRecall(final Map<Sequence, Double> seqs, final String alg) {
 56 | 
 57 | 		// Calculate sorted precision and recall
 58 | 		final int len = seqs.size();
 59 | 		final double[] precision = new double[len];
 60 | 		final double[] recall = new double[len];
 61 | 		for (int k = 1; k <= seqs.size(); k++) {
 62 | 
 63 | 			final Set<Sequence> topKMined = new HashSet<>();
 64 | 			for (final Sequence seq : seqs.keySet()) {
 65 | 				topKMined.add(seq);
 66 | 				if (topKMined.size() == k)
 67 | 					break;
 68 | 			}
 69 | 
 70 | 			// Calculate number of right patterns
 71 | 			double right = 0;
 72 | 			final Set<Integer> procs = new HashSet<>();
 73 | 			for (final Sequence seq : topKMined) {
 74 | 				final int proc = seq.get(0) / 10;
 75 | 				for (int i = 1; i < seq.size(); i++) {
 76 | 					if (seq.get(i) / 10 != proc)
 77 | 						continue;
 78 | 				}
 79 | 				right++;
 80 | 				procs.add(proc);
 81 | 			}
 82 | 
 83 | 			precision[k - 1] = right / topKMined.size();
 84 | 			recall[k - 1] = procs.size() / 5.;
 85 | 		}
 86 | 
 87 | 		// Output precision and recall
 88 | 		System.out.println("\n======== " + alg + " ========");
 89 | 		System.out.println("No. mined sequences: " + len);
 90 | 		System.out.println("Precision: " + Arrays.toString(precision));
 91 | 		System.out.println("Recall: " + Arrays.toString(recall));
 92 | 
 93 | 	}
 94 | 
 95 | 	/** Generate parallel dataset */
 96 | 	@SuppressWarnings("unused")
 97 | 	private static void generateParallelDataset(final File dbFile)
 98 | 			throws FileNotFoundException, UnsupportedEncodingException {
 99 | 		final Random rand = new Random(1);
100 | 		final int[] states = new int[] { 0, 0, 0, 0, 0 };
101 | 		final PrintWriter db = new PrintWriter(dbFile, "UTF-8");
102 | 		for (int j = 1; j <= 1_000_000; j++) {
103 | 			final int proc = rand.nextInt(5);
104 | 			final int lab1 = proc + 1;
105 | 			final int lab2 = (states[proc] % 5) + 1;
106 | 			states[proc] += 1;
107 | 			db.write(lab1 + "" + lab2 + " -1 ");
108 | 			if (j % 100 == 0)
109 | 				db.write("-2\n");
110 | 		}
111 | 		db.close();
112 | 	}
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/SequenceScaling.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.eval;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileOutputStream;
  5 | import java.io.IOException;
  6 | import java.io.PrintStream;
  7 | import java.text.DecimalFormat;
  8 | import java.util.Arrays;
  9 | import java.util.HashSet;
 10 | import java.util.Map;
 11 | import java.util.Map.Entry;
 12 | import java.util.Set;
 13 | 
 14 | import org.apache.commons.io.FileUtils;
 15 | import org.apache.commons.io.FilenameUtils;
 16 | import org.apache.commons.io.LineIterator;
 17 | import org.apache.commons.io.output.TeeOutputStream;
 18 | 
 19 | import com.google.common.collect.Table;
 20 | 
 21 | import sequencemining.main.InferenceAlgorithms.InferGreedy;
 22 | import sequencemining.main.SequenceMining;
 23 | import sequencemining.main.SequenceMiningCore;
 24 | import sequencemining.sequence.Sequence;
 25 | import sequencemining.transaction.TransactionGenerator;
 26 | import sequencemining.util.Logging;
 27 | 
 28 | public class SequenceScaling {
 29 | 
 30 | 	/** Main Settings */
 31 | 	private static final File dbFile = new File("/disk/data1/jfowkes/sequence.txt");
 32 | 	private static final File saveDir = new File("/disk/data1/jfowkes/logs/");
 33 | 
 34 | 	/** Set of mined itemsets to use for background */
 35 | 	private static final String name = "SIGN-based";
 36 | 	private static final File sequenceLog = new File("/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Logs/SIGN.log");
 37 | 
 38 | 	/** Spark Settings */
 39 | 	private static final long MAX_RUNTIME = 24 * 60; // 24hrs
 40 | 	private static final int maxStructureSteps = 100_000;
 41 | 	private static final int maxEMIterations = 100;
 42 | 
 43 | 	public static void main(final String[] args) throws IOException, ClassNotFoundException {
 44 | 
 45 | 		// Run
 46 | 		scalingTransactions(32, new int[] { 1_000, 10_000, 100_000, 1_000_000 });
 47 | 	}
 48 | 
 49 | 	public static void scalingTransactions(final int noCores, final int[] trans)
 50 | 			throws IOException, ClassNotFoundException {
 51 | 
 52 | 		final double[] time = new double[trans.length];
 53 | 		final DecimalFormat formatter = new DecimalFormat("0.0E0");
 54 | 
 55 | 		// Save to file
 56 | 		final FileOutputStream outFile = new FileOutputStream(saveDir + "/" + name + "_scaling_" + noCores + ".txt");
 57 | 		final TeeOutputStream out = new TeeOutputStream(System.out, outFile);
 58 | 		final PrintStream ps = new PrintStream(out);
 59 | 		System.setOut(ps);
 60 | 
 61 | 		// Read in previously mined sequences
 62 | 		final Map<Sequence, Double> sequences = SequenceMiningCore.readISMSequences(sequenceLog);
 63 | 		System.out.print("\n============= ACTUAL SEQUENCES =============\n");
 64 | 		for (final Entry<Sequence, Double> entry : sequences.entrySet()) {
 65 | 			System.out.print(String.format("%s\tprob: %1.5f %n", entry.getKey(), entry.getValue()));
 66 | 		}
 67 | 		System.out.println("\nNo sequences: " + sequences.size());
 68 | 		System.out.println("No items: " + countNoItems(sequences.keySet()));
 69 | 
 70 | 		// Read in associated sequence count distribution
 71 | 		@SuppressWarnings("unchecked")
 72 | 		final Table<Sequence, Integer, Double> countDist = (Table<Sequence, Integer, Double>) Logging
 73 | 				.deserializeFrom(FilenameUtils.removeExtension(sequenceLog.getAbsolutePath()) + ".dist");
 74 | 
 75 | 		transloop: for (int i = 0; i < trans.length; i++) {
 76 | 
 77 | 			final int tran = trans[i];
 78 | 			System.out.println("\n========= " + formatter.format(tran) + " Transactions");
 79 | 
 80 | 			// Generate transaction database
 81 | 			TransactionGenerator.generateTransactionDatabase(sequences, countDist, tran, dbFile);
 82 | 			SequenceScaling.printTransactionDBStats(dbFile);
 83 | 
 84 | 			// Mine sequences
 85 | 			final File logFile = Logging.getLogFileName("ISM", true, saveDir, dbFile);
 86 | 			final long startTime = System.currentTimeMillis();
 87 | 			SequenceMining.mineSequences(dbFile, new InferGreedy(), maxStructureSteps, maxEMIterations, logFile, false);
 88 | 
 89 | 			final long endTime = System.currentTimeMillis();
 90 | 			final double tim = (endTime - startTime) / (double) 1000;
 91 | 			time[i] += tim;
 92 | 
 93 | 			System.out.printf("Time (s): %.2f%n", tim);
 94 | 
 95 | 			if (tim > MAX_RUNTIME * 60)
 96 | 				break transloop;
 97 | 
 98 | 		}
 99 | 
100 | 		// Print time
101 | 		System.out.println("\n========" + name + "========");
102 | 		System.out.println("Transactions:" + Arrays.toString(trans));
103 | 		System.out.println("Time: " + Arrays.toString(time));
104 | 
105 | 		// and save to file
106 | 		out.close();
107 | 	}
108 | 
109 | 	/**
110 | 	 * Count the number of items in the sequences (sequences need not be
111 | 	 * independent)
112 | 	 */
113 | 	public static int countNoItems(final Set<Sequence> sequences) {
114 | 		final Set<Integer> items = new HashSet<>();
115 | 		for (final Sequence sequence : sequences)
116 | 			items.addAll(sequence);
117 | 		return items.size();
118 | 	}
119 | 
120 | 	/** Print useful statistics for the transaction database */
121 | 	public static void printTransactionDBStats(final File dbFile) throws IOException {
122 | 
123 | 		int noTransactions = 0;
124 | 		double sparsity = 0;
125 | 		final Set<Integer> singletons = new HashSet<>();
126 | 		final LineIterator it = FileUtils.lineIterator(dbFile, "UTF-8");
127 | 		while (it.hasNext()) {
128 | 			final String[] items = it.nextLine().replace("-2", "").split(" -1 ");
129 | 			for (final String item : items)
130 | 				singletons.add(Integer.parseInt(item));
131 | 			sparsity += items.length;
132 | 			noTransactions++;
133 | 		}
134 | 		LineIterator.closeQuietly(it);
135 | 
136 | 		System.out.println("\nDatabase: " + dbFile);
137 | 		System.out.println("Items: " + singletons.size());
138 | 		System.out.println("Transactions: " + noTransactions);
139 | 		System.out.println("Avg. items per transaction: " + sparsity / noTransactions + "\n");
140 | 
141 | 	}
142 | 
143 | }
144 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/SequenceSymmetricDistance.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.eval;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileOutputStream;
  5 | import java.io.IOException;
  6 | import java.io.PrintStream;
  7 | import java.io.Writer;
  8 | import java.util.HashSet;
  9 | import java.util.Map;
 10 | import java.util.Set;
 11 | 
 12 | import org.apache.commons.io.output.TeeOutputStream;
 13 | 
 14 | import com.google.common.base.Charsets;
 15 | import com.google.common.io.Files;
 16 | 
 17 | import sequencemining.main.SequenceMiningCore;
 18 | import sequencemining.sequence.Sequence;
 19 | 
 20 | public class SequenceSymmetricDistance {
 21 | 
 22 | 	public static void main(final String[] args) throws IOException {
 23 | 
 24 | 		// TODO re-run BIDE...
 25 | 		final int topN = 50;
 26 | 		final String baseDir = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/";
 27 | 		final String[] datasets = new String[] { "alice_punc", "GAZELLE1", "jmlr", "SIGN", "aslbu", "aslgt", "auslan2",
 28 | 				"context", "pioneer", "skating" };
 29 | 
 30 | 		// Set up logging
 31 | 		final FileOutputStream outFile = new FileOutputStream(baseDir + "redundancy.txt");
 32 | 		final TeeOutputStream out = new TeeOutputStream(System.out, outFile);
 33 | 		final PrintStream ps = new PrintStream(out);
 34 | 		System.setOut(ps);
 35 | 
 36 | 		final Writer writer = Files.newWriter(new File(baseDir + "redundancy.tex"), Charsets.UTF_8);
 37 | 
 38 | 		for (int i = 0; i < datasets.length; i++) {
 39 | 
 40 | 			System.out.println("===== Dataset: " + datasets[i]);
 41 | 
 42 | 			// ISM sequences
 43 | 			final Map<Sequence, Double> intSequences = SequenceMiningCore
 44 | 					.readISMSequences(new File(baseDir + "Logs/" + datasets[i] + ".log"));
 45 | 			calculateRedundancyStats("ISM", intSequences, topN, writer);
 46 | 
 47 | 			// SQS sequences
 48 | 			final Map<Sequence, Double> sqsSequences = StatisticalSequenceMining
 49 | 					.readSQSSequences(new File(baseDir + "SQS/" + datasets[i] + ".txt"));
 50 | 			calculateRedundancyStats("SQS", sqsSequences, topN, writer);
 51 | 
 52 | 			// GoKrimp sequences
 53 | 			final Map<Sequence, Double> gokrimpSequences = StatisticalSequenceMining
 54 | 					.readGoKrimpSequences(new File(baseDir + "GoKrimp/" + datasets[i] + ".txt"));
 55 | 			calculateRedundancyStats("GoKrimp", gokrimpSequences, topN, writer);
 56 | 
 57 | 			// BIDE sequences
 58 | 			final Map<Sequence, Integer> bideSequences = FrequentSequenceMining
 59 | 					.readFrequentSequences(new File(baseDir + "BIDE/" + datasets[i] + ".txt"));
 60 | 			calculateRedundancyStats("BIDE", bideSequences, topN, writer);
 61 | 
 62 | 			System.out.println();
 63 | 		}
 64 | 		writer.close();
 65 | 
 66 | 	}
 67 | 
 68 | 	private static <V> void calculateRedundancyStats(final String name, final Map<Sequence, V> intSequences,
 69 | 			final int topN, final Writer writer) throws IOException {
 70 | 		System.out.println("\n" + name + " Sequences\n-----------");
 71 | 		System.out.println("No. sequences: " + intSequences.size());
 72 | 		if (name.equals("ISM"))
 73 | 			System.out.println(
 74 | 					"No. non-singleton sequences: " + filterSingletons(intSequences, Integer.MAX_VALUE).size());
 75 | 		System.out.println("No. items: " + countNoItems(intSequences.keySet()));
 76 | 
 77 | 		// Get top sequences and calculate stats
 78 | 		final Set<Sequence> topIntSequences = filterSingletons(intSequences, topN);
 79 | 
 80 | 		final double avgMinDiff = calculateRedundancy(topIntSequences);
 81 | 		System.out.println("\nAvg. min edit dist: " + avgMinDiff);
 82 | 		writer.write("$" + avgMinDiff + "$ & ");
 83 | 
 84 | 		// Calculate spuriousness
 85 | 		final double avgMaxSpur = calculateSpuriousness(topIntSequences);
 86 | 		System.out.println("Avg. no. subseq: " + avgMaxSpur);
 87 | 		writer.write("$" + avgMaxSpur + "$ & ");
 88 | 
 89 | 		// Calculate no. items
 90 | 		final int noItems = countNoItems(topIntSequences);
 91 | 		System.out.println("No. items: " + noItems);
 92 | 		writer.write("$" + noItems + "$ & ");
 93 | 
 94 | 		// Calculate size
 95 | 		final double avgSize = calculateAverageSize(topIntSequences);
 96 | 		System.out.println("Avg. subseq size: " + avgSize);
 97 | 
 98 | 		writer.write("\n");
 99 | 	}
100 | 
101 | 	private static <V> double calculateRedundancy(final Set<Sequence> topSequences) {
102 | 
103 | 		double avgMinDiff = 0;
104 | 		for (final Sequence seq1 : topSequences) {
105 | 
106 | 			int minDiff = Integer.MAX_VALUE;
107 | 			for (final Sequence seq2 : topSequences) {
108 | 				if (!seq1.equals(seq2)) {
109 | 					final int diff = editDistance(seq1, seq2);
110 | 					if (diff < minDiff)
111 | 						minDiff = diff;
112 | 				}
113 | 			}
114 | 			avgMinDiff += minDiff;
115 | 		}
116 | 		avgMinDiff /= topSequences.size();
117 | 
118 | 		return avgMinDiff;
119 | 	}
120 | 
121 | 	/**
122 | 	 * Calculate the Levenshtein distance between two sequences using the
123 | 	 * Wagner-Fischer algorithm
124 | 	 *
125 | 	 * @see http://en.wikipedia.org/wiki/Levenshtein_distance
126 | 	 */
127 | 	private static int editDistance(final Sequence s, final Sequence t) {
128 | 		final int m = s.size();
129 | 		final int n = t.size();
130 | 
131 | 		// for all i and j, d[i,j] will hold the Levenshtein distance between
132 | 		// the first i characters of s and the first j characters of t;
133 | 		final int[][] d = new int[m + 1][n + 1];
134 | 
135 | 		// the distance of any first string to an empty second string
136 | 		for (int i = 1; i <= m; i++)
137 | 			d[i][0] = i;
138 | 
139 | 		// the distance of any second string to an empty first string
140 | 		for (int j = 1; j <= n; j++)
141 | 			d[0][j] = j;
142 | 
143 | 		for (int j = 1; j <= n; j++) {
144 | 			for (int i = 1; i <= m; i++) {
145 | 				if (s.get(i - 1) == t.get(j - 1)) {
146 | 					d[i][j] = d[i - 1][j - 1]; // no operation required
147 | 				} else {
148 | 					d[i][j] = Math.min(d[i - 1][j] + 1, // a deletion
149 | 							Math.min(d[i][j - 1] + 1, // an insertion
150 | 									d[i - 1][j - 1] + 1)); // a substitution
151 | 				}
152 | 			}
153 | 		}
154 | 
155 | 		return d[m][n];
156 | 	}
157 | 
158 | 	/**
159 | 	 * Count the number of distinct items in the set of sequences
160 | 	 */
161 | 	public static int countNoItems(final Set<Sequence> sequences) {
162 | 		final Set<Integer> items = new HashSet<>();
163 | 		for (final Sequence seq : sequences)
164 | 			items.addAll(seq.getItems());
165 | 		return items.size();
166 | 	}
167 | 
168 | 	private static double calculateAverageSize(final Set<Sequence> topSequences) {
169 | 
170 | 		double avgSize = 0;
171 | 		for (final Sequence seq : topSequences)
172 | 			avgSize += seq.size();
173 | 		return avgSize / topSequences.size();
174 | 	}
175 | 
176 | 	private static <V> double calculateSpuriousness(final Set<Sequence> topSequences) {
177 | 
178 | 		double avgSubseq = 0;
179 | 		for (final Sequence seq1 : topSequences) {
180 | 			for (final Sequence seq2 : topSequences) {
181 | 				if (!seq1.equals(seq2))
182 | 					avgSubseq += isSubseq(seq1, seq2);
183 | 			}
184 | 		}
185 | 		avgSubseq /= topSequences.size();
186 | 
187 | 		return avgSubseq;
188 | 	}
189 | 
190 | 	/** Filter out singletons */
191 | 	static <V> Set<Sequence> filterSingletons(final Map<Sequence, V> seqs, final int topN) {
192 | 
193 | 		int count = 0;
194 | 		final Set<Sequence> topSeqs = new HashSet<>();
195 | 		for (final Sequence seq : seqs.keySet()) {
196 | 			if (seq.size() != 1) {
197 | 				topSeqs.add(seq);
198 | 				count++;
199 | 			}
200 | 			if (count == topN)
201 | 				break;
202 | 		}
203 | 		if (topN != Integer.MAX_VALUE && count < topN)
204 | 			System.out.println("WARNING: not enough non-singleton sequences in set: " + count);
205 | 
206 | 		return topSeqs;
207 | 	}
208 | 
209 | 	private static int isSubseq(final Sequence seq1, final Sequence seq2) {
210 | 		if (seq2.contains(seq1))
211 | 			return 1;
212 | 		return 0;
213 | 	}
214 | 
215 | }


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/StatisticalSequenceMining.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.eval;
  2 | 
  3 | import java.io.BufferedWriter;
  4 | import java.io.File;
  5 | import java.io.FileWriter;
  6 | import java.io.IOException;
  7 | import java.lang.ProcessBuilder.Redirect;
  8 | import java.util.LinkedHashMap;
  9 | 
 10 | import org.apache.commons.io.FileUtils;
 11 | import org.apache.commons.io.LineIterator;
 12 | 
 13 | import ca.pfv.spmf.algorithms.sequentialpatterns.goKrimp.AlgoGoKrimp;
 14 | import ca.pfv.spmf.algorithms.sequentialpatterns.goKrimp.DataReader;
 15 | import sequencemining.sequence.Sequence;
 16 | 
 17 | public class StatisticalSequenceMining {
 18 | 
 19 | 	public static void main(final String[] args) throws IOException {
 20 | 
 21 | 		// Datasets
 22 | 		final String[] datasets = new String[] { "GAZELLE1" };
 23 | 		for (int i = 0; i < datasets.length; i++) {
 24 | 			final File dbPath = new File(
 25 | 					"/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Datasets/Paper/" + datasets[i] + ".dat");
 26 | 
 27 | 			// Run GoKRIMP
 28 | 			//final File saveFileGoKRIMP = new File(
 29 | 			//		"/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/GoKrimp/" + datasets[i] + ".txt");
 30 | 			//mineGoKrimpSequences(dbPath, saveFileGoKRIMP);
 31 | 
 32 | 			// Run SQS
 33 | 			final File saveFileSQS = new File(
 34 | 					"/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/SQS/" + datasets[i] + ".txt");
 35 | 			mineSQSSequences(dbPath, saveFileSQS, 1);
 36 | 		}
 37 | 
 38 | 	}
 39 | 
 40 | 	public static LinkedHashMap<Sequence, Double> mineGoKrimpSequences(final File dataset, final File saveFile)
 41 | 			throws IOException {
 42 | 
 43 | 		// Convert to SQS Dataset format
 44 | 		final File TMPDB = File.createTempFile("gokrimp-dataset", ".dat");
 45 | 		convertDatasetGoKrimpFormat(dataset, TMPDB);
 46 | 
 47 | 		// Set MTV settings
 48 | 		final String cmd[] = new String[2];
 49 | 		cmd[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/gokrimp/gokrimp.sh";
 50 | 		cmd[1] = TMPDB.toString().replace(".dat", "");
 51 | 		runScript(cmd, saveFile);
 52 | 
 53 | 		TMPDB.delete();
 54 | 
 55 | 		return readGoKrimpSequences(saveFile);
 56 | 	}
 57 | 
 58 | 	public static LinkedHashMap<Sequence, Double> mineSQSSequences(final File dataset, final File saveFile,
 59 | 			final int minUsage) throws IOException {
 60 | 
 61 | 		// Convert to SQS Dataset format
 62 | 		final File TMPDB = File.createTempFile("sqs-dataset", ".dat");
 63 | 		convertDatasetSQSFormat(dataset, TMPDB);
 64 | 
 65 | 		// Set MTV settings
 66 | 		final String cmd[] = new String[5];
 67 | 		cmd[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/sqs/sqs.sh";
 68 | 		cmd[1] = "-i " + TMPDB;
 69 | 		cmd[2] = "-t " + minUsage; // default is 1
 70 | 		cmd[3] = "-o " + saveFile;
 71 | 		cmd[4] = "-m search"; // search - scan db directly, order - compress
 72 | 								// given patterns
 73 | 		// cmd[5] = "-p"; // patterns file (for order method)
 74 | 		runScript(cmd, null);
 75 | 
 76 | 		TMPDB.delete();
 77 | 
 78 | 		return readSQSSequences(saveFile);
 79 | 	}
 80 | 
 81 | 	/** Convert dataset from SPMF format to SQS format */
 82 | 	private static void convertDatasetSQSFormat(final File inputDB, final File outputDB) throws IOException {
 83 | 
 84 | 		// Output DB
 85 | 		final BufferedWriter db = new BufferedWriter(new FileWriter(outputDB));
 86 | 
 87 | 		// for each line (transaction) until the end of file
 88 | 		boolean newSeq = false;
 89 | 		final LineIterator it = FileUtils.lineIterator(inputDB, "UTF-8");
 90 | 		while (it.hasNext()) {
 91 | 
 92 | 			final String line = it.nextLine();
 93 | 			// if the line is a comment, is empty or is a
 94 | 			// kind of metadata
 95 | 			if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
 96 | 				continue;
 97 | 			}
 98 | 
 99 | 			// sequence separator
100 | 			if (newSeq)
101 | 				db.write("-1 ");
102 | 
103 | 			// split the transaction into items
104 | 			final String[] lineSplited = line.split(" ");
105 | 
106 | 			for (int i = 0; i < lineSplited.length; i++) {
107 | 				if (lineSplited[i].equals("-1")) { // end of item
108 | 
109 | 				} else if (lineSplited[i].equals("-2")) { // end of sequence
110 | 					newSeq = true;
111 | 				} else { // extract the value for an item
112 | 					db.write(lineSplited[i] + " ");
113 | 				}
114 | 			}
115 | 
116 | 		}
117 | 		db.newLine();
118 | 		db.close();
119 | 
120 | 		// close the input file
121 | 		LineIterator.closeQuietly(it);
122 | 
123 | 	}
124 | 
125 | 	/** Convert dataset from SPMF format to GoKrimp format */
126 | 	private static void convertDatasetGoKrimpFormat(final File inputDB, final File outputDB) throws IOException {
127 | 
128 | 		// Output DB
129 | 		final BufferedWriter db = new BufferedWriter(new FileWriter(outputDB));
130 | 
131 | 		// for each line (transaction) until the end of file
132 | 		boolean newSeq = false;
133 | 		final LineIterator it = FileUtils.lineIterator(inputDB, "UTF-8");
134 | 		while (it.hasNext()) {
135 | 
136 | 			final String line = it.nextLine();
137 | 			// if the line is a comment, is empty or is a
138 | 			// kind of metadata
139 | 			if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
140 | 				continue;
141 | 			}
142 | 
143 | 			// sequence separator
144 | 			if (newSeq)
145 | 				db.write("\n");
146 | 
147 | 			// split the transaction into items
148 | 			final String[] lineSplited = line.split(" ");
149 | 
150 | 			for (int i = 0; i < lineSplited.length; i++) {
151 | 				if (lineSplited[i].equals("-1")) { // end of item
152 | 
153 | 				} else if (lineSplited[i].equals("-2")) { // end of sequence
154 | 					newSeq = true;
155 | 				} else { // extract the value for an item
156 | 					db.write(lineSplited[i] + " ");
157 | 				}
158 | 			}
159 | 
160 | 		}
161 | 		db.newLine();
162 | 		db.close();
163 | 
164 | 		// close the input file
165 | 		LineIterator.closeQuietly(it);
166 | 
167 | 	}
168 | 
169 | 	/** Read in SQS sequences (sorted by worth) */
170 | 	public static LinkedHashMap<Sequence, Double> readSQSSequences(final File output) throws IOException {
171 | 		final LinkedHashMap<Sequence, Double> sequences = new LinkedHashMap<>();
172 | 
173 | 		final LineIterator it = FileUtils.lineIterator(output);
174 | 		while (it.hasNext()) {
175 | 			final String line = it.nextLine();
176 | 			if (!line.trim().isEmpty()) {
177 | 				final String[] splitLine = line.split("  ");
178 | 				final String[] items = splitLine[0].split(" ");
179 | 				final Sequence seq = new Sequence();
180 | 				for (final String item : items)
181 | 					seq.add(Integer.parseInt(item));
182 | 				final double worth = Double.parseDouble(splitLine[1].split(" ")[1]);
183 | 				sequences.put(seq, worth);
184 | 			}
185 | 		}
186 | 
187 | 		return sequences;
188 | 	}
189 | 
190 | 	/** Read in GoKrimp sequences (sorted by compression benefit) */
191 | 	public static LinkedHashMap<Sequence, Double> readGoKrimpSequences(final File output) throws IOException {
192 | 		final LinkedHashMap<Sequence, Double> sequences = new LinkedHashMap<>();
193 | 
194 | 		final LineIterator it = FileUtils.lineIterator(output);
195 | 		while (it.hasNext()) {
196 | 			final String line = it.nextLine();
197 | 			if (!line.trim().isEmpty() && line.charAt(0) == '[') {
198 | 				final String[] splitLine = line.split(" ");
199 | 				final double worth = Double.parseDouble(splitLine[splitLine.length - 1]);
200 | 				final Sequence seq = new Sequence();
201 | 				for (int i = 1; i < splitLine.length - 2; i++)
202 | 					seq.add(Integer.parseInt(splitLine[i]));
203 | 				sequences.put(seq, worth);
204 | 			}
205 | 		}
206 | 
207 | 		return sequences;
208 | 	}
209 | 
210 | 	/**
211 | 	 * @deprecated gives slightly different results to reference implementation
212 | 	 */
213 | 	@Deprecated
214 | 	public static LinkedHashMap<Sequence, Double> mineGoKrimpSequencesSPMF(final File dataset, final File saveFile)
215 | 			throws IOException {
216 | 
217 | 		final DataReader d = new DataReader();
218 | 		final AlgoGoKrimp g = d.readData_SPMF(dataset.getAbsolutePath(), "");
219 | 		// g.printData();
220 | 		g.setOutputFilePath(saveFile.getAbsolutePath());
221 | 		g.gokrimp();
222 | 
223 | 		return readGoKrimpSequencesSPMF(saveFile);
224 | 	}
225 | 
226 | 	/**
227 | 	 * Read in GOKRIMP sequences (sorted by compression benefit)
228 | 	 *
229 | 	 * @deprecated gives slightly different results to reference implementation
230 | 	 */
231 | 	@Deprecated
232 | 	public static LinkedHashMap<Sequence, Double> readGoKrimpSequencesSPMF(final File output) throws IOException {
233 | 		final LinkedHashMap<Sequence, Double> sequences = new LinkedHashMap<>();
234 | 
235 | 		final LineIterator it = FileUtils.lineIterator(output);
236 | 		while (it.hasNext()) {
237 | 			final String line = it.nextLine();
238 | 			if (!line.trim().isEmpty()) {
239 | 				final String[] splitLine = line.split("#SUP:");
240 | 				final String[] items = splitLine[0].trim().split(" ");
241 | 				final Sequence seq = new Sequence();
242 | 				for (final String item : items)
243 | 					seq.add(Integer.parseInt(item.trim()));
244 | 				final double compressionBenefit = Double.parseDouble(splitLine[1].trim());
245 | 				sequences.put(seq, compressionBenefit);
246 | 			}
247 | 		}
248 | 
249 | 		return sequences;
250 | 	}
251 | 
252 | 	/** Run shell script with command line arguments */
253 | 	public static void runScript(final String cmd[], final File outFile) {
254 | 
255 | 		try {
256 | 			final ProcessBuilder pb = new ProcessBuilder(cmd);
257 | 			if (outFile != null)
258 | 				pb.redirectOutput(outFile);
259 | 			else
260 | 				pb.redirectOutput(Redirect.INHERIT);
261 | 			pb.redirectError(Redirect.INHERIT);
262 | 			final Process process = pb.start();
263 | 			process.waitFor();
264 | 			process.destroy();
265 | 		} catch (final Exception e) {
266 | 			e.printStackTrace();
267 | 		}
268 | 
269 | 	}
270 | 
271 | }
272 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/EMStep.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.main;
  2 | 
  3 | import static java.util.function.Function.identity;
  4 | import static java.util.stream.Collectors.counting;
  5 | import static java.util.stream.Collectors.groupingBy;
  6 | 
  7 | import java.util.Collections;
  8 | import java.util.HashMap;
  9 | import java.util.HashSet;
 10 | import java.util.List;
 11 | import java.util.Map;
 12 | import java.util.Set;
 13 | 
 14 | import com.google.common.collect.HashBasedTable;
 15 | import com.google.common.collect.Multiset;
 16 | import com.google.common.collect.Table;
 17 | 
 18 | import sequencemining.main.InferenceAlgorithms.InferenceAlgorithm;
 19 | import sequencemining.sequence.Sequence;
 20 | import sequencemining.transaction.Transaction;
 21 | import sequencemining.transaction.TransactionDatabase;
 22 | import sequencemining.util.Tuple2;
 23 | 
 24 | /** Class to hold the various transaction EM Steps */
 25 | public class EMStep {
 26 | 
 27 | 	/** Initialize cached sequences */
 28 | 	static void initializeCachedSequences(final TransactionDatabase transactions,
 29 | 			final Table<Sequence, Integer, Double> initProbs) {
 30 | 		transactions.getTransactionList().parallelStream().forEach(t -> t.initializeCachedSequences(initProbs));
 31 | 	}
 32 | 
 33 | 	/** EM-step for hard EM */
 34 | 	static Table<Sequence, Integer, Double> hardEMStep(final List<Transaction> transactions,
 35 | 			final InferenceAlgorithm inferenceAlgorithm) {
 36 | 		final double noTransactions = transactions.size();
 37 | 
 38 | 		// E-step
 39 | 		final Map<Multiset.Entry<Sequence>, Long> coveringWithCounts = transactions.parallelStream().map(t -> {
 40 | 			final Multiset<Sequence> covering = inferenceAlgorithm.infer(t);
 41 | 			t.setCachedCovering(covering);
 42 | 			return covering.entrySet();
 43 | 		}).flatMap(Set::stream).collect(groupingBy(identity(), counting()));
 44 | 
 45 | 		// M-step
 46 | 		final Table<Sequence, Integer, Double> newSequences = coveringWithCounts.entrySet().parallelStream().collect(
 47 | 				HashBasedTable::create,
 48 | 				(t, e) -> t.put(e.getKey().getElement(), e.getKey().getCount(), e.getValue() / noTransactions),
 49 | 				Table::putAll);
 50 | 		newSequences.rowKeySet().parallelStream().forEach(seq -> {
 51 | 			// Pad with zero counts for non-occurrences
 52 | 			final int maxOccur = Collections.max(newSequences.row(seq).keySet());
 53 | 			for (int occur = 1; occur <= maxOccur; occur++) {
 54 | 				if (!newSequences.contains(seq, occur))
 55 | 					newSequences.put(seq, occur, 0.);
 56 | 			} // Add probabilities for zero occurrences
 57 | 			double rowSum = 0;
 58 | 			for (final Double count : newSequences.row(seq).values())
 59 | 				rowSum += count;
 60 | 			newSequences.put(seq, 0, 1 - rowSum);
 61 | 		});
 62 | 
 63 | 		// Update cached sequences
 64 | 		transactions.parallelStream().forEach(t -> t.updateCachedSequences(newSequences));
 65 | 
 66 | 		return newSequences;
 67 | 	}
 68 | 
 69 | 	/** Get average cost of last EM-step */
 70 | 	static double calculateAverageCost(final TransactionDatabase transactions) {
 71 | 		final double noTransactions = transactions.size();
 72 | 		return transactions.getTransactionList().parallelStream().mapToDouble(Transaction::getCachedCost).sum()
 73 | 				/ noTransactions;
 74 | 	}
 75 | 
 76 | 	/** EM-step for structural EM */
 77 | 	static Tuple2<Double, Map<Integer, Double>> structuralEMStep(final TransactionDatabase transactions,
 78 | 			final InferenceAlgorithm inferenceAlgorithm, final Sequence candidate) {
 79 | 		final double noTransactions = transactions.size();
 80 | 
 81 | 		// Calculate max. no. of candidate occurrences
 82 | 		final int maxReps = transactions.getTransactionList().parallelStream().mapToInt(t -> t.repetitions(candidate))
 83 | 				.max().getAsInt();
 84 | 		final Map<Integer, Double> initProb = new HashMap<>();
 85 | 		initProb.put(0, 0.);
 86 | 		for (int occur = 1; occur <= maxReps; occur++)
 87 | 			initProb.put(occur, 1.);
 88 | 
 89 | 		// E-step (adding candidate to transactions that support it)
 90 | 		final Map<Multiset.Entry<Sequence>, Long> coveringWithCounts = transactions.getTransactionList()
 91 | 				.parallelStream().map(t -> {
 92 | 					if (t.contains(candidate)) {
 93 | 						t.addSequenceCache(candidate, initProb);
 94 | 						final Multiset<Sequence> covering = inferenceAlgorithm.infer(t);
 95 | 						t.setTempCachedCovering(covering);
 96 | 						return covering.entrySet();
 97 | 					}
 98 | 					return t.getCachedCovering().entrySet();
 99 | 				}).flatMap(Set::stream).collect(groupingBy(identity(), counting()));
100 | 
101 | 		// M-step
102 | 		final Table<Sequence, Integer, Double> newSequences = coveringWithCounts.entrySet().parallelStream().collect(
103 | 				HashBasedTable::create,
104 | 				(t, e) -> t.put(e.getKey().getElement(), e.getKey().getCount(), e.getValue() / noTransactions),
105 | 				Table::putAll);
106 | 		newSequences.rowKeySet().parallelStream().forEach(seq -> {
107 | 			// Pad with zero counts for non-occurrences
108 | 			final int maxOccur = Collections.max(newSequences.row(seq).keySet());
109 | 			for (int occur = 1; occur <= maxOccur; occur++) {
110 | 				if (!newSequences.contains(seq, occur))
111 | 					newSequences.put(seq, occur, 0.);
112 | 			} // Add probabilities for zero occurrences
113 | 			double rowSum = 0;
114 | 			for (final Double count : newSequences.row(seq).values())
115 | 				rowSum += count;
116 | 			newSequences.put(seq, 0, 1 - rowSum);
117 | 		});
118 | 
119 | 		// Get average cost (removing candidate from supported transactions)
120 | 		final double averageCost = transactions.getTransactionList().parallelStream().mapToDouble(t -> {
121 | 			double cost;
122 | 			if (t.contains(candidate))
123 | 				cost = t.getTempCachedCost(newSequences);
124 | 			else
125 | 				cost = t.getCachedCost(newSequences);
126 | 			t.removeSequenceCache(candidate);
127 | 			return cost;
128 | 		}).sum() / noTransactions;
129 | 
130 | 		// Get candidate prob
131 | 		final Map<Integer, Double> prob = newSequences.row(candidate);
132 | 
133 | 		return new Tuple2<Double, Map<Integer, Double>>(averageCost, prob);
134 | 	}
135 | 
136 | 	/** Add accepted candidate itemset to cache */
137 | 	static Table<Sequence, Integer, Double> addAcceptedCandidateCache(final TransactionDatabase transactions,
138 | 			final Sequence candidate, final Map<Integer, Double> prob) {
139 | 		final double noTransactions = transactions.size();
140 | 
141 | 		// Cached E-step (adding candidate to transactions that support it)
142 | 		final Map<Multiset.Entry<Sequence>, Long> coveringWithCounts = transactions.getTransactionList()
143 | 				.parallelStream().map(t -> {
144 | 					if (t.contains(candidate)) {
145 | 						t.addSequenceCache(candidate, prob);
146 | 						final Multiset<Sequence> covering = t.getTempCachedCovering();
147 | 						t.setCachedCovering(covering);
148 | 						return covering.entrySet();
149 | 					}
150 | 					return t.getCachedCovering().entrySet();
151 | 				}).flatMap(Set::stream).collect(groupingBy(identity(), counting()));
152 | 
153 | 		// M-step
154 | 		final Table<Sequence, Integer, Double> newSequences = coveringWithCounts.entrySet().parallelStream().collect(
155 | 				HashBasedTable::create,
156 | 				(t, e) -> t.put(e.getKey().getElement(), e.getKey().getCount(), e.getValue() / noTransactions),
157 | 				Table::putAll);
158 | 		newSequences.rowKeySet().parallelStream().forEach(seq -> {
159 | 			// Pad with zero counts for non-occurrences
160 | 			final int maxOccur = Collections.max(newSequences.row(seq).keySet());
161 | 			for (int occur = 1; occur <= maxOccur; occur++) {
162 | 				if (!newSequences.contains(seq, occur))
163 | 					newSequences.put(seq, occur, 0.);
164 | 			} // Add probabilities for zero occurrences
165 | 			double rowSum = 0;
166 | 			for (final Double count : newSequences.row(seq).values())
167 | 				rowSum += count;
168 | 			newSequences.put(seq, 0, 1 - rowSum);
169 | 		});
170 | 
171 | 		// Update cached itemsets
172 | 		transactions.getTransactionList().parallelStream().forEach(t -> t.updateCachedSequences(newSequences));
173 | 
174 | 		return newSequences;
175 | 	}
176 | 
177 | 	/** Get the support of given sequences */
178 | 	static Map<Sequence, Long> getSupportsOfSequences(final TransactionDatabase transactions,
179 | 			final Set<Sequence> sequences) {
180 | 		return transactions.getTransactionList().parallelStream().map(t -> {
181 | 			final HashSet<Sequence> supportedSeqs = new HashSet<>();
182 | 			for (final Sequence seq : sequences) {
183 | 				if (t.contains(seq))
184 | 					supportedSeqs.add(seq);
185 | 			}
186 | 			return supportedSeqs;
187 | 		}).flatMap(Set::stream).collect(groupingBy(identity(), counting()));
188 | 	}
189 | 
190 | 	private EMStep() {
191 | 	}
192 | 
193 | }
194 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/InferenceAlgorithms.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.main;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.util.BitSet;
  5 | import java.util.Map;
  6 | 
  7 | import com.google.common.collect.HashMultiset;
  8 | import com.google.common.collect.Multiset;
  9 | import com.google.common.collect.Table;
 10 | 
 11 | import sequencemining.sequence.Sequence;
 12 | import sequencemining.transaction.Transaction;
 13 | 
 14 | /** Container class for Inference Algorithms */
 15 | public class InferenceAlgorithms {
 16 | 
 17 | 	/** Interface for the different inference algorithms */
 18 | 	public interface InferenceAlgorithm {
 19 | 		public Multiset<Sequence> infer(final Transaction transaction);
 20 | 	}
 21 | 
 22 | 	/**
 23 | 	 * Infer ML parameters to explain transaction using greedy algorithm and
 24 | 	 * store in covering. Sequences *may not* overlap.
 25 | 	 * <p>
 26 | 	 * !! Assumes *no overlap* !! i.e. subseqs in covering are pairwise disjoint
 27 | 	 */
 28 | 	public static class InferGreedy implements InferenceAlgorithm, Serializable {
 29 | 		private static final long serialVersionUID = 9173178089235828142L;
 30 | 
 31 | 		@Override
 32 | 		public Multiset<Sequence> infer(final Transaction transaction) {
 33 | 
 34 | 			final Multiset<Sequence> covering = HashMultiset.create();
 35 | 			int lenCovering = 0;
 36 | 			final int transactionSize = transaction.size();
 37 | 			final BitSet coveredItems = new BitSet(transactionSize);
 38 | 
 39 | 			final Table<Sequence, Integer, Double> cachedSequences = transaction.getCachedSequences();
 40 | 			while (coveredItems.cardinality() != transactionSize) {
 41 | 
 42 | 				double minCostPerItem = Double.POSITIVE_INFINITY;
 43 | 				Sequence bestSeq = null;
 44 | 				BitSet bestSeqCoveredItems = null;
 45 | 
 46 | 				for (final Sequence seq : cachedSequences.rowKeySet()) {
 47 | 
 48 | 					// How many additional items does sequence cover?
 49 | 					final BitSet seqCoveredItems = transaction.getCovered(seq, coveredItems);
 50 | 					// Ignore sequences which don't cover anything
 51 | 					if (seqCoveredItems.isEmpty())
 52 | 						continue;
 53 | 
 54 | 					// Get seq multiplicity in covering
 55 | 					final int occur = covering.count(seq);
 56 | 
 57 | 					// TODO triple check that this is right!!!
 58 | 					// Calculate f(CuS) - f(C)
 59 | 					Double prob1 = cachedSequences.get(seq, occur + 1);
 60 | 					if (prob1 == null)
 61 | 						prob1 = 0.; // Empty multiplicities have zero prob
 62 | 					else if (prob1 == 0. && isInnerProb(occur + 1, cachedSequences.row(seq)))
 63 | 						prob1 = Double.MIN_VALUE; // Smooth zero inner probs
 64 | 					double prob = cachedSequences.get(seq, occur);
 65 | 					if (prob == 0. && isInnerProb(occur, cachedSequences.row(seq)))
 66 | 						prob = Double.MIN_VALUE; // Smooth zero inner probs
 67 | 					final double cost = -Math.log(prob1) + Math.log(prob)
 68 | 							+ sumLogRange(lenCovering + 1, lenCovering + seq.size());
 69 | 					final double costPerItem = cost / seq.size();
 70 | 
 71 | 					if (costPerItem < minCostPerItem) {
 72 | 						minCostPerItem = costPerItem;
 73 | 						bestSeq = seq;
 74 | 						bestSeqCoveredItems = seqCoveredItems;
 75 | 					}
 76 | 
 77 | 				}
 78 | 
 79 | 				if (bestSeq != null) {
 80 | 					// final int firstItemCovered = bestSeqCoveredItems
 81 | 					// .nextSetBit(0);
 82 | 					// covering.put(bestSeq, firstItemCovered);
 83 | 					covering.add(bestSeq);
 84 | 					lenCovering += bestSeq.size();
 85 | 					coveredItems.or(bestSeqCoveredItems);
 86 | 				} else { // Fill in incomplete coverings with singletons
 87 | 					int index = 0;
 88 | 					while (coveredItems.cardinality() != transactionSize) {
 89 | 						index = coveredItems.nextClearBit(index);
 90 | 						final Sequence seq = new Sequence(transaction.get(index));
 91 | 						covering.add(seq);
 92 | 						coveredItems.set(index);
 93 | 					}
 94 | 					return covering;
 95 | 				}
 96 | 
 97 | 			}
 98 | 			return covering;
 99 | 		}
100 | 
101 | 		private boolean isInnerProb(final int probIndex, final Map<Integer, Double> probVec) {
102 | 			for (int i = probIndex + 1; i < probVec.size(); i++) {
103 | 				if (probVec.get(i) != 0.)
104 | 					return true;
105 | 			}
106 | 			return false;
107 | 		}
108 | 
109 | 		private double sumLogRange(final int a, final int b) {
110 | 			double sum = 0;
111 | 			for (int i = a; i <= b; i++)
112 | 				sum += Math.log(i);
113 | 			return sum;
114 | 		}
115 | 
116 | 	}
117 | 
118 | 	// /**
119 | 	// * Infer ML parameters to explain transaction using greedy algorithm and
120 | 	// * store in covering. Sequences may overlap.
121 | 	// * <p>
122 | 	// * This is an O(log(n))-approximation algorithm where n is the number of
123 | 	// * elements in the transaction.
124 | 	// */
125 | 	// public static class InferGreedyOld implements InferenceAlgorithm,
126 | 	// Serializable {
127 | 	// private static final long serialVersionUID = 9173178089235828142L;
128 | 	//
129 | 	// @Override
130 | 	// public HashSet<Sequence> infer(final Transaction transaction) {
131 | 	//
132 | 	// final HashSet<Sequence> covering = new HashSet<>();
133 | 	// final int transactionSize = transaction.size();
134 | 	// final BitSet coveredItems = new BitSet(transactionSize);
135 | 	//
136 | 	// final HashMap<Sequence, Double> cachedSequences = transaction
137 | 	// .getCachedSequences();
138 | 	// while (coveredItems.cardinality() != transactionSize) {
139 | 	//
140 | 	// double minCostPerItem = Double.POSITIVE_INFINITY;
141 | 	// Sequence bestSeq = null;
142 | 	// BitSet bestSeqCoveredItems = null;
143 | 	//
144 | 	// for (final Entry<Sequence, Double> entry : cachedSequences
145 | 	// .entrySet()) {
146 | 	//
147 | 	// // Ignore sequences which already cover
148 | 	// if (covering.contains(entry.getKey()))
149 | 	// continue;
150 | 	//
151 | 	// // How many additional items does sequence cover?
152 | 	// final BitSet seqCoveredItems = transaction.getCovered(
153 | 	// entry.getKey(), coveredItems);
154 | 	// // Ignore sequences which don't cover anything
155 | 	// if (seqCoveredItems.isEmpty())
156 | 	// continue;
157 | 	// final BitSet newlyCoveredItems = (BitSet) seqCoveredItems
158 | 	// .clone();
159 | 	// newlyCoveredItems.or(coveredItems);
160 | 	// final int notCovered = newlyCoveredItems.cardinality()
161 | 	// - coveredItems.cardinality();
162 | 	//
163 | 	// final double cost = -Math.log(entry.getValue());
164 | 	// final double costPerItem = cost / notCovered;
165 | 	//
166 | 	// if (costPerItem < minCostPerItem) {
167 | 	// minCostPerItem = costPerItem;
168 | 	// bestSeq = entry.getKey();
169 | 	// bestSeqCoveredItems = seqCoveredItems;
170 | 	// }
171 | 	//
172 | 	// }
173 | 	//
174 | 	// if (bestSeq != null) {
175 | 	// // final int firstItemCovered = bestSeqCoveredItems
176 | 	// // .nextSetBit(0);
177 | 	// // covering.put(bestSeq, firstItemCovered);
178 | 	// covering.add(bestSeq);
179 | 	// coveredItems.or(bestSeqCoveredItems);
180 | 	// } else { // Allow incomplete coverings
181 | 	// break;
182 | 	// }
183 | 	//
184 | 	// }
185 | 	// return covering;
186 | 	// }
187 | 	//
188 | 	// }
189 | 
190 | 	private InferenceAlgorithms() {
191 | 
192 | 	}
193 | 
194 | }
195 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/SequenceMining.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.main;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.text.SimpleDateFormat;
  6 | import java.util.ArrayList;
  7 | import java.util.Collections;
  8 | import java.util.Date;
  9 | import java.util.HashMap;
 10 | import java.util.List;
 11 | import java.util.Map;
 12 | import java.util.Map.Entry;
 13 | import java.util.logging.Level;
 14 | 
 15 | import org.apache.commons.io.FileUtils;
 16 | import org.apache.commons.io.FilenameUtils;
 17 | import org.apache.commons.io.LineIterator;
 18 | 
 19 | import com.beust.jcommander.IStringConverter;
 20 | import com.beust.jcommander.JCommander;
 21 | import com.beust.jcommander.Parameter;
 22 | import com.beust.jcommander.ParameterException;
 23 | import com.google.common.base.Charsets;
 24 | import com.google.common.collect.HashBasedTable;
 25 | import com.google.common.collect.HashMultiset;
 26 | import com.google.common.collect.Multiset;
 27 | import com.google.common.collect.Table;
 28 | import com.google.common.io.Files;
 29 | 
 30 | import sequencemining.main.InferenceAlgorithms.InferGreedy;
 31 | import sequencemining.main.InferenceAlgorithms.InferenceAlgorithm;
 32 | import sequencemining.sequence.Sequence;
 33 | import sequencemining.transaction.Transaction;
 34 | import sequencemining.transaction.TransactionList;
 35 | import sequencemining.util.Logging;
 36 | 
 37 | public class SequenceMining extends SequenceMiningCore {
 38 | 
 39 | 	/** Main function parameters */
 40 | 	public static class Parameters {
 41 | 
 42 | 		@Parameter(names = { "-f", "--file" }, description = "Dataset filename")
 43 | 		private final File dataset = new File(
 44 | 				"/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Datasets/Paper/jmlr.dat");
 45 | 
 46 | 		@Parameter(names = { "-s", "--maxSteps" }, description = "Max structure steps")
 47 | 		int maxStructureSteps = 100_000;
 48 | 
 49 | 		@Parameter(names = { "-i", "--iterations" }, description = "Max iterations")
 50 | 		int maxEMIterations = 1_000;
 51 | 
 52 | 		@Parameter(names = { "-l", "--log-level" }, description = "Log level", converter = LogLevelConverter.class)
 53 | 		Level logLevel = Level.FINE;
 54 | 
 55 | 		@Parameter(names = { "-r", "--runtime" }, description = "Max Runtime (min)")
 56 | 		long maxRunTime = 72 * 60; // 12hrs
 57 | 
 58 | 		@Parameter(names = { "-t", "--timestamp" }, description = "Timestamp Logfile", arity = 1)
 59 | 		boolean timestampLog = true;
 60 | 
 61 | 		@Parameter(names = { "-d", "--dist" }, description = "Save sequence count distribution")
 62 | 		private boolean saveCountDist = false;
 63 | 
 64 | 		@Parameter(names = { "-v", "--verbose" }, description = "Print to console instead of logfile")
 65 | 		private boolean verbose = false;
 66 | 	}
 67 | 
 68 | 	public static void main(final String[] args) throws IOException {
 69 | 
 70 | 		// Main fixed parameters
 71 | 		final InferenceAlgorithm inferenceAlg = new InferGreedy();
 72 | 
 73 | 		// Runtime parameters
 74 | 		final Parameters params = new Parameters();
 75 | 		final JCommander jc = new JCommander(params);
 76 | 
 77 | 		try {
 78 | 			jc.parse(args);
 79 | 
 80 | 			// Set loglevel, runtime, timestamp and log file
 81 | 			LOG_LEVEL = params.logLevel;
 82 | 			MAX_RUNTIME = params.maxRunTime * 60 * 1_000;
 83 | 			File logFile = null;
 84 | 			if (!params.verbose)
 85 | 				logFile = Logging.getLogFileName("ISM", params.timestampLog, LOG_DIR, params.dataset);
 86 | 
 87 | 			// Mine interesting sequences
 88 | 			mineSequences(params.dataset, inferenceAlg, params.maxStructureSteps, params.maxEMIterations, logFile,
 89 | 					params.saveCountDist);
 90 | 
 91 | 		} catch (final ParameterException e) {
 92 | 			System.out.println(e.getMessage());
 93 | 			jc.usage();
 94 | 		}
 95 | 
 96 | 		System.exit(0); // Required to prevent waiting for Runnable completion
 97 | 
 98 | 	}
 99 | 
100 | 	/** Mine interesting sequences */
101 | 	public static Map<Sequence, Double> mineSequences(final File inputFile, final InferenceAlgorithm inferenceAlgorithm,
102 | 			final int maxStructureSteps, final int maxEMIterations, final File logFile, final boolean saveCountDist)
103 | 					throws IOException {
104 | 
105 | 		// Set up logging
106 | 		if (logFile != null)
107 | 			Logging.setUpFileLogger(logger, LOG_LEVEL, logFile);
108 | 		else
109 | 			Logging.setUpConsoleLogger(logger, LOG_LEVEL);
110 | 
111 | 		// Echo input parameters
112 | 		logger.info("========== INTERESTING SEQUENCE MINING ============");
113 | 		logger.info("\n Time: " + new SimpleDateFormat("dd.MM.yyyy-HH:mm:ss").format(new Date()));
114 | 		logger.info("\n Inputs: -f " + inputFile + " -s " + maxStructureSteps + " -i " + maxEMIterations + " -r "
115 | 				+ MAX_RUNTIME / 60_000);
116 | 
117 | 		// Read in transaction database
118 | 		final TransactionList transactions = readTransactions(inputFile);
119 | 
120 | 		// Determine initial probabilities
121 | 		final Table<Sequence, Integer, Double> initProbs = scanDatabaseToDetermineInitialProbabilities(inputFile);
122 | 
123 | 		// Run inference to find interesting sequences
124 | 		logger.fine("\n============= SEQUENCE INFERENCE =============\n");
125 | 		final Table<Sequence, Integer, Double> sequences = structuralEM(transactions, initProbs, inferenceAlgorithm,
126 | 				maxStructureSteps, maxEMIterations);
127 | 		if (LOG_LEVEL.equals(Level.FINEST))
128 | 			logger.finest(
129 | 					"\n======= Transaction Database =======\n" + Files.toString(inputFile, Charsets.UTF_8) + "\n");
130 | 
131 | 		// Calculate probabilities: p(S \in X) = p(z_S >= 1) = 1 - \pi_S_0
132 | 		final HashMap<Sequence, Double> sequenceMap = new HashMap<>();
133 | 		for (final Sequence seq : sequences.rowKeySet())
134 | 			sequenceMap.put(seq, 1 - sequences.get(seq, 0));
135 | 
136 | 		// Sort sequences by interestingness
137 | 		final HashMap<Sequence, Double> intMap = calculateInterestingness(sequenceMap, transactions);
138 | 		final Map<Sequence, Double> sortedSequences = sortSequences(sequenceMap, intMap);
139 | 
140 | 		logger.info("\n============= INTERESTING SEQUENCES =============\n");
141 | 		for (final Entry<Sequence, Double> entry : sortedSequences.entrySet()) {
142 | 			logger.info(String.format("%s\tprob: %1.5f \tint: %1.5f %n", entry.getKey(), entry.getValue(),
143 | 					intMap.get(entry.getKey())));
144 | 		}
145 | 		logger.info("\n");
146 | 
147 | 		// Optionally save sequence count distribution
148 | 		if (saveCountDist) {
149 | 			Logging.serialize(sequences, FilenameUtils.removeExtension(logFile.getAbsolutePath()) + ".dist");
150 | 		}
151 | 
152 | 		return sortedSequences;
153 | 	}
154 | 
155 | 	public static TransactionList readTransactions(final File inputFile) throws IOException {
156 | 
157 | 		final List<Transaction> transactions = new ArrayList<>();
158 | 
159 | 		// for each line (transaction) until the end of file
160 | 		final LineIterator it = FileUtils.lineIterator(inputFile, "UTF-8");
161 | 		while (it.hasNext()) {
162 | 
163 | 			final String line = it.nextLine();
164 | 			// if the line is a comment, is empty or is a
165 | 			// kind of metadata
166 | 			if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
167 | 				continue;
168 | 			}
169 | 
170 | 			// split the transaction into items
171 | 			final String[] lineSplited = line.split(" ");
172 | 			// convert to Transaction class and add it to the structure
173 | 			transactions.add(getTransaction(lineSplited));
174 | 
175 | 		}
176 | 		// close the input file
177 | 		LineIterator.closeQuietly(it);
178 | 
179 | 		return new TransactionList(transactions);
180 | 	}
181 | 
182 | 	/**
183 | 	 * Create and add the Transaction in the String array
184 | 	 *
185 | 	 * @param integers
186 | 	 *            one line of integers in the sequence database
187 | 	 */
188 | 	public static Transaction getTransaction(final String[] integers) {
189 | 		final Transaction sequence = new Transaction();
190 | 
191 | 		for (int i = 0; i < integers.length; i++) {
192 | 			if (integers[i].equals("-1")) { // end of item
193 | 
194 | 			} else if (integers[i].equals("-2")) { // end of sequence
195 | 				return sequence;
196 | 			} else { // extract the value for an item
197 | 				sequence.add(Integer.parseInt(integers[i]));
198 | 			}
199 | 		}
200 | 		throw new RuntimeException("Corrupt sequence database.");
201 | 	}
202 | 
203 | 	/**
204 | 	 * This method scans the input database to determine the initial
205 | 	 * probabilities of single items
206 | 	 *
207 | 	 * @param inputFile
208 | 	 *            the input file
209 | 	 * @return class storing the support of every occurrence of each singleton
210 | 	 */
211 | 	public static Table<Sequence, Integer, Double> scanDatabaseToDetermineInitialProbabilities(final File inputFile)
212 | 			throws IOException {
213 | 
214 | 		// Sequence x occurence x count
215 | 		final Table<Sequence, Integer, Double> supports = HashBasedTable.create();
216 | 
217 | 		// for each line (transaction) until the end of file
218 | 		int noTransactions = 0;
219 | 		final LineIterator it = FileUtils.lineIterator(inputFile, "UTF-8");
220 | 		while (it.hasNext()) {
221 | 
222 | 			final String line = it.nextLine();
223 | 			// if the line is a comment, is empty or is a kind of metadata
224 | 			if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
225 | 				continue;
226 | 			}
227 | 
228 | 			// split the line into items
229 | 			final String[] lineSplit = line.split(" ");
230 | 			// for each item
231 | 			final Multiset<Sequence> seenItems = HashMultiset.create();
232 | 			for (final String itemString : lineSplit) {
233 | 				final int item = Integer.parseInt(itemString);
234 | 				if (item >= 0) // ignore end of itemset/sequence tags
235 | 					seenItems.add(new Sequence(item));
236 | 			}
237 | 			// increase the support count of the items
238 | 			for (final Sequence seq : seenItems.elementSet()) {
239 | 				final int occur = seenItems.count(seq);
240 | 				if (supports.contains(seq, occur)) {
241 | 					final double supp = supports.get(seq, occur);
242 | 					supports.put(seq, occur, supp + 1);
243 | 				} else {
244 | 					supports.put(seq, occur, 1.);
245 | 				}
246 | 			}
247 | 
248 | 			noTransactions++;
249 | 		}
250 | 
251 | 		// close the input file
252 | 		LineIterator.closeQuietly(it);
253 | 
254 | 		for (final Sequence seq : supports.rowKeySet()) {
255 | 			// Pad with zero counts for non-occurrences
256 | 			final int maxOccur = Collections.max(supports.row(seq).keySet());
257 | 			for (int occur = 1; occur <= maxOccur; occur++) {
258 | 				if (!supports.contains(seq, occur))
259 | 					supports.put(seq, occur, 0.);
260 | 			} // Add counts for zero occurrences
261 | 			double rowSum = 0;
262 | 			for (final Double count : supports.row(seq).values())
263 | 				rowSum += count;
264 | 			supports.put(seq, 0, noTransactions - rowSum);
265 | 		}
266 | 
267 | 		// Normalize
268 | 		for (final Sequence seq : supports.rowKeySet()) {
269 | 			double rowSum = 0;
270 | 			for (final Double prob : supports.row(seq).values())
271 | 				rowSum += prob;
272 | 			for (final Integer occur : supports.row(seq).keySet()) {
273 | 				final double normProb = supports.get(seq, occur) / rowSum;
274 | 				supports.put(seq, occur, normProb);
275 | 			}
276 | 		}
277 | 
278 | 		return supports;
279 | 	}
280 | 
281 | 	/** Convert string level to level class */
282 | 	public static class LogLevelConverter implements IStringConverter<Level> {
283 | 		@Override
284 | 		public Level convert(final String value) {
285 | 			if (value.equals("SEVERE"))
286 | 				return Level.SEVERE;
287 | 			else if (value.equals("WARNING"))
288 | 				return Level.WARNING;
289 | 			else if (value.equals("INFO"))
290 | 				return Level.INFO;
291 | 			else if (value.equals("CONFIG"))
292 | 				return Level.CONFIG;
293 | 			else if (value.equals("FINE"))
294 | 				return Level.FINE;
295 | 			else if (value.equals("FINER"))
296 | 				return Level.FINER;
297 | 			else if (value.equals("FINEST"))
298 | 				return Level.FINEST;
299 | 			else
300 | 				throw new RuntimeException("Incorrect Log Level.");
301 | 		}
302 | 	}
303 | 
304 | }


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/SparkEMStep.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.main;
  2 | //package itemsetmining.main;
  3 | //
  4 | //import itemsetmining.itemset.Itemset;
  5 | //import itemsetmining.main.InferenceAlgorithms.InferenceAlgorithm;
  6 | //import itemsetmining.transaction.Transaction;
  7 | //import itemsetmining.transaction.TransactionDatabase;
  8 | //
  9 | //import java.util.HashSet;
 10 | //import java.util.List;
 11 | //import java.util.Map;
 12 | //import java.util.Set;
 13 | //import java.util.stream.Collectors;
 14 | //
 15 | //import org.apache.spark.api.java.JavaPairRDD;
 16 | //import org.apache.spark.api.java.JavaRDD;
 17 | //
 18 | //import scala.Tuple2;
 19 | //
 20 | //import com.google.common.collect.Multiset;
 21 | //
 22 | ///** Class to hold the various transaction EM Steps for Spark */
 23 | //public class SparkEMStep {
 24 | //
 25 | //	/** Initialize cached itemsets */
 26 | //	static void initializeCachedItemsets(
 27 | //			final TransactionDatabase transactions,
 28 | //			final Multiset<Integer> singletons) {
 29 | //		final long noTransactions = transactions.size();
 30 | //		final JavaRDD<Transaction> updatedTransactions = transactions
 31 | //				.getTransactionRDD().map(t -> {
 32 | //					t.initializeCachedItemsets(singletons, noTransactions);
 33 | //					return t;
 34 | //				});
 35 | //
 36 | //		// Update cache reference
 37 | //		transactions.updateTransactionCache(updatedTransactions);
 38 | //	}
 39 | //
 40 | //	/** EM-step for hard EM */
 41 | //	static Map<Itemset, Double> hardEMStep(
 42 | //			final TransactionDatabase transactions,
 43 | //			final InferenceAlgorithm inferenceAlgorithm) {
 44 | //		final double noTransactions = transactions.size();
 45 | //
 46 | //		// E-step: map and cache covering
 47 | //		final JavaPairRDD<Transaction, Set<Itemset>> transactionWithCovering = transactions
 48 | //				.getTransactionRDD()
 49 | //				.mapToPair(
 50 | //						t -> {
 51 | //							final HashSet<Itemset> covering = inferenceAlgorithm
 52 | //									.infer(t);
 53 | //							t.setCachedCovering(covering);
 54 | //							return new Tuple2<Transaction, Set<Itemset>>(t,
 55 | //									covering);
 56 | //						});
 57 | //
 58 | //		// E-step: reduce and get itemset counts
 59 | //		final List<Tuple2<Itemset, Integer>> coveringWithCounts = transactionWithCovering
 60 | //				.values().flatMap(s -> s)
 61 | //				.mapToPair(s -> new Tuple2<Itemset, Integer>(s, 1))
 62 | //				.reduceByKey((a, b) -> a + b).collect();
 63 | //
 64 | //		// M-step
 65 | //		final Map<Itemset, Double> newItemsets = coveringWithCounts
 66 | //				.parallelStream().collect(
 67 | //						Collectors
 68 | //								.toMap(Tuple2::_1, t -> t._2 / noTransactions));
 69 | //
 70 | //		// Update cached itemsets
 71 | //		final JavaRDD<Transaction> updatedTransactions = transactionWithCovering
 72 | //				.keys().map(t -> {
 73 | //					t.updateCachedItemsets(newItemsets);
 74 | //					return t;
 75 | //				});
 76 | //
 77 | //		// Update cache reference
 78 | //		transactions.updateTransactionCache(updatedTransactions);
 79 | //
 80 | //		return newItemsets;
 81 | //	}
 82 | //
 83 | //	/** Get average cost of last EM-step */
 84 | //	static void calculateAndSetAverageCost(
 85 | //			final TransactionDatabase transactions) {
 86 | //		final double noTransactions = transactions.size();
 87 | //		final double averageCost = transactions.getTransactionRDD()
 88 | //				.map(Transaction::getCachedCost).reduce((a, b) -> a + b)
 89 | //				/ noTransactions;
 90 | //		transactions.setAverageCost(averageCost);
 91 | //	}
 92 | //
 93 | //	/** EM-step for structural EM */
 94 | //	static Tuple2<Double, Double> structuralEMStep(
 95 | //			final TransactionDatabase transactions,
 96 | //			final InferenceAlgorithm inferenceAlgorithm, final Itemset candidate) {
 97 | //		final double noTransactions = transactions.size();
 98 | //
 99 | //		// E-step: map candidate to supported transactions and cache covering
100 | //		final JavaPairRDD<Transaction, Set<Itemset>> transactionWithCovering = transactions
101 | //				.getTransactionRDD()
102 | //				.mapToPair(
103 | //						t -> {
104 | //							if (t.contains(candidate)) {
105 | //								t.addItemsetCache(candidate, 1.0);
106 | //								final HashSet<Itemset> covering = inferenceAlgorithm
107 | //										.infer(t);
108 | //								t.setTempCachedCovering(covering);
109 | //								return new Tuple2<Transaction, Set<Itemset>>(t,
110 | //										covering);
111 | //							}
112 | //							return new Tuple2<Transaction, Set<Itemset>>(t, t
113 | //									.getCachedCovering());
114 | //						});
115 | //
116 | //		// E-step: reduce and get itemset counts
117 | //		final List<Tuple2<Itemset, Integer>> coveringWithCounts = transactionWithCovering
118 | //				.values().flatMap(s -> s)
119 | //				.mapToPair(s -> new Tuple2<Itemset, Integer>(s, 1))
120 | //				.reduceByKey((a, b) -> a + b).collect();
121 | //
122 | //		// M-step
123 | //		final Map<Itemset, Double> newItemsets = coveringWithCounts
124 | //				.parallelStream().collect(
125 | //						Collectors
126 | //								.toMap(Tuple2::_1, t -> t._2 / noTransactions));
127 | //
128 | //		// Get cost per transaction
129 | //		final JavaPairRDD<Transaction, Double> transactionWithCost = transactionWithCovering
130 | //				.keys().mapToPair(t -> {
131 | //					double cost;
132 | //					if (t.contains(candidate))
133 | //						cost = t.getTempCachedCost(newItemsets);
134 | //					else
135 | //						cost = t.getCachedCost(newItemsets);
136 | //					t.removeItemsetCache(candidate);
137 | //					return new Tuple2<Transaction, Double>(t, cost);
138 | //				});
139 | //
140 | //		// Get average cost
141 | //		final double averageCost = transactionWithCost.values().reduce(
142 | //				(a, b) -> a + b)
143 | //				/ noTransactions;
144 | //
145 | //		// Get candidate prob
146 | //		Double prob = newItemsets.get(candidate);
147 | //		if (prob == null)
148 | //			prob = 0.;
149 | //
150 | //		// Update cache reference
151 | //		transactions.updateTransactionCache(transactionWithCost.keys());
152 | //
153 | //		return new Tuple2<Double, Double>(averageCost, prob);
154 | //	}
155 | //
156 | //	/** Add accepted candidate itemset to cache */
157 | //	static Map<Itemset, Double> addAcceptedCandidateCache(
158 | //			final TransactionDatabase transactions, final Itemset candidate,
159 | //			final double prob) {
160 | //		final double noTransactions = transactions.size();
161 | //
162 | //		// Cached E-step: map candidate to supported transactions and cache
163 | //		final JavaPairRDD<Transaction, Set<Itemset>> transactionWithCovering = transactions
164 | //				.getTransactionRDD().mapToPair(
165 | //						t -> {
166 | //							if (t.contains(candidate)) {
167 | //								t.addItemsetCache(candidate, prob);
168 | //								final HashSet<Itemset> covering = t
169 | //										.getTempCachedCovering();
170 | //								t.setCachedCovering(covering);
171 | //								return new Tuple2<Transaction, Set<Itemset>>(t,
172 | //										covering);
173 | //							}
174 | //							return new Tuple2<Transaction, Set<Itemset>>(t, t
175 | //									.getCachedCovering());
176 | //						});
177 | //
178 | //		// E-step: reduce and get itemset counts
179 | //		final List<Tuple2<Itemset, Integer>> coveringWithCounts = transactionWithCovering
180 | //				.values().flatMap(s -> s)
181 | //				.mapToPair(s -> new Tuple2<Itemset, Integer>(s, 1))
182 | //				.reduceByKey((a, b) -> a + b).collect();
183 | //
184 | //		// M-step
185 | //		final Map<Itemset, Double> newItemsets = coveringWithCounts
186 | //				.parallelStream().collect(
187 | //						Collectors
188 | //								.toMap(Tuple2::_1, t -> t._2 / noTransactions));
189 | //
190 | //		// Update cached itemsets
191 | //		final JavaRDD<Transaction> updatedTransactions = transactionWithCovering
192 | //				.keys().map(t -> {
193 | //					t.updateCachedItemsets(newItemsets);
194 | //					return t;
195 | //				});
196 | //
197 | //		// Update cache reference
198 | //		transactions.updateTransactionCache(updatedTransactions);
199 | //
200 | //		return newItemsets;
201 | //	}
202 | //
203 | //	private SparkEMStep() {
204 | //	}
205 | //
206 | // }
207 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/SparkSequenceMining.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.main;
  2 | 
  3 | // import itemsetmining.itemset.Itemset;
  4 | // import itemsetmining.itemset.ItemsetTree;
  5 | // import itemsetmining.main.InferenceAlgorithms.InferGreedy;
  6 | // import itemsetmining.main.InferenceAlgorithms.InferenceAlgorithm;
  7 | // import itemsetmining.transaction.Transaction;
  8 | // import itemsetmining.transaction.TransactionRDD;
  9 | // import itemsetmining.util.Logging;
 10 | //
 11 | // import java.io.File;
 12 | // import java.io.FileInputStream;
 13 | // import java.io.IOException;
 14 | // import java.text.SimpleDateFormat;
 15 | // import java.util.Date;
 16 | // import java.util.HashMap;
 17 | // import java.util.Map;
 18 | // import java.util.Map.Entry;
 19 | // import java.util.logging.Level;
 20 | // import java.util.Properties;
 21 | //
 22 | // import org.apache.hadoop.conf.Configuration;
 23 | // import org.apache.hadoop.fs.FileSystem;
 24 | // import org.apache.hadoop.fs.Path;
 25 | // import org.apache.spark.SparkConf;
 26 | // import org.apache.spark.api.java.JavaRDD;
 27 | // import org.apache.spark.api.java.JavaSparkContext;
 28 | // import org.apache.spark.api.java.function.Function;
 29 | //
 30 | // import scala.Tuple2;
 31 | //
 32 | // import com.beust.jcommander.IStringConverter;
 33 | // import com.beust.jcommander.JCommander;
 34 | // import com.beust.jcommander.Parameter;
 35 | // import com.beust.jcommander.ParameterException;
 36 | // import com.google.common.collect.HashMultiset;
 37 | // import com.google.common.collect.Multiset;
 38 | //
 39 | // public class SparkItemsetMining extends ItemsetMiningCore {
 40 | //
 41 | // /** Main function parameters */
 42 | // public static class Parameters {
 43 | //
 44 | // @Parameter(names = { "-f", "--file" }, description = "Dataset filename")
 45 | // private final File dataset = new File("example.dat");
 46 | //
 47 | // @Parameter(names = { "-j", "--jar" }, description = "IIM Standalone jar")
 48 | // private final String IIMJar = "itemset-mining/target/itemset-mining-1.0.jar";
 49 | //
 50 | // @Parameter(names = { "-s", "--maxSteps" }, description = "Max structure
 51 | // steps")
 52 | // int maxStructureSteps = 100_000;
 53 | //
 54 | // @Parameter(names = { "-i", "--iterations" }, description = "Max iterations")
 55 | // int maxEMIterations = 1_000;
 56 | //
 57 | // @Parameter(names = { "-c", "--cores" }, description = "No cores")
 58 | // int noCores = 16;
 59 | //
 60 | // @Parameter(names = { "-l", "--log-level" }, description = "Log level",
 61 | // converter = LogLevelConverter.class)
 62 | // Level logLevel = Level.FINE;
 63 | //
 64 | // @Parameter(names = { "-r", "--runtime" }, description = "Max Runtime (min)")
 65 | // long maxRunTime = 12 * 60; // 12hrs
 66 | //
 67 | // @Parameter(names = { "-t", "--timestamp" }, description = "Timestamp
 68 | // Logfile", arity = 1)
 69 | // boolean timestampLog = true;
 70 | //
 71 | // @Parameter(names = { "-v", "--verbose" }, description = "Print to console
 72 | // instead of logfile")
 73 | // private boolean verbose = false;
 74 | // }
 75 | //
 76 | // public static void main(final String[] args) throws IOException {
 77 | //
 78 | // // Use greedy inference algorithm for Spark
 79 | // final InferenceAlgorithm inferenceAlg = new InferGreedy();
 80 | //
 81 | // final Parameters params = new Parameters();
 82 | // final JCommander jc = new JCommander(params);
 83 | //
 84 | // try {
 85 | // jc.parse(args);
 86 | //
 87 | // // Set up spark and HDFS
 88 | // final JavaSparkContext sc = setUpSpark(params.dataset.getName(),
 89 | // params.IIMJar, params.noCores);
 90 | // final FileSystem hdfs = setUpHDFS();
 91 | //
 92 | // // Set loglevel, runtime, timestamp and log file
 93 | // LOG_LEVEL = params.logLevel;
 94 | // MAX_RUNTIME = params.maxRunTime * 60 * 1_000;
 95 | // File logFile = null;
 96 | // if(!params.verbose)
 97 | // logFile = Logging.getLogFileName("IIM",
 98 | // params.timestampLog, LOG_DIR, params.dataset);
 99 | //
100 | // mineItemsets(params.dataset, hdfs, sc, inferenceAlg,
101 | // params.maxStructureSteps, params.maxEMIterations, logFile);
102 | //
103 | // } catch (final ParameterException e) {
104 | // System.out.println(e.getMessage());
105 | // jc.usage();
106 | // }
107 | //
108 | // }
109 | //
110 | // public static Map<Itemset, Double> mineItemsets(final File inputFile,
111 | // final FileSystem hdfs, final JavaSparkContext sc,
112 | // final InferenceAlgorithm inferenceAlg, final int maxStructureSteps,
113 | // final int maxEMIterations, final File logFile) throws IOException {
114 | //
115 | // // Set up logging
116 | // if (logFile != null)
117 | // Logging.setUpFileLogger(logger, LOG_LEVEL, logFile);
118 | // else
119 | // Logging.setUpConsoleLogger(logger, LOG_LEVEL);
120 | //
121 | // // Echo input parameters
122 | // logger.info("========== SPARK INTERESTING ITEMSET MINING ============");
123 | // logger.info("\n Time: "
124 | // + new SimpleDateFormat("dd.MM.yyyy-HH:mm:ss")
125 | // .format(new Date()));
126 | // logger.info("\n Inputs: -f " + inputFile + " -s " + maxStructureSteps
127 | // + " -i " + maxEMIterations + " -c "
128 | // + sc.getLocalProperty("spark.cores.max") + " -r " + MAX_RUNTIME
129 | // / 60_000 + "\n");
130 | //
131 | // // Load Spark and HDFS Properties
132 | // Properties prop = new Properties();
133 | // prop.load(SparkItemsetMining.class.getResourceAsStream("/spark.properties"));
134 | //
135 | // // Copy transaction database to hdfs
136 | // final String datasetPath = prop.getProperty("HDFSMaster")
137 | // + inputFile.getName();
138 | // hdfs.copyFromLocalFile(new Path(inputFile.getAbsolutePath()), new Path(
139 | // datasetPath));
140 | // hdfs.setReplication(new Path(datasetPath),
141 | // Short.parseShort(prop.getProperty("MachinesInCluster")));
142 | // try { // Wait for file to replicate
143 | // Thread.sleep(10 * 1000);
144 | // } catch (final InterruptedException e) {
145 | // e.printStackTrace();
146 | // }
147 | //
148 | // // Read in transaction database
149 | // final int noCores = Integer.parseInt(sc.getConf()
150 | // .get("spark.cores.max"));
151 | // final JavaRDD<Transaction> db = sc.textFile(datasetPath, 2 * noCores)
152 | // .map(new ParseTransaction()).cache();
153 | //
154 | // // Determine most frequent singletons
155 | // final Map<Integer, Integer> singletonsMap = db.flatMap(t -> t)
156 | // .mapToPair(i -> new Tuple2<Integer, Integer>(i, 1))
157 | // .reduceByKey((a, b) -> a + b).collectAsMap();
158 | //
159 | // // Convert singletons map to Multiset (as Spark map is not serializable)
160 | // final Multiset<Integer> singletons = HashMultiset.create();
161 | // for (final Entry<Integer, Integer> entry : singletonsMap.entrySet())
162 | // singletons.add(entry.getKey(), entry.getValue());
163 | //
164 | // // Apply the algorithm to build the itemset tree
165 | // final ItemsetTree tree = new ItemsetTree(singletons);
166 | // tree.buildTree(datasetPath, hdfs);
167 | // if (LOG_LEVEL.equals(Level.FINE))
168 | // tree.printStatistics(logger);
169 | //
170 | // // Run inference to find interesting itemsets
171 | // final TransactionRDD transactions = new TransactionRDD(db, db.count());
172 | // logger.fine("\n============= ITEMSET INFERENCE =============\n");
173 | // final HashMap<Itemset, Double> itemsets = structuralEM(transactions,
174 | // singletons, tree, inferenceAlg, maxStructureSteps,
175 | // maxEMIterations);
176 | //
177 | // // Sort itemsets by interestingness
178 | // final HashMap<Itemset, Double> intMap = calculateInterestingness(
179 | // itemsets, transactions, tree);
180 | // final Map<Itemset, Double> sortedItemsets = sortItemsets(itemsets,
181 | // intMap);
182 | //
183 | // logger.info("\n============= INTERESTING ITEMSETS =============\n");
184 | // for (final Entry<Itemset, Double> entry : sortedItemsets.entrySet()) {
185 | // logger.info(String.format("%s\tprob: %1.5f \tint: %1.5f %n",
186 | // entry.getKey(), entry.getValue(),
187 | // intMap.get(entry.getKey())));
188 | // }
189 | // logger.info("\n");
190 | //
191 | // return sortedItemsets;
192 | // }
193 | //
194 | // /** Set up Spark */
195 | // public static JavaSparkContext setUpSpark(final String dataset, final String
196 | // IIMJar,
197 | // final int noCores) throws IOException {
198 | //
199 | // // Load Spark and HDFS Properties
200 | // Properties prop = new Properties();
201 | // prop.load(SparkItemsetMining.class.getResourceAsStream("/spark.properties"));
202 | //
203 | // final SparkConf conf = new SparkConf();
204 | // conf.setMaster(prop.getProperty("SparkMaster"))
205 | // .setAppName("Itemset Mining: " + dataset)
206 | // .setSparkHome(prop.getProperty("SparkHome"))
207 | // .setJars(new String[] {IIMJar});
208 | // conf.set("spark.cores.max", Integer.toString(noCores));
209 | // conf.set("spark.executor.memory", "20g");
210 | // conf.set("spark.default.parallelism", "8");
211 | // conf.set("spark.shuffle.manager", "SORT");
212 | // // conf.set("spark.eventLog.enabled", "true"); uses GB of space!!!
213 | //
214 | // // Use Kryo for serialization - much faster!
215 | // conf.set("spark.serializer",
216 | // "org.apache.spark.serializer.KryoSerializer");
217 | // conf.set("spark.kryo.registrator",
218 | // "itemsetmining.util.ClassRegistrator");
219 | //
220 | // final JavaSparkContext sc = new JavaSparkContext(conf);
221 | // sc.setCheckpointDir(prop.getProperty("HDFSMaster")
222 | // + "checkpoint/");
223 | // return sc;
224 | // }
225 | //
226 | // /** Set up HDFS */
227 | // public static FileSystem setUpHDFS() throws IOException {
228 | //
229 | // // Load Spark and HDFS Properties
230 | // Properties prop = new Properties();
231 | // prop.load(SparkItemsetMining.class.getResourceAsStream("/spark.properties"));
232 | //
233 | // final Configuration conf = new Configuration();
234 | // conf.addResource(new Path(prop.getProperty("HDFSConfFile")));
235 | // return FileSystem.get(conf);
236 | // }
237 | //
238 | // /** Read in transactions */
239 | // private static class ParseTransaction implements
240 | // Function<String, Transaction> {
241 | // private static final long serialVersionUID = -9092218383491621520L;
242 | //
243 | // @Override
244 | // public Transaction call(final String line) {
245 | //
246 | // // create a structure for storing the transaction
247 | // final Transaction transaction = new Transaction();
248 | //
249 | // // split the transaction into items
250 | // final String[] lineSplit = line.split(" ");
251 | //
252 | // // for each item in the transaction
253 | // for (int i = 0; i < lineSplit.length; i++) {
254 | // // convert the item to integer and add it to the structure
255 | // transaction.add(Integer.parseInt(lineSplit[i]));
256 | // }
257 | //
258 | // return transaction;
259 | // }
260 | // }
261 | //
262 | // /** Convert string level to level class */
263 | // public static class LogLevelConverter implements IStringConverter<Level> {
264 | // @Override
265 | // public Level convert(final String value) {
266 | // if (value.equals("SEVERE"))
267 | // return Level.SEVERE;
268 | // else if (value.equals("WARNING"))
269 | // return Level.WARNING;
270 | // else if (value.equals("INFO"))
271 | // return Level.INFO;
272 | // else if (value.equals("CONFIG"))
273 | // return Level.CONFIG;
274 | // else if (value.equals("FINE"))
275 | // return Level.FINE;
276 | // else if (value.equals("FINER"))
277 | // return Level.FINER;
278 | // else if (value.equals("FINEST"))
279 | // return Level.FINEST;
280 | // else
281 | // throw new RuntimeException("Incorrect Log Level.");
282 | // }
283 | // }
284 | //
285 | // }
286 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/sequence/AbstractSequence.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.sequence;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.util.AbstractCollection;
  5 | import java.util.BitSet;
  6 | import java.util.Collection;
  7 | import java.util.Iterator;
  8 | import java.util.List;
  9 | 
 10 | public abstract class AbstractSequence extends AbstractCollection<Integer> implements Serializable {
 11 | 	private static final long serialVersionUID = 686688001826219278L;
 12 | 
 13 | 	protected List<Integer> items;
 14 | 
 15 | 	/**
 16 | 	 * Add given items to this sequence
 17 | 	 *
 18 | 	 * @param items
 19 | 	 *            an item that should be added to this sequence
 20 | 	 */
 21 | 	@Override
 22 | 	public boolean add(final Integer item) {
 23 | 		return this.items.add(item);
 24 | 	}
 25 | 
 26 | 	/**
 27 | 	 * Get item at specified position in this sequence
 28 | 	 *
 29 | 	 * @param index
 30 | 	 *            index of the element to return
 31 | 	 */
 32 | 	public int get(final int index) {
 33 | 		return this.items.get(index);
 34 | 	}
 35 | 
 36 | 	/**
 37 | 	 * Add item to this sequence
 38 | 	 *
 39 | 	 * @param items
 40 | 	 *            a collection of items that should be added to this sequence
 41 | 	 */
 42 | 	@Override
 43 | 	public boolean addAll(final Collection<? extends Integer> items) {
 44 | 		return this.items.addAll(items);
 45 | 	}
 46 | 
 47 | 	/**
 48 | 	 * Get the items in this sequence
 49 | 	 *
 50 | 	 * @return the items
 51 | 	 */
 52 | 	public List<Integer> getItems() {
 53 | 		return this.items;
 54 | 	}
 55 | 
 56 | 	/**
 57 | 	 * Add items to this sequence
 58 | 	 *
 59 | 	 * @param items
 60 | 	 *            an array of items that should be added to this sequence
 61 | 	 */
 62 | 	public void add(final Integer... items) {
 63 | 		for (final Integer set : items)
 64 | 			this.items.add(set);
 65 | 	}
 66 | 
 67 | 	/** Code for covering sequences *with gaps* */
 68 | 
 69 | 	/**
 70 | 	 * Check if this sequence contains given sequence (allowing gaps)
 71 | 	 *
 72 | 	 * @param sequence
 73 | 	 */
 74 | 	public boolean contains(final Sequence seq) {
 75 | 		int pos = 0;
 76 | 		boolean containsItem;
 77 | 		for (final int item : seq.items) {
 78 | 			containsItem = false;
 79 | 			for (int i = pos; i < this.items.size(); i++) {
 80 | 				if (this.items.get(i) == item) {
 81 | 					pos = i + 1;
 82 | 					containsItem = true;
 83 | 					break;
 84 | 				}
 85 | 			}
 86 | 			if (!containsItem)
 87 | 				return false;
 88 | 		}
 89 | 		return true;
 90 | 	}
 91 | 
 92 | 	/**
 93 | 	 * Return number of times this sequence contains given sequence (allowing
 94 | 	 * gaps)
 95 | 	 *
 96 | 	 * @param sequence
 97 | 	 * @return number of times given sequence is contained in this one
 98 | 	 */
 99 | 	public int repetitions(final Sequence seq) {
100 | 		int count = 0;
101 | 		int pos = 0;
102 | 		while (true) {
103 | 			boolean containsItem;
104 | 			for (final int item : seq.items) {
105 | 				containsItem = false;
106 | 				for (int i = pos; i < this.items.size(); i++) {
107 | 					if (this.items.get(i) == item) {
108 | 						pos = i + 1;
109 | 						containsItem = true;
110 | 						break;
111 | 					}
112 | 				}
113 | 				if (!containsItem)
114 | 					return count;
115 | 			}
116 | 			count++;
117 | 		}
118 | 	}
119 | 
120 | 	/** Code for covering sequences *without gaps* */
121 | 	//
122 | 	// /**
123 | 	// * Check if this sequence contains given sequence (without gaps)
124 | 	// *
125 | 	// * @param sequence
126 | 	// */
127 | 	// public int contains(final Sequence seq) {
128 | 	// outer: for (int i = 0; i < this.items.size()
129 | 	// - seq.items.size() + 1; i++) {
130 | 	// if (this.items.get(i).equals(seq.items.get(0))) {
131 | 	// for (int j = 1; j < seq.items.size(); j++) {
132 | 	// if (!this.items.get(i + j).equals(seq.items.get(j)))
133 | 	// continue outer;
134 | 	// }
135 | 	// return true;
136 | 	// }
137 | 	// }
138 | 	// return false;
139 | 	// }
140 | 
141 | 	/** Code for covering sequences *with gaps* but *without overlap* */
142 | 
143 | 	/**
144 | 	 * Return items in this sequence covered by given sequence (with gaps,
145 | 	 * without overlap)
146 | 	 *
147 | 	 * @param sequence
148 | 	 * @return BitSet of items in order with the covered items set true
149 | 	 */
150 | 	public BitSet getCovered(final AbstractSequence seq, final BitSet alreadyCoveredItems) {
151 | 		int pos = 0;
152 | 		boolean containsItem;
153 | 		final BitSet coveredItems = new BitSet(this.size());
154 | 		for (final int item : seq.items) {
155 | 			containsItem = false;
156 | 			for (int i = pos; i < this.items.size(); i++) {
157 | 				if (!alreadyCoveredItems.get(i) && this.items.get(i) == item) {
158 | 					coveredItems.set(i);
159 | 					pos = i + 1;
160 | 					containsItem = true;
161 | 					break;
162 | 				}
163 | 			}
164 | 			if (!containsItem) {
165 | 				coveredItems.clear();
166 | 				return coveredItems;
167 | 			}
168 | 		}
169 | 		return coveredItems;
170 | 	}
171 | 
172 | 	/**
173 | 	 * Code for covering sequences *without gaps* and *without overlap* !!
174 | 	 * Remember to change subsequence contains and support function !!
175 | 	 */
176 | 	//
177 | 	// /**
178 | 	// * Return the items in this sequence covered (without gaps, without
179 | 	// overlap)
180 | 	// * by the given sequence
181 | 	// *
182 | 	// * @param sequence
183 | 	// * @return BitSet of items in order with the covered items set true
184 | 	// */
185 | 	// public BitSet getCovered(final AbstractSequence seq,
186 | 	// final BitSet alreadyCoveredItems) {
187 | 	// final BitSet coveredItems = new BitSet(this.size());
188 | 	// outer: for (int i = 0; i < this.items.size() - seq.items.size() + 1; i++)
189 | 	// {
190 | 	// if (!alreadyCoveredItems.get(i)
191 | 	// && this.items.get(i).equals(seq.items.get(0))) {
192 | 	// for (int j = 1; j < seq.items.size(); j++) {
193 | 	// if (alreadyCoveredItems.get(i + j)
194 | 	// || !this.items.get(i + j).equals(seq.items.get(j)))
195 | 	// continue outer;
196 | 	// }
197 | 	// for (int j = 0; j < seq.items.size(); j++)
198 | 	// coveredItems.set(i + j);
199 | 	// return coveredItems;
200 | 	// }
201 | 	// }
202 | 	// coveredItems.clear();
203 | 	// return coveredItems;
204 | 	// }
205 | 
206 | 	/**
207 | 	 * Code for covering sequences *with gaps* but *with overlap* !! Remember to
208 | 	 * change greedy algorithm and subsequence contains and support function !!
209 | 	 */
210 | 	//
211 | 	// /**
212 | 	// * Check if first BitSet contains second BitSet
213 | 	// */
214 | 	// public boolean contains(final BitSet set1, final BitSet set2) {
215 | 	// final BitSet copy = (BitSet) set2.clone();
216 | 	// copy.and(set1);
217 | 	// return copy.equals(set2);
218 | 	// }
219 | 	//
220 | 	// /**
221 | 	// * Return the items in this sequence covered by the given sequence (with
222 | 	// * gaps, with overlap), allowing for multiple covering matches if the
223 | 	// * first match is already fully covered
224 | 	// *
225 | 	// * <p>
226 | 	// * This is intended to allow the covering of 1 2 1 2 1 2 by 1 2.
227 | 	// *
228 | 	// * @param sequence
229 | 	// * @return BitSet of items in order with the covered items set true
230 | 	// */
231 | 	// public BitSet getCovered(final AbstractSequence seq,
232 | 	// final BitSet alreadyCoveredItems) {
233 | 	//
234 | 	// int index = 0;
235 | 	// while (true) {
236 | 	// final BitSet coveredItems = getCovered(seq, index);
237 | 	// if (coveredItems.isEmpty())
238 | 	// return coveredItems;
239 | 	// if (contains(alreadyCoveredItems, coveredItems))
240 | 	// index = coveredItems.nextSetBit(index) + 1;
241 | 	// else
242 | 	// return coveredItems;
243 | 	// }
244 | 	//
245 | 	// }
246 | 	//
247 | 	// /**
248 | 	// * Return the items in this sequence covered by the given sequence (with
249 | 	// * gaps, with overlap)
250 | 	// *
251 | 	// * @param sequence
252 | 	// * @return BitSet of items in order with the covered items set true
253 | 	// */
254 | 	// public BitSet getCovered(final AbstractSequence seq, final int
255 | 	// startIndex) {
256 | 	// int pos = startIndex;
257 | 	// boolean containsItem;
258 | 	// final BitSet coveredItems = new BitSet(this.size());
259 | 	// for (final int item : seq.items) {
260 | 	// containsItem = false;
261 | 	// for (int i = pos; i < this.items.size(); i++) {
262 | 	// if (this.items.get(i) == item) {
263 | 	// coveredItems.set(i);
264 | 	// pos = i + 1;
265 | 	// containsItem = true;
266 | 	// break;
267 | 	// }
268 | 	// }
269 | 	// if (!containsItem) {
270 | 	// coveredItems.clear();
271 | 	// return coveredItems;
272 | 	// }
273 | 	// }
274 | 	// return coveredItems;
275 | 	// }
276 | 
277 | 	/**
278 | 	 * Code for covering sequences *without gaps* but *with overlap* !! Remember
279 | 	 * to change greedy algorithm and subsequence contains and support function
280 | 	 * !!
281 | 	 */
282 | 	//
283 | 	// /**
284 | 	// * Return the items in this sequence covered (without gaps, with
285 | 	// * overlap) by the given sequence
286 | 	// *
287 | 	// * @param sequence
288 | 	// * @return BitSet of items in order with the covered items set true
289 | 	// */
290 | 	// public BitSet getCovered(final AbstractSequence seq, final int
291 | 	// startIndex) {
292 | 	// final BitSet coveredItems = new BitSet(this.size());
293 | 	// outer: for (int i = startIndex; i < this.items.size()
294 | 	// - seq.items.size() + 1; i++) {
295 | 	// if (this.items.get(i).equals(seq.items.get(0))) {
296 | 	// for (int j = 1; j < seq.items.size(); j++) {
297 | 	// if (!this.items.get(i + j).equals(seq.items.get(j)))
298 | 	// continue outer;
299 | 	// }
300 | 	// for (int j = 0; j < seq.items.size(); j++)
301 | 	// coveredItems.set(i + j);
302 | 	// return coveredItems;
303 | 	// }
304 | 	// }
305 | 	// coveredItems.clear();
306 | 	// return coveredItems;
307 | 	// }
308 | 
309 | 	/**
310 | 	 * Number of items in this sequence
311 | 	 */
312 | 	@Override
313 | 	public int size() {
314 | 		return this.items.size();
315 | 	}
316 | 
317 | 	@Override
318 | 	public boolean isEmpty() {
319 | 		return items.isEmpty();
320 | 	}
321 | 
322 | 	@Override
323 | 	public String toString() {
324 | 		return items.toString();
325 | 	}
326 | 
327 | 	@Override
328 | 	public int hashCode() {
329 | 		return items.hashCode();
330 | 	}
331 | 
332 | 	@Override
333 | 	public boolean equals(final Object obj) {
334 | 		if (this == obj)
335 | 			return true;
336 | 		if (!(obj instanceof AbstractSequence))
337 | 			return false;
338 | 		final AbstractSequence other = (AbstractSequence) obj;
339 | 		return items.equals(other.items);
340 | 	}
341 | 
342 | 	@Override
343 | 	public Iterator<Integer> iterator() {
344 | 		return items.iterator();
345 | 	}
346 | 
347 | }
348 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/sequence/Sequence.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.sequence;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.ArrayList;
 5 | import java.util.Arrays;
 6 | import java.util.List;
 7 | 
 8 | public class Sequence extends AbstractSequence implements Serializable {
 9 | 	private static final long serialVersionUID = -2766830126344921771L;
10 | 
11 | 	/**
12 | 	 * Constructor
13 | 	 */
14 | 	public Sequence() {
15 | 		this.items = new ArrayList<>();
16 | 	}
17 | 
18 | 	/**
19 | 	 * Shallow Copy Constructor
20 | 	 *
21 | 	 * @param seq
22 | 	 *            sequence to shallow copy
23 | 	 */
24 | 	public Sequence(final Sequence seq) {
25 | 		this.items = seq.items;
26 | 	}
27 | 
28 | 	/**
29 | 	 * Constructor
30 | 	 *
31 | 	 * @param items
32 | 	 *            a list of items that should be added to the new sequence
33 | 	 */
34 | 	public Sequence(final List<Integer> items) {
35 | 		this.items = new ArrayList<>(items);
36 | 	}
37 | 
38 | 	/**
39 | 	 * Constructor
40 | 	 *
41 | 	 * @param items
42 | 	 *            an array of items that should be added to the new sequence
43 | 	 */
44 | 	public Sequence(final Integer... items) {
45 | 		this.items = new ArrayList<>(Arrays.asList(items));
46 | 	}
47 | 
48 | 	/**
49 | 	 * Join Constructor
50 | 	 *
51 | 	 * @param seqs
52 | 	 *            two sequences that should be joined
53 | 	 */
54 | 	public Sequence(final Sequence seq1, final Sequence seq2) {
55 | 		this.items = new ArrayList<>(seq1.items);
56 | 		this.items.addAll(seq2.items);
57 | 	}
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/Transaction.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.transaction;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.util.ArrayList;
  5 | import java.util.Arrays;
  6 | import java.util.Collection;
  7 | import java.util.Iterator;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | import java.util.Map.Entry;
 11 | 
 12 | import com.google.common.collect.HashBasedTable;
 13 | import com.google.common.collect.Multiset;
 14 | import com.google.common.collect.Table;
 15 | 
 16 | import sequencemining.sequence.AbstractSequence;
 17 | import sequencemining.sequence.Sequence;
 18 | 
 19 | /** A transaction is an ordered list of items */
 20 | public class Transaction extends AbstractSequence implements Serializable {
 21 | 	private static final long serialVersionUID = 3327396055332538091L;
 22 | 
 23 | 	/** Cached sequences and probabilities for this transaction */
 24 | 	private Table<Sequence, Integer, Double> cachedSequences;
 25 | 
 26 | 	/** Cached covering for this transaction */
 27 | 	private Multiset<Sequence> cachedCovering;
 28 | 	private Multiset<Sequence> tempCachedCovering;
 29 | 
 30 | 	public void initializeCachedSequences(final Table<Sequence, Integer, Double> initProbs) {
 31 | 		final Table<Sequence, Integer, Double> probs = HashBasedTable.create();
 32 | 		for (final Sequence seq : initProbs.rowKeySet()) {
 33 | 			if (this.contains(seq))
 34 | 				probs.row(seq).putAll(initProbs.row(seq));
 35 | 		}
 36 | 		cachedSequences = probs;
 37 | 	}
 38 | 
 39 | 	public Table<Sequence, Integer, Double> getCachedSequences() {
 40 | 		return cachedSequences;
 41 | 	}
 42 | 
 43 | 	public void addSequenceCache(final Sequence candidate, final Map<Integer, Double> prob) {
 44 | 		cachedSequences.row(candidate).putAll(prob);
 45 | 	}
 46 | 
 47 | 	public void removeSequenceCache(final Sequence candidate) {
 48 | 		cachedSequences.row(candidate).clear();
 49 | 	}
 50 | 
 51 | 	public void updateCachedSequences(final Table<Sequence, Integer, Double> newSequences) {
 52 | 		for (final Iterator<Sequence> it = cachedSequences.rowKeySet().iterator(); it.hasNext();) {
 53 | 			final Sequence seq = it.next();
 54 | 			if (newSequences.containsRow(seq)) { // TODO zeros to clear ok?
 55 | 				for (final Entry<Integer, Double> entry : cachedSequences.row(seq).entrySet())
 56 | 					entry.setValue(0.);
 57 | 				cachedSequences.row(seq).putAll(newSequences.row(seq));
 58 | 			} else if (seq.size() == 1) {
 59 | 				for (final Entry<Integer, Double> entry : cachedSequences.row(seq).entrySet())
 60 | 					entry.setValue(0.); // so we can fill incomplete coverings
 61 | 			} else
 62 | 				it.remove();
 63 | 		}
 64 | 	}
 65 | 
 66 | 	/** Get cost of cached covering for hard EM-step */
 67 | 	public double getCachedCost() {
 68 | 		double totalCost = 0;
 69 | 		int lenCovering = 0;
 70 | 		// TODO triple check that this is right!!!
 71 | 		// Calculate (3.3)
 72 | 		for (final Sequence seq : cachedSequences.rowKeySet()) {
 73 | 			if (cachedCovering.contains(seq)) {
 74 | 				final int occur = cachedCovering.count(seq);
 75 | 				totalCost += -Math.log(cachedSequences.get(seq, occur));
 76 | 				for (int m = 1; m <= occur; m++) {
 77 | 					totalCost += sumLogRange(lenCovering + 1, lenCovering + seq.size());
 78 | 					lenCovering += seq.size();
 79 | 				}
 80 | 			} else if (seq.size() == 1 && sum(cachedSequences.row(seq).values()) == 0.) {
 81 | 				continue; // ignore singletons used to fill incomplete coverings
 82 | 			} else {
 83 | 				totalCost += -Math.log(cachedSequences.get(seq, 0));
 84 | 			}
 85 | 		}
 86 | 		return totalCost;
 87 | 	}
 88 | 
 89 | 	/** Get cost of cached covering for structural EM-step */
 90 | 	public double getCachedCost(final Table<Sequence, Integer, Double> sequences) {
 91 | 		return calculateCachedCost(sequences, cachedCovering);
 92 | 	}
 93 | 
 94 | 	/** Get cost of temp. cached covering for structural EM-step */
 95 | 	public double getTempCachedCost(final Table<Sequence, Integer, Double> sequences) {
 96 | 		return calculateCachedCost(sequences, tempCachedCovering);
 97 | 	}
 98 | 
 99 | 	/** Calculate cached cost for structural EM-step */
100 | 	private double calculateCachedCost(final Table<Sequence, Integer, Double> sequences,
101 | 			final Multiset<Sequence> covering) {
102 | 		double totalCost = 0;
103 | 		int lenCovering = 0;
104 | 		for (final Sequence seq : cachedSequences.rowKeySet()) {
105 | 			if (sequences.containsRow(seq)) {
106 | 				if (covering.contains(seq)) {
107 | 					final int occur = covering.count(seq);
108 | 					totalCost += -Math.log(sequences.get(seq, occur));
109 | 					for (int m = 1; m <= occur; m++) {
110 | 						totalCost += sumLogRange(lenCovering + 1, lenCovering + seq.size());
111 | 						lenCovering += seq.size();
112 | 					}
113 | 				} else if (seq.size() == 1 && sum(cachedSequences.row(seq).values()) == 0.) {
114 | 					continue; // ignore seqs used to fill incomplete coverings
115 | 				} else {
116 | 					totalCost += -Math.log(sequences.get(seq, 0));
117 | 				}
118 | 			}
119 | 		}
120 | 		return totalCost;
121 | 	}
122 | 
123 | 	private double sum(final Collection<Double> elems) {
124 | 		double sum = 0;
125 | 		for (final double elem : elems)
126 | 			sum += elem;
127 | 		return sum;
128 | 	}
129 | 
130 | 	private double sumLogRange(final int a, final int b) {
131 | 		double sum = 0;
132 | 		for (int i = a; i <= b; i++)
133 | 			sum += Math.log(i);
134 | 		return sum;
135 | 	}
136 | 
137 | 	public void setCachedCovering(final Multiset<Sequence> covering) {
138 | 		cachedCovering = covering;
139 | 	}
140 | 
141 | 	public Multiset<Sequence> getCachedCovering() {
142 | 		return cachedCovering;
143 | 	}
144 | 
145 | 	public void setTempCachedCovering(final Multiset<Sequence> covering) {
146 | 		tempCachedCovering = covering;
147 | 	}
148 | 
149 | 	public Multiset<Sequence> getTempCachedCovering() {
150 | 		return tempCachedCovering;
151 | 	}
152 | 
153 | 	/**
154 | 	 * Constructor
155 | 	 */
156 | 	public Transaction() {
157 | 		this.items = new ArrayList<>();
158 | 	}
159 | 
160 | 	/**
161 | 	 * Constructor
162 | 	 *
163 | 	 * @param items
164 | 	 *            an array of items that should be added to the new sequence
165 | 	 */
166 | 	public Transaction(final Integer... items) {
167 | 		this.items = new ArrayList<>(Arrays.asList(items));
168 | 	}
169 | 
170 | 	/**
171 | 	 * Constructor
172 | 	 *
173 | 	 * @param items
174 | 	 *            a List of items that should be added to the new sequence
175 | 	 */
176 | 	public Transaction(final List<Integer> items) {
177 | 		this.items = new ArrayList<>(items);
178 | 	}
179 | 
180 | }


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/TransactionDatabase.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.transaction;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /** Wrapper class for storing a database of transactions */
 6 | public abstract class TransactionDatabase {
 7 | 
 8 | 	/** Set to true if candidate generation iteration limit exceeded */
 9 | 	private boolean iterationLimitExceeded = false;
10 | 
11 | 	/** Average cost across the transactions */
12 | 	private double averageCost = Double.POSITIVE_INFINITY;
13 | 
14 | 	/** Set the average cost */
15 | 	public void setAverageCost(final double averageCost) {
16 | 		this.averageCost = averageCost;
17 | 	}
18 | 
19 | 	/** Get the average cost */
20 | 	public double getAverageCost() {
21 | 		return averageCost;
22 | 	}
23 | 
24 | 	public void setIterationLimitExceeded() {
25 | 		iterationLimitExceeded = true;
26 | 	}
27 | 
28 | 	public boolean getIterationLimitExceeded() {
29 | 		return iterationLimitExceeded;
30 | 	}
31 | 
32 | 	/** Get a list of transactions */
33 | 	public abstract List<Transaction> getTransactionList();
34 | 
35 | 	// /** Get a JavaRDD of transactions */
36 | 	// public abstract JavaRDD<Transaction> getTransactionRDD();
37 | 	//
38 | 	// /** Update the transaction cache */
39 | 	// public abstract void updateTransactionCache(
40 | 	// final JavaRDD<Transaction> updatedTransactions);
41 | 
42 | 	/** Get the number of transactions in this database */
43 | 	public abstract int size();
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/TransactionGenerator.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.transaction;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileReader;
  5 | import java.io.IOException;
  6 | import java.io.PrintWriter;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.List;
 10 | import java.util.Map;
 11 | import java.util.Map.Entry;
 12 | import java.util.Random;
 13 | 
 14 | import org.apache.commons.io.LineIterator;
 15 | import org.apache.commons.math3.distribution.EnumeratedIntegerDistribution;
 16 | import org.apache.commons.math3.random.JDKRandomGenerator;
 17 | import org.apache.commons.math3.random.RandomGenerator;
 18 | 
 19 | import com.google.common.collect.HashMultiset;
 20 | import com.google.common.collect.Multiset;
 21 | import com.google.common.collect.Table;
 22 | import com.google.common.primitives.Doubles;
 23 | import com.google.common.primitives.Ints;
 24 | 
 25 | import sequencemining.sequence.Sequence;
 26 | 
 27 | public class TransactionGenerator {
 28 | 
 29 | 	private static final boolean VERBOSE = false;
 30 | 
 31 | 	/**
 32 | 	 * Generate transactions from set of interesting sequences
 33 | 	 *
 34 | 	 * @return set of sequences added to transaction
 35 | 	 */
 36 | 	public static HashMap<Sequence, Double> generateTransactionDatabase(final Map<Sequence, Double> sequences,
 37 | 			final Table<Sequence, Integer, Double> probabilities, final int noTransactions, final File outFile)
 38 | 					throws IOException {
 39 | 
 40 | 		// Set random number seeds
 41 | 		final Random random = new Random(1);
 42 | 		final Random randomI = new Random(10);
 43 | 		final RandomGenerator randomC = new JDKRandomGenerator();
 44 | 		randomC.setSeed(100);
 45 | 
 46 | 		// Storage for sequences actually added
 47 | 		final HashMap<Sequence, Double> addedSequences = new HashMap<>();
 48 | 
 49 | 		// Set output file
 50 | 		final PrintWriter out = new PrintWriter(outFile, "UTF-8");
 51 | 
 52 | 		// Add to distribution class for easy sampling
 53 | 		final Map<Sequence, EnumeratedIntegerDistribution> dists = new HashMap<>();
 54 | 		for (final Sequence seq : sequences.keySet()) {
 55 | 			final List<Integer> singletons = new ArrayList<>();
 56 | 			final List<Double> probs = new ArrayList<>();
 57 | 			for (final Entry<Integer, Double> entry : probabilities.row(seq).entrySet()) {
 58 | 				singletons.add(entry.getKey());
 59 | 				probs.add(entry.getValue());
 60 | 			}
 61 | 			final EnumeratedIntegerDistribution dist = new EnumeratedIntegerDistribution(randomC,
 62 | 					Ints.toArray(singletons), Doubles.toArray(probs));
 63 | 			dists.put(seq, dist);
 64 | 		}
 65 | 
 66 | 		// Generate transaction database
 67 | 		int count = 0;
 68 | 		while (count < noTransactions) {
 69 | 
 70 | 			// Generate transaction from distribution
 71 | 			final Transaction transaction = sampleFromDistribution(random, sequences, dists, addedSequences, randomI);
 72 | 			for (final int item : transaction) {
 73 | 				out.print(item + " -1 ");
 74 | 			}
 75 | 			if (!transaction.isEmpty()) {
 76 | 				out.print("-2");
 77 | 				out.println();
 78 | 				count++;
 79 | 			}
 80 | 
 81 | 		}
 82 | 		out.close();
 83 | 
 84 | 		// Print file to screen
 85 | 		if (VERBOSE) {
 86 | 			final FileReader reader = new FileReader(outFile);
 87 | 			final LineIterator it = new LineIterator(reader);
 88 | 			while (it.hasNext()) {
 89 | 				System.out.println(it.nextLine());
 90 | 			}
 91 | 			LineIterator.closeQuietly(it);
 92 | 		}
 93 | 
 94 | 		return addedSequences;
 95 | 	}
 96 | 
 97 | 	/**
 98 | 	 * Randomly generate sequence with its probability, randomly interleaving
 99 | 	 * subsequences
100 | 	 */
101 | 	public static Transaction sampleFromDistribution(final Random random, final Map<Sequence, Double> sequences,
102 | 			final Map<Sequence, EnumeratedIntegerDistribution> probabilities,
103 | 			final HashMap<Sequence, Double> addedSequences, final Random randomI) {
104 | 
105 | 		// Sample counts for interesting sequences
106 | 		final Multiset<Sequence> seqsWithRep = HashMultiset.create();
107 | 		for (final Sequence seq : sequences.keySet()) {
108 | 			final int count = probabilities.get(seq).sample();
109 | 			seqsWithRep.add(seq, count);
110 | 		}
111 | 
112 | 		final ArrayList<Integer> transaction = new ArrayList<>();
113 | 		for (final Sequence seq : seqsWithRep) {
114 | 			if (random.nextDouble() < sequences.get(seq)) {
115 | 				interleave(transaction, seq, randomI);
116 | 				addedSequences.put(seq, sequences.get(seq));
117 | 			}
118 | 		}
119 | 
120 | 		return new Transaction(transaction);
121 | 	}
122 | 
123 | 	/** Randomly interleave sequence into transaction */
124 | 	private static void interleave(final ArrayList<Integer> transaction, final Sequence seq, final Random randomI) {
125 | 		if (transaction.size() == 0) {
126 | 			transaction.addAll(seq);
127 | 		} else {
128 | 			int prev = 0;
129 | 			for (final Integer item : seq) {
130 | 				final int insertionPoint = randomI.nextInt((transaction.size() - prev) + 1) + prev;
131 | 				transaction.add(insertionPoint, item);
132 | 				prev = insertionPoint + 1;
133 | 			}
134 | 		}
135 | 	}
136 | 
137 | }
138 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/TransactionList.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.transaction;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /** Wrapper class for storing transaction database as a list of transactions */
 6 | public class TransactionList extends TransactionDatabase {
 7 | 
 8 | 	private final List<Transaction> transactions;
 9 | 
10 | 	public TransactionList(final List<Transaction> transactions) {
11 | 		this.transactions = transactions;
12 | 	}
13 | 
14 | 	@Override
15 | 	public List<Transaction> getTransactionList() {
16 | 		return transactions;
17 | 	}
18 | 
19 | 	// @Override
20 | 	// public JavaRDD<Transaction> getTransactionRDD() {
21 | 	// throw new UnsupportedOperationException("This is a list is not a RDD!!");
22 | 	// }
23 | 
24 | 	@Override
25 | 	public int size() {
26 | 		return transactions.size();
27 | 	}
28 | 
29 | 	// @Override
30 | 	// public void updateTransactionCache(
31 | 	// final JavaRDD<Transaction> updatedTransactions) {
32 | 	// throw new UnsupportedOperationException("This is a list is not a RDD!!");
33 | 	// }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/TransactionRDD.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.transaction;
 2 | 
 3 | //import java.util.List;
 4 | //
 5 | //import org.apache.spark.api.java.JavaRDD;
 6 | //
 7 | ///** Wrapper class for storing transaction database as a Spark RDD */
 8 | //public class TransactionRDD extends TransactionDatabase {
 9 | //
10 | //	private JavaRDD<Transaction> transactions;
11 | //	private final long noTransactions;
12 | //	private final String[] cachedDB;
13 | //
14 | //	public TransactionRDD(final JavaRDD<Transaction> transactions,
15 | //			final long noTransactions, final String[] cachedDB) {
16 | //		this.transactions = transactions;
17 | //		this.noTransactions = noTransactions;
18 | //		this.cachedDB = cachedDB;
19 | //	}
20 | //
21 | //	@Override
22 | //	public List<Transaction> getTransactionList() {
23 | //		throw new UnsupportedOperationException("This is a RDD not a List!!");
24 | //	}
25 | //
26 | //	@Override
27 | //	public JavaRDD<Transaction> getTransactionRDD() {
28 | //		return transactions;
29 | //	}
30 | //
31 | //	@Override
32 | //	public void updateTransactionCache(
33 | //			final JavaRDD<Transaction> updatedTransactions) {
34 | //		transactions = updatedTransactions;
35 | //	}
36 | //
37 | //	@Override
38 | //	public long size() {
39 | //		return noTransactions;
40 | //	}
41 | //
42 | //	@Override
43 | //	public String[] getCachedDB() {
44 | //		return cachedDB;
45 | //	}
46 | //
47 | // }
48 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/ClassRegistrator.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.util;
 2 | 
 3 | //import itemsetmining.itemset.AbstractSequence;
 4 | //import itemsetmining.itemset.Sequence;
 5 | //import itemsetmining.main.InferenceAlgorithms.InferGreedy;
 6 | //import itemsetmining.transaction.Transaction;
 7 | //
 8 | //import org.apache.spark.serializer.KryoRegistrator;
 9 | //
10 | //import com.esotericsoftware.kryo.Kryo;
11 | //
12 | ///** Register custom classes for Spark Kryo serialization */
13 | //public class ClassRegistrator implements KryoRegistrator {
14 | //
15 | //	@Override
16 | //	public void registerClasses(final Kryo kryo) {
17 | //		kryo.register(Transaction.class);
18 | //		kryo.register(AbstractSequence.class);
19 | //		kryo.register(Sequence.class);
20 | //		kryo.register(InferGreedy.class);
21 | //	}
22 | //
23 | // }
24 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/Logging.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.util;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileOutputStream;
  6 | import java.io.IOException;
  7 | import java.io.ObjectInputStream;
  8 | import java.io.ObjectOutputStream;
  9 | import java.io.OutputStream;
 10 | import java.text.SimpleDateFormat;
 11 | import java.util.Date;
 12 | import java.util.logging.ConsoleHandler;
 13 | import java.util.logging.FileHandler;
 14 | import java.util.logging.Formatter;
 15 | import java.util.logging.Handler;
 16 | import java.util.logging.Level;
 17 | import java.util.logging.LogManager;
 18 | import java.util.logging.LogRecord;
 19 | import java.util.logging.Logger;
 20 | 
 21 | import org.apache.commons.io.FilenameUtils;
 22 | 
 23 | public class Logging {
 24 | 
 25 | 	/** Set up logging to console */
 26 | 	public static void setUpConsoleLogger(final Logger logger, final Level logLevel) {
 27 | 		LogManager.getLogManager().reset();
 28 | 		logger.setLevel(logLevel);
 29 | 		final Handler handler = setUpConsoleHandler();
 30 | 		logger.addHandler(handler);
 31 | 	}
 32 | 
 33 | 	/** Set up logging to file */
 34 | 	public static void setUpFileLogger(final Logger logger, final Level logLevel, final File logFile) {
 35 | 		LogManager.getLogManager().reset();
 36 | 		logger.setLevel(logLevel);
 37 | 		final Handler handler = setUpFileHandler(logFile.getAbsolutePath());
 38 | 		logger.addHandler(handler);
 39 | 	}
 40 | 
 41 | 	/** Set up logging to console and file */
 42 | 	public static void setUpConsoleAndFileLogger(final Logger logger, final Level logLevel, final File logFile) {
 43 | 		LogManager.getLogManager().reset();
 44 | 		logger.setLevel(logLevel);
 45 | 		final Handler chandler = setUpConsoleHandler();
 46 | 		final Handler fhandler = setUpFileHandler(logFile.getAbsolutePath());
 47 | 		logger.addHandler(chandler);
 48 | 		logger.addHandler(fhandler);
 49 | 	}
 50 | 
 51 | 	/** Set the log file name */
 52 | 	public static File getLogFileName(final String algorithm, final boolean timeStampLog, final File logDir,
 53 | 			final File dataset) {
 54 | 		String timeStamp = "";
 55 | 		if (timeStampLog)
 56 | 			timeStamp = "-" + new SimpleDateFormat("dd.MM.yyyy-HH:mm:ss").format(new Date());
 57 | 		return new File(logDir + File.separator + algorithm + "-" + FilenameUtils.getBaseName(dataset.getName())
 58 | 				+ timeStamp + ".log");
 59 | 	}
 60 | 
 61 | 	/** Set up console handler */
 62 | 	public static Handler setUpConsoleHandler() {
 63 | 		final ConsoleHandler handler = new ConsoleHandler() {
 64 | 			@Override
 65 | 			protected void setOutputStream(final OutputStream out) throws SecurityException {
 66 | 				super.setOutputStream(System.out);
 67 | 			}
 68 | 		};
 69 | 		handler.setLevel(Level.ALL);
 70 | 		final Formatter formatter = new Formatter() {
 71 | 			@Override
 72 | 			public String format(final LogRecord record) {
 73 | 				return record.getMessage();
 74 | 			}
 75 | 		};
 76 | 		handler.setFormatter(formatter);
 77 | 		return handler;
 78 | 	}
 79 | 
 80 | 	/** Set up file handler */
 81 | 	public static Handler setUpFileHandler(final String path) {
 82 | 		FileHandler handler = null;
 83 | 		try {
 84 | 			handler = new FileHandler(path, 104857600, 1);
 85 | 		} catch (SecurityException | IOException e) {
 86 | 			e.printStackTrace();
 87 | 		}
 88 | 		handler.setLevel(Level.ALL);
 89 | 		final Formatter formatter = new Formatter() {
 90 | 			@Override
 91 | 			public String format(final LogRecord record) {
 92 | 				return record.getMessage();
 93 | 			}
 94 | 		};
 95 | 		handler.setFormatter(formatter);
 96 | 		return handler;
 97 | 	}
 98 | 
 99 | 	/** Serialize object to file */
100 | 	public static void serialize(final Object obj, final String filename) throws IOException {
101 | 		final FileOutputStream fos = new FileOutputStream(filename);
102 | 		final ObjectOutputStream oos = new ObjectOutputStream(fos);
103 | 		oos.writeObject(obj);
104 | 		oos.close();
105 | 	}
106 | 
107 | 	/** Deserialize object from file */
108 | 	public static Object deserializeFrom(final String filename) throws IOException, ClassNotFoundException {
109 | 		final FileInputStream fisM = new FileInputStream(filename);
110 | 		final ObjectInputStream oisM = new ObjectInputStream(fisM);
111 | 		final Object obj = oisM.readObject();
112 | 		oisM.close();
113 | 		return obj;
114 | 	}
115 | 
116 | 	private Logging() {
117 | 	}
118 | 
119 | }
120 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/MemoryLogger.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.util;
 2 | 
 3 | /*
 4 |  *  Copyright (c) 2008-2012 Philippe Fournier-Viger
 5 |  * 
 6 |  * This file is part of the SPMF DATA MINING SOFTWARE
 7 |  * (http://www.philippe-fournier-viger.com/spmf).
 8 |  *
 9 |  * SPMF is free software: you can redistribute it and/or modify
10 |  * it under the terms of the GNU General Public License as published by
11 |  * the Free Software Foundation, either version 3 of the License, or
12 |  * (at your option) any later version.
13 |  *
14 |  * SPMF is distributed in the hope that it will be useful,
15 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 |  * GNU General Public License for more details.
18 |  *
19 |  * You should have received a copy of the GNU General Public License
20 |  * along with SPMF.  If not, see <http://www.gnu.org/licenses/>.
21 |  */
22 | 
23 | /**
24 |  * This class is used to record the maximum memory usaged of an algorithm during
25 |  * a given execution. It is implemented by using the "singleton" design pattern.
26 |  * 
27 |  */
28 | public class MemoryLogger {
29 | 
30 | 	// the only instance of this class (this is the "singleton" design pattern)
31 | 	private static MemoryLogger instance = new MemoryLogger();
32 | 
33 | 	// variable to store the maximum memory usage
34 | 	private double maxMemory = 0;
35 | 
36 | 	/**
37 | 	 * Method to obtain the only instance of this class
38 | 	 * 
39 | 	 * @return instance of MemoryLogger
40 | 	 */
41 | 	public static MemoryLogger getInstance() {
42 | 		return instance;
43 | 	}
44 | 
45 | 	/**
46 | 	 * To get the maximum amount of memory used until now
47 | 	 * 
48 | 	 * @return a double value indicating memory as megabytes
49 | 	 */
50 | 	public double getMaxMemory() {
51 | 		return maxMemory;
52 | 	}
53 | 
54 | 	/**
55 | 	 * Reset the maximum amount of memory recorded.
56 | 	 */
57 | 	public void reset() {
58 | 		maxMemory = 0;
59 | 	}
60 | 
61 | 	/**
62 | 	 * Check the current memory usage and record it if it is higher than the
63 | 	 * amount of memory previously recorded.
64 | 	 */
65 | 	public void checkMemory() {
66 | 		final double currentMemory = (Runtime.getRuntime().totalMemory() - Runtime
67 | 				.getRuntime().freeMemory()) / 1024d / 1024d;
68 | 		if (currentMemory > maxMemory) {
69 | 			maxMemory = currentMemory;
70 | 		}
71 | 	}
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/PartialLogFixer.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.util;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileWriter;
 5 | import java.io.IOException;
 6 | import java.util.HashMap;
 7 | import java.util.Map;
 8 | import java.util.Map.Entry;
 9 | import java.util.regex.Matcher;
10 | import java.util.regex.Pattern;
11 | 
12 | import org.apache.commons.io.input.ReversedLinesFileReader;
13 | 
14 | import sequencemining.main.SequenceMining;
15 | import sequencemining.main.SequenceMiningCore;
16 | import sequencemining.sequence.Sequence;
17 | import sequencemining.transaction.TransactionList;
18 | 
19 | /**
20 |  * Read last EM step of partial sequence log and output interesting sequences
21 |  * along with interestingness and probability and write to end of log file.
22 |  */
23 | public class PartialLogFixer {
24 | 
25 | 	public static void main(final String[] args) throws IOException {
26 | 		if (args.length != 2) {
27 | 			System.err.println("Usage <transactionDB> <logFile>");
28 | 			System.exit(-1);
29 | 		}
30 | 
31 | 		System.out.println("Reading sequences from last parameter EM step for " + args[1] + "...");
32 | 		final HashMap<Sequence, Double> itemsets = readLastEMStepSequences(new File(args[1]));
33 | 		System.out.println("done. Number of sequences: " + itemsets.size());
34 | 
35 | 		System.out.println("\nWriting sorted sequences to " + args[1] + "...");
36 | 		sortSequencesInterestingness(itemsets, new File(args[0]), new File(args[1]));
37 | 		System.out.println("All done. Exiting.");
38 | 
39 | 	}
40 | 
41 | 	public static HashMap<Sequence, Double> readLastEMStepSequences(final File logFile) throws IOException {
42 | 		final HashMap<Sequence, Double> sequences = new HashMap<>();
43 | 
44 | 		final ReversedLinesFileReader reader = new ReversedLinesFileReader(logFile);
45 | 		String line = reader.readLine();
46 | 		while (line != null) {
47 | 
48 | 			if (line.contains("Parameter Optimal Sequences:")) {
49 | 				final Matcher m = Pattern
50 | 						.compile(
51 | 								"\\[((?:[0-9]|,| )+?)\\]=\\(((?:(?:[-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?)|,)+?)\\)")
52 | 						.matcher(line);
53 | 				while (m.find()) {
54 | 					final Sequence sequence = new Sequence();
55 | 					final String[] items = m.group(1).split(", ");
56 | 					for (final String item : items)
57 | 						sequence.add(Integer.parseInt(item));
58 | 					final double prob = 1 - Double.parseDouble(m.group(2).split(",")[0]);
59 | 					sequences.put(sequence, prob);
60 | 				}
61 | 				break;
62 | 			}
63 | 			line = reader.readLine();
64 | 
65 | 		}
66 | 		reader.close();
67 | 
68 | 		return sequences;
69 | 	}
70 | 
71 | 	public static void sortSequencesInterestingness(final HashMap<Sequence, Double> sequences, final File transactionDB,
72 | 			final File logFile) throws IOException {
73 | 
74 | 		// Read in transaction database
75 | 		final TransactionList transactions = SequenceMining.readTransactions(transactionDB);
76 | 
77 | 		// Sort sequences by interestingness
78 | 		System.out.println("Sorting sequences by interestingness...");
79 | 		final HashMap<Sequence, Double> intMap = SequenceMiningCore.calculateInterestingness(sequences, transactions);
80 | 		final Map<Sequence, Double> sortedSequences = SequenceMiningCore.sortSequences(sequences, intMap);
81 | 
82 | 		System.out.println("Writing out to file...");
83 | 		final FileWriter out = new FileWriter(logFile, true);
84 | 		out.write("\n============= INTERESTING SEQUENCES =============\n");
85 | 		for (final Entry<Sequence, Double> entry : sortedSequences.entrySet()) {
86 | 			out.write(String.format("%s\tprob: %1.5f \tint: %1.5f %n", entry.getKey(), entry.getValue(),
87 | 					intMap.get(entry.getKey())));
88 | 		}
89 | 		out.write("\n");
90 | 		out.close();
91 | 		System.out.println("done.");
92 | 
93 | 	}
94 | 
95 | }
96 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/Tuple2.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.util;
 2 | 
 3 | public class Tuple2<T1, T2> {
 4 | 	public final T1 _1;
 5 | 	public final T2 _2;
 6 | 
 7 | 	public Tuple2(final T1 _1, final T2 _2) {
 8 | 		this._1 = _1;
 9 | 		this._2 = _2;
10 | 	}
11 | 
12 | 	@Override
13 | 	public String toString() {
14 | 		return "(" + _1 + "," + _2 + ")";
15 | 	}
16 | 
17 | 	@Override
18 | 	public int hashCode() {
19 | 		final int prime = 31;
20 | 		int result = 1;
21 | 		result = prime * result + ((_1 == null) ? 0 : _1.hashCode());
22 | 		result = prime * result + ((_2 == null) ? 0 : _2.hashCode());
23 | 		return result;
24 | 	}
25 | 
26 | 	@Override
27 | 	public boolean equals(final Object obj) {
28 | 		if (this == obj)
29 | 			return true;
30 | 		if (!(obj instanceof Tuple2))
31 | 			return false;
32 | 		final Tuple2<?, ?> other = (Tuple2<?, ?>) obj;
33 | 		return (_1 == null ? other._1 == null : _1.equals(other._1))
34 | 				&& (_2 == null ? other._2 == null : _2.equals(other._2));
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=WARN, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
12 | 


--------------------------------------------------------------------------------
/sequence-miner/src/main/resources/spark.properties:
--------------------------------------------------------------------------------
1 | # Main Spark Parameters
2 | SparkHome=/disk/data1/jfowkes/spark-1.1.0-bin-hadoop1
3 | SparkMaster=spark://cup04.inf.ed.ac.uk:7077
4 | MachinesInCluster=8
5 | 
6 | # Main HDFS Parameters 
7 | HDFSMaster=hdfs://cup04.inf.ed.ac.uk:54310/
8 | HDFSConfFile=/disk/data1/jfowkes/hadoop-1.0.4/conf/core-site.xml
9 | 


--------------------------------------------------------------------------------
/sequence-miner/src/test/java/sequencemining/main/InitialProbabilitiesTest.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.main;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.io.UnsupportedEncodingException;
 6 | import java.net.URL;
 7 | 
 8 | import org.junit.Test;
 9 | 
10 | import com.google.common.collect.Table;
11 | 
12 | import sequencemining.sequence.Sequence;
13 | 
14 | public class InitialProbabilitiesTest {
15 | 
16 | 	@Test
17 | 	public void testScanDatabaseToDetermineInitialProbabilities() throws IOException {
18 | 
19 | 		final File input = getTestFile("TOY.txt"); // database
20 | 		final Table<Sequence, Integer, Double> probs = SequenceMining
21 | 				.scanDatabaseToDetermineInitialProbabilities(input);
22 | 		System.out.println(SequenceMiningCore.probsToString(probs));
23 | 
24 | 	}
25 | 
26 | 	public File getTestFile(final String filename) throws UnsupportedEncodingException {
27 | 		final URL url = this.getClass().getClassLoader().getResource(filename);
28 | 		return new File(java.net.URLDecoder.decode(url.getPath(), "UTF-8"));
29 | 	}
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/sequence-miner/src/test/java/sequencemining/main/SequenceMiningTest.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.main;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | 
  5 | import java.util.HashMap;
  6 | import java.util.Map;
  7 | 
  8 | import org.junit.Test;
  9 | 
 10 | import com.google.common.collect.HashBasedTable;
 11 | import com.google.common.collect.HashMultiset;
 12 | import com.google.common.collect.Multiset;
 13 | 
 14 | import sequencemining.main.InferenceAlgorithms.InferGreedy;
 15 | import sequencemining.main.InferenceAlgorithms.InferenceAlgorithm;
 16 | import sequencemining.sequence.Sequence;
 17 | import sequencemining.transaction.Transaction;
 18 | 
 19 | public class SequenceMiningTest {
 20 | 
 21 | 	@Test
 22 | 	public void testDoInference() {
 23 | 
 24 | 		// TODO better tests??
 25 | 
 26 | 		// Subsequences
 27 | 		final Sequence s1 = new Sequence(3, 4, 5, 8);
 28 | 		final Map<Integer, Double> p1 = new HashMap<>();
 29 | 		p1.put(0, 0.6);
 30 | 		p1.put(1, 0.4);
 31 | 		final Sequence s2 = new Sequence(7, 9);
 32 | 		final Map<Integer, Double> p2 = new HashMap<>();
 33 | 		p2.put(0, 0.7);
 34 | 		p2.put(1, 0.3);
 35 | 		// final Sequence s3 = new Sequence(8, 4, 5, 6); // with overlap
 36 | 		// final double p3 = 0.2;
 37 | 		final Sequence s3 = new Sequence(8, 6);
 38 | 		final Map<Integer, Double> p3 = new HashMap<>();
 39 | 		p3.put(0, 0.8);
 40 | 		p3.put(1, 0.2);
 41 | 
 42 | 		// Transaction #1
 43 | 		final Transaction transaction1 = new Transaction(7, 3, 8, 9, 4, 5, 6, 8);
 44 | 		transaction1.initializeCachedSequences(HashBasedTable.create());
 45 | 		transaction1.addSequenceCache(s1, p1);
 46 | 		transaction1.addSequenceCache(s2, p2);
 47 | 		transaction1.addSequenceCache(s3, p3);
 48 | 
 49 | 		// Expected solution #1
 50 | 		final Multiset<Sequence> expected1 = HashMultiset.create();
 51 | 		expected1.add(s1);
 52 | 		expected1.add(s2);
 53 | 		expected1.add(s3);
 54 | 		// final HashSet<Integer> order1 = new HashSet<>();
 55 | 		// order1.add(0);
 56 | 		// order1.add(1);
 57 | 		// order1.add(2);
 58 | 
 59 | 		// Test greedy
 60 | 		final InferenceAlgorithm inferGreedy = new InferGreedy();
 61 | 		final Multiset<Sequence> actual = inferGreedy.infer(transaction1);
 62 | 		System.out.println(actual);
 63 | 		assertEquals(expected1, actual);
 64 | 		// assertTrue(order1.containsAll(actual.values()));
 65 | 
 66 | 		// Subsequences
 67 | 		final Sequence s4 = new Sequence(1, 2);
 68 | 		final Map<Integer, Double> p4 = new HashMap<>();
 69 | 		p4.put(0, 0.5);
 70 | 		p4.put(1, 0.3);
 71 | 		p4.put(2, 0.1);
 72 | 		p4.put(3, 0.1);
 73 | 
 74 | 		// Transaction #2
 75 | 		final Transaction transaction2 = new Transaction(1, 2, 1, 2, 1, 2);
 76 | 		transaction2.initializeCachedSequences(HashBasedTable.create());
 77 | 		transaction2.addSequenceCache(s4, p4);
 78 | 
 79 | 		// Expected solution #2
 80 | 		final Multiset<Sequence> expected2 = HashMultiset.create();
 81 | 		expected2.add(s4, 3);
 82 | 		// final HashSet<Integer> order2 = new HashSet<>();
 83 | 		// order2.add(0);
 84 | 		// order2.add(2);
 85 | 		// order2.add(4);
 86 | 
 87 | 		int lenCovering = 0;
 88 | 		final int occur = 3;
 89 | 		double expectedCost2 = -Math.log(p4.get(occur));
 90 | 		for (int m = 1; m <= 3; m++) {
 91 | 			expectedCost2 += sumLogRange(lenCovering + 1, lenCovering + s4.size());
 92 | 			lenCovering += s4.size();
 93 | 		}
 94 | 
 95 | 		// Test greedy
 96 | 		final Multiset<Sequence> actual2 = inferGreedy.infer(transaction2);
 97 | 		System.out.println(actual2);
 98 | 		assertEquals(expected2, actual2);
 99 | 		// assertTrue(order2.containsAll(actual2.values()));
100 | 		transaction2.setCachedCovering(actual2);
101 | 		assertEquals(expectedCost2, transaction2.getCachedCost(), 1e-15);
102 | 
103 | 	}
104 | 
105 | 	private double sumLogRange(final int a, final int b) {
106 | 		double sum = 0;
107 | 		for (int i = a; i <= b; i++)
108 | 			sum += Math.log(i);
109 | 		return sum;
110 | 	}
111 | 
112 | 	// @Test
113 | 	// public void testCombLoop() {
114 | 	//
115 | 	// final ArrayList<Sequence> sequences = new ArrayList<>();
116 | 	// for (int i = 1; i < 10; i++)
117 | 	// sequences.add(new Sequence(i));
118 | 	//
119 | 	// final int len = sequences.size();
120 | 	// for (int k = 0; k < 2 * len - 2; k++) {
121 | 	// for (int i = 0; i < len && i < k + 1; i++) {
122 | 	// for (int j = 0; j < len && i + j < k + 1; j++) {
123 | 	// if (k <= i + j && i != j) {
124 | 	// final Sequence s1 = sequences.get(i);
125 | 	// final Sequence s2 = sequences.get(j);
126 | 	// System.out.println(s1.toString() + s2.toString());
127 | 	// }
128 | 	// }
129 | 	// }
130 | 	// }
131 | 	//
132 | 	// }
133 | 
134 | }
135 | 


--------------------------------------------------------------------------------
/sequence-miner/src/test/java/sequencemining/main/SupportCountingTest.java:
--------------------------------------------------------------------------------
 1 | package sequencemining.main;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import java.io.File;
 6 | import java.io.IOException;
 7 | import java.io.UnsupportedEncodingException;
 8 | import java.net.URL;
 9 | import java.util.HashSet;
10 | 
11 | import org.junit.Test;
12 | 
13 | import sequencemining.sequence.Sequence;
14 | import sequencemining.transaction.TransactionList;
15 | 
16 | public class SupportCountingTest {
17 | 
18 | 	@Test
19 | 	public void testSupportCounting() throws IOException {
20 | 
21 | 		final File input = getTestFile("TOY.txt"); // database
22 | 		final TransactionList transactions = SequenceMining.readTransactions(input);
23 | 		final Sequence seq = new Sequence(7, 3);
24 | 		final HashSet<Sequence> seqs = new HashSet<>();
25 | 		seqs.add(seq);
26 | 		final long supp = EMStep.getSupportsOfSequences(transactions, seqs).get(seq);
27 | 		assertEquals(1, supp);
28 | 	}
29 | 
30 | 	public File getTestFile(final String filename) throws UnsupportedEncodingException {
31 | 		final URL url = this.getClass().getClassLoader().getResource(filename);
32 | 		return new File(java.net.URLDecoder.decode(url.getPath(), "UTF-8"));
33 | 	}
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/sequence-miner/src/test/java/sequencemining/sequence/PartitionTest.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.sequence;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | 
  5 | import java.util.HashMap;
  6 | import java.util.HashSet;
  7 | import java.util.Iterator;
  8 | import java.util.Map;
  9 | import java.util.Random;
 10 | import java.util.Set;
 11 | 
 12 | import org.apache.commons.math3.distribution.EnumeratedIntegerDistribution;
 13 | import org.apache.commons.math3.random.JDKRandomGenerator;
 14 | import org.apache.commons.math3.random.RandomGenerator;
 15 | import org.junit.Test;
 16 | 
 17 | import com.google.common.collect.HashMultiset;
 18 | import com.google.common.collect.Multiset;
 19 | 
 20 | import sequencemining.transaction.Transaction;
 21 | import sequencemining.transaction.TransactionGenerator;
 22 | 
 23 | public class PartitionTest {
 24 | 
 25 | 	double EPS = 1E-15; // Approx. machine epsilon
 26 | 
 27 | 	@Test
 28 | 	public void testNormalisingConstant() {
 29 | 
 30 | 		final Set<Sequence> seqs = new HashSet<>();
 31 | 
 32 | 		// Test #1
 33 | 		seqs.add(new Sequence(1, 2));
 34 | 		seqs.add(new Sequence(3));
 35 | 		assertEquals(3, modP(seqs.iterator()), EPS);
 36 | 
 37 | 		// Test #2
 38 | 		seqs.add(new Sequence(4));
 39 | 		assertEquals(12, modP(seqs.iterator()), EPS);
 40 | 
 41 | 		// Test #3
 42 | 		seqs.clear();
 43 | 		seqs.add(new Sequence(1, 2, 3));
 44 | 		seqs.add(new Sequence(4));
 45 | 		assertEquals(4, modP(seqs.iterator()), EPS);
 46 | 
 47 | 		// Test #4
 48 | 		seqs.clear();
 49 | 		seqs.add(new Sequence(1));
 50 | 		seqs.add(new Sequence(2));
 51 | 		seqs.add(new Sequence(3));
 52 | 		seqs.add(new Sequence(4));
 53 | 		assertEquals(24, modP(seqs.iterator()), EPS); // 4Perm4 = 24
 54 | 
 55 | 		// Test #5
 56 | 		seqs.clear();
 57 | 		seqs.add(new Sequence(1, 2, 3, 4));
 58 | 		assertEquals(1, modP(seqs.iterator()), EPS); // 4Comb4 = 1
 59 | 
 60 | 	}
 61 | 
 62 | 	@Test
 63 | 	public void testInterleavingGenerator() {
 64 | 
 65 | 		final Random random = new Random(1);
 66 | 		final Random randomI = new Random(10);
 67 | 		final RandomGenerator randomC = new JDKRandomGenerator();
 68 | 		randomC.setSeed(100);
 69 | 
 70 | 		final Multiset<Sequence> seqsI = HashMultiset.create();
 71 | 		seqsI.add(new Sequence(1, 2, 3));
 72 | 		seqsI.add(new Sequence(4, 5));
 73 | 		seqsI.add(new Sequence(6));
 74 | 		seqsI.add(new Sequence(7));
 75 | 
 76 | 		final HashMap<Sequence, Double> seqsG = new HashMap<>();
 77 | 		for (final Sequence seq : seqsI.elementSet()) {
 78 | 			seqsG.put(seq, 1.0);
 79 | 		}
 80 | 
 81 | 		final Map<Sequence, EnumeratedIntegerDistribution> countDists = new HashMap<>();
 82 | 		final EnumeratedIntegerDistribution oneRepeat = new EnumeratedIntegerDistribution(randomC, new int[] { 1 },
 83 | 				new double[] { 1.0 });
 84 | 		countDists.put(new Sequence(1, 2, 3), oneRepeat);
 85 | 		countDists.put(new Sequence(4, 5), oneRepeat);
 86 | 		countDists.put(new Sequence(6), oneRepeat);
 87 | 		countDists.put(new Sequence(7), oneRepeat);
 88 | 
 89 | 		final HashSet<Transaction> transG = new HashSet<>();
 90 | 		for (int i = 0; i < 700000; i++)
 91 | 			transG.add(
 92 | 					TransactionGenerator.sampleFromDistribution(random, seqsG, countDists, new HashMap<>(), randomI));
 93 | 		// Note that upper bound is exact when there are no repetitions
 94 | 		assertEquals(transG.size(), modP(seqsI.iterator()), EPS);
 95 | 	}
 96 | 
 97 | 	/**
 98 | 	 * Calculate upper bound on interleaving model normalization constant for the given set of
 99 | 	 * sequences (bound is exact when there are no repetitions)
100 | 	 */
101 | 	public static double modP(final Iterator<Sequence> iterator) {
102 | 		double prod = 1;
103 | 		int ln = 1;
104 | 		while (iterator.hasNext()) {
105 | 			final int seqSize = iterator.next().size();
106 | 			prod *= nMk(ln, seqSize);
107 | 			ln += seqSize;
108 | 		}
109 | 		return prod;
110 | 	}
111 | 
112 | 	/** N multichoose K (combinations with repetition) **/
113 | 	public static double nMk(final int n, final int k) {
114 | 		double prodNum = 1;
115 | 		for (int i = n; i <= n + k - 1; i++)
116 | 			prodNum *= i;
117 | 		double prodDenom = 1;
118 | 		for (int i = 2; i <= k; i++)
119 | 			prodDenom *= i;
120 | 		return prodNum / prodDenom;
121 | 	}
122 | 
123 | 	// /**
124 | 	// * Calculate interleaving model normalization constant for the given set
125 | 	// of
126 | 	// * sequences
127 | 	// */
128 | 	// public static double modP(final Set<Sequence> sequences) {
129 | 	// int freeSlots = 0;
130 | 	// for (final Sequence seq : sequences)
131 | 	// freeSlots += seq.size();
132 | 	//
133 | 	// double prod = 1;
134 | 	// for (final Sequence seq : sequences) {
135 | 	// final int seqSize = seq.size();
136 | 	// prod *= nestedSum(freeSlots, seqSize);
137 | 	// freeSlots -= seqSize;
138 | 	// }
139 | 	// return prod;
140 | 	// }
141 | 	//
142 | 	// private static double nestedSum(final int n, final int e) {
143 | 	// if (e == 1)
144 | 	// return n - e + 1;
145 | 	// double sum = 0;
146 | 	// for (int i = 1; i <= n - e + 1; i++)
147 | 	// sum += nestedSum(n - i, e - 1);
148 | 	// return sum;
149 | 	// }
150 | 
151 | }
152 | 


--------------------------------------------------------------------------------
/sequence-miner/src/test/java/sequencemining/sequence/SequenceTest.java:
--------------------------------------------------------------------------------
  1 | package sequencemining.sequence;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | 
  5 | import java.util.BitSet;
  6 | 
  7 | import org.junit.Test;
  8 | 
  9 | import sequencemining.transaction.Transaction;
 10 | 
 11 | public class SequenceTest {
 12 | 
 13 | 	// @Test
 14 | 	// public void testGetSupportOfSequenceWithGaps() throws IOException {
 15 | 	// final File inputFile = new File(
 16 | 	// "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Datasets/TOY.txt");
 17 | 	// final TransactionList dBase = ItemsetMining.readTransactions(inputFile);
 18 | 	//
 19 | 	// final Sequence seq = new Sequence(1, 3);
 20 | 	// int expectedSupp = 4;
 21 | 	// assertEquals(expectedSupp,
 22 | 	// ItemsetMiningCore.getSupportOfSequence(dBase, seq));
 23 | 	//
 24 | 	// final Sequence seq2 = new Sequence(1, 3);
 25 | 	// seq2.incrementOccurence();
 26 | 	// expectedSupp = 1;
 27 | 	// assertEquals(expectedSupp,
 28 | 	// ItemsetMiningCore.getSupportOfSequence(dBase, seq2));
 29 | 	//
 30 | 	// final Sequence seq3 = new Sequence(1, 3);
 31 | 	// seq3.incrementOccurence();
 32 | 	// seq3.incrementOccurence();
 33 | 	// expectedSupp = 0;
 34 | 	// assertEquals(expectedSupp,
 35 | 	// ItemsetMiningCore.getSupportOfSequence(dBase, seq3));
 36 | 	//
 37 | 	// }
 38 | 
 39 | 	// @Test
 40 | 	// public void testGetSupportOfSequenceWithoutGaps() throws IOException {
 41 | 	// final File inputFile = new File(
 42 | 	// "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Datasets/TOY.txt");
 43 | 	// final TransactionList dBase = ItemsetMining.readTransactions(inputFile);
 44 | 	//
 45 | 	// final Sequence seq = new Sequence(1, 3);
 46 | 	// int expectedSupp = 1;
 47 | 	// assertEquals(expectedSupp,
 48 | 	// ItemsetMiningCore.getSupportOfSequence(dBase, seq));
 49 | 	//
 50 | 	// final Sequence seq2 = new Sequence(2, 3);
 51 | 	// expectedSupp = 3;
 52 | 	// assertEquals(expectedSupp,
 53 | 	// ItemsetMiningCore.getSupportOfSequence(dBase, seq2));
 54 | 	//
 55 | 	// }
 56 | 
 57 | 	@Test
 58 | 	public void testSequenceRepetitionsWithGaps() {
 59 | 
 60 | 		// Test transaction contains repetitions
 61 | 		final Transaction trans1 = new Transaction(1, 2, 1, 2);
 62 | 		final Transaction trans2 = new Transaction(3, 1, 4, 2, 5);
 63 | 		final Transaction trans3 = new Transaction(3, 1, 4, 2, 5, 1, 6, 2, 7);
 64 | 		final Transaction trans4 = new Transaction(1, 4, 5);
 65 | 		final Sequence seq = new Sequence(1, 2);
 66 | 
 67 | 		assertEquals(2, trans1.repetitions(seq));
 68 | 		assertEquals(1, trans2.repetitions(seq));
 69 | 		assertEquals(2, trans3.repetitions(seq));
 70 | 		assertEquals(0, trans4.repetitions(seq));
 71 | 
 72 | 	}
 73 | 
 74 | 	@Test
 75 | 	public void testSequenceContainsWithGaps() {
 76 | 
 77 | 		// Example from Agrawal paper
 78 | 		final Sequence seq1 = new Sequence(3, 4, 5, 8);
 79 | 		final Sequence seq2 = new Sequence(7, 3, 8, 9, 4, 5, 6, 8);
 80 | 		final Sequence seq3 = new Sequence(3, 3, 8);
 81 | 
 82 | 		assertEquals(true, seq2.contains(seq1));
 83 | 		assertEquals(false, seq1.contains(seq2));
 84 | 		assertEquals(false, seq1.contains(seq3));
 85 | 
 86 | 		// Test transaction contains repetitions
 87 | 		final Transaction trans1 = new Transaction(1, 2, 1, 2);
 88 | 		final Transaction trans2 = new Transaction(3, 1, 4, 2, 5);
 89 | 		final Transaction trans3 = new Transaction(3, 1, 4, 2, 5, 1, 6, 2, 7);
 90 | 		final Sequence seq = new Sequence(1, 2);
 91 | 
 92 | 		assertEquals(true, trans1.contains(seq));
 93 | 		assertEquals(true, trans2.contains(seq));
 94 | 		assertEquals(true, trans3.contains(seq));
 95 | 
 96 | 	}
 97 | 
 98 | 	// @Test
 99 | 	// public void testSequenceContainsWithoutGaps() {
100 | 	//
101 | 	// final Sequence seq1 = new Sequence(3, 4, 5, 8);
102 | 	// final Sequence seq2 = new Sequence(3, 3, 4, 5, 8, 6);
103 | 	// final Sequence seq3 = new Sequence(3, 4, 8, 9, 4, 5, 6, 8);
104 | 	//
105 | 	// assertEquals(true, seq2.contains(seq1));
106 | 	// assertEquals(false, seq1.contains(seq2));
107 | 	// assertEquals(false, seq3.contains(seq1));
108 | 	//
109 | 	// final Sequence transI = new Sequence(12763, 12823, 34913);
110 | 	// final Sequence seqI = new Sequence(34913);
111 | 	//
112 | 	// assertEquals(true, transI.contains(seqI));
113 | 	//
114 | 	// final Sequence transL = new Sequence(1, 2, 3, 4);
115 | 	// final Sequence seqL1 = new Sequence(4, 5);
116 | 	// final Sequence seqL2 = new Sequence(3, 4);
117 | 	//
118 | 	// assertEquals(false, transL.contains(seqL1));
119 | 	// assertEquals(true, transL.contains(seqL2));
120 | 	//
121 | 	// // Test transaction contains repetitions
122 | 	// final Transaction trans1 = new Transaction(1, 2, 1, 2);
123 | 	// final Transaction trans2 = new Transaction(3, 1, 2, 4);
124 | 	// final Transaction trans3 = new Transaction(3, 1, 2, 4, 1, 2, 5);
125 | 	// final Sequence seq = new Sequence(1, 2);
126 | 	//
127 | 	// assertEquals(true, trans1.contains(seq));
128 | 	// assertEquals(true, trans2.contains(seq));
129 | 	// assertEquals(true, trans3.contains(seq));
130 | 	//
131 | 	// }
132 | 
133 | 	@Test
134 | 	public void testSequenceGetCoveredWithGapsWithoutOverlap() {
135 | 
136 | 		final Sequence trans = new Sequence(7, 3, 8, 9, 4, 5, 6, 8);
137 | 
138 | 		final Sequence seq1 = new Sequence(3, 4, 5, 8);
139 | 		final BitSet expected1 = new BitSet(trans.size());
140 | 		expected1.set(1);
141 | 		expected1.set(4);
142 | 		expected1.set(5);
143 | 		expected1.set(7);
144 | 
145 | 		final Sequence seq2 = new Sequence(7, 9);
146 | 		final BitSet expected2 = new BitSet(trans.size());
147 | 		expected2.set(0);
148 | 		expected2.set(3);
149 | 
150 | 		final Sequence seq3 = new Sequence(8, 4, 5);
151 | 		final BitSet expected3 = new BitSet(trans.size());
152 | 		expected3.set(2);
153 | 		expected3.set(4);
154 | 		expected3.set(5);
155 | 
156 | 		// Seq not contained in trans
157 | 		final Sequence seq4 = new Sequence(3, 3, 8);
158 | 		final BitSet expected4 = new BitSet(trans.size());
159 | 
160 | 		assertEquals(expected1, trans.getCovered(seq1, new BitSet()));
161 | 		assertEquals(expected2, trans.getCovered(seq2, new BitSet()));
162 | 		assertEquals(expected2, trans.getCovered(seq2, expected1));
163 | 		assertEquals(expected3, trans.getCovered(seq3, new BitSet()));
164 | 		assertEquals(expected3, trans.getCovered(seq3, expected2));
165 | 		assertEquals(expected4, trans.getCovered(seq4, new BitSet()));
166 | 
167 | 		// Test covering without overlap
168 | 		assertEquals(new BitSet(), trans.getCovered(seq3, expected1));
169 | 
170 | 		// Test double covering
171 | 		final Sequence transC = new Sequence(1, 2, 1, 2, 1, 2);
172 | 		final Sequence seqC = new Sequence(1, 2);
173 | 		final BitSet expectedC1 = new BitSet(transC.size());
174 | 		expectedC1.set(0);
175 | 		expectedC1.set(1);
176 | 		final BitSet expectedC2 = new BitSet(transC.size());
177 | 		expectedC2.set(2);
178 | 		expectedC2.set(3);
179 | 		final BitSet expectedC3 = new BitSet(transC.size());
180 | 		expectedC3.set(4);
181 | 		expectedC3.set(5);
182 | 
183 | 		assertEquals(expectedC1, transC.getCovered(seqC, new BitSet()));
184 | 		assertEquals(expectedC2, transC.getCovered(seqC, expectedC1));
185 | 		expectedC2.or(expectedC1);
186 | 		assertEquals(expectedC3, transC.getCovered(seqC, expectedC2));
187 | 
188 | 		// Test covering with single item sequence
189 | 		final Sequence transI = new Sequence(12763, 12823, 34913);
190 | 		final Sequence seqI = new Sequence(34913);
191 | 		final BitSet expectedI = new BitSet(transI.size());
192 | 		expectedI.set(2);
193 | 
194 | 		assertEquals(expectedI, transI.getCovered(seqI, new BitSet()));
195 | 
196 | 	}
197 | 
198 | 	// @Test
199 | 	// public void testSequenceGetCoveredWithoutGapsWithoutOverlap() {
200 | 	//
201 | 	// final Sequence trans = new Sequence(3, 3, 4, 5, 8, 6);
202 | 	//
203 | 	// final Sequence seq1 = new Sequence(3, 4, 5, 8);
204 | 	// final BitSet expected1 = new BitSet(trans.size());
205 | 	// expected1.set(1);
206 | 	// expected1.set(2);
207 | 	// expected1.set(3);
208 | 	// expected1.set(4);
209 | 	//
210 | 	// final Sequence seq2 = new Sequence(8, 6);
211 | 	// final BitSet expected2 = new BitSet(trans.size());
212 | 	// expected2.set(4);
213 | 	// expected2.set(5);
214 | 	//
215 | 	// final Sequence seq3 = new Sequence(3, 4, 5);
216 | 	// final BitSet expected3 = new BitSet(trans.size());
217 | 	// expected3.set(1);
218 | 	// expected3.set(2);
219 | 	// expected3.set(3);
220 | 	//
221 | 	// // Seq not contained in trans
222 | 	// final Sequence seq4 = new Sequence(3, 3, 8);
223 | 	// final BitSet expected4 = new BitSet(trans.size());
224 | 	//
225 | 	// assertEquals(expected1, trans.getCovered(seq1, new BitSet()));
226 | 	// assertEquals(expected2, trans.getCovered(seq2, new BitSet()));
227 | 	// assertEquals(expected3, trans.getCovered(seq3, new BitSet()));
228 | 	// assertEquals(expected3, trans.getCovered(seq3, expected2));
229 | 	// assertEquals(expected4, trans.getCovered(seq4, new BitSet()));
230 | 	//
231 | 	// // Test covering without overlap
232 | 	// assertEquals(new BitSet(), trans.getCovered(seq2, expected1));
233 | 	// assertEquals(new BitSet(), trans.getCovered(seq3, expected1));
234 | 	// assertEquals(new BitSet(), trans.getCovered(seq4, expected1));
235 | 	//
236 | 	// // Test double covering
237 | 	// final Sequence transC = new Sequence(1, 2, 1, 2, 1, 2);
238 | 	// final Sequence seqC = new Sequence(1, 2);
239 | 	// final BitSet expectedC1 = new BitSet(transC.size());
240 | 	// expectedC1.set(0);
241 | 	// expectedC1.set(1);
242 | 	// final BitSet expectedC2 = new BitSet(transC.size());
243 | 	// expectedC2.set(2);
244 | 	// expectedC2.set(3);
245 | 	// final BitSet expectedC3 = new BitSet(transC.size());
246 | 	// expectedC3.set(4);
247 | 	// expectedC3.set(5);
248 | 	//
249 | 	// assertEquals(expectedC1, transC.getCovered(seqC, new BitSet()));
250 | 	// assertEquals(expectedC2, transC.getCovered(seqC, expectedC1));
251 | 	// expectedC2.or(expectedC1);
252 | 	// assertEquals(expectedC3, transC.getCovered(seqC, expectedC2));
253 | 	//
254 | 	// // Test covering with single item sequence
255 | 	// final Sequence transI = new Sequence(12763, 12823, 34913);
256 | 	// final Sequence seqI = new Sequence(34913);
257 | 	// final BitSet expectedI = new BitSet(transI.size());
258 | 	// expectedI.set(2);
259 | 	//
260 | 	// assertEquals(expectedI, transI.getCovered(seqI, new BitSet()));
261 | 	//
262 | 	// }
263 | 
264 | 	// @Test
265 | 	// public void testSequenceGetCoveredWithGapsWithOverlap() {
266 | 	//
267 | 	// final Sequence trans = new Sequence(7, 3, 8, 9, 4, 5, 6, 8);
268 | 	//
269 | 	// final Sequence seq1 = new Sequence(3, 4, 5, 8);
270 | 	// final BitSet expected1 = new BitSet(trans.size());
271 | 	// expected1.set(1);
272 | 	// expected1.set(4);
273 | 	// expected1.set(5);
274 | 	// expected1.set(7);
275 | 	//
276 | 	// final Sequence seq2 = new Sequence(7, 9);
277 | 	// final BitSet expected2 = new BitSet(trans.size());
278 | 	// expected2.set(0);
279 | 	// expected2.set(3);
280 | 	//
281 | 	// final Sequence seq3 = new Sequence(8, 4, 5);
282 | 	// final BitSet expected3 = new BitSet(trans.size());
283 | 	// expected3.set(2);
284 | 	// expected3.set(4);
285 | 	// expected3.set(5);
286 | 	//
287 | 	// // Seq not contained in trans
288 | 	// final Sequence seq4 = new Sequence(3, 3, 8);
289 | 	// final BitSet expected4 = new BitSet(trans.size());
290 | 	//
291 | 	// assertEquals(expected1, trans.getCovered(seq1, new BitSet()));
292 | 	// assertEquals(expected2, trans.getCovered(seq2, new BitSet()));
293 | 	// assertEquals(expected2, trans.getCovered(seq2, expected1));
294 | 	// assertEquals(expected3, trans.getCovered(seq3, new BitSet()));
295 | 	// assertEquals(expected3, trans.getCovered(seq3, expected2));
296 | 	// assertEquals(expected4, trans.getCovered(seq4, new BitSet()));
297 | 	//
298 | 	// // Test covering with overlap
299 | 	// assertEquals(expected3, trans.getCovered(seq3, expected1));
300 | 	//
301 | 	// // Test double covering
302 | 	// final Sequence transC = new Sequence(1, 2, 1, 2, 1, 2);
303 | 	// final Sequence seqC = new Sequence(1, 2);
304 | 	// final BitSet expectedC1 = new BitSet(transC.size());
305 | 	// expectedC1.set(0);
306 | 	// expectedC1.set(1);
307 | 	// final BitSet expectedC2 = new BitSet(transC.size());
308 | 	// expectedC2.set(2);
309 | 	// expectedC2.set(3);
310 | 	// final BitSet expectedC3 = new BitSet(transC.size());
311 | 	// expectedC3.set(4);
312 | 	// expectedC3.set(5);
313 | 	//
314 | 	// assertEquals(expectedC1, transC.getCovered(seqC, new BitSet()));
315 | 	// assertEquals(expectedC2, transC.getCovered(seqC, expectedC1));
316 | 	// expectedC2.or(expectedC1);
317 | 	// assertEquals(expectedC3, transC.getCovered(seqC, expectedC2));
318 | 	//
319 | 	// // Test covering with single item sequence
320 | 	// final Sequence transI = new Sequence(12763, 12823, 34913);
321 | 	// final Sequence seqI = new Sequence(34913);
322 | 	// final BitSet expectedI = new BitSet(transI.size());
323 | 	// expectedI.set(2);
324 | 	//
325 | 	// assertEquals(expectedI, transI.getCovered(seqI, new BitSet()));
326 | 	//
327 | 	// }
328 | 
329 | 	// @Test
330 | 	// public void testSequenceGetCoveredWithoutGapsWithOverlap() {
331 | 	//
332 | 	// final Sequence trans = new Sequence(3, 3, 4, 5, 8, 6);
333 | 	//
334 | 	// final Sequence seq1 = new Sequence(3, 4, 5, 8);
335 | 	// final BitSet expected1 = new BitSet(trans.size());
336 | 	// expected1.set(1);
337 | 	// expected1.set(2);
338 | 	// expected1.set(3);
339 | 	// expected1.set(4);
340 | 	//
341 | 	// final Sequence seq2 = new Sequence(8, 6);
342 | 	// final BitSet expected2 = new BitSet(trans.size());
343 | 	// expected2.set(4);
344 | 	// expected2.set(5);
345 | 	//
346 | 	// final Sequence seq3 = new Sequence(3, 4, 5);
347 | 	// final BitSet expected3 = new BitSet(trans.size());
348 | 	// expected3.set(1);
349 | 	// expected3.set(2);
350 | 	// expected3.set(3);
351 | 	//
352 | 	// // Seq not contained in trans
353 | 	// final Sequence seq4 = new Sequence(3, 3, 8);
354 | 	// final BitSet expected4 = new BitSet(trans.size());
355 | 	//
356 | 	// assertEquals(expected1, trans.getCovered(seq1, new BitSet()));
357 | 	// assertEquals(expected2, trans.getCovered(seq2, new BitSet()));
358 | 	// assertEquals(expected2, trans.getCovered(seq2, expected1));
359 | 	// assertEquals(expected3, trans.getCovered(seq3, new BitSet()));
360 | 	// assertEquals(expected3, trans.getCovered(seq3, expected2));
361 | 	// assertEquals(expected4, trans.getCovered(seq4, new BitSet()));
362 | 	//
363 | 	// // Test double covering
364 | 	// final Sequence transC = new Sequence(1, 2, 1, 2, 1, 2);
365 | 	// final Sequence seqC = new Sequence(1, 2);
366 | 	// final BitSet expectedC1 = new BitSet(transC.size());
367 | 	// expectedC1.set(0);
368 | 	// expectedC1.set(1);
369 | 	// final BitSet expectedC2 = new BitSet(transC.size());
370 | 	// expectedC2.set(2);
371 | 	// expectedC2.set(3);
372 | 	// final BitSet expectedC3 = new BitSet(transC.size());
373 | 	// expectedC3.set(4);
374 | 	// expectedC3.set(5);
375 | 	//
376 | 	// assertEquals(expectedC1, transC.getCovered(seqC, new BitSet()));
377 | 	// assertEquals(expectedC2, transC.getCovered(seqC, expectedC1));
378 | 	// expectedC2.or(expectedC1);
379 | 	// assertEquals(expectedC3, transC.getCovered(seqC, expectedC2));
380 | 	//
381 | 	// // Test covering with single item sequence
382 | 	// final Sequence transI = new Sequence(12763, 12823, 34913);
383 | 	// final Sequence seqI = new Sequence(34913);
384 | 	// final BitSet expectedI = new BitSet(transI.size());
385 | 	// expectedI.set(2);
386 | 	//
387 | 	// assertEquals(expectedI, transI.getCovered(seqI, new BitSet()));
388 | 	//
389 | 	// }
390 | 
391 | }
392 | 


--------------------------------------------------------------------------------
/sequence-miner/src/test/resources/TOY.txt:
--------------------------------------------------------------------------------
1 | 1 -1 1 -1 2 -1 3 -1 1 -1 3 -1 4 -1 3 -1 6 -1 -2
2 | 1 -1 4 -1 3 -1 2 -1 3 -1 1 -1 5 -1 -2
3 | 5 -1 6 -1 1 -1 2 -1 4 -1 6 -1 3 -1 2 -1 -2
4 | 5 -1 7 -1 1 -1 6 -1 3 -1 2 -1 3 -1 -2


--------------------------------------------------------------------------------
/sequence-mining/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>codemining</groupId>
 5 |   <artifactId>sequence-mining</artifactId>
 6 |   <packaging>jar</packaging>
 7 |   <version>1.0</version>
 8 |   <name>ISM Standalone Jar</name>
 9 |   
10 |   <build>
11 |     <plugins>
12 |       <plugin>
13 |         <artifactId>maven-compiler-plugin</artifactId>
14 |         <version>3.1</version>
15 |         <configuration>
16 |           <source>1.8</source>
17 |           <target>1.8</target>
18 |         </configuration>
19 |       </plugin>
20 | 	  <plugin>
21 |           <groupId>org.apache.maven.plugins</groupId>
22 |           <artifactId>maven-shade-plugin</artifactId>
23 |           <version>2.2</version>
24 |           <executions>
25 |               <execution>
26 |                   <phase>package</phase>
27 |                   <goals>
28 |                       <goal>shade</goal>
29 |                   </goals>
30 |                   <configuration>
31 |                       <finalName>${project.artifactId}-${project.version}</finalName>
32 |                       <minimizeJar>false</minimizeJar>
33 | 		      <createDependencyReducedPom>false</createDependencyReducedPom>
34 |                       <filters>
35 |                           <filter>
36 |                               <artifact>*:*</artifact>
37 |                               <excludes>
38 |                                   <exclude>META-INF/*.SF</exclude>
39 |                                   <exclude>META-INF/*.DSA</exclude>
40 |                                   <exclude>META-INF/*.RSA</exclude>
41 |                               </excludes>
42 |                           </filter>
43 |                       </filters>
44 |                       <transformers>
45 |                           <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
46 |                               <resource>reference.conf</resource>
47 |                           </transformer>
48 | 		      	  <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
49 |                   	      <mainClass>sequencemining.main.SequenceMining</mainClass>
50 |                 	  </transformer>
51 |                       </transformers>
52 |                   </configuration>
53 |               </execution>
54 |           </executions>
55 |       </plugin>
56 |     </plugins>
57 |   </build>
58 | 
59 |   <dependencies>
60 |     <dependency> 
61 |       <groupId>codemining</groupId>
62 |       <artifactId>sequence-miner</artifactId>
63 |       <version>${project.version}</version>
64 |     </dependency>
65 |   </dependencies>
66 |   
67 | </project>
68 | 


--------------------------------------------------------------------------------