dict = FileUtils.readLines(new File(seqLabels));
51 |
52 | // Print top ten
53 | System.out.print("\n============= ISM not SQS/GoKrimp =============\n");
54 | printTopExclusiveSequences(topN, ismSeqs, ISMnotSQSorGoKrimp, dict);
55 | System.out.print("\n============= SQS not ISM/GoKrimp =============\n");
56 | printTopExclusiveSequences(topN, sqsSeqs, SQSnotISMorGoKrimp, dict);
57 | System.out.print("\n============= GoKrimp not ISM/SQS =============\n");
58 | printTopExclusiveSequences(topN, gokrimpSeqs, GoKrimpnotISMorSQS, dict);
59 |
60 | }
61 |
62 | /**
63 | * Set A \ B u C
64 | *
65 | * Note: slow but Guava contains/Set.difference doesn't work here
66 | */
67 | private static Set getExclusiveSequences(final Set setA, final Set setB,
68 | final Set setC) {
69 | final Set exclSeqs = new HashSet<>();
70 | outer: for (final Sequence seqA : setA) {
71 | for (final Sequence seqB : setB) {
72 | if (seqA.equals(seqB))
73 | continue outer;
74 | }
75 | for (final Sequence seqC : setC) {
76 | if (seqA.equals(seqC))
77 | continue outer;
78 | }
79 | exclSeqs.add(seqA);
80 | }
81 | return exclSeqs;
82 | }
83 |
84 | private static void printTopExclusiveSequences(final int topN, final Map seqs,
85 | final Set exclusiveSeqs, final List dict) {
86 | int count = 0;
87 | for (final Entry entry : seqs.entrySet()) {
88 | final Sequence set = entry.getKey();
89 | if (set.size() > 1 && exclusiveSeqs.contains(set)) {
90 | System.out.print(String.format("%s\tprob: %1.5f %n", decode(entry.getKey(), dict), entry.getValue()));
91 | count++;
92 | if (count == topN)
93 | break;
94 | }
95 | }
96 | System.out.println();
97 | }
98 |
99 | private static String decode(final Sequence seq, final List dict) {
100 | String prefix = "";
101 | final StringBuilder sb = new StringBuilder();
102 | for (final Integer item : seq) {
103 | sb.append(prefix);
104 | sb.append(dict.get(item - 1));
105 | prefix = ", ";
106 | }
107 | return sb.toString();
108 | }
109 |
110 | }
111 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/FrequentSequenceMining.java:
--------------------------------------------------------------------------------
1 | package sequencemining.eval;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.HashMap;
6 | import java.util.List;
7 | import java.util.SortedMap;
8 |
9 | import org.apache.commons.io.FileUtils;
10 | import org.apache.commons.io.LineIterator;
11 |
12 | import com.google.common.base.Functions;
13 | import com.google.common.collect.ImmutableSortedMap;
14 | import com.google.common.collect.Ordering;
15 |
16 | import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.AlgoBIDEPlus;
17 | import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.AlgoPrefixSpan;
18 | import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.SequentialPattern;
19 | import ca.pfv.spmf.algorithms.sequentialpatterns.BIDE_and_prefixspan.SequentialPatterns;
20 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.AlgoSPADE;
21 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.AlgoSPAM_AGP;
22 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.candidatePatternsGeneration.CandidateGenerator;
23 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.candidatePatternsGeneration.CandidateGenerator_Qualitative;
24 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.creators.AbstractionCreator;
25 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.creators.AbstractionCreator_Qualitative;
26 | import ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.idLists.creators.IdListCreator_FatBitmap;
27 | import ca.pfv.spmf.input.sequence_database_list_integers.SequenceDatabase;
28 | import ca.pfv.spmf.patterns.itemset_list_integers_without_support.Itemset;
29 | import sequencemining.sequence.Sequence;
30 |
31 | public class FrequentSequenceMining {
32 |
33 | public static void main(final String[] args) throws IOException {
34 |
35 | // Datasets and parameters
36 | final String[] datasets = { "alice_punc", "GAZELLE1", "jmlr", "SIGN", "auslan2", "pioneer", "aslbu", "skating",
37 | "aslgt", "context" };
38 | final double[] minSupps = new double[] { 0.02, 0.004, 0.15, 0.45, 0.0001, 0.1, 0.04, 0.43, 0.25, 0.49 };
39 |
40 | for (int i = 0; i < datasets.length; i++) {
41 | final String dbPath = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Datasets/Paper/" + datasets[i]
42 | + ".dat";
43 | final String saveFile = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/BIDE/" + datasets[i] + ".txt";
44 | mineClosedFrequentSequencesBIDE(dbPath, saveFile, minSupps[i]);
45 | }
46 | }
47 |
48 | /** Run PrefixSpan algorithm */
49 | public static SortedMap mineFrequentSequencesPrefixSpan(final String dataset,
50 | final String saveFile, final double minSupp) throws IOException {
51 |
52 | final SequenceDatabase sequenceDatabase = new SequenceDatabase();
53 | sequenceDatabase.loadFile(dataset);
54 |
55 | final AlgoPrefixSpan algo = new AlgoPrefixSpan();
56 | algo.setShowSequenceIdentifiers(false);
57 | final SequentialPatterns patterns = algo.runAlgorithm(sequenceDatabase, minSupp, saveFile);
58 | // algo.printStatistics(sequenceDatabase.size());
59 |
60 | return toMap(patterns);
61 | }
62 |
63 | /** Run SPADE algorithm */
64 | public static SortedMap mineFrequentSequencesSPADE(final String dataset, final String saveFile,
65 | final double minSupp) throws IOException {
66 |
67 | final boolean verbose = true;
68 |
69 | final AbstractionCreator abstractionCreator = AbstractionCreator_Qualitative.getInstance();
70 | final ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase sequenceDatabase = new ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase(
71 | abstractionCreator, IdListCreator_FatBitmap.getInstance());
72 | sequenceDatabase.loadFile(dataset, minSupp);
73 |
74 | final AlgoSPADE algo = new AlgoSPADE(minSupp, true, abstractionCreator);
75 | final CandidateGenerator candidateGenerator = CandidateGenerator_Qualitative.getInstance();
76 | algo.runAlgorithmParallelized(sequenceDatabase, candidateGenerator, true, verbose, saveFile, false);
77 | // algo.printStatistics();
78 |
79 | return null;
80 | }
81 |
82 | /** Run SPAM algorithm */
83 | public static SortedMap mineFrequentSequencesSPAM(final String dataset, final String saveFile,
84 | final double minSupp) throws IOException {
85 |
86 | final boolean verbose = true;
87 |
88 | final AbstractionCreator abstractionCreator = AbstractionCreator_Qualitative.getInstance();
89 | final ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase sequenceDatabase = new ca.pfv.spmf.algorithms.sequentialpatterns.spade_spam_AGP.dataStructures.database.SequenceDatabase(
90 | abstractionCreator, IdListCreator_FatBitmap.getInstance());
91 | sequenceDatabase.loadFile(dataset, minSupp);
92 |
93 | final AlgoSPAM_AGP algorithm = new AlgoSPAM_AGP(minSupp);
94 | algorithm.runAlgorithm(sequenceDatabase, true, verbose, saveFile, false);
95 | // algo.printStatistics();
96 |
97 | return null;
98 | }
99 |
100 | /** Run BIDE algorithm */
101 | public static SortedMap mineClosedFrequentSequencesBIDE(final String dataset,
102 | final String saveFile, final double minSupp) throws IOException {
103 |
104 | final SequenceDatabase sequenceDatabase = new SequenceDatabase();
105 | sequenceDatabase.loadFile(dataset);
106 |
107 | // Convert to absolute support (rounding down)
108 | final int absMinSupp = (int) (sequenceDatabase.size() * minSupp);
109 |
110 | final AlgoBIDEPlus algo = new AlgoBIDEPlus();
111 | algo.setShowSequenceIdentifiers(false);
112 | final SequentialPatterns patterns = algo.runAlgorithm(sequenceDatabase, saveFile, absMinSupp);
113 | // algo.printStatistics(sequenceDatabase.size());
114 |
115 | return toMap(patterns);
116 | }
117 |
118 | /** Convert frequent sequences to sorted Map */
119 | public static SortedMap toMap(final SequentialPatterns patterns) {
120 | if (patterns == null) {
121 | return null;
122 | } else {
123 | final HashMap sequences = new HashMap<>();
124 | for (final List level : patterns.levels) {
125 | for (final SequentialPattern pattern : level) {
126 | final Sequence seq = new Sequence();
127 | for (final Itemset set : pattern.getItemsets())
128 | seq.add(set.get(0)); // Assumes a seq is just singleton
129 | // itemsets
130 | sequences.put(seq, pattern.getAbsoluteSupport());
131 | }
132 | }
133 | // Sort patterns by support
134 | final Ordering comparator = Ordering.natural().reverse().onResultOf(Functions.forMap(sequences))
135 | .compound(Ordering.usingToString());
136 | return ImmutableSortedMap.copyOf(sequences, comparator);
137 | }
138 | }
139 |
140 | /** Read in frequent sequences (sorted by support) */
141 | public static SortedMap readFrequentSequences(final File output) throws IOException {
142 | final HashMap sequences = new HashMap<>();
143 |
144 | final LineIterator it = FileUtils.lineIterator(output);
145 | while (it.hasNext()) {
146 | final String line = it.nextLine();
147 | if (!line.trim().isEmpty()) {
148 | final String[] splitLine = line.split("#SUP:");
149 | final String[] items = splitLine[0].trim().split("-1");
150 | final Sequence seq = new Sequence();
151 | for (final String item : items)
152 | seq.add(Integer.parseInt(item.trim()));
153 | final int supp = Integer.parseInt(splitLine[1].trim());
154 | sequences.put(seq, supp);
155 | }
156 | }
157 | // Sort sequences by support
158 | final Ordering comparator = Ordering.natural().reverse().onResultOf(Functions.forMap(sequences))
159 | .compound(Ordering.usingToString());
160 | return ImmutableSortedMap.copyOf(sequences, comparator);
161 | }
162 |
163 | }
164 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/IntervalClassification.java:
--------------------------------------------------------------------------------
1 | package sequencemining.eval;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.io.PrintWriter;
6 | import java.io.Writer;
7 | import java.lang.ProcessBuilder.Redirect;
8 | import java.util.Arrays;
9 | import java.util.HashMap;
10 | import java.util.HashSet;
11 | import java.util.Map;
12 | import java.util.Map.Entry;
13 | import java.util.Set;
14 |
15 | import org.apache.commons.io.FileUtils;
16 |
17 | import com.google.common.base.Charsets;
18 | import com.google.common.base.Functions;
19 | import com.google.common.collect.ArrayListMultimap;
20 | import com.google.common.collect.ImmutableSortedMap;
21 | import com.google.common.collect.Multimap;
22 | import com.google.common.collect.Ordering;
23 | import com.google.common.collect.Table;
24 | import com.google.common.io.Files;
25 |
26 | import sequencemining.main.SequenceMining;
27 | import sequencemining.sequence.Sequence;
28 | import sequencemining.transaction.Transaction;
29 | import sequencemining.transaction.TransactionList;
30 |
31 | public class IntervalClassification {
32 |
33 | public static void main(final String[] args) throws IOException {
34 |
35 | final String[] datasets = new String[] { "context", "auslan2", "pioneer", "aslbu", "skating", "aslgt" };
36 | final int[] topNs = new int[] { 10, 40, 70, 100 };
37 | final String baseFolder = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/";
38 | final String datasetFolder = baseFolder + "Datasets/Intervals/";
39 | final String outFolder = baseFolder + "Classification/";
40 |
41 | for (int i = 0; i < datasets.length; i++) {
42 | final String dataset = datasets[i];
43 |
44 | System.out.println("===== Dataset: " + dataset + " =====");
45 | final File outFile = new File(outFolder + dataset + ".txt");
46 | final Writer writer = Files.newWriter(outFile, Charsets.UTF_8);
47 | writer.write("===== " + dataset + " =====\n");
48 | writer.write("topN: " + Arrays.toString(topNs) + "\n");
49 |
50 | // Read dataset
51 | final File dbFile = new File(datasetFolder + dataset + "/" + dataset + ".dat");
52 | final TransactionList dbTrans = SequenceMining.readTransactions(dbFile);
53 | final File labelFile = new File(datasetFolder + dataset + "/" + dataset + ".lab");
54 |
55 | // Read SQS seqs
56 | final File outSQS = new File(baseFolder + "SQS/" + dataset + ".txt");
57 | final Map seqsSQS = StatisticalSequenceMining.readSQSSequences(outSQS);
58 | // seqsSQS = removeSingletons(seqsSQS);
59 | System.out.println("SQS: " + seqsSQS);
60 | writer.write(seqsSQS.size() + " SQS seqs \n");
61 |
62 | // Read GOKRIMP seqs
63 | final File outGOKRIMP = new File(baseFolder + "GoKrimp/" + dataset + ".txt");
64 | final Map seqsGOKRIMP = StatisticalSequenceMining.readGoKrimpSequences(outGOKRIMP);
65 | // seqsGOKRIMP = removeSingletons(seqsGOKRIMP);
66 | System.out.println("GoKrimp: " + seqsGOKRIMP);
67 | writer.write(seqsGOKRIMP.size() + " GoKrimp seqs \n");
68 |
69 | // Read ISM seqs
70 | final File outISM = new File(baseFolder + "Logs/" + dataset + ".log");
71 | final Map seqsISM = SequenceMining.readISMSequences(outISM);
72 | System.out.println("ISM: " + seqsISM);
73 | writer.write(seqsISM.size() + " ISM seqs \n");
74 |
75 | // Read BIDE seqs
76 | final File outBIDE = new File(baseFolder + "BIDE/" + dataset + ".txt");
77 | final Map seqsBIDE = FrequentSequenceMining.readFrequentSequences(outBIDE);
78 | // seqsBIDE = removeSingletons(seqsBIDE);
79 | System.out.println("BIDE: " + seqsBIDE);
80 | writer.write(seqsBIDE.size() + " BIDE seqs \n");
81 |
82 | // Generate simple features
83 | Map seqsSingleton = new HashMap<>();
84 | final Table singletons = SequenceMining
85 | .scanDatabaseToDetermineInitialProbabilities(dbFile);
86 | for (final Sequence seq : singletons.rowKeySet())
87 | seqsSingleton.put(seq, 1 - singletons.get(seq, 0));
88 | // Sort by support
89 | final Ordering comparator = Ordering.natural().reverse()
90 | .onResultOf(Functions.forMap(seqsSingleton)).compound(Ordering.usingToString());
91 | seqsSingleton = ImmutableSortedMap.copyOf(seqsSingleton, comparator);
92 | System.out.println("Singeltons: " + seqsSingleton);
93 | writer.write(seqsSingleton.size() + " Singletons seqs \n");
94 |
95 | // Classify
96 | final Multimap accuracy = ArrayListMultimap.create();
97 | for (final int n : topNs) {
98 | // Run MALLET Naive Bayes classifier
99 | accuracy.put("SQS", classify(n, seqsSQS, dbTrans, labelFile));
100 | accuracy.put("GoKrimp", classify(n, seqsGOKRIMP, dbTrans, labelFile));
101 | accuracy.put("ISM", classify(n, seqsISM, dbTrans, labelFile));
102 | accuracy.put("BIDE", classify(n, seqsBIDE, dbTrans, labelFile));
103 | accuracy.put("Singletons", classify(n, seqsSingleton, dbTrans, labelFile));
104 | // Run libSVM Linear classifier
105 | accuracy.put("SQS_SVM", classifySVM(n, seqsSQS, dbTrans, labelFile));
106 | accuracy.put("GoKrimp_SVM", classifySVM(n, seqsGOKRIMP, dbTrans, labelFile));
107 | accuracy.put("ISM_SVM", classifySVM(n, seqsISM, dbTrans, labelFile));
108 | accuracy.put("BIDE_SVM", classifySVM(n, seqsBIDE, dbTrans, labelFile));
109 | accuracy.put("Singletons_SVM", classifySVM(n, seqsSingleton, dbTrans, labelFile));
110 | }
111 | for (final String alg : accuracy.keySet())
112 | writer.write(alg + ": " + accuracy.get(alg) + "\n");
113 | writer.close();
114 | }
115 | }
116 |
117 | /** Classify using MALLET Naive Bayes */
118 | static Double classify(final int topN, final Map seqs, final TransactionList dbTrans,
119 | final File labelFile) throws IOException {
120 | if (seqs.size() == 0)
121 | return 0.;
122 |
123 | // Create temp files
124 | final File featureFile = File.createTempFile("features_temp", ".txt");
125 | final File tmpFile = File.createTempFile("mallet_temp", ".txt");
126 | final File outFile = File.createTempFile("mallet_output_temp", ".txt");
127 |
128 | // Generate features
129 | generateFeatures(topN, seqs, dbTrans, featureFile, labelFile);
130 |
131 | // Convert to binary MALLET format
132 | final String cmd[] = new String[4];
133 | cmd[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/mallet-2.0.7/bin/mallet";
134 | cmd[1] = "import-svmlight";
135 | cmd[2] = "--input " + featureFile;
136 | cmd[3] = "--output " + tmpFile;
137 | runScript(cmd, null);
138 |
139 | // Classify
140 | final String cmd2[] = new String[5];
141 | cmd2[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/mallet-2.0.7/bin/mallet";
142 | cmd2[1] = "train-classifier";
143 | cmd2[2] = "--input " + tmpFile;
144 | cmd2[3] = "--cross-validation 10";
145 | cmd2[4] = "--report test:accuracy";
146 | runScript(cmd2, outFile);
147 |
148 | // Print output to screen
149 | final String cmd3[] = new String[3];
150 | cmd3[0] = "tail";
151 | cmd3[1] = "-n 2";
152 | cmd3[2] = "" + outFile;
153 | runScript(cmd3, null);
154 |
155 | // Get accuracy
156 | final String[] lines = FileUtils.readFileToString(outFile).split("\n");
157 | final double accuracy = Double.parseDouble(lines[lines.length - 1].split(" ")[5]);
158 |
159 | // Remove temp files
160 | featureFile.delete();
161 | tmpFile.delete();
162 | outFile.delete();
163 |
164 | return accuracy;
165 | }
166 |
167 | /** Classify using libSVM linear kernel */
168 | static Double classifySVM(final int topN, final Map seqs, final TransactionList dbTrans,
169 | final File labelFile) throws IOException {
170 | if (seqs.size() == 0)
171 | return 0.;
172 |
173 | // Create temp files
174 | final File featureFile = File.createTempFile("features_temp", ".txt");
175 | final File outFile = File.createTempFile("libsvm_output_temp", ".txt");
176 |
177 | // Generate features
178 | generateFeatures(topN, seqs, dbTrans, featureFile, labelFile);
179 |
180 | // Classify
181 | final String cmd[] = new String[4];
182 | cmd[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/libsvm/svm.sh";
183 | cmd[1] = "-t 0"; // Linear kernel
184 | cmd[2] = "-v 10"; // 10-fold cross-validation
185 | cmd[3] = "" + featureFile;
186 | runScript(cmd, outFile);
187 |
188 | // Print output to screen
189 | final String cmd2[] = new String[3];
190 | cmd2[0] = "tail";
191 | cmd2[1] = "-n 2";
192 | cmd2[2] = "" + outFile;
193 | runScript(cmd2, null);
194 |
195 | // Get accuracy
196 | final String[] lines = FileUtils.readFileToString(outFile).split("\n");
197 | final double accuracy = Double.parseDouble(lines[lines.length - 1].split(" ")[4].replace("%", ""));
198 |
199 | // Remove temp files
200 | featureFile.delete();
201 | outFile.delete();
202 |
203 | return accuracy;
204 | }
205 |
206 | private static boolean generateFeatures(final int topN, final Map sequences,
207 | final TransactionList dbTrans, final File featureFile, final File labelFile) throws IOException {
208 |
209 | // Get topN sequences
210 | final Set topSeqs = getTopSequences(sequences, topN);
211 |
212 | // Set output file
213 | final PrintWriter out = new PrintWriter(featureFile, "UTF-8");
214 |
215 | // Read transaction labels
216 | final String[] labels = FileUtils.readFileToString(labelFile).split("\n");
217 |
218 | // Generate features
219 | int count = 0;
220 | for (final Transaction trans : dbTrans.getTransactionList()) {
221 | out.print(labels[count] + " ");
222 | int fNum = 0;
223 | for (final Sequence seq : topSeqs) {
224 | if (trans.contains(seq))
225 | out.print(fNum + ":1 ");
226 | else
227 | out.print(fNum + ":0 ");
228 | fNum++;
229 | }
230 | out.println();
231 | count++;
232 | }
233 | out.close();
234 |
235 | return true;
236 | }
237 |
238 | /** Get top sequences */
239 | private static Set getTopSequences(final Map sequences, final int topN) {
240 |
241 | int count = 0;
242 | final Set topItemsets = new HashSet<>();
243 | for (final Sequence set : sequences.keySet()) {
244 | topItemsets.add(set);
245 | count++;
246 | if (count == topN)
247 | break;
248 | }
249 | if (count < topN)
250 | System.out.println("WARNING: not enough sequences in set: " + count);
251 |
252 | return topItemsets;
253 | }
254 |
255 | @SuppressWarnings("unused")
256 | private static Map removeSingletons(final Map oldSeqs) {
257 | final Map newSeqs = new HashMap<>();
258 | for (final Entry entry : oldSeqs.entrySet()) {
259 | if (entry.getKey().size() > 1)
260 | newSeqs.put(entry.getKey(), entry.getValue());
261 | }
262 | return newSeqs;
263 | }
264 |
265 | /** Run shell script with command line arguments */
266 | public static void runScript(final String cmd[], final File outFile) {
267 |
268 | try {
269 | final ProcessBuilder pb = new ProcessBuilder(cmd);
270 | if (outFile != null)
271 | pb.redirectOutput(outFile);
272 | else
273 | pb.redirectOutput(Redirect.INHERIT);
274 | pb.redirectError(Redirect.INHERIT);
275 | final Process process = pb.start();
276 | process.waitFor();
277 | process.destroy();
278 | } catch (final Exception e) {
279 | e.printStackTrace();
280 | }
281 |
282 | }
283 |
284 | }
285 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/PrecisionRecallBackground.java:
--------------------------------------------------------------------------------
1 | package sequencemining.eval;
2 |
3 | import java.io.File;
4 | import java.io.FileOutputStream;
5 | import java.io.IOException;
6 | import java.io.PrintStream;
7 | import java.util.Arrays;
8 | import java.util.HashMap;
9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 | import java.util.Set;
12 |
13 | import org.apache.commons.io.FilenameUtils;
14 | import org.apache.commons.io.output.TeeOutputStream;
15 |
16 | import com.google.common.collect.Sets;
17 | import com.google.common.collect.Table;
18 |
19 | import sequencemining.main.InferenceAlgorithms.InferGreedy;
20 | import sequencemining.main.SequenceMining;
21 | import sequencemining.main.SequenceMiningCore;
22 | import sequencemining.sequence.Sequence;
23 | import sequencemining.transaction.TransactionGenerator;
24 | import sequencemining.util.Logging;
25 |
26 | public class PrecisionRecallBackground {
27 |
28 | /** Main Settings */
29 | private static final File dbFile = new File("/disk/data1/jfowkes/sequence.txt");
30 | private static final File saveDir = new File("/disk/data1/jfowkes/logs/");
31 |
32 | /** FSM Issues to incorporate */
33 | private static final String name = "Background";
34 | private static final int noIterations = 5_000;
35 |
36 | /** Previously mined Sequences to use for background distribution */
37 | private static final File sequenceLog = new File("/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Logs/SIGN.log");
38 | private static final int noTransactions = 10_000;
39 |
40 | /** Sequence Miner Settings */
41 | private static final int maxStructureSteps = 100_000;
42 | private static final double minSup = 0.05;
43 |
44 | public static void main(final String[] args) throws IOException, ClassNotFoundException {
45 |
46 | // Read in background distribution
47 | final Map backgroundSequences = SequenceMiningCore.readISMSequences(sequenceLog);
48 |
49 | // Read in associated sequence count distribution
50 | @SuppressWarnings("unchecked")
51 | final Table countDist = (Table) Logging
52 | .deserializeFrom(FilenameUtils.removeExtension(sequenceLog.getAbsolutePath()) + ".dist");
53 |
54 | final HashMap sequences = TransactionGenerator
55 | .generateTransactionDatabase(backgroundSequences, countDist, noTransactions, dbFile);
56 | System.out.print("\n============= ACTUAL SEQUENCES =============\n");
57 | for (final Entry entry : sequences.entrySet()) {
58 | System.out.print(String.format("%s\tprob: %1.5f %n", entry.getKey(), entry.getValue()));
59 | }
60 | System.out.println("\nNo sequences: " + sequences.size());
61 | SequenceScaling.printTransactionDBStats(dbFile);
62 |
63 | // Set up logging
64 | final FileOutputStream outFile = new FileOutputStream(saveDir + "/" + name + "_pr.txt");
65 | final TeeOutputStream out = new TeeOutputStream(System.out, outFile);
66 | final PrintStream ps = new PrintStream(out);
67 | System.setOut(ps);
68 |
69 | precisionRecall(sequences, "GoKrimp");
70 | precisionRecall(sequences, "SQS");
71 | precisionRecall(sequences, "BIDE");
72 | precisionRecall(sequences, "ISM");
73 |
74 | }
75 |
76 | public static void precisionRecall(final Map sequences, final String algorithm)
77 | throws IOException {
78 |
79 | // Mine sequences
80 | Set minedSequences = null;
81 | final File logFile = Logging.getLogFileName(algorithm, true, saveDir, dbFile);
82 | final long startTime = System.currentTimeMillis();
83 | if (algorithm.equals("BIDE")) {
84 | FrequentSequenceMining.mineClosedFrequentSequencesBIDE(dbFile.getAbsolutePath(), logFile.getAbsolutePath(),
85 | minSup);
86 | minedSequences = FrequentSequenceMining.readFrequentSequences(logFile).keySet();
87 | } else if (algorithm.equals("ISM")) {
88 | minedSequences = SequenceMining
89 | .mineSequences(dbFile, new InferGreedy(), maxStructureSteps, noIterations, logFile, false).keySet();
90 | } else if (algorithm.equals("GoKrimp")) {
91 | minedSequences = StatisticalSequenceMining.mineGoKrimpSequences(dbFile, logFile).keySet();
92 | } else if (algorithm.equals("SQS")) {
93 | minedSequences = StatisticalSequenceMining.mineSQSSequences(dbFile, logFile, 1).keySet();
94 | } else
95 | throw new RuntimeException("Incorrect algorithm name.");
96 | final long endTime = System.currentTimeMillis();
97 | final double time = (endTime - startTime) / (double) 1000;
98 |
99 | // Calculate sorted precision and recall
100 | final int len = minedSequences.size();
101 | final double[] precision = new double[len];
102 | final double[] recall = new double[len];
103 | for (int k = 1; k <= minedSequences.size(); k++) {
104 |
105 | final Set topKMined = Sets.newHashSet();
106 | for (final Sequence seq : minedSequences) {
107 | topKMined.add(seq);
108 | if (topKMined.size() == k)
109 | break;
110 | }
111 |
112 | final double noInBoth = Sets.intersection(sequences.keySet(), topKMined).size();
113 | final double pr = noInBoth / (double) topKMined.size();
114 | final double rec = noInBoth / (double) sequences.size();
115 | precision[k - 1] = pr;
116 | recall[k - 1] = rec;
117 | }
118 |
119 | // Output precision and recall
120 | System.out.println("\n======== " + algorithm + " ========");
121 | System.out.println("No. mined sequences: " + len);
122 | System.out.println("Time: " + time);
123 | System.out.println("Precision (all): " + Arrays.toString(precision));
124 | System.out.println("Recall (all): " + Arrays.toString(recall));
125 |
126 | }
127 |
128 | }
129 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/PrecisionRecallParallel.java:
--------------------------------------------------------------------------------
1 | package sequencemining.eval;
2 |
3 | import java.io.File;
4 | import java.io.FileNotFoundException;
5 | import java.io.FileOutputStream;
6 | import java.io.IOException;
7 | import java.io.PrintStream;
8 | import java.io.PrintWriter;
9 | import java.io.UnsupportedEncodingException;
10 | import java.util.Arrays;
11 | import java.util.HashSet;
12 | import java.util.Map;
13 | import java.util.Random;
14 | import java.util.Set;
15 |
16 | import org.apache.commons.io.output.TeeOutputStream;
17 |
18 | import sequencemining.main.SequenceMining;
19 | import sequencemining.sequence.Sequence;
20 |
21 | public class PrecisionRecallParallel {
22 |
23 | public static void main(final String[] args) throws IOException, ClassNotFoundException {
24 |
25 | final String baseFolder = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/";
26 | // final File dbFile = new File(baseFolder + "Datasets/parallel",
27 | // ".dat");
28 | // generateParallelDataset(dbFile);
29 |
30 | // Set up logging
31 | final FileOutputStream outFile = new FileOutputStream(baseFolder + "PrecisionRecall/parallel_pr.txt");
32 | final TeeOutputStream out = new TeeOutputStream(System.out, outFile);
33 | final PrintStream ps = new PrintStream(out);
34 | System.setOut(ps);
35 |
36 | // Read SQS sequences
37 | final File outSQS = new File(baseFolder + "SQS/parallel_partial.txt");
38 | final Map seqsSQS = StatisticalSequenceMining.readSQSSequences(outSQS);
39 |
40 | // Read GoKrimp sequences
41 | final File outGOKRIMP = new File(baseFolder + "GoKrimp/parallel.txt");
42 | final Map seqsGORKIMP = StatisticalSequenceMining.readGoKrimpSequences(outGOKRIMP);
43 |
44 | // Read ISM sequences
45 | final File outISM = new File(baseFolder + "Logs/parallel.log");
46 | final Map seqsISM = SequenceMining.readISMSequences(outISM);
47 |
48 | // Precision-recall
49 | precisionRecall(seqsSQS, "SQS");
50 | precisionRecall(seqsGORKIMP, "GoKrimp");
51 | precisionRecall(seqsISM, "ISM");
52 |
53 | }
54 |
55 | private static void precisionRecall(final Map seqs, final String alg) {
56 |
57 | // Calculate sorted precision and recall
58 | final int len = seqs.size();
59 | final double[] precision = new double[len];
60 | final double[] recall = new double[len];
61 | for (int k = 1; k <= seqs.size(); k++) {
62 |
63 | final Set topKMined = new HashSet<>();
64 | for (final Sequence seq : seqs.keySet()) {
65 | topKMined.add(seq);
66 | if (topKMined.size() == k)
67 | break;
68 | }
69 |
70 | // Calculate number of right patterns
71 | double right = 0;
72 | final Set procs = new HashSet<>();
73 | for (final Sequence seq : topKMined) {
74 | final int proc = seq.get(0) / 10;
75 | for (int i = 1; i < seq.size(); i++) {
76 | if (seq.get(i) / 10 != proc)
77 | continue;
78 | }
79 | right++;
80 | procs.add(proc);
81 | }
82 |
83 | precision[k - 1] = right / topKMined.size();
84 | recall[k - 1] = procs.size() / 5.;
85 | }
86 |
87 | // Output precision and recall
88 | System.out.println("\n======== " + alg + " ========");
89 | System.out.println("No. mined sequences: " + len);
90 | System.out.println("Precision: " + Arrays.toString(precision));
91 | System.out.println("Recall: " + Arrays.toString(recall));
92 |
93 | }
94 |
95 | /** Generate parallel dataset */
96 | @SuppressWarnings("unused")
97 | private static void generateParallelDataset(final File dbFile)
98 | throws FileNotFoundException, UnsupportedEncodingException {
99 | final Random rand = new Random(1);
100 | final int[] states = new int[] { 0, 0, 0, 0, 0 };
101 | final PrintWriter db = new PrintWriter(dbFile, "UTF-8");
102 | for (int j = 1; j <= 1_000_000; j++) {
103 | final int proc = rand.nextInt(5);
104 | final int lab1 = proc + 1;
105 | final int lab2 = (states[proc] % 5) + 1;
106 | states[proc] += 1;
107 | db.write(lab1 + "" + lab2 + " -1 ");
108 | if (j % 100 == 0)
109 | db.write("-2\n");
110 | }
111 | db.close();
112 | }
113 |
114 | }
115 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/SequenceScaling.java:
--------------------------------------------------------------------------------
1 | package sequencemining.eval;
2 |
3 | import java.io.File;
4 | import java.io.FileOutputStream;
5 | import java.io.IOException;
6 | import java.io.PrintStream;
7 | import java.text.DecimalFormat;
8 | import java.util.Arrays;
9 | import java.util.HashSet;
10 | import java.util.Map;
11 | import java.util.Map.Entry;
12 | import java.util.Set;
13 |
14 | import org.apache.commons.io.FileUtils;
15 | import org.apache.commons.io.FilenameUtils;
16 | import org.apache.commons.io.LineIterator;
17 | import org.apache.commons.io.output.TeeOutputStream;
18 |
19 | import com.google.common.collect.Table;
20 |
21 | import sequencemining.main.InferenceAlgorithms.InferGreedy;
22 | import sequencemining.main.SequenceMining;
23 | import sequencemining.main.SequenceMiningCore;
24 | import sequencemining.sequence.Sequence;
25 | import sequencemining.transaction.TransactionGenerator;
26 | import sequencemining.util.Logging;
27 |
28 | public class SequenceScaling {
29 |
30 | /** Main Settings */
31 | private static final File dbFile = new File("/disk/data1/jfowkes/sequence.txt");
32 | private static final File saveDir = new File("/disk/data1/jfowkes/logs/");
33 |
34 | /** Set of mined itemsets to use for background */
35 | private static final String name = "SIGN-based";
36 | private static final File sequenceLog = new File("/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Logs/SIGN.log");
37 |
38 | /** Spark Settings */
39 | private static final long MAX_RUNTIME = 24 * 60; // 24hrs
40 | private static final int maxStructureSteps = 100_000;
41 | private static final int maxEMIterations = 100;
42 |
43 | public static void main(final String[] args) throws IOException, ClassNotFoundException {
44 |
45 | // Run
46 | scalingTransactions(32, new int[] { 1_000, 10_000, 100_000, 1_000_000 });
47 | }
48 |
49 | public static void scalingTransactions(final int noCores, final int[] trans)
50 | throws IOException, ClassNotFoundException {
51 |
52 | final double[] time = new double[trans.length];
53 | final DecimalFormat formatter = new DecimalFormat("0.0E0");
54 |
55 | // Save to file
56 | final FileOutputStream outFile = new FileOutputStream(saveDir + "/" + name + "_scaling_" + noCores + ".txt");
57 | final TeeOutputStream out = new TeeOutputStream(System.out, outFile);
58 | final PrintStream ps = new PrintStream(out);
59 | System.setOut(ps);
60 |
61 | // Read in previously mined sequences
62 | final Map sequences = SequenceMiningCore.readISMSequences(sequenceLog);
63 | System.out.print("\n============= ACTUAL SEQUENCES =============\n");
64 | for (final Entry entry : sequences.entrySet()) {
65 | System.out.print(String.format("%s\tprob: %1.5f %n", entry.getKey(), entry.getValue()));
66 | }
67 | System.out.println("\nNo sequences: " + sequences.size());
68 | System.out.println("No items: " + countNoItems(sequences.keySet()));
69 |
70 | // Read in associated sequence count distribution
71 | @SuppressWarnings("unchecked")
72 | final Table countDist = (Table) Logging
73 | .deserializeFrom(FilenameUtils.removeExtension(sequenceLog.getAbsolutePath()) + ".dist");
74 |
75 | transloop: for (int i = 0; i < trans.length; i++) {
76 |
77 | final int tran = trans[i];
78 | System.out.println("\n========= " + formatter.format(tran) + " Transactions");
79 |
80 | // Generate transaction database
81 | TransactionGenerator.generateTransactionDatabase(sequences, countDist, tran, dbFile);
82 | SequenceScaling.printTransactionDBStats(dbFile);
83 |
84 | // Mine sequences
85 | final File logFile = Logging.getLogFileName("ISM", true, saveDir, dbFile);
86 | final long startTime = System.currentTimeMillis();
87 | SequenceMining.mineSequences(dbFile, new InferGreedy(), maxStructureSteps, maxEMIterations, logFile, false);
88 |
89 | final long endTime = System.currentTimeMillis();
90 | final double tim = (endTime - startTime) / (double) 1000;
91 | time[i] += tim;
92 |
93 | System.out.printf("Time (s): %.2f%n", tim);
94 |
95 | if (tim > MAX_RUNTIME * 60)
96 | break transloop;
97 |
98 | }
99 |
100 | // Print time
101 | System.out.println("\n========" + name + "========");
102 | System.out.println("Transactions:" + Arrays.toString(trans));
103 | System.out.println("Time: " + Arrays.toString(time));
104 |
105 | // and save to file
106 | out.close();
107 | }
108 |
109 | /**
110 | * Count the number of items in the sequences (sequences need not be
111 | * independent)
112 | */
113 | public static int countNoItems(final Set sequences) {
114 | final Set items = new HashSet<>();
115 | for (final Sequence sequence : sequences)
116 | items.addAll(sequence);
117 | return items.size();
118 | }
119 |
120 | /** Print useful statistics for the transaction database */
121 | public static void printTransactionDBStats(final File dbFile) throws IOException {
122 |
123 | int noTransactions = 0;
124 | double sparsity = 0;
125 | final Set singletons = new HashSet<>();
126 | final LineIterator it = FileUtils.lineIterator(dbFile, "UTF-8");
127 | while (it.hasNext()) {
128 | final String[] items = it.nextLine().replace("-2", "").split(" -1 ");
129 | for (final String item : items)
130 | singletons.add(Integer.parseInt(item));
131 | sparsity += items.length;
132 | noTransactions++;
133 | }
134 | LineIterator.closeQuietly(it);
135 |
136 | System.out.println("\nDatabase: " + dbFile);
137 | System.out.println("Items: " + singletons.size());
138 | System.out.println("Transactions: " + noTransactions);
139 | System.out.println("Avg. items per transaction: " + sparsity / noTransactions + "\n");
140 |
141 | }
142 |
143 | }
144 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/SequenceSymmetricDistance.java:
--------------------------------------------------------------------------------
1 | package sequencemining.eval;
2 |
3 | import java.io.File;
4 | import java.io.FileOutputStream;
5 | import java.io.IOException;
6 | import java.io.PrintStream;
7 | import java.io.Writer;
8 | import java.util.HashSet;
9 | import java.util.Map;
10 | import java.util.Set;
11 |
12 | import org.apache.commons.io.output.TeeOutputStream;
13 |
14 | import com.google.common.base.Charsets;
15 | import com.google.common.io.Files;
16 |
17 | import sequencemining.main.SequenceMiningCore;
18 | import sequencemining.sequence.Sequence;
19 |
20 | public class SequenceSymmetricDistance {
21 |
22 | public static void main(final String[] args) throws IOException {
23 |
24 | // TODO re-run BIDE...
25 | final int topN = 50;
26 | final String baseDir = "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/";
27 | final String[] datasets = new String[] { "alice_punc", "GAZELLE1", "jmlr", "SIGN", "aslbu", "aslgt", "auslan2",
28 | "context", "pioneer", "skating" };
29 |
30 | // Set up logging
31 | final FileOutputStream outFile = new FileOutputStream(baseDir + "redundancy.txt");
32 | final TeeOutputStream out = new TeeOutputStream(System.out, outFile);
33 | final PrintStream ps = new PrintStream(out);
34 | System.setOut(ps);
35 |
36 | final Writer writer = Files.newWriter(new File(baseDir + "redundancy.tex"), Charsets.UTF_8);
37 |
38 | for (int i = 0; i < datasets.length; i++) {
39 |
40 | System.out.println("===== Dataset: " + datasets[i]);
41 |
42 | // ISM sequences
43 | final Map intSequences = SequenceMiningCore
44 | .readISMSequences(new File(baseDir + "Logs/" + datasets[i] + ".log"));
45 | calculateRedundancyStats("ISM", intSequences, topN, writer);
46 |
47 | // SQS sequences
48 | final Map sqsSequences = StatisticalSequenceMining
49 | .readSQSSequences(new File(baseDir + "SQS/" + datasets[i] + ".txt"));
50 | calculateRedundancyStats("SQS", sqsSequences, topN, writer);
51 |
52 | // GoKrimp sequences
53 | final Map gokrimpSequences = StatisticalSequenceMining
54 | .readGoKrimpSequences(new File(baseDir + "GoKrimp/" + datasets[i] + ".txt"));
55 | calculateRedundancyStats("GoKrimp", gokrimpSequences, topN, writer);
56 |
57 | // BIDE sequences
58 | final Map bideSequences = FrequentSequenceMining
59 | .readFrequentSequences(new File(baseDir + "BIDE/" + datasets[i] + ".txt"));
60 | calculateRedundancyStats("BIDE", bideSequences, topN, writer);
61 |
62 | System.out.println();
63 | }
64 | writer.close();
65 |
66 | }
67 |
68 | private static void calculateRedundancyStats(final String name, final Map intSequences,
69 | final int topN, final Writer writer) throws IOException {
70 | System.out.println("\n" + name + " Sequences\n-----------");
71 | System.out.println("No. sequences: " + intSequences.size());
72 | if (name.equals("ISM"))
73 | System.out.println(
74 | "No. non-singleton sequences: " + filterSingletons(intSequences, Integer.MAX_VALUE).size());
75 | System.out.println("No. items: " + countNoItems(intSequences.keySet()));
76 |
77 | // Get top sequences and calculate stats
78 | final Set topIntSequences = filterSingletons(intSequences, topN);
79 |
80 | final double avgMinDiff = calculateRedundancy(topIntSequences);
81 | System.out.println("\nAvg. min edit dist: " + avgMinDiff);
82 | writer.write("$" + avgMinDiff + "$ & ");
83 |
84 | // Calculate spuriousness
85 | final double avgMaxSpur = calculateSpuriousness(topIntSequences);
86 | System.out.println("Avg. no. subseq: " + avgMaxSpur);
87 | writer.write("$" + avgMaxSpur + "$ & ");
88 |
89 | // Calculate no. items
90 | final int noItems = countNoItems(topIntSequences);
91 | System.out.println("No. items: " + noItems);
92 | writer.write("$" + noItems + "$ & ");
93 |
94 | // Calculate size
95 | final double avgSize = calculateAverageSize(topIntSequences);
96 | System.out.println("Avg. subseq size: " + avgSize);
97 |
98 | writer.write("\n");
99 | }
100 |
101 | private static double calculateRedundancy(final Set topSequences) {
102 |
103 | double avgMinDiff = 0;
104 | for (final Sequence seq1 : topSequences) {
105 |
106 | int minDiff = Integer.MAX_VALUE;
107 | for (final Sequence seq2 : topSequences) {
108 | if (!seq1.equals(seq2)) {
109 | final int diff = editDistance(seq1, seq2);
110 | if (diff < minDiff)
111 | minDiff = diff;
112 | }
113 | }
114 | avgMinDiff += minDiff;
115 | }
116 | avgMinDiff /= topSequences.size();
117 |
118 | return avgMinDiff;
119 | }
120 |
121 | /**
122 | * Calculate the Levenshtein distance between two sequences using the
123 | * Wagner-Fischer algorithm
124 | *
125 | * @see http://en.wikipedia.org/wiki/Levenshtein_distance
126 | */
127 | private static int editDistance(final Sequence s, final Sequence t) {
128 | final int m = s.size();
129 | final int n = t.size();
130 |
131 | // for all i and j, d[i,j] will hold the Levenshtein distance between
132 | // the first i characters of s and the first j characters of t;
133 | final int[][] d = new int[m + 1][n + 1];
134 |
135 | // the distance of any first string to an empty second string
136 | for (int i = 1; i <= m; i++)
137 | d[i][0] = i;
138 |
139 | // the distance of any second string to an empty first string
140 | for (int j = 1; j <= n; j++)
141 | d[0][j] = j;
142 |
143 | for (int j = 1; j <= n; j++) {
144 | for (int i = 1; i <= m; i++) {
145 | if (s.get(i - 1) == t.get(j - 1)) {
146 | d[i][j] = d[i - 1][j - 1]; // no operation required
147 | } else {
148 | d[i][j] = Math.min(d[i - 1][j] + 1, // a deletion
149 | Math.min(d[i][j - 1] + 1, // an insertion
150 | d[i - 1][j - 1] + 1)); // a substitution
151 | }
152 | }
153 | }
154 |
155 | return d[m][n];
156 | }
157 |
158 | /**
159 | * Count the number of distinct items in the set of sequences
160 | */
161 | public static int countNoItems(final Set sequences) {
162 | final Set items = new HashSet<>();
163 | for (final Sequence seq : sequences)
164 | items.addAll(seq.getItems());
165 | return items.size();
166 | }
167 |
168 | private static double calculateAverageSize(final Set topSequences) {
169 |
170 | double avgSize = 0;
171 | for (final Sequence seq : topSequences)
172 | avgSize += seq.size();
173 | return avgSize / topSequences.size();
174 | }
175 |
176 | private static double calculateSpuriousness(final Set topSequences) {
177 |
178 | double avgSubseq = 0;
179 | for (final Sequence seq1 : topSequences) {
180 | for (final Sequence seq2 : topSequences) {
181 | if (!seq1.equals(seq2))
182 | avgSubseq += isSubseq(seq1, seq2);
183 | }
184 | }
185 | avgSubseq /= topSequences.size();
186 |
187 | return avgSubseq;
188 | }
189 |
190 | /** Filter out singletons */
191 | static Set filterSingletons(final Map seqs, final int topN) {
192 |
193 | int count = 0;
194 | final Set topSeqs = new HashSet<>();
195 | for (final Sequence seq : seqs.keySet()) {
196 | if (seq.size() != 1) {
197 | topSeqs.add(seq);
198 | count++;
199 | }
200 | if (count == topN)
201 | break;
202 | }
203 | if (topN != Integer.MAX_VALUE && count < topN)
204 | System.out.println("WARNING: not enough non-singleton sequences in set: " + count);
205 |
206 | return topSeqs;
207 | }
208 |
209 | private static int isSubseq(final Sequence seq1, final Sequence seq2) {
210 | if (seq2.contains(seq1))
211 | return 1;
212 | return 0;
213 | }
214 |
215 | }
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/eval/StatisticalSequenceMining.java:
--------------------------------------------------------------------------------
1 | package sequencemining.eval;
2 |
3 | import java.io.BufferedWriter;
4 | import java.io.File;
5 | import java.io.FileWriter;
6 | import java.io.IOException;
7 | import java.lang.ProcessBuilder.Redirect;
8 | import java.util.LinkedHashMap;
9 |
10 | import org.apache.commons.io.FileUtils;
11 | import org.apache.commons.io.LineIterator;
12 |
13 | import ca.pfv.spmf.algorithms.sequentialpatterns.goKrimp.AlgoGoKrimp;
14 | import ca.pfv.spmf.algorithms.sequentialpatterns.goKrimp.DataReader;
15 | import sequencemining.sequence.Sequence;
16 |
17 | public class StatisticalSequenceMining {
18 |
19 | public static void main(final String[] args) throws IOException {
20 |
21 | // Datasets
22 | final String[] datasets = new String[] { "GAZELLE1" };
23 | for (int i = 0; i < datasets.length; i++) {
24 | final File dbPath = new File(
25 | "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Datasets/Paper/" + datasets[i] + ".dat");
26 |
27 | // Run GoKRIMP
28 | //final File saveFileGoKRIMP = new File(
29 | // "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/GoKrimp/" + datasets[i] + ".txt");
30 | //mineGoKrimpSequences(dbPath, saveFileGoKRIMP);
31 |
32 | // Run SQS
33 | final File saveFileSQS = new File(
34 | "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/SQS/" + datasets[i] + ".txt");
35 | mineSQSSequences(dbPath, saveFileSQS, 1);
36 | }
37 |
38 | }
39 |
40 | public static LinkedHashMap mineGoKrimpSequences(final File dataset, final File saveFile)
41 | throws IOException {
42 |
43 | // Convert to SQS Dataset format
44 | final File TMPDB = File.createTempFile("gokrimp-dataset", ".dat");
45 | convertDatasetGoKrimpFormat(dataset, TMPDB);
46 |
47 | // Set MTV settings
48 | final String cmd[] = new String[2];
49 | cmd[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/gokrimp/gokrimp.sh";
50 | cmd[1] = TMPDB.toString().replace(".dat", "");
51 | runScript(cmd, saveFile);
52 |
53 | TMPDB.delete();
54 |
55 | return readGoKrimpSequences(saveFile);
56 | }
57 |
58 | public static LinkedHashMap mineSQSSequences(final File dataset, final File saveFile,
59 | final int minUsage) throws IOException {
60 |
61 | // Convert to SQS Dataset format
62 | final File TMPDB = File.createTempFile("sqs-dataset", ".dat");
63 | convertDatasetSQSFormat(dataset, TMPDB);
64 |
65 | // Set MTV settings
66 | final String cmd[] = new String[5];
67 | cmd[0] = "/afs/inf.ed.ac.uk/user/j/jfowkes/Packages/sqs/sqs.sh";
68 | cmd[1] = "-i " + TMPDB;
69 | cmd[2] = "-t " + minUsage; // default is 1
70 | cmd[3] = "-o " + saveFile;
71 | cmd[4] = "-m search"; // search - scan db directly, order - compress
72 | // given patterns
73 | // cmd[5] = "-p"; // patterns file (for order method)
74 | runScript(cmd, null);
75 |
76 | TMPDB.delete();
77 |
78 | return readSQSSequences(saveFile);
79 | }
80 |
81 | /** Convert dataset from SPMF format to SQS format */
82 | private static void convertDatasetSQSFormat(final File inputDB, final File outputDB) throws IOException {
83 |
84 | // Output DB
85 | final BufferedWriter db = new BufferedWriter(new FileWriter(outputDB));
86 |
87 | // for each line (transaction) until the end of file
88 | boolean newSeq = false;
89 | final LineIterator it = FileUtils.lineIterator(inputDB, "UTF-8");
90 | while (it.hasNext()) {
91 |
92 | final String line = it.nextLine();
93 | // if the line is a comment, is empty or is a
94 | // kind of metadata
95 | if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
96 | continue;
97 | }
98 |
99 | // sequence separator
100 | if (newSeq)
101 | db.write("-1 ");
102 |
103 | // split the transaction into items
104 | final String[] lineSplited = line.split(" ");
105 |
106 | for (int i = 0; i < lineSplited.length; i++) {
107 | if (lineSplited[i].equals("-1")) { // end of item
108 |
109 | } else if (lineSplited[i].equals("-2")) { // end of sequence
110 | newSeq = true;
111 | } else { // extract the value for an item
112 | db.write(lineSplited[i] + " ");
113 | }
114 | }
115 |
116 | }
117 | db.newLine();
118 | db.close();
119 |
120 | // close the input file
121 | LineIterator.closeQuietly(it);
122 |
123 | }
124 |
125 | /** Convert dataset from SPMF format to GoKrimp format */
126 | private static void convertDatasetGoKrimpFormat(final File inputDB, final File outputDB) throws IOException {
127 |
128 | // Output DB
129 | final BufferedWriter db = new BufferedWriter(new FileWriter(outputDB));
130 |
131 | // for each line (transaction) until the end of file
132 | boolean newSeq = false;
133 | final LineIterator it = FileUtils.lineIterator(inputDB, "UTF-8");
134 | while (it.hasNext()) {
135 |
136 | final String line = it.nextLine();
137 | // if the line is a comment, is empty or is a
138 | // kind of metadata
139 | if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
140 | continue;
141 | }
142 |
143 | // sequence separator
144 | if (newSeq)
145 | db.write("\n");
146 |
147 | // split the transaction into items
148 | final String[] lineSplited = line.split(" ");
149 |
150 | for (int i = 0; i < lineSplited.length; i++) {
151 | if (lineSplited[i].equals("-1")) { // end of item
152 |
153 | } else if (lineSplited[i].equals("-2")) { // end of sequence
154 | newSeq = true;
155 | } else { // extract the value for an item
156 | db.write(lineSplited[i] + " ");
157 | }
158 | }
159 |
160 | }
161 | db.newLine();
162 | db.close();
163 |
164 | // close the input file
165 | LineIterator.closeQuietly(it);
166 |
167 | }
168 |
169 | /** Read in SQS sequences (sorted by worth) */
170 | public static LinkedHashMap readSQSSequences(final File output) throws IOException {
171 | final LinkedHashMap sequences = new LinkedHashMap<>();
172 |
173 | final LineIterator it = FileUtils.lineIterator(output);
174 | while (it.hasNext()) {
175 | final String line = it.nextLine();
176 | if (!line.trim().isEmpty()) {
177 | final String[] splitLine = line.split(" ");
178 | final String[] items = splitLine[0].split(" ");
179 | final Sequence seq = new Sequence();
180 | for (final String item : items)
181 | seq.add(Integer.parseInt(item));
182 | final double worth = Double.parseDouble(splitLine[1].split(" ")[1]);
183 | sequences.put(seq, worth);
184 | }
185 | }
186 |
187 | return sequences;
188 | }
189 |
190 | /** Read in GoKrimp sequences (sorted by compression benefit) */
191 | public static LinkedHashMap readGoKrimpSequences(final File output) throws IOException {
192 | final LinkedHashMap sequences = new LinkedHashMap<>();
193 |
194 | final LineIterator it = FileUtils.lineIterator(output);
195 | while (it.hasNext()) {
196 | final String line = it.nextLine();
197 | if (!line.trim().isEmpty() && line.charAt(0) == '[') {
198 | final String[] splitLine = line.split(" ");
199 | final double worth = Double.parseDouble(splitLine[splitLine.length - 1]);
200 | final Sequence seq = new Sequence();
201 | for (int i = 1; i < splitLine.length - 2; i++)
202 | seq.add(Integer.parseInt(splitLine[i]));
203 | sequences.put(seq, worth);
204 | }
205 | }
206 |
207 | return sequences;
208 | }
209 |
210 | /**
211 | * @deprecated gives slightly different results to reference implementation
212 | */
213 | @Deprecated
214 | public static LinkedHashMap mineGoKrimpSequencesSPMF(final File dataset, final File saveFile)
215 | throws IOException {
216 |
217 | final DataReader d = new DataReader();
218 | final AlgoGoKrimp g = d.readData_SPMF(dataset.getAbsolutePath(), "");
219 | // g.printData();
220 | g.setOutputFilePath(saveFile.getAbsolutePath());
221 | g.gokrimp();
222 |
223 | return readGoKrimpSequencesSPMF(saveFile);
224 | }
225 |
226 | /**
227 | * Read in GOKRIMP sequences (sorted by compression benefit)
228 | *
229 | * @deprecated gives slightly different results to reference implementation
230 | */
231 | @Deprecated
232 | public static LinkedHashMap readGoKrimpSequencesSPMF(final File output) throws IOException {
233 | final LinkedHashMap sequences = new LinkedHashMap<>();
234 |
235 | final LineIterator it = FileUtils.lineIterator(output);
236 | while (it.hasNext()) {
237 | final String line = it.nextLine();
238 | if (!line.trim().isEmpty()) {
239 | final String[] splitLine = line.split("#SUP:");
240 | final String[] items = splitLine[0].trim().split(" ");
241 | final Sequence seq = new Sequence();
242 | for (final String item : items)
243 | seq.add(Integer.parseInt(item.trim()));
244 | final double compressionBenefit = Double.parseDouble(splitLine[1].trim());
245 | sequences.put(seq, compressionBenefit);
246 | }
247 | }
248 |
249 | return sequences;
250 | }
251 |
252 | /** Run shell script with command line arguments */
253 | public static void runScript(final String cmd[], final File outFile) {
254 |
255 | try {
256 | final ProcessBuilder pb = new ProcessBuilder(cmd);
257 | if (outFile != null)
258 | pb.redirectOutput(outFile);
259 | else
260 | pb.redirectOutput(Redirect.INHERIT);
261 | pb.redirectError(Redirect.INHERIT);
262 | final Process process = pb.start();
263 | process.waitFor();
264 | process.destroy();
265 | } catch (final Exception e) {
266 | e.printStackTrace();
267 | }
268 |
269 | }
270 |
271 | }
272 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/EMStep.java:
--------------------------------------------------------------------------------
1 | package sequencemining.main;
2 |
3 | import static java.util.function.Function.identity;
4 | import static java.util.stream.Collectors.counting;
5 | import static java.util.stream.Collectors.groupingBy;
6 |
7 | import java.util.Collections;
8 | import java.util.HashMap;
9 | import java.util.HashSet;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.Set;
13 |
14 | import com.google.common.collect.HashBasedTable;
15 | import com.google.common.collect.Multiset;
16 | import com.google.common.collect.Table;
17 |
18 | import sequencemining.main.InferenceAlgorithms.InferenceAlgorithm;
19 | import sequencemining.sequence.Sequence;
20 | import sequencemining.transaction.Transaction;
21 | import sequencemining.transaction.TransactionDatabase;
22 | import sequencemining.util.Tuple2;
23 |
24 | /** Class to hold the various transaction EM Steps */
25 | public class EMStep {
26 |
27 | /** Initialize cached sequences */
28 | static void initializeCachedSequences(final TransactionDatabase transactions,
29 | final Table initProbs) {
30 | transactions.getTransactionList().parallelStream().forEach(t -> t.initializeCachedSequences(initProbs));
31 | }
32 |
33 | /** EM-step for hard EM */
34 | static Table hardEMStep(final List transactions,
35 | final InferenceAlgorithm inferenceAlgorithm) {
36 | final double noTransactions = transactions.size();
37 |
38 | // E-step
39 | final Map, Long> coveringWithCounts = transactions.parallelStream().map(t -> {
40 | final Multiset covering = inferenceAlgorithm.infer(t);
41 | t.setCachedCovering(covering);
42 | return covering.entrySet();
43 | }).flatMap(Set::stream).collect(groupingBy(identity(), counting()));
44 |
45 | // M-step
46 | final Table newSequences = coveringWithCounts.entrySet().parallelStream().collect(
47 | HashBasedTable::create,
48 | (t, e) -> t.put(e.getKey().getElement(), e.getKey().getCount(), e.getValue() / noTransactions),
49 | Table::putAll);
50 | newSequences.rowKeySet().parallelStream().forEach(seq -> {
51 | // Pad with zero counts for non-occurrences
52 | final int maxOccur = Collections.max(newSequences.row(seq).keySet());
53 | for (int occur = 1; occur <= maxOccur; occur++) {
54 | if (!newSequences.contains(seq, occur))
55 | newSequences.put(seq, occur, 0.);
56 | } // Add probabilities for zero occurrences
57 | double rowSum = 0;
58 | for (final Double count : newSequences.row(seq).values())
59 | rowSum += count;
60 | newSequences.put(seq, 0, 1 - rowSum);
61 | });
62 |
63 | // Update cached sequences
64 | transactions.parallelStream().forEach(t -> t.updateCachedSequences(newSequences));
65 |
66 | return newSequences;
67 | }
68 |
69 | /** Get average cost of last EM-step */
70 | static double calculateAverageCost(final TransactionDatabase transactions) {
71 | final double noTransactions = transactions.size();
72 | return transactions.getTransactionList().parallelStream().mapToDouble(Transaction::getCachedCost).sum()
73 | / noTransactions;
74 | }
75 |
76 | /** EM-step for structural EM */
77 | static Tuple2> structuralEMStep(final TransactionDatabase transactions,
78 | final InferenceAlgorithm inferenceAlgorithm, final Sequence candidate) {
79 | final double noTransactions = transactions.size();
80 |
81 | // Calculate max. no. of candidate occurrences
82 | final int maxReps = transactions.getTransactionList().parallelStream().mapToInt(t -> t.repetitions(candidate))
83 | .max().getAsInt();
84 | final Map initProb = new HashMap<>();
85 | initProb.put(0, 0.);
86 | for (int occur = 1; occur <= maxReps; occur++)
87 | initProb.put(occur, 1.);
88 |
89 | // E-step (adding candidate to transactions that support it)
90 | final Map, Long> coveringWithCounts = transactions.getTransactionList()
91 | .parallelStream().map(t -> {
92 | if (t.contains(candidate)) {
93 | t.addSequenceCache(candidate, initProb);
94 | final Multiset covering = inferenceAlgorithm.infer(t);
95 | t.setTempCachedCovering(covering);
96 | return covering.entrySet();
97 | }
98 | return t.getCachedCovering().entrySet();
99 | }).flatMap(Set::stream).collect(groupingBy(identity(), counting()));
100 |
101 | // M-step
102 | final Table newSequences = coveringWithCounts.entrySet().parallelStream().collect(
103 | HashBasedTable::create,
104 | (t, e) -> t.put(e.getKey().getElement(), e.getKey().getCount(), e.getValue() / noTransactions),
105 | Table::putAll);
106 | newSequences.rowKeySet().parallelStream().forEach(seq -> {
107 | // Pad with zero counts for non-occurrences
108 | final int maxOccur = Collections.max(newSequences.row(seq).keySet());
109 | for (int occur = 1; occur <= maxOccur; occur++) {
110 | if (!newSequences.contains(seq, occur))
111 | newSequences.put(seq, occur, 0.);
112 | } // Add probabilities for zero occurrences
113 | double rowSum = 0;
114 | for (final Double count : newSequences.row(seq).values())
115 | rowSum += count;
116 | newSequences.put(seq, 0, 1 - rowSum);
117 | });
118 |
119 | // Get average cost (removing candidate from supported transactions)
120 | final double averageCost = transactions.getTransactionList().parallelStream().mapToDouble(t -> {
121 | double cost;
122 | if (t.contains(candidate))
123 | cost = t.getTempCachedCost(newSequences);
124 | else
125 | cost = t.getCachedCost(newSequences);
126 | t.removeSequenceCache(candidate);
127 | return cost;
128 | }).sum() / noTransactions;
129 |
130 | // Get candidate prob
131 | final Map prob = newSequences.row(candidate);
132 |
133 | return new Tuple2>(averageCost, prob);
134 | }
135 |
136 | /** Add accepted candidate itemset to cache */
137 | static Table addAcceptedCandidateCache(final TransactionDatabase transactions,
138 | final Sequence candidate, final Map prob) {
139 | final double noTransactions = transactions.size();
140 |
141 | // Cached E-step (adding candidate to transactions that support it)
142 | final Map, Long> coveringWithCounts = transactions.getTransactionList()
143 | .parallelStream().map(t -> {
144 | if (t.contains(candidate)) {
145 | t.addSequenceCache(candidate, prob);
146 | final Multiset covering = t.getTempCachedCovering();
147 | t.setCachedCovering(covering);
148 | return covering.entrySet();
149 | }
150 | return t.getCachedCovering().entrySet();
151 | }).flatMap(Set::stream).collect(groupingBy(identity(), counting()));
152 |
153 | // M-step
154 | final Table newSequences = coveringWithCounts.entrySet().parallelStream().collect(
155 | HashBasedTable::create,
156 | (t, e) -> t.put(e.getKey().getElement(), e.getKey().getCount(), e.getValue() / noTransactions),
157 | Table::putAll);
158 | newSequences.rowKeySet().parallelStream().forEach(seq -> {
159 | // Pad with zero counts for non-occurrences
160 | final int maxOccur = Collections.max(newSequences.row(seq).keySet());
161 | for (int occur = 1; occur <= maxOccur; occur++) {
162 | if (!newSequences.contains(seq, occur))
163 | newSequences.put(seq, occur, 0.);
164 | } // Add probabilities for zero occurrences
165 | double rowSum = 0;
166 | for (final Double count : newSequences.row(seq).values())
167 | rowSum += count;
168 | newSequences.put(seq, 0, 1 - rowSum);
169 | });
170 |
171 | // Update cached itemsets
172 | transactions.getTransactionList().parallelStream().forEach(t -> t.updateCachedSequences(newSequences));
173 |
174 | return newSequences;
175 | }
176 |
177 | /** Get the support of given sequences */
178 | static Map getSupportsOfSequences(final TransactionDatabase transactions,
179 | final Set sequences) {
180 | return transactions.getTransactionList().parallelStream().map(t -> {
181 | final HashSet supportedSeqs = new HashSet<>();
182 | for (final Sequence seq : sequences) {
183 | if (t.contains(seq))
184 | supportedSeqs.add(seq);
185 | }
186 | return supportedSeqs;
187 | }).flatMap(Set::stream).collect(groupingBy(identity(), counting()));
188 | }
189 |
190 | private EMStep() {
191 | }
192 |
193 | }
194 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/InferenceAlgorithms.java:
--------------------------------------------------------------------------------
1 | package sequencemining.main;
2 |
3 | import java.io.Serializable;
4 | import java.util.BitSet;
5 | import java.util.Map;
6 |
7 | import com.google.common.collect.HashMultiset;
8 | import com.google.common.collect.Multiset;
9 | import com.google.common.collect.Table;
10 |
11 | import sequencemining.sequence.Sequence;
12 | import sequencemining.transaction.Transaction;
13 |
14 | /** Container class for Inference Algorithms */
15 | public class InferenceAlgorithms {
16 |
17 | /** Interface for the different inference algorithms */
18 | public interface InferenceAlgorithm {
19 | public Multiset infer(final Transaction transaction);
20 | }
21 |
22 | /**
23 | * Infer ML parameters to explain transaction using greedy algorithm and
24 | * store in covering. Sequences *may not* overlap.
25 | *
26 | * !! Assumes *no overlap* !! i.e. subseqs in covering are pairwise disjoint
27 | */
28 | public static class InferGreedy implements InferenceAlgorithm, Serializable {
29 | private static final long serialVersionUID = 9173178089235828142L;
30 |
31 | @Override
32 | public Multiset infer(final Transaction transaction) {
33 |
34 | final Multiset covering = HashMultiset.create();
35 | int lenCovering = 0;
36 | final int transactionSize = transaction.size();
37 | final BitSet coveredItems = new BitSet(transactionSize);
38 |
39 | final Table cachedSequences = transaction.getCachedSequences();
40 | while (coveredItems.cardinality() != transactionSize) {
41 |
42 | double minCostPerItem = Double.POSITIVE_INFINITY;
43 | Sequence bestSeq = null;
44 | BitSet bestSeqCoveredItems = null;
45 |
46 | for (final Sequence seq : cachedSequences.rowKeySet()) {
47 |
48 | // How many additional items does sequence cover?
49 | final BitSet seqCoveredItems = transaction.getCovered(seq, coveredItems);
50 | // Ignore sequences which don't cover anything
51 | if (seqCoveredItems.isEmpty())
52 | continue;
53 |
54 | // Get seq multiplicity in covering
55 | final int occur = covering.count(seq);
56 |
57 | // TODO triple check that this is right!!!
58 | // Calculate f(CuS) - f(C)
59 | Double prob1 = cachedSequences.get(seq, occur + 1);
60 | if (prob1 == null)
61 | prob1 = 0.; // Empty multiplicities have zero prob
62 | else if (prob1 == 0. && isInnerProb(occur + 1, cachedSequences.row(seq)))
63 | prob1 = Double.MIN_VALUE; // Smooth zero inner probs
64 | double prob = cachedSequences.get(seq, occur);
65 | if (prob == 0. && isInnerProb(occur, cachedSequences.row(seq)))
66 | prob = Double.MIN_VALUE; // Smooth zero inner probs
67 | final double cost = -Math.log(prob1) + Math.log(prob)
68 | + sumLogRange(lenCovering + 1, lenCovering + seq.size());
69 | final double costPerItem = cost / seq.size();
70 |
71 | if (costPerItem < minCostPerItem) {
72 | minCostPerItem = costPerItem;
73 | bestSeq = seq;
74 | bestSeqCoveredItems = seqCoveredItems;
75 | }
76 |
77 | }
78 |
79 | if (bestSeq != null) {
80 | // final int firstItemCovered = bestSeqCoveredItems
81 | // .nextSetBit(0);
82 | // covering.put(bestSeq, firstItemCovered);
83 | covering.add(bestSeq);
84 | lenCovering += bestSeq.size();
85 | coveredItems.or(bestSeqCoveredItems);
86 | } else { // Fill in incomplete coverings with singletons
87 | int index = 0;
88 | while (coveredItems.cardinality() != transactionSize) {
89 | index = coveredItems.nextClearBit(index);
90 | final Sequence seq = new Sequence(transaction.get(index));
91 | covering.add(seq);
92 | coveredItems.set(index);
93 | }
94 | return covering;
95 | }
96 |
97 | }
98 | return covering;
99 | }
100 |
101 | private boolean isInnerProb(final int probIndex, final Map probVec) {
102 | for (int i = probIndex + 1; i < probVec.size(); i++) {
103 | if (probVec.get(i) != 0.)
104 | return true;
105 | }
106 | return false;
107 | }
108 |
109 | private double sumLogRange(final int a, final int b) {
110 | double sum = 0;
111 | for (int i = a; i <= b; i++)
112 | sum += Math.log(i);
113 | return sum;
114 | }
115 |
116 | }
117 |
118 | // /**
119 | // * Infer ML parameters to explain transaction using greedy algorithm and
120 | // * store in covering. Sequences may overlap.
121 | // *
122 | // * This is an O(log(n))-approximation algorithm where n is the number of
123 | // * elements in the transaction.
124 | // */
125 | // public static class InferGreedyOld implements InferenceAlgorithm,
126 | // Serializable {
127 | // private static final long serialVersionUID = 9173178089235828142L;
128 | //
129 | // @Override
130 | // public HashSet infer(final Transaction transaction) {
131 | //
132 | // final HashSet covering = new HashSet<>();
133 | // final int transactionSize = transaction.size();
134 | // final BitSet coveredItems = new BitSet(transactionSize);
135 | //
136 | // final HashMap cachedSequences = transaction
137 | // .getCachedSequences();
138 | // while (coveredItems.cardinality() != transactionSize) {
139 | //
140 | // double minCostPerItem = Double.POSITIVE_INFINITY;
141 | // Sequence bestSeq = null;
142 | // BitSet bestSeqCoveredItems = null;
143 | //
144 | // for (final Entry entry : cachedSequences
145 | // .entrySet()) {
146 | //
147 | // // Ignore sequences which already cover
148 | // if (covering.contains(entry.getKey()))
149 | // continue;
150 | //
151 | // // How many additional items does sequence cover?
152 | // final BitSet seqCoveredItems = transaction.getCovered(
153 | // entry.getKey(), coveredItems);
154 | // // Ignore sequences which don't cover anything
155 | // if (seqCoveredItems.isEmpty())
156 | // continue;
157 | // final BitSet newlyCoveredItems = (BitSet) seqCoveredItems
158 | // .clone();
159 | // newlyCoveredItems.or(coveredItems);
160 | // final int notCovered = newlyCoveredItems.cardinality()
161 | // - coveredItems.cardinality();
162 | //
163 | // final double cost = -Math.log(entry.getValue());
164 | // final double costPerItem = cost / notCovered;
165 | //
166 | // if (costPerItem < minCostPerItem) {
167 | // minCostPerItem = costPerItem;
168 | // bestSeq = entry.getKey();
169 | // bestSeqCoveredItems = seqCoveredItems;
170 | // }
171 | //
172 | // }
173 | //
174 | // if (bestSeq != null) {
175 | // // final int firstItemCovered = bestSeqCoveredItems
176 | // // .nextSetBit(0);
177 | // // covering.put(bestSeq, firstItemCovered);
178 | // covering.add(bestSeq);
179 | // coveredItems.or(bestSeqCoveredItems);
180 | // } else { // Allow incomplete coverings
181 | // break;
182 | // }
183 | //
184 | // }
185 | // return covering;
186 | // }
187 | //
188 | // }
189 |
190 | private InferenceAlgorithms() {
191 |
192 | }
193 |
194 | }
195 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/SequenceMining.java:
--------------------------------------------------------------------------------
1 | package sequencemining.main;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.text.SimpleDateFormat;
6 | import java.util.ArrayList;
7 | import java.util.Collections;
8 | import java.util.Date;
9 | import java.util.HashMap;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.Map.Entry;
13 | import java.util.logging.Level;
14 |
15 | import org.apache.commons.io.FileUtils;
16 | import org.apache.commons.io.FilenameUtils;
17 | import org.apache.commons.io.LineIterator;
18 |
19 | import com.beust.jcommander.IStringConverter;
20 | import com.beust.jcommander.JCommander;
21 | import com.beust.jcommander.Parameter;
22 | import com.beust.jcommander.ParameterException;
23 | import com.google.common.base.Charsets;
24 | import com.google.common.collect.HashBasedTable;
25 | import com.google.common.collect.HashMultiset;
26 | import com.google.common.collect.Multiset;
27 | import com.google.common.collect.Table;
28 | import com.google.common.io.Files;
29 |
30 | import sequencemining.main.InferenceAlgorithms.InferGreedy;
31 | import sequencemining.main.InferenceAlgorithms.InferenceAlgorithm;
32 | import sequencemining.sequence.Sequence;
33 | import sequencemining.transaction.Transaction;
34 | import sequencemining.transaction.TransactionList;
35 | import sequencemining.util.Logging;
36 |
37 | public class SequenceMining extends SequenceMiningCore {
38 |
39 | /** Main function parameters */
40 | public static class Parameters {
41 |
42 | @Parameter(names = { "-f", "--file" }, description = "Dataset filename")
43 | private final File dataset = new File(
44 | "/afs/inf.ed.ac.uk/user/j/jfowkes/Code/Sequences/Datasets/Paper/jmlr.dat");
45 |
46 | @Parameter(names = { "-s", "--maxSteps" }, description = "Max structure steps")
47 | int maxStructureSteps = 100_000;
48 |
49 | @Parameter(names = { "-i", "--iterations" }, description = "Max iterations")
50 | int maxEMIterations = 1_000;
51 |
52 | @Parameter(names = { "-l", "--log-level" }, description = "Log level", converter = LogLevelConverter.class)
53 | Level logLevel = Level.FINE;
54 |
55 | @Parameter(names = { "-r", "--runtime" }, description = "Max Runtime (min)")
56 | long maxRunTime = 72 * 60; // 12hrs
57 |
58 | @Parameter(names = { "-t", "--timestamp" }, description = "Timestamp Logfile", arity = 1)
59 | boolean timestampLog = true;
60 |
61 | @Parameter(names = { "-d", "--dist" }, description = "Save sequence count distribution")
62 | private boolean saveCountDist = false;
63 |
64 | @Parameter(names = { "-v", "--verbose" }, description = "Print to console instead of logfile")
65 | private boolean verbose = false;
66 | }
67 |
68 | public static void main(final String[] args) throws IOException {
69 |
70 | // Main fixed parameters
71 | final InferenceAlgorithm inferenceAlg = new InferGreedy();
72 |
73 | // Runtime parameters
74 | final Parameters params = new Parameters();
75 | final JCommander jc = new JCommander(params);
76 |
77 | try {
78 | jc.parse(args);
79 |
80 | // Set loglevel, runtime, timestamp and log file
81 | LOG_LEVEL = params.logLevel;
82 | MAX_RUNTIME = params.maxRunTime * 60 * 1_000;
83 | File logFile = null;
84 | if (!params.verbose)
85 | logFile = Logging.getLogFileName("ISM", params.timestampLog, LOG_DIR, params.dataset);
86 |
87 | // Mine interesting sequences
88 | mineSequences(params.dataset, inferenceAlg, params.maxStructureSteps, params.maxEMIterations, logFile,
89 | params.saveCountDist);
90 |
91 | } catch (final ParameterException e) {
92 | System.out.println(e.getMessage());
93 | jc.usage();
94 | }
95 |
96 | System.exit(0); // Required to prevent waiting for Runnable completion
97 |
98 | }
99 |
100 | /** Mine interesting sequences */
101 | public static Map mineSequences(final File inputFile, final InferenceAlgorithm inferenceAlgorithm,
102 | final int maxStructureSteps, final int maxEMIterations, final File logFile, final boolean saveCountDist)
103 | throws IOException {
104 |
105 | // Set up logging
106 | if (logFile != null)
107 | Logging.setUpFileLogger(logger, LOG_LEVEL, logFile);
108 | else
109 | Logging.setUpConsoleLogger(logger, LOG_LEVEL);
110 |
111 | // Echo input parameters
112 | logger.info("========== INTERESTING SEQUENCE MINING ============");
113 | logger.info("\n Time: " + new SimpleDateFormat("dd.MM.yyyy-HH:mm:ss").format(new Date()));
114 | logger.info("\n Inputs: -f " + inputFile + " -s " + maxStructureSteps + " -i " + maxEMIterations + " -r "
115 | + MAX_RUNTIME / 60_000);
116 |
117 | // Read in transaction database
118 | final TransactionList transactions = readTransactions(inputFile);
119 |
120 | // Determine initial probabilities
121 | final Table initProbs = scanDatabaseToDetermineInitialProbabilities(inputFile);
122 |
123 | // Run inference to find interesting sequences
124 | logger.fine("\n============= SEQUENCE INFERENCE =============\n");
125 | final Table sequences = structuralEM(transactions, initProbs, inferenceAlgorithm,
126 | maxStructureSteps, maxEMIterations);
127 | if (LOG_LEVEL.equals(Level.FINEST))
128 | logger.finest(
129 | "\n======= Transaction Database =======\n" + Files.toString(inputFile, Charsets.UTF_8) + "\n");
130 |
131 | // Calculate probabilities: p(S \in X) = p(z_S >= 1) = 1 - \pi_S_0
132 | final HashMap sequenceMap = new HashMap<>();
133 | for (final Sequence seq : sequences.rowKeySet())
134 | sequenceMap.put(seq, 1 - sequences.get(seq, 0));
135 |
136 | // Sort sequences by interestingness
137 | final HashMap intMap = calculateInterestingness(sequenceMap, transactions);
138 | final Map sortedSequences = sortSequences(sequenceMap, intMap);
139 |
140 | logger.info("\n============= INTERESTING SEQUENCES =============\n");
141 | for (final Entry entry : sortedSequences.entrySet()) {
142 | logger.info(String.format("%s\tprob: %1.5f \tint: %1.5f %n", entry.getKey(), entry.getValue(),
143 | intMap.get(entry.getKey())));
144 | }
145 | logger.info("\n");
146 |
147 | // Optionally save sequence count distribution
148 | if (saveCountDist) {
149 | Logging.serialize(sequences, FilenameUtils.removeExtension(logFile.getAbsolutePath()) + ".dist");
150 | }
151 |
152 | return sortedSequences;
153 | }
154 |
155 | public static TransactionList readTransactions(final File inputFile) throws IOException {
156 |
157 | final List transactions = new ArrayList<>();
158 |
159 | // for each line (transaction) until the end of file
160 | final LineIterator it = FileUtils.lineIterator(inputFile, "UTF-8");
161 | while (it.hasNext()) {
162 |
163 | final String line = it.nextLine();
164 | // if the line is a comment, is empty or is a
165 | // kind of metadata
166 | if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
167 | continue;
168 | }
169 |
170 | // split the transaction into items
171 | final String[] lineSplited = line.split(" ");
172 | // convert to Transaction class and add it to the structure
173 | transactions.add(getTransaction(lineSplited));
174 |
175 | }
176 | // close the input file
177 | LineIterator.closeQuietly(it);
178 |
179 | return new TransactionList(transactions);
180 | }
181 |
182 | /**
183 | * Create and add the Transaction in the String array
184 | *
185 | * @param integers
186 | * one line of integers in the sequence database
187 | */
188 | public static Transaction getTransaction(final String[] integers) {
189 | final Transaction sequence = new Transaction();
190 |
191 | for (int i = 0; i < integers.length; i++) {
192 | if (integers[i].equals("-1")) { // end of item
193 |
194 | } else if (integers[i].equals("-2")) { // end of sequence
195 | return sequence;
196 | } else { // extract the value for an item
197 | sequence.add(Integer.parseInt(integers[i]));
198 | }
199 | }
200 | throw new RuntimeException("Corrupt sequence database.");
201 | }
202 |
203 | /**
204 | * This method scans the input database to determine the initial
205 | * probabilities of single items
206 | *
207 | * @param inputFile
208 | * the input file
209 | * @return class storing the support of every occurrence of each singleton
210 | */
211 | public static Table scanDatabaseToDetermineInitialProbabilities(final File inputFile)
212 | throws IOException {
213 |
214 | // Sequence x occurence x count
215 | final Table supports = HashBasedTable.create();
216 |
217 | // for each line (transaction) until the end of file
218 | int noTransactions = 0;
219 | final LineIterator it = FileUtils.lineIterator(inputFile, "UTF-8");
220 | while (it.hasNext()) {
221 |
222 | final String line = it.nextLine();
223 | // if the line is a comment, is empty or is a kind of metadata
224 | if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') {
225 | continue;
226 | }
227 |
228 | // split the line into items
229 | final String[] lineSplit = line.split(" ");
230 | // for each item
231 | final Multiset seenItems = HashMultiset.create();
232 | for (final String itemString : lineSplit) {
233 | final int item = Integer.parseInt(itemString);
234 | if (item >= 0) // ignore end of itemset/sequence tags
235 | seenItems.add(new Sequence(item));
236 | }
237 | // increase the support count of the items
238 | for (final Sequence seq : seenItems.elementSet()) {
239 | final int occur = seenItems.count(seq);
240 | if (supports.contains(seq, occur)) {
241 | final double supp = supports.get(seq, occur);
242 | supports.put(seq, occur, supp + 1);
243 | } else {
244 | supports.put(seq, occur, 1.);
245 | }
246 | }
247 |
248 | noTransactions++;
249 | }
250 |
251 | // close the input file
252 | LineIterator.closeQuietly(it);
253 |
254 | for (final Sequence seq : supports.rowKeySet()) {
255 | // Pad with zero counts for non-occurrences
256 | final int maxOccur = Collections.max(supports.row(seq).keySet());
257 | for (int occur = 1; occur <= maxOccur; occur++) {
258 | if (!supports.contains(seq, occur))
259 | supports.put(seq, occur, 0.);
260 | } // Add counts for zero occurrences
261 | double rowSum = 0;
262 | for (final Double count : supports.row(seq).values())
263 | rowSum += count;
264 | supports.put(seq, 0, noTransactions - rowSum);
265 | }
266 |
267 | // Normalize
268 | for (final Sequence seq : supports.rowKeySet()) {
269 | double rowSum = 0;
270 | for (final Double prob : supports.row(seq).values())
271 | rowSum += prob;
272 | for (final Integer occur : supports.row(seq).keySet()) {
273 | final double normProb = supports.get(seq, occur) / rowSum;
274 | supports.put(seq, occur, normProb);
275 | }
276 | }
277 |
278 | return supports;
279 | }
280 |
281 | /** Convert string level to level class */
282 | public static class LogLevelConverter implements IStringConverter {
283 | @Override
284 | public Level convert(final String value) {
285 | if (value.equals("SEVERE"))
286 | return Level.SEVERE;
287 | else if (value.equals("WARNING"))
288 | return Level.WARNING;
289 | else if (value.equals("INFO"))
290 | return Level.INFO;
291 | else if (value.equals("CONFIG"))
292 | return Level.CONFIG;
293 | else if (value.equals("FINE"))
294 | return Level.FINE;
295 | else if (value.equals("FINER"))
296 | return Level.FINER;
297 | else if (value.equals("FINEST"))
298 | return Level.FINEST;
299 | else
300 | throw new RuntimeException("Incorrect Log Level.");
301 | }
302 | }
303 |
304 | }
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/SparkEMStep.java:
--------------------------------------------------------------------------------
1 | package sequencemining.main;
2 | //package itemsetmining.main;
3 | //
4 | //import itemsetmining.itemset.Itemset;
5 | //import itemsetmining.main.InferenceAlgorithms.InferenceAlgorithm;
6 | //import itemsetmining.transaction.Transaction;
7 | //import itemsetmining.transaction.TransactionDatabase;
8 | //
9 | //import java.util.HashSet;
10 | //import java.util.List;
11 | //import java.util.Map;
12 | //import java.util.Set;
13 | //import java.util.stream.Collectors;
14 | //
15 | //import org.apache.spark.api.java.JavaPairRDD;
16 | //import org.apache.spark.api.java.JavaRDD;
17 | //
18 | //import scala.Tuple2;
19 | //
20 | //import com.google.common.collect.Multiset;
21 | //
22 | ///** Class to hold the various transaction EM Steps for Spark */
23 | //public class SparkEMStep {
24 | //
25 | // /** Initialize cached itemsets */
26 | // static void initializeCachedItemsets(
27 | // final TransactionDatabase transactions,
28 | // final Multiset singletons) {
29 | // final long noTransactions = transactions.size();
30 | // final JavaRDD updatedTransactions = transactions
31 | // .getTransactionRDD().map(t -> {
32 | // t.initializeCachedItemsets(singletons, noTransactions);
33 | // return t;
34 | // });
35 | //
36 | // // Update cache reference
37 | // transactions.updateTransactionCache(updatedTransactions);
38 | // }
39 | //
40 | // /** EM-step for hard EM */
41 | // static Map hardEMStep(
42 | // final TransactionDatabase transactions,
43 | // final InferenceAlgorithm inferenceAlgorithm) {
44 | // final double noTransactions = transactions.size();
45 | //
46 | // // E-step: map and cache covering
47 | // final JavaPairRDD> transactionWithCovering = transactions
48 | // .getTransactionRDD()
49 | // .mapToPair(
50 | // t -> {
51 | // final HashSet covering = inferenceAlgorithm
52 | // .infer(t);
53 | // t.setCachedCovering(covering);
54 | // return new Tuple2>(t,
55 | // covering);
56 | // });
57 | //
58 | // // E-step: reduce and get itemset counts
59 | // final List> coveringWithCounts = transactionWithCovering
60 | // .values().flatMap(s -> s)
61 | // .mapToPair(s -> new Tuple2(s, 1))
62 | // .reduceByKey((a, b) -> a + b).collect();
63 | //
64 | // // M-step
65 | // final Map newItemsets = coveringWithCounts
66 | // .parallelStream().collect(
67 | // Collectors
68 | // .toMap(Tuple2::_1, t -> t._2 / noTransactions));
69 | //
70 | // // Update cached itemsets
71 | // final JavaRDD updatedTransactions = transactionWithCovering
72 | // .keys().map(t -> {
73 | // t.updateCachedItemsets(newItemsets);
74 | // return t;
75 | // });
76 | //
77 | // // Update cache reference
78 | // transactions.updateTransactionCache(updatedTransactions);
79 | //
80 | // return newItemsets;
81 | // }
82 | //
83 | // /** Get average cost of last EM-step */
84 | // static void calculateAndSetAverageCost(
85 | // final TransactionDatabase transactions) {
86 | // final double noTransactions = transactions.size();
87 | // final double averageCost = transactions.getTransactionRDD()
88 | // .map(Transaction::getCachedCost).reduce((a, b) -> a + b)
89 | // / noTransactions;
90 | // transactions.setAverageCost(averageCost);
91 | // }
92 | //
93 | // /** EM-step for structural EM */
94 | // static Tuple2 structuralEMStep(
95 | // final TransactionDatabase transactions,
96 | // final InferenceAlgorithm inferenceAlgorithm, final Itemset candidate) {
97 | // final double noTransactions = transactions.size();
98 | //
99 | // // E-step: map candidate to supported transactions and cache covering
100 | // final JavaPairRDD> transactionWithCovering = transactions
101 | // .getTransactionRDD()
102 | // .mapToPair(
103 | // t -> {
104 | // if (t.contains(candidate)) {
105 | // t.addItemsetCache(candidate, 1.0);
106 | // final HashSet covering = inferenceAlgorithm
107 | // .infer(t);
108 | // t.setTempCachedCovering(covering);
109 | // return new Tuple2>(t,
110 | // covering);
111 | // }
112 | // return new Tuple2>(t, t
113 | // .getCachedCovering());
114 | // });
115 | //
116 | // // E-step: reduce and get itemset counts
117 | // final List> coveringWithCounts = transactionWithCovering
118 | // .values().flatMap(s -> s)
119 | // .mapToPair(s -> new Tuple2(s, 1))
120 | // .reduceByKey((a, b) -> a + b).collect();
121 | //
122 | // // M-step
123 | // final Map newItemsets = coveringWithCounts
124 | // .parallelStream().collect(
125 | // Collectors
126 | // .toMap(Tuple2::_1, t -> t._2 / noTransactions));
127 | //
128 | // // Get cost per transaction
129 | // final JavaPairRDD transactionWithCost = transactionWithCovering
130 | // .keys().mapToPair(t -> {
131 | // double cost;
132 | // if (t.contains(candidate))
133 | // cost = t.getTempCachedCost(newItemsets);
134 | // else
135 | // cost = t.getCachedCost(newItemsets);
136 | // t.removeItemsetCache(candidate);
137 | // return new Tuple2(t, cost);
138 | // });
139 | //
140 | // // Get average cost
141 | // final double averageCost = transactionWithCost.values().reduce(
142 | // (a, b) -> a + b)
143 | // / noTransactions;
144 | //
145 | // // Get candidate prob
146 | // Double prob = newItemsets.get(candidate);
147 | // if (prob == null)
148 | // prob = 0.;
149 | //
150 | // // Update cache reference
151 | // transactions.updateTransactionCache(transactionWithCost.keys());
152 | //
153 | // return new Tuple2(averageCost, prob);
154 | // }
155 | //
156 | // /** Add accepted candidate itemset to cache */
157 | // static Map addAcceptedCandidateCache(
158 | // final TransactionDatabase transactions, final Itemset candidate,
159 | // final double prob) {
160 | // final double noTransactions = transactions.size();
161 | //
162 | // // Cached E-step: map candidate to supported transactions and cache
163 | // final JavaPairRDD> transactionWithCovering = transactions
164 | // .getTransactionRDD().mapToPair(
165 | // t -> {
166 | // if (t.contains(candidate)) {
167 | // t.addItemsetCache(candidate, prob);
168 | // final HashSet covering = t
169 | // .getTempCachedCovering();
170 | // t.setCachedCovering(covering);
171 | // return new Tuple2>(t,
172 | // covering);
173 | // }
174 | // return new Tuple2>(t, t
175 | // .getCachedCovering());
176 | // });
177 | //
178 | // // E-step: reduce and get itemset counts
179 | // final List> coveringWithCounts = transactionWithCovering
180 | // .values().flatMap(s -> s)
181 | // .mapToPair(s -> new Tuple2(s, 1))
182 | // .reduceByKey((a, b) -> a + b).collect();
183 | //
184 | // // M-step
185 | // final Map newItemsets = coveringWithCounts
186 | // .parallelStream().collect(
187 | // Collectors
188 | // .toMap(Tuple2::_1, t -> t._2 / noTransactions));
189 | //
190 | // // Update cached itemsets
191 | // final JavaRDD updatedTransactions = transactionWithCovering
192 | // .keys().map(t -> {
193 | // t.updateCachedItemsets(newItemsets);
194 | // return t;
195 | // });
196 | //
197 | // // Update cache reference
198 | // transactions.updateTransactionCache(updatedTransactions);
199 | //
200 | // return newItemsets;
201 | // }
202 | //
203 | // private SparkEMStep() {
204 | // }
205 | //
206 | // }
207 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/main/SparkSequenceMining.java:
--------------------------------------------------------------------------------
1 | package sequencemining.main;
2 |
3 | // import itemsetmining.itemset.Itemset;
4 | // import itemsetmining.itemset.ItemsetTree;
5 | // import itemsetmining.main.InferenceAlgorithms.InferGreedy;
6 | // import itemsetmining.main.InferenceAlgorithms.InferenceAlgorithm;
7 | // import itemsetmining.transaction.Transaction;
8 | // import itemsetmining.transaction.TransactionRDD;
9 | // import itemsetmining.util.Logging;
10 | //
11 | // import java.io.File;
12 | // import java.io.FileInputStream;
13 | // import java.io.IOException;
14 | // import java.text.SimpleDateFormat;
15 | // import java.util.Date;
16 | // import java.util.HashMap;
17 | // import java.util.Map;
18 | // import java.util.Map.Entry;
19 | // import java.util.logging.Level;
20 | // import java.util.Properties;
21 | //
22 | // import org.apache.hadoop.conf.Configuration;
23 | // import org.apache.hadoop.fs.FileSystem;
24 | // import org.apache.hadoop.fs.Path;
25 | // import org.apache.spark.SparkConf;
26 | // import org.apache.spark.api.java.JavaRDD;
27 | // import org.apache.spark.api.java.JavaSparkContext;
28 | // import org.apache.spark.api.java.function.Function;
29 | //
30 | // import scala.Tuple2;
31 | //
32 | // import com.beust.jcommander.IStringConverter;
33 | // import com.beust.jcommander.JCommander;
34 | // import com.beust.jcommander.Parameter;
35 | // import com.beust.jcommander.ParameterException;
36 | // import com.google.common.collect.HashMultiset;
37 | // import com.google.common.collect.Multiset;
38 | //
39 | // public class SparkItemsetMining extends ItemsetMiningCore {
40 | //
41 | // /** Main function parameters */
42 | // public static class Parameters {
43 | //
44 | // @Parameter(names = { "-f", "--file" }, description = "Dataset filename")
45 | // private final File dataset = new File("example.dat");
46 | //
47 | // @Parameter(names = { "-j", "--jar" }, description = "IIM Standalone jar")
48 | // private final String IIMJar = "itemset-mining/target/itemset-mining-1.0.jar";
49 | //
50 | // @Parameter(names = { "-s", "--maxSteps" }, description = "Max structure
51 | // steps")
52 | // int maxStructureSteps = 100_000;
53 | //
54 | // @Parameter(names = { "-i", "--iterations" }, description = "Max iterations")
55 | // int maxEMIterations = 1_000;
56 | //
57 | // @Parameter(names = { "-c", "--cores" }, description = "No cores")
58 | // int noCores = 16;
59 | //
60 | // @Parameter(names = { "-l", "--log-level" }, description = "Log level",
61 | // converter = LogLevelConverter.class)
62 | // Level logLevel = Level.FINE;
63 | //
64 | // @Parameter(names = { "-r", "--runtime" }, description = "Max Runtime (min)")
65 | // long maxRunTime = 12 * 60; // 12hrs
66 | //
67 | // @Parameter(names = { "-t", "--timestamp" }, description = "Timestamp
68 | // Logfile", arity = 1)
69 | // boolean timestampLog = true;
70 | //
71 | // @Parameter(names = { "-v", "--verbose" }, description = "Print to console
72 | // instead of logfile")
73 | // private boolean verbose = false;
74 | // }
75 | //
76 | // public static void main(final String[] args) throws IOException {
77 | //
78 | // // Use greedy inference algorithm for Spark
79 | // final InferenceAlgorithm inferenceAlg = new InferGreedy();
80 | //
81 | // final Parameters params = new Parameters();
82 | // final JCommander jc = new JCommander(params);
83 | //
84 | // try {
85 | // jc.parse(args);
86 | //
87 | // // Set up spark and HDFS
88 | // final JavaSparkContext sc = setUpSpark(params.dataset.getName(),
89 | // params.IIMJar, params.noCores);
90 | // final FileSystem hdfs = setUpHDFS();
91 | //
92 | // // Set loglevel, runtime, timestamp and log file
93 | // LOG_LEVEL = params.logLevel;
94 | // MAX_RUNTIME = params.maxRunTime * 60 * 1_000;
95 | // File logFile = null;
96 | // if(!params.verbose)
97 | // logFile = Logging.getLogFileName("IIM",
98 | // params.timestampLog, LOG_DIR, params.dataset);
99 | //
100 | // mineItemsets(params.dataset, hdfs, sc, inferenceAlg,
101 | // params.maxStructureSteps, params.maxEMIterations, logFile);
102 | //
103 | // } catch (final ParameterException e) {
104 | // System.out.println(e.getMessage());
105 | // jc.usage();
106 | // }
107 | //
108 | // }
109 | //
110 | // public static Map mineItemsets(final File inputFile,
111 | // final FileSystem hdfs, final JavaSparkContext sc,
112 | // final InferenceAlgorithm inferenceAlg, final int maxStructureSteps,
113 | // final int maxEMIterations, final File logFile) throws IOException {
114 | //
115 | // // Set up logging
116 | // if (logFile != null)
117 | // Logging.setUpFileLogger(logger, LOG_LEVEL, logFile);
118 | // else
119 | // Logging.setUpConsoleLogger(logger, LOG_LEVEL);
120 | //
121 | // // Echo input parameters
122 | // logger.info("========== SPARK INTERESTING ITEMSET MINING ============");
123 | // logger.info("\n Time: "
124 | // + new SimpleDateFormat("dd.MM.yyyy-HH:mm:ss")
125 | // .format(new Date()));
126 | // logger.info("\n Inputs: -f " + inputFile + " -s " + maxStructureSteps
127 | // + " -i " + maxEMIterations + " -c "
128 | // + sc.getLocalProperty("spark.cores.max") + " -r " + MAX_RUNTIME
129 | // / 60_000 + "\n");
130 | //
131 | // // Load Spark and HDFS Properties
132 | // Properties prop = new Properties();
133 | // prop.load(SparkItemsetMining.class.getResourceAsStream("/spark.properties"));
134 | //
135 | // // Copy transaction database to hdfs
136 | // final String datasetPath = prop.getProperty("HDFSMaster")
137 | // + inputFile.getName();
138 | // hdfs.copyFromLocalFile(new Path(inputFile.getAbsolutePath()), new Path(
139 | // datasetPath));
140 | // hdfs.setReplication(new Path(datasetPath),
141 | // Short.parseShort(prop.getProperty("MachinesInCluster")));
142 | // try { // Wait for file to replicate
143 | // Thread.sleep(10 * 1000);
144 | // } catch (final InterruptedException e) {
145 | // e.printStackTrace();
146 | // }
147 | //
148 | // // Read in transaction database
149 | // final int noCores = Integer.parseInt(sc.getConf()
150 | // .get("spark.cores.max"));
151 | // final JavaRDD db = sc.textFile(datasetPath, 2 * noCores)
152 | // .map(new ParseTransaction()).cache();
153 | //
154 | // // Determine most frequent singletons
155 | // final Map singletonsMap = db.flatMap(t -> t)
156 | // .mapToPair(i -> new Tuple2(i, 1))
157 | // .reduceByKey((a, b) -> a + b).collectAsMap();
158 | //
159 | // // Convert singletons map to Multiset (as Spark map is not serializable)
160 | // final Multiset singletons = HashMultiset.create();
161 | // for (final Entry entry : singletonsMap.entrySet())
162 | // singletons.add(entry.getKey(), entry.getValue());
163 | //
164 | // // Apply the algorithm to build the itemset tree
165 | // final ItemsetTree tree = new ItemsetTree(singletons);
166 | // tree.buildTree(datasetPath, hdfs);
167 | // if (LOG_LEVEL.equals(Level.FINE))
168 | // tree.printStatistics(logger);
169 | //
170 | // // Run inference to find interesting itemsets
171 | // final TransactionRDD transactions = new TransactionRDD(db, db.count());
172 | // logger.fine("\n============= ITEMSET INFERENCE =============\n");
173 | // final HashMap itemsets = structuralEM(transactions,
174 | // singletons, tree, inferenceAlg, maxStructureSteps,
175 | // maxEMIterations);
176 | //
177 | // // Sort itemsets by interestingness
178 | // final HashMap intMap = calculateInterestingness(
179 | // itemsets, transactions, tree);
180 | // final Map sortedItemsets = sortItemsets(itemsets,
181 | // intMap);
182 | //
183 | // logger.info("\n============= INTERESTING ITEMSETS =============\n");
184 | // for (final Entry entry : sortedItemsets.entrySet()) {
185 | // logger.info(String.format("%s\tprob: %1.5f \tint: %1.5f %n",
186 | // entry.getKey(), entry.getValue(),
187 | // intMap.get(entry.getKey())));
188 | // }
189 | // logger.info("\n");
190 | //
191 | // return sortedItemsets;
192 | // }
193 | //
194 | // /** Set up Spark */
195 | // public static JavaSparkContext setUpSpark(final String dataset, final String
196 | // IIMJar,
197 | // final int noCores) throws IOException {
198 | //
199 | // // Load Spark and HDFS Properties
200 | // Properties prop = new Properties();
201 | // prop.load(SparkItemsetMining.class.getResourceAsStream("/spark.properties"));
202 | //
203 | // final SparkConf conf = new SparkConf();
204 | // conf.setMaster(prop.getProperty("SparkMaster"))
205 | // .setAppName("Itemset Mining: " + dataset)
206 | // .setSparkHome(prop.getProperty("SparkHome"))
207 | // .setJars(new String[] {IIMJar});
208 | // conf.set("spark.cores.max", Integer.toString(noCores));
209 | // conf.set("spark.executor.memory", "20g");
210 | // conf.set("spark.default.parallelism", "8");
211 | // conf.set("spark.shuffle.manager", "SORT");
212 | // // conf.set("spark.eventLog.enabled", "true"); uses GB of space!!!
213 | //
214 | // // Use Kryo for serialization - much faster!
215 | // conf.set("spark.serializer",
216 | // "org.apache.spark.serializer.KryoSerializer");
217 | // conf.set("spark.kryo.registrator",
218 | // "itemsetmining.util.ClassRegistrator");
219 | //
220 | // final JavaSparkContext sc = new JavaSparkContext(conf);
221 | // sc.setCheckpointDir(prop.getProperty("HDFSMaster")
222 | // + "checkpoint/");
223 | // return sc;
224 | // }
225 | //
226 | // /** Set up HDFS */
227 | // public static FileSystem setUpHDFS() throws IOException {
228 | //
229 | // // Load Spark and HDFS Properties
230 | // Properties prop = new Properties();
231 | // prop.load(SparkItemsetMining.class.getResourceAsStream("/spark.properties"));
232 | //
233 | // final Configuration conf = new Configuration();
234 | // conf.addResource(new Path(prop.getProperty("HDFSConfFile")));
235 | // return FileSystem.get(conf);
236 | // }
237 | //
238 | // /** Read in transactions */
239 | // private static class ParseTransaction implements
240 | // Function {
241 | // private static final long serialVersionUID = -9092218383491621520L;
242 | //
243 | // @Override
244 | // public Transaction call(final String line) {
245 | //
246 | // // create a structure for storing the transaction
247 | // final Transaction transaction = new Transaction();
248 | //
249 | // // split the transaction into items
250 | // final String[] lineSplit = line.split(" ");
251 | //
252 | // // for each item in the transaction
253 | // for (int i = 0; i < lineSplit.length; i++) {
254 | // // convert the item to integer and add it to the structure
255 | // transaction.add(Integer.parseInt(lineSplit[i]));
256 | // }
257 | //
258 | // return transaction;
259 | // }
260 | // }
261 | //
262 | // /** Convert string level to level class */
263 | // public static class LogLevelConverter implements IStringConverter {
264 | // @Override
265 | // public Level convert(final String value) {
266 | // if (value.equals("SEVERE"))
267 | // return Level.SEVERE;
268 | // else if (value.equals("WARNING"))
269 | // return Level.WARNING;
270 | // else if (value.equals("INFO"))
271 | // return Level.INFO;
272 | // else if (value.equals("CONFIG"))
273 | // return Level.CONFIG;
274 | // else if (value.equals("FINE"))
275 | // return Level.FINE;
276 | // else if (value.equals("FINER"))
277 | // return Level.FINER;
278 | // else if (value.equals("FINEST"))
279 | // return Level.FINEST;
280 | // else
281 | // throw new RuntimeException("Incorrect Log Level.");
282 | // }
283 | // }
284 | //
285 | // }
286 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/sequence/AbstractSequence.java:
--------------------------------------------------------------------------------
1 | package sequencemining.sequence;
2 |
3 | import java.io.Serializable;
4 | import java.util.AbstractCollection;
5 | import java.util.BitSet;
6 | import java.util.Collection;
7 | import java.util.Iterator;
8 | import java.util.List;
9 |
10 | public abstract class AbstractSequence extends AbstractCollection implements Serializable {
11 | private static final long serialVersionUID = 686688001826219278L;
12 |
13 | protected List items;
14 |
15 | /**
16 | * Add given items to this sequence
17 | *
18 | * @param items
19 | * an item that should be added to this sequence
20 | */
21 | @Override
22 | public boolean add(final Integer item) {
23 | return this.items.add(item);
24 | }
25 |
26 | /**
27 | * Get item at specified position in this sequence
28 | *
29 | * @param index
30 | * index of the element to return
31 | */
32 | public int get(final int index) {
33 | return this.items.get(index);
34 | }
35 |
36 | /**
37 | * Add item to this sequence
38 | *
39 | * @param items
40 | * a collection of items that should be added to this sequence
41 | */
42 | @Override
43 | public boolean addAll(final Collection extends Integer> items) {
44 | return this.items.addAll(items);
45 | }
46 |
47 | /**
48 | * Get the items in this sequence
49 | *
50 | * @return the items
51 | */
52 | public List getItems() {
53 | return this.items;
54 | }
55 |
56 | /**
57 | * Add items to this sequence
58 | *
59 | * @param items
60 | * an array of items that should be added to this sequence
61 | */
62 | public void add(final Integer... items) {
63 | for (final Integer set : items)
64 | this.items.add(set);
65 | }
66 |
67 | /** Code for covering sequences *with gaps* */
68 |
69 | /**
70 | * Check if this sequence contains given sequence (allowing gaps)
71 | *
72 | * @param sequence
73 | */
74 | public boolean contains(final Sequence seq) {
75 | int pos = 0;
76 | boolean containsItem;
77 | for (final int item : seq.items) {
78 | containsItem = false;
79 | for (int i = pos; i < this.items.size(); i++) {
80 | if (this.items.get(i) == item) {
81 | pos = i + 1;
82 | containsItem = true;
83 | break;
84 | }
85 | }
86 | if (!containsItem)
87 | return false;
88 | }
89 | return true;
90 | }
91 |
92 | /**
93 | * Return number of times this sequence contains given sequence (allowing
94 | * gaps)
95 | *
96 | * @param sequence
97 | * @return number of times given sequence is contained in this one
98 | */
99 | public int repetitions(final Sequence seq) {
100 | int count = 0;
101 | int pos = 0;
102 | while (true) {
103 | boolean containsItem;
104 | for (final int item : seq.items) {
105 | containsItem = false;
106 | for (int i = pos; i < this.items.size(); i++) {
107 | if (this.items.get(i) == item) {
108 | pos = i + 1;
109 | containsItem = true;
110 | break;
111 | }
112 | }
113 | if (!containsItem)
114 | return count;
115 | }
116 | count++;
117 | }
118 | }
119 |
120 | /** Code for covering sequences *without gaps* */
121 | //
122 | // /**
123 | // * Check if this sequence contains given sequence (without gaps)
124 | // *
125 | // * @param sequence
126 | // */
127 | // public int contains(final Sequence seq) {
128 | // outer: for (int i = 0; i < this.items.size()
129 | // - seq.items.size() + 1; i++) {
130 | // if (this.items.get(i).equals(seq.items.get(0))) {
131 | // for (int j = 1; j < seq.items.size(); j++) {
132 | // if (!this.items.get(i + j).equals(seq.items.get(j)))
133 | // continue outer;
134 | // }
135 | // return true;
136 | // }
137 | // }
138 | // return false;
139 | // }
140 |
141 | /** Code for covering sequences *with gaps* but *without overlap* */
142 |
143 | /**
144 | * Return items in this sequence covered by given sequence (with gaps,
145 | * without overlap)
146 | *
147 | * @param sequence
148 | * @return BitSet of items in order with the covered items set true
149 | */
150 | public BitSet getCovered(final AbstractSequence seq, final BitSet alreadyCoveredItems) {
151 | int pos = 0;
152 | boolean containsItem;
153 | final BitSet coveredItems = new BitSet(this.size());
154 | for (final int item : seq.items) {
155 | containsItem = false;
156 | for (int i = pos; i < this.items.size(); i++) {
157 | if (!alreadyCoveredItems.get(i) && this.items.get(i) == item) {
158 | coveredItems.set(i);
159 | pos = i + 1;
160 | containsItem = true;
161 | break;
162 | }
163 | }
164 | if (!containsItem) {
165 | coveredItems.clear();
166 | return coveredItems;
167 | }
168 | }
169 | return coveredItems;
170 | }
171 |
172 | /**
173 | * Code for covering sequences *without gaps* and *without overlap* !!
174 | * Remember to change subsequence contains and support function !!
175 | */
176 | //
177 | // /**
178 | // * Return the items in this sequence covered (without gaps, without
179 | // overlap)
180 | // * by the given sequence
181 | // *
182 | // * @param sequence
183 | // * @return BitSet of items in order with the covered items set true
184 | // */
185 | // public BitSet getCovered(final AbstractSequence seq,
186 | // final BitSet alreadyCoveredItems) {
187 | // final BitSet coveredItems = new BitSet(this.size());
188 | // outer: for (int i = 0; i < this.items.size() - seq.items.size() + 1; i++)
189 | // {
190 | // if (!alreadyCoveredItems.get(i)
191 | // && this.items.get(i).equals(seq.items.get(0))) {
192 | // for (int j = 1; j < seq.items.size(); j++) {
193 | // if (alreadyCoveredItems.get(i + j)
194 | // || !this.items.get(i + j).equals(seq.items.get(j)))
195 | // continue outer;
196 | // }
197 | // for (int j = 0; j < seq.items.size(); j++)
198 | // coveredItems.set(i + j);
199 | // return coveredItems;
200 | // }
201 | // }
202 | // coveredItems.clear();
203 | // return coveredItems;
204 | // }
205 |
206 | /**
207 | * Code for covering sequences *with gaps* but *with overlap* !! Remember to
208 | * change greedy algorithm and subsequence contains and support function !!
209 | */
210 | //
211 | // /**
212 | // * Check if first BitSet contains second BitSet
213 | // */
214 | // public boolean contains(final BitSet set1, final BitSet set2) {
215 | // final BitSet copy = (BitSet) set2.clone();
216 | // copy.and(set1);
217 | // return copy.equals(set2);
218 | // }
219 | //
220 | // /**
221 | // * Return the items in this sequence covered by the given sequence (with
222 | // * gaps, with overlap), allowing for multiple covering matches if the
223 | // * first match is already fully covered
224 | // *
225 | // *
226 | // * This is intended to allow the covering of 1 2 1 2 1 2 by 1 2.
227 | // *
228 | // * @param sequence
229 | // * @return BitSet of items in order with the covered items set true
230 | // */
231 | // public BitSet getCovered(final AbstractSequence seq,
232 | // final BitSet alreadyCoveredItems) {
233 | //
234 | // int index = 0;
235 | // while (true) {
236 | // final BitSet coveredItems = getCovered(seq, index);
237 | // if (coveredItems.isEmpty())
238 | // return coveredItems;
239 | // if (contains(alreadyCoveredItems, coveredItems))
240 | // index = coveredItems.nextSetBit(index) + 1;
241 | // else
242 | // return coveredItems;
243 | // }
244 | //
245 | // }
246 | //
247 | // /**
248 | // * Return the items in this sequence covered by the given sequence (with
249 | // * gaps, with overlap)
250 | // *
251 | // * @param sequence
252 | // * @return BitSet of items in order with the covered items set true
253 | // */
254 | // public BitSet getCovered(final AbstractSequence seq, final int
255 | // startIndex) {
256 | // int pos = startIndex;
257 | // boolean containsItem;
258 | // final BitSet coveredItems = new BitSet(this.size());
259 | // for (final int item : seq.items) {
260 | // containsItem = false;
261 | // for (int i = pos; i < this.items.size(); i++) {
262 | // if (this.items.get(i) == item) {
263 | // coveredItems.set(i);
264 | // pos = i + 1;
265 | // containsItem = true;
266 | // break;
267 | // }
268 | // }
269 | // if (!containsItem) {
270 | // coveredItems.clear();
271 | // return coveredItems;
272 | // }
273 | // }
274 | // return coveredItems;
275 | // }
276 |
277 | /**
278 | * Code for covering sequences *without gaps* but *with overlap* !! Remember
279 | * to change greedy algorithm and subsequence contains and support function
280 | * !!
281 | */
282 | //
283 | // /**
284 | // * Return the items in this sequence covered (without gaps, with
285 | // * overlap) by the given sequence
286 | // *
287 | // * @param sequence
288 | // * @return BitSet of items in order with the covered items set true
289 | // */
290 | // public BitSet getCovered(final AbstractSequence seq, final int
291 | // startIndex) {
292 | // final BitSet coveredItems = new BitSet(this.size());
293 | // outer: for (int i = startIndex; i < this.items.size()
294 | // - seq.items.size() + 1; i++) {
295 | // if (this.items.get(i).equals(seq.items.get(0))) {
296 | // for (int j = 1; j < seq.items.size(); j++) {
297 | // if (!this.items.get(i + j).equals(seq.items.get(j)))
298 | // continue outer;
299 | // }
300 | // for (int j = 0; j < seq.items.size(); j++)
301 | // coveredItems.set(i + j);
302 | // return coveredItems;
303 | // }
304 | // }
305 | // coveredItems.clear();
306 | // return coveredItems;
307 | // }
308 |
309 | /**
310 | * Number of items in this sequence
311 | */
312 | @Override
313 | public int size() {
314 | return this.items.size();
315 | }
316 |
317 | @Override
318 | public boolean isEmpty() {
319 | return items.isEmpty();
320 | }
321 |
322 | @Override
323 | public String toString() {
324 | return items.toString();
325 | }
326 |
327 | @Override
328 | public int hashCode() {
329 | return items.hashCode();
330 | }
331 |
332 | @Override
333 | public boolean equals(final Object obj) {
334 | if (this == obj)
335 | return true;
336 | if (!(obj instanceof AbstractSequence))
337 | return false;
338 | final AbstractSequence other = (AbstractSequence) obj;
339 | return items.equals(other.items);
340 | }
341 |
342 | @Override
343 | public Iterator iterator() {
344 | return items.iterator();
345 | }
346 |
347 | }
348 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/sequence/Sequence.java:
--------------------------------------------------------------------------------
1 | package sequencemining.sequence;
2 |
3 | import java.io.Serializable;
4 | import java.util.ArrayList;
5 | import java.util.Arrays;
6 | import java.util.List;
7 |
8 | public class Sequence extends AbstractSequence implements Serializable {
9 | private static final long serialVersionUID = -2766830126344921771L;
10 |
11 | /**
12 | * Constructor
13 | */
14 | public Sequence() {
15 | this.items = new ArrayList<>();
16 | }
17 |
18 | /**
19 | * Shallow Copy Constructor
20 | *
21 | * @param seq
22 | * sequence to shallow copy
23 | */
24 | public Sequence(final Sequence seq) {
25 | this.items = seq.items;
26 | }
27 |
28 | /**
29 | * Constructor
30 | *
31 | * @param items
32 | * a list of items that should be added to the new sequence
33 | */
34 | public Sequence(final List items) {
35 | this.items = new ArrayList<>(items);
36 | }
37 |
38 | /**
39 | * Constructor
40 | *
41 | * @param items
42 | * an array of items that should be added to the new sequence
43 | */
44 | public Sequence(final Integer... items) {
45 | this.items = new ArrayList<>(Arrays.asList(items));
46 | }
47 |
48 | /**
49 | * Join Constructor
50 | *
51 | * @param seqs
52 | * two sequences that should be joined
53 | */
54 | public Sequence(final Sequence seq1, final Sequence seq2) {
55 | this.items = new ArrayList<>(seq1.items);
56 | this.items.addAll(seq2.items);
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/Transaction.java:
--------------------------------------------------------------------------------
1 | package sequencemining.transaction;
2 |
3 | import java.io.Serializable;
4 | import java.util.ArrayList;
5 | import java.util.Arrays;
6 | import java.util.Collection;
7 | import java.util.Iterator;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Map.Entry;
11 |
12 | import com.google.common.collect.HashBasedTable;
13 | import com.google.common.collect.Multiset;
14 | import com.google.common.collect.Table;
15 |
16 | import sequencemining.sequence.AbstractSequence;
17 | import sequencemining.sequence.Sequence;
18 |
19 | /** A transaction is an ordered list of items */
20 | public class Transaction extends AbstractSequence implements Serializable {
21 | private static final long serialVersionUID = 3327396055332538091L;
22 |
23 | /** Cached sequences and probabilities for this transaction */
24 | private Table cachedSequences;
25 |
26 | /** Cached covering for this transaction */
27 | private Multiset cachedCovering;
28 | private Multiset tempCachedCovering;
29 |
30 | public void initializeCachedSequences(final Table initProbs) {
31 | final Table probs = HashBasedTable.create();
32 | for (final Sequence seq : initProbs.rowKeySet()) {
33 | if (this.contains(seq))
34 | probs.row(seq).putAll(initProbs.row(seq));
35 | }
36 | cachedSequences = probs;
37 | }
38 |
39 | public Table getCachedSequences() {
40 | return cachedSequences;
41 | }
42 |
43 | public void addSequenceCache(final Sequence candidate, final Map prob) {
44 | cachedSequences.row(candidate).putAll(prob);
45 | }
46 |
47 | public void removeSequenceCache(final Sequence candidate) {
48 | cachedSequences.row(candidate).clear();
49 | }
50 |
51 | public void updateCachedSequences(final Table newSequences) {
52 | for (final Iterator it = cachedSequences.rowKeySet().iterator(); it.hasNext();) {
53 | final Sequence seq = it.next();
54 | if (newSequences.containsRow(seq)) { // TODO zeros to clear ok?
55 | for (final Entry entry : cachedSequences.row(seq).entrySet())
56 | entry.setValue(0.);
57 | cachedSequences.row(seq).putAll(newSequences.row(seq));
58 | } else if (seq.size() == 1) {
59 | for (final Entry entry : cachedSequences.row(seq).entrySet())
60 | entry.setValue(0.); // so we can fill incomplete coverings
61 | } else
62 | it.remove();
63 | }
64 | }
65 |
66 | /** Get cost of cached covering for hard EM-step */
67 | public double getCachedCost() {
68 | double totalCost = 0;
69 | int lenCovering = 0;
70 | // TODO triple check that this is right!!!
71 | // Calculate (3.3)
72 | for (final Sequence seq : cachedSequences.rowKeySet()) {
73 | if (cachedCovering.contains(seq)) {
74 | final int occur = cachedCovering.count(seq);
75 | totalCost += -Math.log(cachedSequences.get(seq, occur));
76 | for (int m = 1; m <= occur; m++) {
77 | totalCost += sumLogRange(lenCovering + 1, lenCovering + seq.size());
78 | lenCovering += seq.size();
79 | }
80 | } else if (seq.size() == 1 && sum(cachedSequences.row(seq).values()) == 0.) {
81 | continue; // ignore singletons used to fill incomplete coverings
82 | } else {
83 | totalCost += -Math.log(cachedSequences.get(seq, 0));
84 | }
85 | }
86 | return totalCost;
87 | }
88 |
89 | /** Get cost of cached covering for structural EM-step */
90 | public double getCachedCost(final Table sequences) {
91 | return calculateCachedCost(sequences, cachedCovering);
92 | }
93 |
94 | /** Get cost of temp. cached covering for structural EM-step */
95 | public double getTempCachedCost(final Table sequences) {
96 | return calculateCachedCost(sequences, tempCachedCovering);
97 | }
98 |
99 | /** Calculate cached cost for structural EM-step */
100 | private double calculateCachedCost(final Table sequences,
101 | final Multiset covering) {
102 | double totalCost = 0;
103 | int lenCovering = 0;
104 | for (final Sequence seq : cachedSequences.rowKeySet()) {
105 | if (sequences.containsRow(seq)) {
106 | if (covering.contains(seq)) {
107 | final int occur = covering.count(seq);
108 | totalCost += -Math.log(sequences.get(seq, occur));
109 | for (int m = 1; m <= occur; m++) {
110 | totalCost += sumLogRange(lenCovering + 1, lenCovering + seq.size());
111 | lenCovering += seq.size();
112 | }
113 | } else if (seq.size() == 1 && sum(cachedSequences.row(seq).values()) == 0.) {
114 | continue; // ignore seqs used to fill incomplete coverings
115 | } else {
116 | totalCost += -Math.log(sequences.get(seq, 0));
117 | }
118 | }
119 | }
120 | return totalCost;
121 | }
122 |
123 | private double sum(final Collection elems) {
124 | double sum = 0;
125 | for (final double elem : elems)
126 | sum += elem;
127 | return sum;
128 | }
129 |
130 | private double sumLogRange(final int a, final int b) {
131 | double sum = 0;
132 | for (int i = a; i <= b; i++)
133 | sum += Math.log(i);
134 | return sum;
135 | }
136 |
137 | public void setCachedCovering(final Multiset covering) {
138 | cachedCovering = covering;
139 | }
140 |
141 | public Multiset getCachedCovering() {
142 | return cachedCovering;
143 | }
144 |
145 | public void setTempCachedCovering(final Multiset covering) {
146 | tempCachedCovering = covering;
147 | }
148 |
149 | public Multiset getTempCachedCovering() {
150 | return tempCachedCovering;
151 | }
152 |
153 | /**
154 | * Constructor
155 | */
156 | public Transaction() {
157 | this.items = new ArrayList<>();
158 | }
159 |
160 | /**
161 | * Constructor
162 | *
163 | * @param items
164 | * an array of items that should be added to the new sequence
165 | */
166 | public Transaction(final Integer... items) {
167 | this.items = new ArrayList<>(Arrays.asList(items));
168 | }
169 |
170 | /**
171 | * Constructor
172 | *
173 | * @param items
174 | * a List of items that should be added to the new sequence
175 | */
176 | public Transaction(final List items) {
177 | this.items = new ArrayList<>(items);
178 | }
179 |
180 | }
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/TransactionDatabase.java:
--------------------------------------------------------------------------------
1 | package sequencemining.transaction;
2 |
3 | import java.util.List;
4 |
5 | /** Wrapper class for storing a database of transactions */
6 | public abstract class TransactionDatabase {
7 |
8 | /** Set to true if candidate generation iteration limit exceeded */
9 | private boolean iterationLimitExceeded = false;
10 |
11 | /** Average cost across the transactions */
12 | private double averageCost = Double.POSITIVE_INFINITY;
13 |
14 | /** Set the average cost */
15 | public void setAverageCost(final double averageCost) {
16 | this.averageCost = averageCost;
17 | }
18 |
19 | /** Get the average cost */
20 | public double getAverageCost() {
21 | return averageCost;
22 | }
23 |
24 | public void setIterationLimitExceeded() {
25 | iterationLimitExceeded = true;
26 | }
27 |
28 | public boolean getIterationLimitExceeded() {
29 | return iterationLimitExceeded;
30 | }
31 |
32 | /** Get a list of transactions */
33 | public abstract List getTransactionList();
34 |
35 | // /** Get a JavaRDD of transactions */
36 | // public abstract JavaRDD getTransactionRDD();
37 | //
38 | // /** Update the transaction cache */
39 | // public abstract void updateTransactionCache(
40 | // final JavaRDD updatedTransactions);
41 |
42 | /** Get the number of transactions in this database */
43 | public abstract int size();
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/TransactionGenerator.java:
--------------------------------------------------------------------------------
1 | package sequencemining.transaction;
2 |
3 | import java.io.File;
4 | import java.io.FileReader;
5 | import java.io.IOException;
6 | import java.io.PrintWriter;
7 | import java.util.ArrayList;
8 | import java.util.HashMap;
9 | import java.util.List;
10 | import java.util.Map;
11 | import java.util.Map.Entry;
12 | import java.util.Random;
13 |
14 | import org.apache.commons.io.LineIterator;
15 | import org.apache.commons.math3.distribution.EnumeratedIntegerDistribution;
16 | import org.apache.commons.math3.random.JDKRandomGenerator;
17 | import org.apache.commons.math3.random.RandomGenerator;
18 |
19 | import com.google.common.collect.HashMultiset;
20 | import com.google.common.collect.Multiset;
21 | import com.google.common.collect.Table;
22 | import com.google.common.primitives.Doubles;
23 | import com.google.common.primitives.Ints;
24 |
25 | import sequencemining.sequence.Sequence;
26 |
27 | public class TransactionGenerator {
28 |
29 | private static final boolean VERBOSE = false;
30 |
31 | /**
32 | * Generate transactions from set of interesting sequences
33 | *
34 | * @return set of sequences added to transaction
35 | */
36 | public static HashMap generateTransactionDatabase(final Map sequences,
37 | final Table probabilities, final int noTransactions, final File outFile)
38 | throws IOException {
39 |
40 | // Set random number seeds
41 | final Random random = new Random(1);
42 | final Random randomI = new Random(10);
43 | final RandomGenerator randomC = new JDKRandomGenerator();
44 | randomC.setSeed(100);
45 |
46 | // Storage for sequences actually added
47 | final HashMap addedSequences = new HashMap<>();
48 |
49 | // Set output file
50 | final PrintWriter out = new PrintWriter(outFile, "UTF-8");
51 |
52 | // Add to distribution class for easy sampling
53 | final Map dists = new HashMap<>();
54 | for (final Sequence seq : sequences.keySet()) {
55 | final List singletons = new ArrayList<>();
56 | final List probs = new ArrayList<>();
57 | for (final Entry entry : probabilities.row(seq).entrySet()) {
58 | singletons.add(entry.getKey());
59 | probs.add(entry.getValue());
60 | }
61 | final EnumeratedIntegerDistribution dist = new EnumeratedIntegerDistribution(randomC,
62 | Ints.toArray(singletons), Doubles.toArray(probs));
63 | dists.put(seq, dist);
64 | }
65 |
66 | // Generate transaction database
67 | int count = 0;
68 | while (count < noTransactions) {
69 |
70 | // Generate transaction from distribution
71 | final Transaction transaction = sampleFromDistribution(random, sequences, dists, addedSequences, randomI);
72 | for (final int item : transaction) {
73 | out.print(item + " -1 ");
74 | }
75 | if (!transaction.isEmpty()) {
76 | out.print("-2");
77 | out.println();
78 | count++;
79 | }
80 |
81 | }
82 | out.close();
83 |
84 | // Print file to screen
85 | if (VERBOSE) {
86 | final FileReader reader = new FileReader(outFile);
87 | final LineIterator it = new LineIterator(reader);
88 | while (it.hasNext()) {
89 | System.out.println(it.nextLine());
90 | }
91 | LineIterator.closeQuietly(it);
92 | }
93 |
94 | return addedSequences;
95 | }
96 |
97 | /**
98 | * Randomly generate sequence with its probability, randomly interleaving
99 | * subsequences
100 | */
101 | public static Transaction sampleFromDistribution(final Random random, final Map sequences,
102 | final Map probabilities,
103 | final HashMap addedSequences, final Random randomI) {
104 |
105 | // Sample counts for interesting sequences
106 | final Multiset seqsWithRep = HashMultiset.create();
107 | for (final Sequence seq : sequences.keySet()) {
108 | final int count = probabilities.get(seq).sample();
109 | seqsWithRep.add(seq, count);
110 | }
111 |
112 | final ArrayList transaction = new ArrayList<>();
113 | for (final Sequence seq : seqsWithRep) {
114 | if (random.nextDouble() < sequences.get(seq)) {
115 | interleave(transaction, seq, randomI);
116 | addedSequences.put(seq, sequences.get(seq));
117 | }
118 | }
119 |
120 | return new Transaction(transaction);
121 | }
122 |
123 | /** Randomly interleave sequence into transaction */
124 | private static void interleave(final ArrayList transaction, final Sequence seq, final Random randomI) {
125 | if (transaction.size() == 0) {
126 | transaction.addAll(seq);
127 | } else {
128 | int prev = 0;
129 | for (final Integer item : seq) {
130 | final int insertionPoint = randomI.nextInt((transaction.size() - prev) + 1) + prev;
131 | transaction.add(insertionPoint, item);
132 | prev = insertionPoint + 1;
133 | }
134 | }
135 | }
136 |
137 | }
138 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/TransactionList.java:
--------------------------------------------------------------------------------
1 | package sequencemining.transaction;
2 |
3 | import java.util.List;
4 |
5 | /** Wrapper class for storing transaction database as a list of transactions */
6 | public class TransactionList extends TransactionDatabase {
7 |
8 | private final List transactions;
9 |
10 | public TransactionList(final List transactions) {
11 | this.transactions = transactions;
12 | }
13 |
14 | @Override
15 | public List getTransactionList() {
16 | return transactions;
17 | }
18 |
19 | // @Override
20 | // public JavaRDD getTransactionRDD() {
21 | // throw new UnsupportedOperationException("This is a list is not a RDD!!");
22 | // }
23 |
24 | @Override
25 | public int size() {
26 | return transactions.size();
27 | }
28 |
29 | // @Override
30 | // public void updateTransactionCache(
31 | // final JavaRDD updatedTransactions) {
32 | // throw new UnsupportedOperationException("This is a list is not a RDD!!");
33 | // }
34 |
35 | }
36 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/transaction/TransactionRDD.java:
--------------------------------------------------------------------------------
1 | package sequencemining.transaction;
2 |
3 | //import java.util.List;
4 | //
5 | //import org.apache.spark.api.java.JavaRDD;
6 | //
7 | ///** Wrapper class for storing transaction database as a Spark RDD */
8 | //public class TransactionRDD extends TransactionDatabase {
9 | //
10 | // private JavaRDD transactions;
11 | // private final long noTransactions;
12 | // private final String[] cachedDB;
13 | //
14 | // public TransactionRDD(final JavaRDD transactions,
15 | // final long noTransactions, final String[] cachedDB) {
16 | // this.transactions = transactions;
17 | // this.noTransactions = noTransactions;
18 | // this.cachedDB = cachedDB;
19 | // }
20 | //
21 | // @Override
22 | // public List getTransactionList() {
23 | // throw new UnsupportedOperationException("This is a RDD not a List!!");
24 | // }
25 | //
26 | // @Override
27 | // public JavaRDD getTransactionRDD() {
28 | // return transactions;
29 | // }
30 | //
31 | // @Override
32 | // public void updateTransactionCache(
33 | // final JavaRDD updatedTransactions) {
34 | // transactions = updatedTransactions;
35 | // }
36 | //
37 | // @Override
38 | // public long size() {
39 | // return noTransactions;
40 | // }
41 | //
42 | // @Override
43 | // public String[] getCachedDB() {
44 | // return cachedDB;
45 | // }
46 | //
47 | // }
48 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/ClassRegistrator.java:
--------------------------------------------------------------------------------
1 | package sequencemining.util;
2 |
3 | //import itemsetmining.itemset.AbstractSequence;
4 | //import itemsetmining.itemset.Sequence;
5 | //import itemsetmining.main.InferenceAlgorithms.InferGreedy;
6 | //import itemsetmining.transaction.Transaction;
7 | //
8 | //import org.apache.spark.serializer.KryoRegistrator;
9 | //
10 | //import com.esotericsoftware.kryo.Kryo;
11 | //
12 | ///** Register custom classes for Spark Kryo serialization */
13 | //public class ClassRegistrator implements KryoRegistrator {
14 | //
15 | // @Override
16 | // public void registerClasses(final Kryo kryo) {
17 | // kryo.register(Transaction.class);
18 | // kryo.register(AbstractSequence.class);
19 | // kryo.register(Sequence.class);
20 | // kryo.register(InferGreedy.class);
21 | // }
22 | //
23 | // }
24 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/Logging.java:
--------------------------------------------------------------------------------
1 | package sequencemining.util;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.FileOutputStream;
6 | import java.io.IOException;
7 | import java.io.ObjectInputStream;
8 | import java.io.ObjectOutputStream;
9 | import java.io.OutputStream;
10 | import java.text.SimpleDateFormat;
11 | import java.util.Date;
12 | import java.util.logging.ConsoleHandler;
13 | import java.util.logging.FileHandler;
14 | import java.util.logging.Formatter;
15 | import java.util.logging.Handler;
16 | import java.util.logging.Level;
17 | import java.util.logging.LogManager;
18 | import java.util.logging.LogRecord;
19 | import java.util.logging.Logger;
20 |
21 | import org.apache.commons.io.FilenameUtils;
22 |
23 | public class Logging {
24 |
25 | /** Set up logging to console */
26 | public static void setUpConsoleLogger(final Logger logger, final Level logLevel) {
27 | LogManager.getLogManager().reset();
28 | logger.setLevel(logLevel);
29 | final Handler handler = setUpConsoleHandler();
30 | logger.addHandler(handler);
31 | }
32 |
33 | /** Set up logging to file */
34 | public static void setUpFileLogger(final Logger logger, final Level logLevel, final File logFile) {
35 | LogManager.getLogManager().reset();
36 | logger.setLevel(logLevel);
37 | final Handler handler = setUpFileHandler(logFile.getAbsolutePath());
38 | logger.addHandler(handler);
39 | }
40 |
41 | /** Set up logging to console and file */
42 | public static void setUpConsoleAndFileLogger(final Logger logger, final Level logLevel, final File logFile) {
43 | LogManager.getLogManager().reset();
44 | logger.setLevel(logLevel);
45 | final Handler chandler = setUpConsoleHandler();
46 | final Handler fhandler = setUpFileHandler(logFile.getAbsolutePath());
47 | logger.addHandler(chandler);
48 | logger.addHandler(fhandler);
49 | }
50 |
51 | /** Set the log file name */
52 | public static File getLogFileName(final String algorithm, final boolean timeStampLog, final File logDir,
53 | final File dataset) {
54 | String timeStamp = "";
55 | if (timeStampLog)
56 | timeStamp = "-" + new SimpleDateFormat("dd.MM.yyyy-HH:mm:ss").format(new Date());
57 | return new File(logDir + File.separator + algorithm + "-" + FilenameUtils.getBaseName(dataset.getName())
58 | + timeStamp + ".log");
59 | }
60 |
61 | /** Set up console handler */
62 | public static Handler setUpConsoleHandler() {
63 | final ConsoleHandler handler = new ConsoleHandler() {
64 | @Override
65 | protected void setOutputStream(final OutputStream out) throws SecurityException {
66 | super.setOutputStream(System.out);
67 | }
68 | };
69 | handler.setLevel(Level.ALL);
70 | final Formatter formatter = new Formatter() {
71 | @Override
72 | public String format(final LogRecord record) {
73 | return record.getMessage();
74 | }
75 | };
76 | handler.setFormatter(formatter);
77 | return handler;
78 | }
79 |
80 | /** Set up file handler */
81 | public static Handler setUpFileHandler(final String path) {
82 | FileHandler handler = null;
83 | try {
84 | handler = new FileHandler(path, 104857600, 1);
85 | } catch (SecurityException | IOException e) {
86 | e.printStackTrace();
87 | }
88 | handler.setLevel(Level.ALL);
89 | final Formatter formatter = new Formatter() {
90 | @Override
91 | public String format(final LogRecord record) {
92 | return record.getMessage();
93 | }
94 | };
95 | handler.setFormatter(formatter);
96 | return handler;
97 | }
98 |
99 | /** Serialize object to file */
100 | public static void serialize(final Object obj, final String filename) throws IOException {
101 | final FileOutputStream fos = new FileOutputStream(filename);
102 | final ObjectOutputStream oos = new ObjectOutputStream(fos);
103 | oos.writeObject(obj);
104 | oos.close();
105 | }
106 |
107 | /** Deserialize object from file */
108 | public static Object deserializeFrom(final String filename) throws IOException, ClassNotFoundException {
109 | final FileInputStream fisM = new FileInputStream(filename);
110 | final ObjectInputStream oisM = new ObjectInputStream(fisM);
111 | final Object obj = oisM.readObject();
112 | oisM.close();
113 | return obj;
114 | }
115 |
116 | private Logging() {
117 | }
118 |
119 | }
120 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/MemoryLogger.java:
--------------------------------------------------------------------------------
1 | package sequencemining.util;
2 |
3 | /*
4 | * Copyright (c) 2008-2012 Philippe Fournier-Viger
5 | *
6 | * This file is part of the SPMF DATA MINING SOFTWARE
7 | * (http://www.philippe-fournier-viger.com/spmf).
8 | *
9 | * SPMF is free software: you can redistribute it and/or modify
10 | * it under the terms of the GNU General Public License as published by
11 | * the Free Software Foundation, either version 3 of the License, or
12 | * (at your option) any later version.
13 | *
14 | * SPMF is distributed in the hope that it will be useful,
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | * GNU General Public License for more details.
18 | *
19 | * You should have received a copy of the GNU General Public License
20 | * along with SPMF. If not, see .
21 | */
22 |
23 | /**
24 | * This class is used to record the maximum memory usaged of an algorithm during
25 | * a given execution. It is implemented by using the "singleton" design pattern.
26 | *
27 | */
28 | public class MemoryLogger {
29 |
30 | // the only instance of this class (this is the "singleton" design pattern)
31 | private static MemoryLogger instance = new MemoryLogger();
32 |
33 | // variable to store the maximum memory usage
34 | private double maxMemory = 0;
35 |
36 | /**
37 | * Method to obtain the only instance of this class
38 | *
39 | * @return instance of MemoryLogger
40 | */
41 | public static MemoryLogger getInstance() {
42 | return instance;
43 | }
44 |
45 | /**
46 | * To get the maximum amount of memory used until now
47 | *
48 | * @return a double value indicating memory as megabytes
49 | */
50 | public double getMaxMemory() {
51 | return maxMemory;
52 | }
53 |
54 | /**
55 | * Reset the maximum amount of memory recorded.
56 | */
57 | public void reset() {
58 | maxMemory = 0;
59 | }
60 |
61 | /**
62 | * Check the current memory usage and record it if it is higher than the
63 | * amount of memory previously recorded.
64 | */
65 | public void checkMemory() {
66 | final double currentMemory = (Runtime.getRuntime().totalMemory() - Runtime
67 | .getRuntime().freeMemory()) / 1024d / 1024d;
68 | if (currentMemory > maxMemory) {
69 | maxMemory = currentMemory;
70 | }
71 | }
72 |
73 | }
74 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/PartialLogFixer.java:
--------------------------------------------------------------------------------
1 | package sequencemining.util;
2 |
3 | import java.io.File;
4 | import java.io.FileWriter;
5 | import java.io.IOException;
6 | import java.util.HashMap;
7 | import java.util.Map;
8 | import java.util.Map.Entry;
9 | import java.util.regex.Matcher;
10 | import java.util.regex.Pattern;
11 |
12 | import org.apache.commons.io.input.ReversedLinesFileReader;
13 |
14 | import sequencemining.main.SequenceMining;
15 | import sequencemining.main.SequenceMiningCore;
16 | import sequencemining.sequence.Sequence;
17 | import sequencemining.transaction.TransactionList;
18 |
19 | /**
20 | * Read last EM step of partial sequence log and output interesting sequences
21 | * along with interestingness and probability and write to end of log file.
22 | */
23 | public class PartialLogFixer {
24 |
25 | public static void main(final String[] args) throws IOException {
26 | if (args.length != 2) {
27 | System.err.println("Usage ");
28 | System.exit(-1);
29 | }
30 |
31 | System.out.println("Reading sequences from last parameter EM step for " + args[1] + "...");
32 | final HashMap itemsets = readLastEMStepSequences(new File(args[1]));
33 | System.out.println("done. Number of sequences: " + itemsets.size());
34 |
35 | System.out.println("\nWriting sorted sequences to " + args[1] + "...");
36 | sortSequencesInterestingness(itemsets, new File(args[0]), new File(args[1]));
37 | System.out.println("All done. Exiting.");
38 |
39 | }
40 |
41 | public static HashMap readLastEMStepSequences(final File logFile) throws IOException {
42 | final HashMap sequences = new HashMap<>();
43 |
44 | final ReversedLinesFileReader reader = new ReversedLinesFileReader(logFile);
45 | String line = reader.readLine();
46 | while (line != null) {
47 |
48 | if (line.contains("Parameter Optimal Sequences:")) {
49 | final Matcher m = Pattern
50 | .compile(
51 | "\\[((?:[0-9]|,| )+?)\\]=\\(((?:(?:[-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?)|,)+?)\\)")
52 | .matcher(line);
53 | while (m.find()) {
54 | final Sequence sequence = new Sequence();
55 | final String[] items = m.group(1).split(", ");
56 | for (final String item : items)
57 | sequence.add(Integer.parseInt(item));
58 | final double prob = 1 - Double.parseDouble(m.group(2).split(",")[0]);
59 | sequences.put(sequence, prob);
60 | }
61 | break;
62 | }
63 | line = reader.readLine();
64 |
65 | }
66 | reader.close();
67 |
68 | return sequences;
69 | }
70 |
71 | public static void sortSequencesInterestingness(final HashMap sequences, final File transactionDB,
72 | final File logFile) throws IOException {
73 |
74 | // Read in transaction database
75 | final TransactionList transactions = SequenceMining.readTransactions(transactionDB);
76 |
77 | // Sort sequences by interestingness
78 | System.out.println("Sorting sequences by interestingness...");
79 | final HashMap intMap = SequenceMiningCore.calculateInterestingness(sequences, transactions);
80 | final Map sortedSequences = SequenceMiningCore.sortSequences(sequences, intMap);
81 |
82 | System.out.println("Writing out to file...");
83 | final FileWriter out = new FileWriter(logFile, true);
84 | out.write("\n============= INTERESTING SEQUENCES =============\n");
85 | for (final Entry entry : sortedSequences.entrySet()) {
86 | out.write(String.format("%s\tprob: %1.5f \tint: %1.5f %n", entry.getKey(), entry.getValue(),
87 | intMap.get(entry.getKey())));
88 | }
89 | out.write("\n");
90 | out.close();
91 | System.out.println("done.");
92 |
93 | }
94 |
95 | }
96 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/java/sequencemining/util/Tuple2.java:
--------------------------------------------------------------------------------
1 | package sequencemining.util;
2 |
3 | public class Tuple2 {
4 | public final T1 _1;
5 | public final T2 _2;
6 |
7 | public Tuple2(final T1 _1, final T2 _2) {
8 | this._1 = _1;
9 | this._2 = _2;
10 | }
11 |
12 | @Override
13 | public String toString() {
14 | return "(" + _1 + "," + _2 + ")";
15 | }
16 |
17 | @Override
18 | public int hashCode() {
19 | final int prime = 31;
20 | int result = 1;
21 | result = prime * result + ((_1 == null) ? 0 : _1.hashCode());
22 | result = prime * result + ((_2 == null) ? 0 : _2.hashCode());
23 | return result;
24 | }
25 |
26 | @Override
27 | public boolean equals(final Object obj) {
28 | if (this == obj)
29 | return true;
30 | if (!(obj instanceof Tuple2))
31 | return false;
32 | final Tuple2, ?> other = (Tuple2, ?>) obj;
33 | return (_1 == null ? other._1 == null : _1.equals(other._1))
34 | && (_2 == null ? other._2 == null : _2.equals(other._2));
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=WARN, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.target=System.err
5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
7 |
8 | # Settings to quiet third party logs that are too verbose
9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
12 |
--------------------------------------------------------------------------------
/sequence-miner/src/main/resources/spark.properties:
--------------------------------------------------------------------------------
1 | # Main Spark Parameters
2 | SparkHome=/disk/data1/jfowkes/spark-1.1.0-bin-hadoop1
3 | SparkMaster=spark://cup04.inf.ed.ac.uk:7077
4 | MachinesInCluster=8
5 |
6 | # Main HDFS Parameters
7 | HDFSMaster=hdfs://cup04.inf.ed.ac.uk:54310/
8 | HDFSConfFile=/disk/data1/jfowkes/hadoop-1.0.4/conf/core-site.xml
9 |
--------------------------------------------------------------------------------
/sequence-miner/src/test/java/sequencemining/main/InitialProbabilitiesTest.java:
--------------------------------------------------------------------------------
1 | package sequencemining.main;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.io.UnsupportedEncodingException;
6 | import java.net.URL;
7 |
8 | import org.junit.Test;
9 |
10 | import com.google.common.collect.Table;
11 |
12 | import sequencemining.sequence.Sequence;
13 |
14 | public class InitialProbabilitiesTest {
15 |
16 | @Test
17 | public void testScanDatabaseToDetermineInitialProbabilities() throws IOException {
18 |
19 | final File input = getTestFile("TOY.txt"); // database
20 | final Table probs = SequenceMining
21 | .scanDatabaseToDetermineInitialProbabilities(input);
22 | System.out.println(SequenceMiningCore.probsToString(probs));
23 |
24 | }
25 |
26 | public File getTestFile(final String filename) throws UnsupportedEncodingException {
27 | final URL url = this.getClass().getClassLoader().getResource(filename);
28 | return new File(java.net.URLDecoder.decode(url.getPath(), "UTF-8"));
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/sequence-miner/src/test/java/sequencemining/main/SequenceMiningTest.java:
--------------------------------------------------------------------------------
1 | package sequencemining.main;
2 |
3 | import static org.junit.Assert.assertEquals;
4 |
5 | import java.util.HashMap;
6 | import java.util.Map;
7 |
8 | import org.junit.Test;
9 |
10 | import com.google.common.collect.HashBasedTable;
11 | import com.google.common.collect.HashMultiset;
12 | import com.google.common.collect.Multiset;
13 |
14 | import sequencemining.main.InferenceAlgorithms.InferGreedy;
15 | import sequencemining.main.InferenceAlgorithms.InferenceAlgorithm;
16 | import sequencemining.sequence.Sequence;
17 | import sequencemining.transaction.Transaction;
18 |
19 | public class SequenceMiningTest {
20 |
21 | @Test
22 | public void testDoInference() {
23 |
24 | // TODO better tests??
25 |
26 | // Subsequences
27 | final Sequence s1 = new Sequence(3, 4, 5, 8);
28 | final Map p1 = new HashMap<>();
29 | p1.put(0, 0.6);
30 | p1.put(1, 0.4);
31 | final Sequence s2 = new Sequence(7, 9);
32 | final Map