├── README.md
└── bide_alg.py
/README.md:
--------------------------------------------------------------------------------
1 |
Bide algorithm
2 | Implement Bide algorithm for searching closed frequent sequences
3 |
4 | Example 1
5 | ---------------------------
6 | Samples are referred to J. Wang and J. Han[1].
7 | ```
8 | from bide_alg import *
9 |
10 | db = [
11 | ['c', 'a', 'a', 'b', 'c'],
12 | ['a', 'b', 'c', 'b'],
13 | ['c', 'a', 'b', 'c'],
14 | ['a', 'b', 'b', 'c', 'a']
15 | ]
16 | ```
17 | Execute bide algorithm to find closed frequent patterns with the minimum support greater than or equal to 2, minimum length >=0, and maximum length <=5 on db by:
18 | ```
19 | bide_obj = bide_alg(db, 2 , 0, 5)
20 | bide_obj._mine()
21 | ```
22 | Show results:
23 | ```
24 | bide_obj._results
25 | ```
26 | ```
27 | [(['c', 'a'], 3),
28 | (['c', 'a', 'b', 'c'], 2),
29 | (['c', 'b'], 3),
30 | (['a', 'a'], 2),
31 | (['a', 'b', 'c'], 4),
32 | (['a', 'b', 'b'], 2)]
33 | ```
34 |
35 | Example 2
36 | ---------------------------
37 | Given data as follows, each element is a sequence.
38 | ```
39 | from bide_alg import *
40 | db = [
41 | [0, 1, 2, 3, 4, 4],
42 | [1, 1, 1, 3, 4, 3],
43 | [2, 1, 2, 2, 0],
44 | [1, 1, 1, 2, 2, 4, 3],
45 | ]
46 | ```
47 | Execute bide algorithm to find closed frequent patterns with the minimum support greater than or equal to 2, minimum length >=2, and maximum length <=5 on db by:
48 | ```
49 | bide_obj = bide_alg(db, 2 , 2, 5)
50 | bide_obj._mine()
51 | ```
52 | Show result:
53 | ```
54 | bide_obj._results
55 | ```
56 | ```
57 | [([1, 2], 3),
58 | ([1, 2, 3], 2),
59 | ([1, 2, 4], 2),
60 | ([1, 2, 2], 2),
61 | ([1, 3], 3),
62 | ([1, 3, 4], 2),
63 | ([1, 4], 3),
64 | ([1, 1, 1, 4, 3], 2)]
65 | ```
66 |
67 |
68 | Reference
69 | ---------------------------
70 | 1. J. Wang and J. Han, "BIDE: efficient mining of frequent closed sequences," Proceedings. 20th International Conference on Data Engineering, Boston, MA, USA, 2004, pp. 79-90.
71 | doi: 10.1109/ICDE.2004.1319986
72 | keywords: {data mining;optimisation;search problems;pattern mining algorithm;BIDE;sequence closure checking;frequent closed sequence;bidirectional extension;search space;BackScan pruning method;Scan-Skip optimization technique;Data mining;Itemsets;Bidirectional control;Optimization methods;Pattern analysis;Computer science;Runtime;Databases;Proteins;XML},
73 | URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1319986&isnumber=29235
74 |
75 | 2. Refer to package prefixspan, https://pypi.org/project/prefixspan/
--------------------------------------------------------------------------------
/bide_alg.py:
--------------------------------------------------------------------------------
1 | from typing import *
2 | from collections import defaultdict
3 | from prefixspan import PrefixSpan
4 |
5 | Entries = List[Tuple[int, int]]
6 |
7 | def invertedindex(seqs: Iterable[Sequence[Any]], entries: Entries = None) -> Mapping[Any, Entries]:
8 | index: Mapping[T, Entries] = defaultdict(list)
9 |
10 | for k, seq in enumerate(seqs):
11 | i, lastpos = entries[k] if entries else (k, -1)
12 |
13 | for p, item in enumerate(seq, start=(lastpos + 1)):
14 | l = index[item]
15 | if len(l) and l[-1][0] == i:
16 | continue
17 |
18 | l.append((i, p))
19 |
20 | return index
21 |
22 |
23 | def nextentries(data: Sequence[Sequence[Any]], entries: Entries) -> Mapping[Any, Entries]:
24 | return invertedindex(
25 | (data[i][lastpos + 1:] for i, lastpos in entries),
26 | entries
27 | )
28 | class bide_alg:
29 |
30 | def __init__(self, db, minsup, minlen, maxlen):
31 |
32 | self._db = db
33 | self.minsup = minsup
34 | self.minlen = minlen
35 | self.maxlen = maxlen
36 | self._results = []
37 |
38 | def __reversescan(self, db, patt, matches, check_type):
39 |
40 | # db: complete database
41 | # patt: the current pattern
42 | # matches: a list of tuples (row_index, the index of the last element of patt within db[row_index])
43 | def islocalclosed(previtem):
44 | closeditems = set()
45 |
46 | for k, (i, endpos) in enumerate(matches):
47 | localitems = set()
48 |
49 | for startpos in range(endpos-1, -1, -1):
50 | item = db[i][startpos]
51 |
52 | if item == previtem:
53 | matches[k] = (i, startpos)
54 | break
55 |
56 | localitems.add(item)
57 |
58 | # first run: add elements of localitems to closeditems
59 | # after first run: start intersection
60 | (closeditems.update if k==0 else closeditems.intersection_update)(localitems)
61 |
62 | return len(closeditems) > 0
63 |
64 | check = True if check_type == 'closed' else False
65 | for previtem in reversed(patt[:-1]):
66 |
67 | if islocalclosed(previtem):
68 | check = False if check_type == 'closed' else True
69 | break
70 |
71 | return check
72 |
73 |
74 | def isclosed(self, db, patt, matches):
75 |
76 | return self.__reversescan(db, [None, *patt, None], [(i, len(db[i])) for i, _ in matches], 'closed')
77 |
78 |
79 | def canclosedprune(self, db, patt, matches):
80 |
81 | return self.__reversescan(db, [None, *patt], matches[:], 'prune')
82 |
83 |
84 | def bide_frequent_rec(self, patt, matches):
85 |
86 | sup = len(matches)
87 |
88 | # if pattern's length is greater than minimum length, consider whether it should be recorded
89 | if len(patt) >= self.minlen:
90 |
91 | # if pattern's support < minsup, stop
92 | if sup < self.minsup:
93 | return None
94 | # if pattern is closed (backward extension check), record the pattern and its support
95 | if self.isclosed(self._db, patt, matches):
96 | self._results.append((patt, sup))
97 |
98 | # if pattern's length is greater than maximum length, stop recurssion
99 | if len(patt) == self.maxlen:
100 | return None
101 |
102 | # find the following items
103 | occurs = nextentries(self._db, matches)
104 | for newitem, newmatches in occurs.items():
105 | # set the new pattern
106 | newpatt = patt + [newitem]
107 |
108 | # forward closed pattern checking
109 | if (len(matches) == len(newmatches)) and ((patt, sup) in self._results):
110 | self._results.remove((patt, sup))
111 |
112 | # can we stop pruning the new pattern
113 | if self.canclosedprune(self._db, newpatt, newmatches):
114 | continue
115 | self.bide_frequent_rec(newpatt, newmatches)
116 |
117 | def _mine(self):
118 | # type: (Callable[[Pattern, Matches], None]) -> Any
119 | self._results.clear()
120 |
121 | self.bide_frequent_rec([], [(i, -1) for i in range(len(self._db))])
122 |
--------------------------------------------------------------------------------