├── README.md └── bide_alg.py /README.md: -------------------------------------------------------------------------------- 1 |

Bide algorithm

2 | Implement Bide algorithm for searching closed frequent sequences 3 | 4 | Example 1 5 | --------------------------- 6 | Samples are referred to J. Wang and J. Han[1]. 7 | ``` 8 | from bide_alg import * 9 | 10 | db = [ 11 | ['c', 'a', 'a', 'b', 'c'], 12 | ['a', 'b', 'c', 'b'], 13 | ['c', 'a', 'b', 'c'], 14 | ['a', 'b', 'b', 'c', 'a'] 15 | ] 16 | ``` 17 | Execute bide algorithm to find closed frequent patterns with the minimum support greater than or equal to 2, minimum length >=0, and maximum length <=5 on db by: 18 | ``` 19 | bide_obj = bide_alg(db, 2 , 0, 5) 20 | bide_obj._mine() 21 | ``` 22 | Show results: 23 | ``` 24 | bide_obj._results 25 | ``` 26 | ``` 27 | [(['c', 'a'], 3), 28 | (['c', 'a', 'b', 'c'], 2), 29 | (['c', 'b'], 3), 30 | (['a', 'a'], 2), 31 | (['a', 'b', 'c'], 4), 32 | (['a', 'b', 'b'], 2)] 33 | ``` 34 | 35 | Example 2 36 | --------------------------- 37 | Given data as follows, each element is a sequence. 38 | ``` 39 | from bide_alg import * 40 | db = [ 41 | [0, 1, 2, 3, 4, 4], 42 | [1, 1, 1, 3, 4, 3], 43 | [2, 1, 2, 2, 0], 44 | [1, 1, 1, 2, 2, 4, 3], 45 | ] 46 | ``` 47 | Execute bide algorithm to find closed frequent patterns with the minimum support greater than or equal to 2, minimum length >=2, and maximum length <=5 on db by: 48 | ``` 49 | bide_obj = bide_alg(db, 2 , 2, 5) 50 | bide_obj._mine() 51 | ``` 52 | Show result: 53 | ``` 54 | bide_obj._results 55 | ``` 56 | ``` 57 | [([1, 2], 3), 58 | ([1, 2, 3], 2), 59 | ([1, 2, 4], 2), 60 | ([1, 2, 2], 2), 61 | ([1, 3], 3), 62 | ([1, 3, 4], 2), 63 | ([1, 4], 3), 64 | ([1, 1, 1, 4, 3], 2)] 65 | ``` 66 | 67 | 68 | Reference 69 | --------------------------- 70 | 1. J. Wang and J. Han, "BIDE: efficient mining of frequent closed sequences," Proceedings. 20th International Conference on Data Engineering, Boston, MA, USA, 2004, pp. 79-90. 71 | doi: 10.1109/ICDE.2004.1319986 72 | keywords: {data mining;optimisation;search problems;pattern mining algorithm;BIDE;sequence closure checking;frequent closed sequence;bidirectional extension;search space;BackScan pruning method;Scan-Skip optimization technique;Data mining;Itemsets;Bidirectional control;Optimization methods;Pattern analysis;Computer science;Runtime;Databases;Proteins;XML}, 73 | URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1319986&isnumber=29235 74 | 75 | 2. Refer to package prefixspan, https://pypi.org/project/prefixspan/ -------------------------------------------------------------------------------- /bide_alg.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | from collections import defaultdict 3 | from prefixspan import PrefixSpan 4 | 5 | Entries = List[Tuple[int, int]] 6 | 7 | def invertedindex(seqs: Iterable[Sequence[Any]], entries: Entries = None) -> Mapping[Any, Entries]: 8 | index: Mapping[T, Entries] = defaultdict(list) 9 | 10 | for k, seq in enumerate(seqs): 11 | i, lastpos = entries[k] if entries else (k, -1) 12 | 13 | for p, item in enumerate(seq, start=(lastpos + 1)): 14 | l = index[item] 15 | if len(l) and l[-1][0] == i: 16 | continue 17 | 18 | l.append((i, p)) 19 | 20 | return index 21 | 22 | 23 | def nextentries(data: Sequence[Sequence[Any]], entries: Entries) -> Mapping[Any, Entries]: 24 | return invertedindex( 25 | (data[i][lastpos + 1:] for i, lastpos in entries), 26 | entries 27 | ) 28 | class bide_alg: 29 | 30 | def __init__(self, db, minsup, minlen, maxlen): 31 | 32 | self._db = db 33 | self.minsup = minsup 34 | self.minlen = minlen 35 | self.maxlen = maxlen 36 | self._results = [] 37 | 38 | def __reversescan(self, db, patt, matches, check_type): 39 | 40 | # db: complete database 41 | # patt: the current pattern 42 | # matches: a list of tuples (row_index, the index of the last element of patt within db[row_index]) 43 | def islocalclosed(previtem): 44 | closeditems = set() 45 | 46 | for k, (i, endpos) in enumerate(matches): 47 | localitems = set() 48 | 49 | for startpos in range(endpos-1, -1, -1): 50 | item = db[i][startpos] 51 | 52 | if item == previtem: 53 | matches[k] = (i, startpos) 54 | break 55 | 56 | localitems.add(item) 57 | 58 | # first run: add elements of localitems to closeditems 59 | # after first run: start intersection 60 | (closeditems.update if k==0 else closeditems.intersection_update)(localitems) 61 | 62 | return len(closeditems) > 0 63 | 64 | check = True if check_type == 'closed' else False 65 | for previtem in reversed(patt[:-1]): 66 | 67 | if islocalclosed(previtem): 68 | check = False if check_type == 'closed' else True 69 | break 70 | 71 | return check 72 | 73 | 74 | def isclosed(self, db, patt, matches): 75 | 76 | return self.__reversescan(db, [None, *patt, None], [(i, len(db[i])) for i, _ in matches], 'closed') 77 | 78 | 79 | def canclosedprune(self, db, patt, matches): 80 | 81 | return self.__reversescan(db, [None, *patt], matches[:], 'prune') 82 | 83 | 84 | def bide_frequent_rec(self, patt, matches): 85 | 86 | sup = len(matches) 87 | 88 | # if pattern's length is greater than minimum length, consider whether it should be recorded 89 | if len(patt) >= self.minlen: 90 | 91 | # if pattern's support < minsup, stop 92 | if sup < self.minsup: 93 | return None 94 | # if pattern is closed (backward extension check), record the pattern and its support 95 | if self.isclosed(self._db, patt, matches): 96 | self._results.append((patt, sup)) 97 | 98 | # if pattern's length is greater than maximum length, stop recurssion 99 | if len(patt) == self.maxlen: 100 | return None 101 | 102 | # find the following items 103 | occurs = nextentries(self._db, matches) 104 | for newitem, newmatches in occurs.items(): 105 | # set the new pattern 106 | newpatt = patt + [newitem] 107 | 108 | # forward closed pattern checking 109 | if (len(matches) == len(newmatches)) and ((patt, sup) in self._results): 110 | self._results.remove((patt, sup)) 111 | 112 | # can we stop pruning the new pattern 113 | if self.canclosedprune(self._db, newpatt, newmatches): 114 | continue 115 | self.bide_frequent_rec(newpatt, newmatches) 116 | 117 | def _mine(self): 118 | # type: (Callable[[Pattern, Matches], None]) -> Any 119 | self._results.clear() 120 | 121 | self.bide_frequent_rec([], [(i, -1) for i in range(len(self._db))]) 122 | --------------------------------------------------------------------------------