├── testdataABCD.csv ├── testdata0.csv ├── README.md ├── testdata1.csv ├── bridges.csv ├── tane.py └── ctane.py /testdataABCD.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D 2 | 1,1,5,5 3 | 1,1,1,3 4 | 5,1,2,3 5 | -------------------------------------------------------------------------------- /testdata0.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D,E,F,G,H,I,J,K 2 | 128059,1,1,1,1,2,5,5,1,1,2 3 | 1285531,1,1,1,1,2,1,3,1,1,2 4 | 1287775,5,1,1,2,2,2,3,1,1,2 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FD_CFD_extraction 2 | 3 | This repository contains the implementation of two algorithms, TANE and CTANE, corresponding to the following publications: 4 | 5 | 1. "TANE: An Efficient Algorithm for Discovering Functional and Approximate Dependencies" (link: https://www.lri.fr/~pierres/donn%E9es/save/these/articles/lpr-queue/huhtala99tane.pdf) 6 | 7 | 2. "Discovering Conditional Functional Dependencies" (link: http://homepages.inf.ed.ac.uk/fgeerts/pdf/CFDdiscovery.pdf) 8 | 9 | We have also provided several CSV files as test data. 10 | 11 | This code was used in the following work: 12 | "Automatic Discovery of Functional Dependencies and Conditional Functional 13 | Dependencies: A Comparative Study" (link: https://cs.uwaterloo.ca/~nasghar/848.pdf) 14 | 15 | ##Running the code 16 | 17 | To run tane.py on a particular csv file (e.g. adult.csv), execute the following command in your terminal: 18 | ``` 19 | python tane.py adult.csv 20 | ``` 21 | To run ctane.py on the same data, execute: 22 | ``` 23 | python ctane.py adult.csv 24 | ``` 25 | To run ctane.py and obtain k-frequent CFDs, execute: 26 | ``` 27 | python ctane.py adult.csv k 28 | ``` 29 | where k is your integer of choice. 30 | 31 | -------------------------------------------------------------------------------- /testdata1.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D,E,F,G,H,I,J,K 2 | 128059,1,1,1,1,2,5,5,1,1,2 3 | 1285531,1,1,1,1,2,1,3,1,1,2 4 | 1287775,5,1,1,2,2,2,3,1,1,2 5 | 144888,8,10,10,8,5,10,7,8,1,4 6 | 145447,8,4,4,1,2,9,3,3,1,4 7 | 167528,4,1,1,1,2,1,3,6,1,2 8 | 169356,3,1,1,1,2,?,3,1,1,2 9 | 183913,1,2,2,1,2,1,1,1,1,2 10 | 191250,10,4,4,10,2,10,5,3,3,4 11 | 1017023,6,3,3,5,3,10,3,5,3,2 12 | 1100524,6,10,10,2,8,10,7,3,3,4 13 | 1116116,9,10,10,1,10,8,3,3,1,4 14 | 1168736,5,6,6,2,4,10,3,6,1,4 15 | 1182404,3,1,1,1,2,1,1,1,1,2 16 | 1182404,3,1,1,1,2,1,2,1,1,2 17 | 1198641,3,1,1,1,2,1,3,1,1,2 18 | 242970,5,7,7,1,5,8,3,4,1,2 19 | 1151734,10,8,7,4,3,10,7,9,1,4 20 | 1156017,3,1,1,1,2,1,2,1,1,2 21 | 1158247,1,1,1,1,1,1,1,1,1,2 22 | 677910,5,2,2,4,2,4,1,1,1,2 23 | 734111,1,1,1,3,2,3,1,1,1,2 24 | 734111,1,1,1,1,2,2,1,1,1,2 25 | 780555,5,1,1,6,3,1,2,1,1,2 26 | 827627,2,1,1,1,2,1,1,1,1,2 27 | 1049837,1,1,1,1,2,1,1,1,1,2 28 | 1058849,5,1,1,1,2,1,1,1,1,2 29 | 1182404,1,1,1,1,1,1,1,1,1,2 30 | 1193544,5,7,9,8,6,10,8,10,1,4 31 | 1201870,4,1,1,3,1,1,2,1,1,2 32 | 1202253,5,1,1,1,2,1,1,1,1,2 33 | 1227081,3,1,1,3,2,1,1,1,1,2 34 | 1230994,4,5,5,8,6,10,10,7,1,4 35 | 1238410,2,3,1,1,3,1,1,1,1,2 36 | 1246562,10,2,2,1,2,6,1,1,2,4 37 | 1257470,10,6,5,8,5,10,8,6,1,4 38 | 1259008,8,8,9,6,6,3,10,10,1,4 39 | 1266124,5,1,2,1,2,1,1,1,1,2 40 | 1267898,5,1,3,1,2,1,1,1,1,2 41 | 1268313,5,1,1,3,2,1,1,1,1,2 42 | 1298416,10,6,6,2,4,10,9,7,1,4 43 | 1299596,6,6,6,5,4,10,7,6,2,4 44 | 1105524,4,1,1,1,2,1,1,1,1,2 45 | 1253917,4,1,1,2,2,1,2,1,1,2 46 | 1265899,4,1,1,1,2,1,3,1,1,2 47 | 1268766,1,1,1,1,2,1,1,1,1,2 48 | 1277268,3,3,1,1,2,1,1,1,1,2 49 | 1286943,8,10,10,10,7,5,4,8,7,4 50 | 1295508,1,1,1,1,2,4,1,1,1,2 51 | 1297327,5,1,1,1,2,1,1,1,1,2 52 | 1297522,2,1,1,1,2,1,1,1,1,2 53 | 1298360,1,1,1,1,2,1,1,1,1,2 54 | 1299924,5,1,1,1,2,1,2,1,1,2 55 | 1299994,5,1,1,1,2,1,1,1,1,2 56 | 1304595,3,1,1,1,1,1,2,1,1,2 57 | 749653,3,1,1,1,2,1,2,1,1,2 58 | 769612,3,1,1,2,2,1,1,1,1,2 59 | 769612,4,1,1,1,2,1,1,1,1,2 60 | 798429,4,1,1,1,2,1,3,1,1,2 61 | 807657,6,1,3,2,2,1,1,1,1,2 62 | 8233704,4,1,1,1,1,1,2,1,1,2 63 | 837480,7,4,4,3,4,10,6,9,1,4 64 | 867392,4,2,2,1,2,1,2,1,1,2 65 | 1240603,2,1,1,1,1,1,1,1,1,2 66 | 1240603,3,1,1,1,1,1,1,1,1,2 67 | 1241035,7,8,3,7,4,5,7,8,2,4 68 | 1287971,3,1,1,1,2,1,2,1,1,2 69 | 1289391,1,1,1,1,2,1,3,1,1,2 70 | 1299924,3,2,2,2,2,1,4,2,1,2 71 | 1306339,4,4,2,1,2,5,2,1,2,2 72 | 1321942,5,1,1,1,2,1,3,1,1,2 73 | 1328331,1,1,1,1,2,1,3,1,1,2 74 | 1328755,3,1,1,1,2,1,2,1,1,2 75 | 1331405,4,1,1,1,2,1,3,2,1,2 76 | 1331412,5,7,10,10,5,10,10,10,1,4 77 | 1333104,3,1,2,1,2,1,3,1,1,2 78 | 1334071,4,1,1,1,2,3,2,1,1,2 79 | 1343068,8,4,4,1,6,10,2,5,2,4 80 | 1343374,10,10,8,10,6,5,10,3,1,4 81 | 1344121,8,10,4,4,8,10,8,2,1,4 82 | 142932,7,6,10,5,3,10,9,10,2,4 83 | 183936,3,1,1,1,2,1,2,1,1,2 84 | 324382,1,1,1,1,2,1,2,1,1,2 85 | 378275,10,9,7,3,4,2,7,7,1,4 86 | 878358,5,7,10,6,5,10,7,5,1,4 87 | 1107684,6,10,5,5,4,10,6,10,1,4 88 | 1115762,3,1,1,1,2,1,1,1,1,2 89 | 1217717,5,1,1,6,3,1,1,1,1,2 90 | 1239420,1,1,1,1,2,1,1,1,1,2 91 | 1254538,8,10,10,10,6,10,10,10,1,4 92 | 1261751,5,1,1,1,2,1,2,2,1,2 93 | 1268275,9,8,8,9,6,3,4,1,1,4 94 | 1324572,5,1,1,1,2,1,2,2,1,2 95 | 1324681,4,1,1,1,2,1,2,1,1,2 96 | 1325159,3,1,1,1,2,1,3,1,1,2 97 | 1326892,3,1,1,1,2,1,2,1,1,2 98 | 1339781,4,1,1,1,2,1,3,1,1,2 99 | 13454352,1,1,3,1,2,1,2,1,1,2 100 | 1345452,1,1,3,1,2,1,2,1,1,2 101 | 1345593,3,1,1,3,2,1,2,1,1,2 102 | -------------------------------------------------------------------------------- /bridges.csv: -------------------------------------------------------------------------------- 1 | A,B,C,D,E,F,G,H,I,J,K,L,M 2 | E1,M,3,1818,HIGHWAY,?,2,N,THROUGH,WOOD,SHORT,S,WOOD 3 | E2,A,25,1819,HIGHWAY,1037,2,N,THROUGH,WOOD,SHORT,S,WOOD 4 | E3,A,39,1829,AQUEDUCT,?,1,N,THROUGH,WOOD,?,S,WOOD 5 | E5,A,29,1837,HIGHWAY,1000,2,N,THROUGH,WOOD,SHORT,S,WOOD 6 | E6,M,23,1838,HIGHWAY,?,2,N,THROUGH,WOOD,?,S,WOOD 7 | E7,A,27,1840,HIGHWAY,990,2,N,THROUGH,WOOD,MEDIUM,S,WOOD 8 | E8,A,28,1844,AQUEDUCT,1000,1,N,THROUGH,IRON,SHORT,S,SUSPEN 9 | E9,M,3,1846,HIGHWAY,1500,2,N,THROUGH,IRON,SHORT,S,SUSPEN 10 | E10,A,39,1848,AQUEDUCT,?,1,N,DECK,WOOD,?,S,WOOD 11 | E11,A,29,1851,HIGHWAY,1000,2,N,THROUGH,WOOD,MEDIUM,S,WOOD 12 | E12,A,39,1853,RR,?,2,N,DECK,WOOD,?,S,WOOD 13 | E14,M,6,1856,HIGHWAY,1200,2,N,THROUGH,WOOD,MEDIUM,S,WOOD 14 | E13,A,33,1856,HIGHWAY,?,2,N,THROUGH,WOOD,?,S,WOOD 15 | E15,A,28,1857,RR,?,2,N,THROUGH,WOOD,?,S,WOOD 16 | E16,A,25,1859,HIGHWAY,1030,2,N,THROUGH,IRON,MEDIUM,S-F,SUSPEN 17 | E17,M,4,1863,RR,1000,2,N,THROUGH,IRON,MEDIUM,?,SIMPLE-T 18 | E18,A,28,1864,RR,1200,2,N,THROUGH,IRON,SHORT,S,SIMPLE-T 19 | E19,A,29,1866,HIGHWAY,1000,2,N,THROUGH,WOOD,MEDIUM,S,WOOD 20 | E20,A,32,1870,HIGHWAY,1000,2,N,THROUGH,WOOD,MEDIUM,S,WOOD 21 | E21,M,16,1874,RR,?,2,?,THROUGH,IRON,?,?,SIMPLE-T 22 | E23,M,1,1876,HIGHWAY,1245,?,?,THROUGH,STEEL,LONG,F,SUSPEN 23 | E22,A,24,1876,HIGHWAY,1200,4,G,THROUGH,WOOD,SHORT,S,WOOD 24 | E24,O,45,1878,RR,?,2,G,?,STEEL,?,?,SIMPLE-T 25 | E25,M,10,1882,RR,?,2,G,?,STEEL,?,?,SIMPLE-T 26 | E27,A,39,1883,RR,?,2,G,THROUGH,STEEL,?,F,SIMPLE-T 27 | E26,M,12,1883,RR,1150,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T 28 | E30,A,31,1884,RR,?,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 29 | E29,A,26,1884,HIGHWAY,1080,2,G,THROUGH,STEEL,MEDIUM,?,SUSPEN 30 | E28,M,3,1884,HIGHWAY,1000,2,G,THROUGH,STEEL,MEDIUM,S,ARCH 31 | E32,A,30,1887,HIGHWAY,?,2,G,THROUGH,IRON,MEDIUM,F,SIMPLE-T 32 | E31,M,8,1887,RR,1161,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T 33 | E34,O,41,1888,RR,4558,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T 34 | E33,M,19,1889,HIGHWAY,1120,?,G,THROUGH,IRON,MEDIUM,F,SIMPLE-T 35 | E36,O,45,1890,HIGHWAY,?,2,G,THROUGH,IRON,SHORT,F,SIMPLE-T 36 | E35,A,27,1890,HIGHWAY,1000,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 37 | E38,M,17,1891,HIGHWAY,?,2,G,THROUGH,IRON,MEDIUM,F,SIMPLE-T 38 | E37,M,18,1891,RR,1350,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T 39 | E39,A,25,1892,HIGHWAY,?,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 40 | E4,A,27,1892,AQUEDUCT,1092,1,N,THROUGH,WOOD,SHORT,S,WOOD 41 | E40,M,22,1893,HIGHWAY,?,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 42 | E41,M,11,1894,HIGHWAY,?,2,G,THROUGH,IRON,MEDIUM,F,SIMPLE-T 43 | E42,M,9,1895,HIGHWAY,2367,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T 44 | E44,O,48,1896,HIGHWAY,?,2,G,THROUGH,STEEL,LONG,F,SUSPEN 45 | E43,M,7,1896,HIGHWAY,1040,2,G,THROUGH,STEEL,LONG,F,ARCH 46 | E46,A,37,1897,RR,4000,2,G,DECK,STEEL,LONG,F,SIMPLE-T 47 | E45,M,14,1897,RR,2264,?,G,THROUGH,STEEL,?,F,SIMPLE-T 48 | E47,M,15,1898,RR,2000,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T 49 | E58,A,33,1900,HIGHWAY,1200,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 50 | E48,A,38,1900,HIGHWAY,2000,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 51 | E94,M,13,1901,RR,?,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T 52 | E49,A,34,1902,HIGHWAY,1850,2,G,THROUGH,STEEL,MEDIUM,F,CANTILEV 53 | E95,M,16,1903,RR,1300,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T 54 | E87,A,35,1903,RR,3000,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T 55 | E51,M,6,1903,RR,1417,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 56 | E50,M,21,1903,RR,1154,?,G,THROUGH,STEEL,LONG,F,SIMPLE-T 57 | E89,M,4,1904,RR,1200,2,G,THROUGH,STEEL,MEDIUM,S-F,SIMPLE-T 58 | E53,A,28,1904,RR,965,4,G,THROUGH,STEEL,MEDIUM,S-F,SIMPLE-T 59 | E52,M,2,1904,RR,1504,?,G,THROUGH,STEEL,LONG,F,CANTILEV 60 | E54,Y,?,1908,HIGHWAY,1240,?,G,?,STEEL,MEDIUM,F,SIMPLE-T 61 | E56,M,23,1909,HIGHWAY,?,?,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 62 | E55,A,36,1909,HIGHWAY,1730,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T 63 | E57,O,49,1910,RR,1620,2,G,THROUGH,STEEL,LONG,F,CANTILEV 64 | E59,O,43,1911,HIGHWAY,1652,2,G,THROUGH,STEEL,LONG,F,CANTILEV 65 | E107,A,39,1914,RR,?,?,G,?,STEEL,?,F,NIL 66 | E92,M,10,1914,RR,2210,?,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 67 | E61,O,41,1915,RR,2822,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T 68 | E60,A,24,1915,HIGHWAY,1000,4,G,THROUGH,STEEL,LONG,F,SIMPLE-T 69 | E62,A,37,1918,RR,2300,2,N,DECK,STEEL,LONG,F,CONT-T 70 | E63,A,31,1920,RR,2122,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 71 | E65,A,30,1921,WALK,?,?,G,THROUGH,STEEL,?,F,SUSPEN 72 | E64,A,29,1923,HIGHWAY,885,4,G,THROUGH,STEEL,MEDIUM,F,ARCH 73 | E66,A,32,1924,HIGHWAY,2365,4,G,THROUGH,STEEL,MEDIUM,S,ARCH 74 | E70,A,27,1926,HIGHWAY,860,4,G,THROUGH,STEEL,MEDIUM,S-F,SUSPEN 75 | E69,A,26,1926,HIGHWAY,884,4,G,THROUGH,STEEL,MEDIUM,S-F,SUSPEN 76 | E101,O,46,1927,HIGHWAY,1770,2,G,THROUGH,STEEL,LONG,S-F,CANTILEV 77 | E73,A,38,1927,HIGHWAY,1508,?,G,THROUGH,STEEL,MEDIUM,S,ARCH 78 | E72,M,5,1927,HIGHWAY,2663,4,N,DECK,STEEL,MEDIUM,S-F,CANTILEV 79 | E67,M,1,1927,HIGHWAY,1330,4,G,THROUGH,STEEL,LONG,F,CANTILEV 80 | E75,A,30,1928,HIGHWAY,2678,4,G,DECK,STEEL,MEDIUM,F,ARCH 81 | E74,M,20,1928,HIGHWAY,2220,2,G,DECK,STEEL,MEDIUM,S-F,CANTILEV 82 | E71,A,25,1928,HIGHWAY,860,4,G,THROUGH,STEEL,MEDIUM,S-F,SUSPEN 83 | E68,M,17,1928,HIGHWAY,2250,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T 84 | E78,O,40,1931,HIGHWAY,1365,4,G,THROUGH,STEEL,LONG,F,ARCH 85 | E77,O,42,1931,HIGHWAY,1450,4,N,THROUGH,STEEL,LONG,F,ARCH 86 | E76,M,6,1931,HIGHWAY,1500,4,G,THROUGH,STEEL,LONG,F,SUSPEN 87 | E93,M,11,1937,HIGHWAY,1690,4,N,DECK,STEEL,LONG,S-F,CONT-T 88 | E79,A,34,1939,HIGHWAY,1800,4,G,DECK,STEEL,MEDIUM,F,CANTILEV 89 | E108,A,39.5,1945,HIGHWAY,1060,4,G,DECK,STEEL,MEDIUM,S-F,CONT-T 90 | E107N,A,39.7,1945,RR,840,2,G,THROUGH,STEEL,MEDIUM,S-F,SIMPLE-T 91 | E105,A,38.5,1945,HIGHWAY,1710,2,N,DECK,STEEL,MEDIUM,S-F,CONT-T 92 | E103,O,48,1945,HIGHWAY,2160,2,G,THROUGH,STEEL,LONG,F,CANTILEV 93 | E97,Y,52,1945,HIGHWAY,?,?,G,THROUGH,STEEL,MEDIUM,S,ARCH 94 | E96,Y,51,1945,RR,?,?,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T 95 | E99,M,23,1950,HIGHWAY,1320,2,G,THROUGH,STEEL,MEDIUM,S-F,SIMPLE-T 96 | E98,M,22,1951,HIGHWAY,900,4,G,THROUGH,STEEL,MEDIUM,F,CONT-T 97 | E81,M,14,1951,HIGHWAY,2423,4,G,DECK,STEEL,LONG,F,CONT-T 98 | E80,M,19,1951,HIGHWAY,1031,4,G,THROUGH,STEEL,LONG,F,CANTILEV 99 | E88,A,37,1955,HIGHWAY,2300,4,N,DECK,STEEL,LONG,F,CONT-T 100 | E82,O,42,1955,HIGHWAY,804,?,G,THROUGH,STEEL,?,F,SIMPLE-T 101 | E102,O,47,1959,HIGHWAY,1700,2,G,THROUGH,STEEL,LONG,F,CONT-T 102 | E83,M,1,1959,HIGHWAY,1000,6,G,THROUGH,STEEL,LONG,F,ARCH 103 | E86,A,33,1961,HIGHWAY,980,4,G,DECK,STEEL,MEDIUM,S-F,CONT-T 104 | E85,M,9,1962,HIGHWAY,2213,4,G,DECK,STEEL,LONG,F,CONT-T 105 | E84,A,24,1969,HIGHWAY,870,6,G,THROUGH,STEEL,MEDIUM,F,ARCH 106 | E91,O,44,1975,HIGHWAY,3756,6,G,THROUGH,STEEL,LONG,F,ARCH 107 | E90,M,7,1978,HIGHWAY,950,6,G,THROUGH,STEEL,LONG,F,ARCH 108 | E100,O,43,1982,HIGHWAY,?,?,G,?,?,?,F,? 109 | E109,A,28,1986,HIGHWAY,?,?,G,?,?,?,F,? 110 | -------------------------------------------------------------------------------- /tane.py: -------------------------------------------------------------------------------- 1 | """------------------------------------------------------------------------------------------ 2 | TANE Algorithm for discovery of exact functional dependencies 3 | Author: Nabiha Asghar, nasghar@uwaterloo.ca 4 | February 2015 5 | Use for research purposes only. 6 | Please do not re-distribute without written permission from the author 7 | Any commerical uses strictly forbidden. 8 | Code is provided without any guarantees. 9 | ----------------------------------------------------------------------------------------------""" 10 | from pandas import * 11 | from collections import defaultdict 12 | import numpy as NP 13 | import sys 14 | 15 | def list_duplicates(seq): 16 | tally = defaultdict(list) 17 | for i,item in enumerate(seq): 18 | tally[item].append(i) 19 | return ((key,locs) for key,locs in tally.items() 20 | if len(locs)>0) 21 | 22 | def findCplus(x): # this computes the Cplus of x as an intersection of smaller Cplus sets 23 | global dictCplus 24 | thesets=[] 25 | for a in x: 26 | if x.replace(a,'') in dictCplus.keys(): 27 | temp = dictCplus[x.replace(a,'')] 28 | else: 29 | temp=findCplus(x.replace(a,'')) # compute C+(X\{A}) for each A at a time 30 | #dictCplus[x.replace(a,'')] = temp 31 | thesets.insert(0, set(temp)) 32 | if list(set.intersection(*thesets)) == []: 33 | cplus = [] 34 | else: 35 | cplus = list(set.intersection(*thesets)) # compute the intersection in line 2 of pseudocode 36 | return cplus 37 | 38 | def compute_dependencies(level, listofcols): 39 | global dictCplus 40 | global finallistofFDs 41 | global listofcolumns 42 | for x in level: 43 | thesets=[] 44 | for a in x: 45 | if x.replace(a,'') in dictCplus.keys(): 46 | temp = dictCplus[x.replace(a,'')] 47 | else: 48 | temp=computeCplus(x.replace(a,'')) # compute C+(X\{A}) for each A at a time 49 | dictCplus[x.replace(a,'')] = temp 50 | thesets.insert(0, set(temp)) 51 | if list(set.intersection(*thesets)) == []: 52 | dictCplus[x] = [] 53 | else: 54 | dictCplus[x] = list(set.intersection(*thesets)) # compute the intersection in line 2 of pseudocode 55 | for x in level: 56 | for a in x: 57 | if a in dictCplus[x]: 58 | #if x=='BCJ': print "dictCplus['BCJ'] = ", dictCplus[x] 59 | if validfd(x.replace(a,''), a): # line 5 60 | finallistofFDs.append([x.replace(a,''), a]) # line 6 61 | dictCplus[x].remove(a) # line 7 62 | 63 | listofcols=listofcolumns[:] 64 | for j in x: # this loop computes R\X 65 | if j in listofcols: listofcols.remove(j) 66 | 67 | for b in listofcols: # this loop removes each b in R\X from C+(X) 68 | if b in dictCplus[x]: dictCplus[x].remove(b) 69 | 70 | def computeCplus(x): # this computes the Cplus from the first definition in section 3.2.2 of TANE paper. output should be a list of single attributes 71 | global listofcolumns 72 | listofcols = listofcolumns[:] 73 | if x=='': return listofcols # because C+{phi} = R 74 | cplus = [] 75 | for a in listofcols: 76 | for b in x: 77 | temp = x.replace(a,'') 78 | temp = temp.replace(b,'') 79 | if not validfd(temp, b): 80 | cplus.append(a) 81 | return cplus 82 | 83 | def validfd(y,z): 84 | if y=='' or z=='': return False 85 | ey = computeE(y) 86 | eyz = computeE(y+z) 87 | if ey == eyz : 88 | return True 89 | else: 90 | return False 91 | 92 | def computeE(x): 93 | global totaltuples 94 | global dictpartitions 95 | doublenorm = 0 96 | for i in dictpartitions[''.join(sorted(x))]: 97 | doublenorm = doublenorm + len(i) 98 | e = (doublenorm-len(dictpartitions[''.join(sorted(x))]))/float(totaltuples) 99 | return e 100 | 101 | def check_superkey(x): 102 | global dictpartitions 103 | if ((dictpartitions[x] == [[]]) or (dictpartitions[x] == [])): 104 | return True 105 | else: 106 | return False 107 | 108 | def prune(level): 109 | global dictCplus 110 | global finallistofFDs 111 | stufftobedeletedfromlevel = [] 112 | for x in level: # line 1 113 | if dictCplus[x]==[]: # line 2 114 | level.remove(x) # line 3 115 | if check_superkey(x): # line 4 ### should this check for a key, instead of super key??? Not sure. 116 | temp = dictCplus[x][:] 117 | for i in x: # this loop computes C+(X) \ X 118 | if i in temp: temp.remove(i) 119 | for a in temp: # line 5 120 | thesets=[] 121 | for b in x: 122 | if not( ''.join(sorted((x+a).replace(b,''))) in dictCplus.keys()): 123 | dictCplus[''.join(sorted((x+a).replace(b,'')))] = findCplus(''.join(sorted((x+a).replace(b,'')))) 124 | thesets.insert(0,set(dictCplus[''.join(sorted((x+a).replace(b,'')))])) 125 | if a in list(set.intersection(*thesets)): # line 6 126 | finallistofFDs.append([x, a]) # line 7 127 | #print "adding key FD: ", [x,a] 128 | if x in level: stufftobedeletedfromlevel.append(x) # line 8 129 | for item in stufftobedeletedfromlevel: 130 | level.remove(item) 131 | 132 | def generate_next_level(level): 133 | nextlevel=[] 134 | for i in range(0,len(level)): # pick an element 135 | for j in range(i+1, len(level)): # compare it to every element that comes after it. 136 | if ((not level[i]==level[j]) and level[i][0:-1]==level[j][0:-1]): # i.e. line 2 and 3 137 | x = level[i]+level[j][-1] #line 4 138 | flag = True 139 | for a in x: # this entire for loop is for the 'for all' check in line 5 140 | if not(x.replace(a, '') in level): 141 | flag=False 142 | if flag==True: 143 | nextlevel.append(x) 144 | stripped_product(x, level[i] , level[j] ) # compute partition of x as pi_y * pi_z (where y is level[i] and z is level[j]) 145 | return nextlevel 146 | 147 | 148 | def stripped_product(x,y,z): 149 | global dictpartitions 150 | global tableT 151 | tableS = ['']*len(tableT) 152 | partitionY = dictpartitions[''.join(sorted(y))] # partitionY is a list of lists, each list is an equivalence class 153 | partitionZ = dictpartitions[''.join(sorted(z))] 154 | partitionofx = [] # line 1 155 | for i in range(len(partitionY)): # line 2 156 | for t in partitionY[i]: # line 3 157 | tableT[t] = i 158 | tableS[i]='' #line 4 159 | for i in range(len(partitionZ)): # line 5 160 | for t in partitionZ[i]: # line 6 161 | if ( not (tableT[t] == 'NULL')): # line 7 162 | tableS[tableT[t]] = sorted(list(set(tableS[tableT[t]]) | set([t]))) 163 | for t in partitionZ[i]: # line 8 164 | if (not (tableT[t] == 'NULL')) and len(tableS[tableT[t]])>= 2 : # line 9 165 | partitionofx.append(tableS[tableT[t]]) 166 | if not (tableT[t] == 'NULL'): tableS[tableT[t]]='' # line 10 167 | for i in range(len(partitionY)): # line 11 168 | for t in partitionY[i]: # line 12 169 | tableT[t]='NULL' 170 | dictpartitions[''.join(sorted(x))] = partitionofx 171 | 172 | def computeSingletonPartitions(listofcols): 173 | global data2D 174 | global dictpartitions 175 | for a in listofcols: 176 | dictpartitions[a]=[] 177 | for element in list_duplicates(data2D[a].tolist()): # list_duplicates returns 2-tuples, where 1st is a value, and 2nd is a list of indices where that value occurs 178 | if len(element[1])>1: # ignore singleton equivalence classes 179 | dictpartitions[a].append(element[1]) 180 | 181 | #------------------------------------------------------- START --------------------------------------------------- 182 | 183 | if len(sys.argv) > 1: 184 | infile=str(sys.argv[1]) # this would be e.g. "testdata.csv" 185 | 186 | data2D = read_csv(infile) 187 | 188 | totaltuples = len(data2D.index) 189 | listofcolumns = list(data2D.columns.values) # returns ['A', 'B', 'C', 'D', .....] 190 | 191 | tableT = ['NULL']*totaltuples # this is for the table T used in the function stripped_product 192 | 193 | L0 = [] 194 | dictCplus = {'NULL': listofcolumns} 195 | dictpartitions = {} # maps 'stringslikethis' to a list of lists, each of which contains indices 196 | computeSingletonPartitions(listofcolumns) 197 | finallistofFDs=[] 198 | #print dictCplus['NULL'] 199 | L1=listofcolumns[:] # L1 is a copy of listofcolumns 200 | l=1 201 | 202 | L = [L0,L1] 203 | 204 | while (not (L[l] == [])): 205 | compute_dependencies(L[l],listofcolumns[:]) 206 | prune(L[l]) 207 | temp = generate_next_level(L[l]) 208 | L.append(temp) 209 | l=l+1 210 | 211 | print "List of all FDs: " , finallistofFDs 212 | print "Total number of FDs found: ", len(finallistofFDs) 213 | -------------------------------------------------------------------------------- /ctane.py: -------------------------------------------------------------------------------- 1 | """------------------------------------------------------------------------------------------ 2 | TANE Algorithm for discovery of exact conditional functional dependencies 3 | Author: Nabiha Asghar, nasghar@uwaterloo.ca 4 | March 2015 5 | Use for research purposes only. 6 | Please do not re-distribute without written permission from the author 7 | Any commerical uses strictly forbidden. 8 | Code is provided without any guarantees. 9 | ----------------------------------------------------------------------------------------------""" 10 | from pandas import * 11 | from collections import defaultdict 12 | import numpy as NP 13 | import itertools 14 | import sys 15 | 16 | def replace_element_in_tuple(tup, elementindex, elementval): 17 | if type(elementval)==tuple: 18 | elementval = elementval[0] 19 | newtup = list(tup) 20 | newtup[elementindex] = elementval 21 | newtup = tuple(newtup) 22 | return newtup 23 | 24 | def add_element_in_tuple(spxminusa, ca): 25 | thelist = list(spxminusa) 26 | thelist.append(ca[0]) 27 | return tuple(thelist) 28 | 29 | def validcfd(xminusa, x, a, spxminusa, sp, ca): 30 | global dictpartitions 31 | if xminusa == '' or a == '': 32 | return False 33 | indexofa = x.index(a) 34 | newsp0 = add_element_in_tuple(spxminusa, ca) 35 | newsp1 = replace_element_in_tuple(sp, indexofa, ca) #this is sp, except that in place of value of a we put ca 36 | if (x, newsp1) in dictpartitions.keys(): 37 | if len(dictpartitions[(xminusa, spxminusa)]) == len(dictpartitions[(x, newsp1)]):# and twodlen(dictpartitions[(xminusa, spxminusa)]) == twodlen(dictpartitions[(x, newsp1)]): 38 | return True 39 | return False 40 | 41 | def twodlen(listoflists): 42 | summ = 0 43 | for item in listoflists: 44 | summ = summ + len(item) 45 | return summ 46 | 47 | def greaterthanorequalto(upxminusa, spxminusa): # this is actually greaterthan or equal to 48 | if upxminusa == spxminusa: 49 | return True 50 | flag = True 51 | for index in range(0, len(upxminusa)): 52 | if not (spxminusa[index]=='--'): 53 | if (not (upxminusa[index] == spxminusa[index])): 54 | flag = False 55 | return flag 56 | 57 | def doublegreaterthan(upxminusa, spxminusa): 58 | if upxminusa == spxminusa: 59 | return False 60 | flag = True 61 | for index in range(0, len(upxminusa)): 62 | if (not spxminusa[index]=='--'): 63 | if (not (upxminusa[index] == spxminusa[index])): 64 | flag = False 65 | return flag 66 | 67 | def compute_dependencies(level, listofcols): 68 | global dictCplus 69 | global finallistofCFDs 70 | global listofcolumns 71 | for (x,sp) in level: 72 | for a in x: 73 | for (att, ca) in dictCplus[(x, sp)]: 74 | if att == a: 75 | newtup = spXminusA(sp, x, a) ### tuple(y for y in sp if not sp.index(y)==x.index(a)) # this is sp[X\A] 76 | if validcfd( x.replace(a,''), x, a, newtup, sp, ca) and not ([x.replace(a,''), a, [newtup, ca]] in finallistofCFDs): 77 | finallistofCFDs.append([x.replace(a,''), a, [newtup, ca]]) 78 | for (xx, up) in level: 79 | if xx==x: 80 | newtup0 = spXminusA(up, x, a) ### tuple(y for y in up if not up.index(y)==x.index(a)) # this is up[X\A] 81 | if up[x.index(a)]==ca[0] and greaterthanorequalto(newtup0, newtup) : 82 | if (a, ca) in dictCplus[(x,up)]: dictCplus[(x,up)].remove((a,ca)) 83 | listofcolscopy = listofcols[:] 84 | for j in x: # this loop computes R\X 85 | if j in listofcolscopy: listofcolscopy.remove(j) 86 | for b_att in listofcolscopy: # this loop removes each b in R\X from C+(X,up) 87 | stufftobedeleted = [] 88 | for (bbval, sometup) in dictCplus[(x,up)]: 89 | if b_att == bbval: 90 | stufftobedeleted.append((bbval,sometup)) 91 | for item in stufftobedeleted: 92 | dictCplus[(x,up)].remove(item) 93 | 94 | def prune(level): 95 | global dictCplus 96 | stufftobedeleted=[] 97 | for (x,sp) in level: 98 | if len(dictCplus[(x,sp)])==0: 99 | stufftobedeleted.append((x,sp)) 100 | for item in stufftobedeleted: 101 | level.remove(item) 102 | 103 | def computeCplus(level): # for each tuple (x,sp) in the list level, it computes C+(x,sp), which is a list of (attribute, value) tuples) 104 | global listofcolumns 105 | global dictCplus 106 | listofcols = listofcolumns[:] 107 | for (x,sp) in level: #sp is a tuple of strings like this: ('aa', 'bb', 'cc') or ('aa', ) 108 | thesets=[] 109 | for b in x: 110 | indx = x.index(b) # the index where b is located in x 111 | spcopy = spXminusA(sp, x, b) ### tuple(y for y in sp if not sp.index(y)==indx) 112 | spcopy2 = sp[:] 113 | if (x.replace(b,''), spcopy ) in dictCplus.keys(): 114 | temp = dictCplus[(x.replace(b,''), spcopy)] 115 | else: temp = [] # is this correct???? should I put [] here? 116 | thesets.insert(0, set(temp)) 117 | if list(set.intersection(*thesets)) == []: 118 | dictCplus[(x,sp)] = [] 119 | else: 120 | dictCplus[(x,sp)] = list(set.intersection(*thesets)) 121 | 122 | def initial_Cplus(level): 123 | global listofcolumns 124 | global dictCplus 125 | computeCplus(level) 126 | for (a,ca) in level: 127 | stufftobedeleted = [] 128 | for (att, val) in dictCplus[(a,ca)]: 129 | if att==a and not val==ca: 130 | stufftobedeleted.append((att,val)) 131 | for item in stufftobedeleted: 132 | dictCplus[(a,ca)].remove(item) 133 | 134 | def populateL1(listofcols): 135 | global k_suppthreshold 136 | l1 = [] 137 | attributepartitions = computeAttributePartitions(listofcols) 138 | for a in listofcols: 139 | l1.append((a, ('--',))) 140 | for eqclass in attributepartitions[a]: 141 | if len(eqclass)>= k_suppthreshold: 142 | l1.append( (a, (str(data2D.iloc[eqclass[0]][a]) , ) ) ) 143 | computeInitialPartitions(l1, attributepartitions) # populates the dictpartitions with the initial partitions (X,sp) where X is a single attribute 144 | return l1 145 | 146 | def computeInitialPartitions(level1, attributepartitions): 147 | global data2D 148 | global dictpartitions # dictpartitions[(x,sp)] is of the form [[0,1,2]]. So simply a list of lists of indices 149 | for (a,sp) in level1: 150 | dictpartitions[(a,sp)]=[] 151 | dictpartitions[(a,sp)] = attributepartitions[a] 152 | 153 | def old_computeInitialPartitions(level1, attributepartitions): 154 | global data2D 155 | global dictpartitions # dictpartitions[(x,sp)] is of the form [[0,1,2]]. So simply a list of lists of indices 156 | for (a,sp) in level1: 157 | dictpartitions[(a,sp)]=[] 158 | if sp[0]=='--': 159 | dictpartitions[(a,sp)] = attributepartitions[a] 160 | else: 161 | for eqclass in attributepartitions[a]: 162 | if str(data2D.iloc[eqclass[0]][a])==sp[0]: 163 | dictpartitions[(a,sp)].append(eqclass) 164 | 165 | def computeAttributePartitions(listofcols): # compute partitions for every attribute 166 | global data2D 167 | attributepartitions = {} 168 | for a in listofcols: 169 | attributepartitions[a]=[] 170 | for element in list_duplicates(data2D[a].tolist()): # list_duplicates returns 2-tuples, where 1st is a value, and 2nd is a list of indices where that value occurs 171 | if len(element[1])>0: # if >1, then ignore singleton equivalence classes 172 | attributepartitions[a].append(element[1]) 173 | return attributepartitions 174 | 175 | def list_duplicates(seq): 176 | tally = defaultdict(list) 177 | for i,item in enumerate(seq): 178 | tally[item].append(i) 179 | return ((key,locs) for key,locs in tally.items() 180 | if len(locs)>0) 181 | 182 | def sometuplematchesZUP(z,up): 183 | global dictpartitions 184 | global k_suppthreshold 185 | sumofmatches = 0 186 | for eqclass in dictpartitions[(z, up)]: 187 | sumofmatches = sumofmatches + len(eqclass) 188 | if sumofmatches >= k_suppthreshold: 189 | return True 190 | else: 191 | return False 192 | 193 | def generate_next_level(level): 194 | nextlevel=[] 195 | for i in range(0,len(level)): # pick an element 196 | for j in range(i+1, len(level)): # compare it to every element that comes after it. 197 | if ((not level[i][0]==level[j][0]) and level[i][0][0:-1]==level[j][0][0:-1] and level[i][1][0:-1]==level[j][1][0:-1]): 198 | z = level[i][0] + level[j][0][-1] 199 | up = tuple(list(level[i][1]) + [level[j][1][-1]]) 200 | (z, up) = sortspbasedonx(z, up) 201 | partition_product((z,up), level[i], level[j]) 202 | if sometuplematchesZUP(z,up): 203 | flag = True 204 | for att in z: 205 | indexofatt = z.index(att) # where is att located in z 206 | up_zminusa = spXminusA(up, z, att) 207 | zminusa = z.replace(att,'') 208 | if not ((zminusa, up_zminusa) in level): 209 | flag = False 210 | if flag: 211 | nextlevel.append((z, up)) 212 | return nextlevel 213 | 214 | def spXminusA(sp, x, a): 215 | indexofa = x.index(a) 216 | mylist=[] 217 | for i in range(0, len(sp)): 218 | if not i==indexofa: 219 | mylist.append(sp[i]) 220 | return tuple(mylist) 221 | 222 | def partition_product(zup, xsp, ytp): 223 | global dictpartitions 224 | global tableT 225 | tableS = ['']*len(tableT) 226 | partitionXSP = dictpartitions[xsp] 227 | partitionYTP = dictpartitions[ytp] 228 | partitionZUP = [] 229 | for i in range(len(partitionXSP)): 230 | for t in partitionXSP[i]: 231 | tableT[t] = i 232 | tableS[i]='' 233 | for i in range(len(partitionYTP)): 234 | for t in partitionYTP[i]: 235 | if ( not (tableT[t] == 'NULL')): 236 | tableS[tableT[t]] = sorted(list(set(tableS[tableT[t]]) | set([t]))) 237 | for t in partitionYTP[i]: 238 | if (not (tableT[t] == 'NULL')) and len(tableS[tableT[t]])>= 1 : 239 | partitionZUP.append(tableS[tableT[t]]) 240 | if not (tableT[t] == 'NULL'): tableS[tableT[t]]='' 241 | for i in range(len(partitionXSP)): 242 | for t in partitionXSP[i]: 243 | tableT[t]='NULL' 244 | dictpartitions[zup] = partitionZUP 245 | dictpartitions[zup] = partitionZUP 246 | 247 | def sortspbasedonx(x,sp): 248 | x = list(x) 249 | points = zip(x,sp) 250 | sorted_points = sorted(points) 251 | new_x = [point[0] for point in sorted_points] 252 | new_sp = [point[1] for point in sorted_points] 253 | return (''.join(new_x), tuple(new_sp)) 254 | 255 | #------------------------------------------------------- START --------------------------------------------------- 256 | if len(sys.argv) > 1: 257 | infile=str(sys.argv[1]) 258 | if len(sys.argv) > 2: 259 | k=int(sys.argv[2]) 260 | 261 | data2D = read_csv(infile) 262 | 263 | totaltuples = len(data2D.index) 264 | listofcolumns = list(data2D.columns.values) # returns ['A', 'B', 'C', 'D', .....] 265 | tableT = ['NULL']*totaltuples # this is for the table T used in the function partition_product 266 | k_suppthreshold = k 267 | L0 = [] 268 | 269 | dictpartitions = {} # maps 'stringslikethis' to a list of lists, each of which contains indices 270 | finallistofCFDs=[] 271 | L1=populateL1(listofcolumns[:]) # L1 is a list of tuples of the form [ ('A', ('val1') ), ('A', ('val2') ), ..., ('B', ('val3') ), ......] 272 | dictCplus = {('',()): L1[:]} 273 | l=1 274 | L = [L0,L1] 275 | 276 | while (not (L[l] == [])): 277 | if l==1: 278 | initial_Cplus(L[l]) 279 | else: 280 | computeCplus(L[l]) 281 | compute_dependencies(L[l],listofcolumns[:]) 282 | prune(L[l]) 283 | temp = generate_next_level(L[l]) 284 | L.append(temp) 285 | l=l+1 286 | #print "List of all CFDs: " , finallistofCFDs 287 | #print "CFDs found: ", len(finallistofCFDs), ", level = ", l-1 288 | 289 | print "List of all CFDs: " , finallistofCFDs 290 | print "Total number of CFDs found: ", len(finallistofCFDs) 291 | --------------------------------------------------------------------------------