├── testdataABCD.csv
├── testdata0.csv
├── README.md
├── testdata1.csv
├── bridges.csv
├── tane.py
└── ctane.py


/testdataABCD.csv:
--------------------------------------------------------------------------------
1 | A,B,C,D
2 | 1,1,5,5
3 | 1,1,1,3
4 | 5,1,2,3
5 | 


--------------------------------------------------------------------------------
/testdata0.csv:
--------------------------------------------------------------------------------
1 | A,B,C,D,E,F,G,H,I,J,K
2 | 128059,1,1,1,1,2,5,5,1,1,2
3 | 1285531,1,1,1,1,2,1,3,1,1,2
4 | 1287775,5,1,1,2,2,2,3,1,1,2
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FD_CFD_extraction
 2 | 
 3 | This repository contains the implementation of two algorithms, TANE and CTANE, corresponding to the following publications:
 4 | 
 5 | 1. "TANE: An Efficient Algorithm for Discovering Functional and Approximate Dependencies" (link: https://www.lri.fr/~pierres/donn%E9es/save/these/articles/lpr-queue/huhtala99tane.pdf)
 6 | 
 7 | 2. "Discovering Conditional Functional Dependencies" (link: http://homepages.inf.ed.ac.uk/fgeerts/pdf/CFDdiscovery.pdf)
 8 | 
 9 | We have also provided several CSV files as test data.
10 | 
11 | This code was used in the following work:
12 | "Automatic Discovery of Functional Dependencies and Conditional Functional
13 | Dependencies: A Comparative Study" (link: https://cs.uwaterloo.ca/~nasghar/848.pdf)
14 | 
15 | ##Running the code
16 | 
17 | To run tane.py on a particular csv file (e.g. adult.csv), execute the following command in your terminal:
18 | ```
19 | python tane.py adult.csv
20 | ```
21 | To run ctane.py on the same data, execute:
22 | ```
23 | python ctane.py adult.csv
24 | ```
25 | To run ctane.py and obtain k-frequent CFDs, execute:
26 | ```
27 | python ctane.py adult.csv k
28 | ```
29 | where k is your integer of choice.
30 | 
31 | 


--------------------------------------------------------------------------------
/testdata1.csv:
--------------------------------------------------------------------------------
  1 | A,B,C,D,E,F,G,H,I,J,K
  2 | 128059,1,1,1,1,2,5,5,1,1,2
  3 | 1285531,1,1,1,1,2,1,3,1,1,2
  4 | 1287775,5,1,1,2,2,2,3,1,1,2
  5 | 144888,8,10,10,8,5,10,7,8,1,4
  6 | 145447,8,4,4,1,2,9,3,3,1,4
  7 | 167528,4,1,1,1,2,1,3,6,1,2
  8 | 169356,3,1,1,1,2,?,3,1,1,2
  9 | 183913,1,2,2,1,2,1,1,1,1,2
 10 | 191250,10,4,4,10,2,10,5,3,3,4
 11 | 1017023,6,3,3,5,3,10,3,5,3,2
 12 | 1100524,6,10,10,2,8,10,7,3,3,4
 13 | 1116116,9,10,10,1,10,8,3,3,1,4
 14 | 1168736,5,6,6,2,4,10,3,6,1,4
 15 | 1182404,3,1,1,1,2,1,1,1,1,2
 16 | 1182404,3,1,1,1,2,1,2,1,1,2
 17 | 1198641,3,1,1,1,2,1,3,1,1,2
 18 | 242970,5,7,7,1,5,8,3,4,1,2
 19 | 1151734,10,8,7,4,3,10,7,9,1,4
 20 | 1156017,3,1,1,1,2,1,2,1,1,2
 21 | 1158247,1,1,1,1,1,1,1,1,1,2
 22 | 677910,5,2,2,4,2,4,1,1,1,2
 23 | 734111,1,1,1,3,2,3,1,1,1,2
 24 | 734111,1,1,1,1,2,2,1,1,1,2
 25 | 780555,5,1,1,6,3,1,2,1,1,2
 26 | 827627,2,1,1,1,2,1,1,1,1,2
 27 | 1049837,1,1,1,1,2,1,1,1,1,2
 28 | 1058849,5,1,1,1,2,1,1,1,1,2
 29 | 1182404,1,1,1,1,1,1,1,1,1,2
 30 | 1193544,5,7,9,8,6,10,8,10,1,4
 31 | 1201870,4,1,1,3,1,1,2,1,1,2
 32 | 1202253,5,1,1,1,2,1,1,1,1,2
 33 | 1227081,3,1,1,3,2,1,1,1,1,2
 34 | 1230994,4,5,5,8,6,10,10,7,1,4
 35 | 1238410,2,3,1,1,3,1,1,1,1,2
 36 | 1246562,10,2,2,1,2,6,1,1,2,4
 37 | 1257470,10,6,5,8,5,10,8,6,1,4
 38 | 1259008,8,8,9,6,6,3,10,10,1,4
 39 | 1266124,5,1,2,1,2,1,1,1,1,2
 40 | 1267898,5,1,3,1,2,1,1,1,1,2
 41 | 1268313,5,1,1,3,2,1,1,1,1,2
 42 | 1298416,10,6,6,2,4,10,9,7,1,4
 43 | 1299596,6,6,6,5,4,10,7,6,2,4
 44 | 1105524,4,1,1,1,2,1,1,1,1,2
 45 | 1253917,4,1,1,2,2,1,2,1,1,2
 46 | 1265899,4,1,1,1,2,1,3,1,1,2
 47 | 1268766,1,1,1,1,2,1,1,1,1,2
 48 | 1277268,3,3,1,1,2,1,1,1,1,2
 49 | 1286943,8,10,10,10,7,5,4,8,7,4
 50 | 1295508,1,1,1,1,2,4,1,1,1,2
 51 | 1297327,5,1,1,1,2,1,1,1,1,2
 52 | 1297522,2,1,1,1,2,1,1,1,1,2
 53 | 1298360,1,1,1,1,2,1,1,1,1,2
 54 | 1299924,5,1,1,1,2,1,2,1,1,2
 55 | 1299994,5,1,1,1,2,1,1,1,1,2
 56 | 1304595,3,1,1,1,1,1,2,1,1,2
 57 | 749653,3,1,1,1,2,1,2,1,1,2
 58 | 769612,3,1,1,2,2,1,1,1,1,2
 59 | 769612,4,1,1,1,2,1,1,1,1,2
 60 | 798429,4,1,1,1,2,1,3,1,1,2
 61 | 807657,6,1,3,2,2,1,1,1,1,2
 62 | 8233704,4,1,1,1,1,1,2,1,1,2
 63 | 837480,7,4,4,3,4,10,6,9,1,4
 64 | 867392,4,2,2,1,2,1,2,1,1,2
 65 | 1240603,2,1,1,1,1,1,1,1,1,2
 66 | 1240603,3,1,1,1,1,1,1,1,1,2
 67 | 1241035,7,8,3,7,4,5,7,8,2,4
 68 | 1287971,3,1,1,1,2,1,2,1,1,2
 69 | 1289391,1,1,1,1,2,1,3,1,1,2
 70 | 1299924,3,2,2,2,2,1,4,2,1,2
 71 | 1306339,4,4,2,1,2,5,2,1,2,2
 72 | 1321942,5,1,1,1,2,1,3,1,1,2
 73 | 1328331,1,1,1,1,2,1,3,1,1,2
 74 | 1328755,3,1,1,1,2,1,2,1,1,2
 75 | 1331405,4,1,1,1,2,1,3,2,1,2
 76 | 1331412,5,7,10,10,5,10,10,10,1,4
 77 | 1333104,3,1,2,1,2,1,3,1,1,2
 78 | 1334071,4,1,1,1,2,3,2,1,1,2
 79 | 1343068,8,4,4,1,6,10,2,5,2,4
 80 | 1343374,10,10,8,10,6,5,10,3,1,4
 81 | 1344121,8,10,4,4,8,10,8,2,1,4
 82 | 142932,7,6,10,5,3,10,9,10,2,4
 83 | 183936,3,1,1,1,2,1,2,1,1,2
 84 | 324382,1,1,1,1,2,1,2,1,1,2
 85 | 378275,10,9,7,3,4,2,7,7,1,4
 86 | 878358,5,7,10,6,5,10,7,5,1,4
 87 | 1107684,6,10,5,5,4,10,6,10,1,4
 88 | 1115762,3,1,1,1,2,1,1,1,1,2
 89 | 1217717,5,1,1,6,3,1,1,1,1,2
 90 | 1239420,1,1,1,1,2,1,1,1,1,2
 91 | 1254538,8,10,10,10,6,10,10,10,1,4
 92 | 1261751,5,1,1,1,2,1,2,2,1,2
 93 | 1268275,9,8,8,9,6,3,4,1,1,4
 94 | 1324572,5,1,1,1,2,1,2,2,1,2
 95 | 1324681,4,1,1,1,2,1,2,1,1,2
 96 | 1325159,3,1,1,1,2,1,3,1,1,2
 97 | 1326892,3,1,1,1,2,1,2,1,1,2
 98 | 1339781,4,1,1,1,2,1,3,1,1,2
 99 | 13454352,1,1,3,1,2,1,2,1,1,2
100 | 1345452,1,1,3,1,2,1,2,1,1,2
101 | 1345593,3,1,1,3,2,1,2,1,1,2
102 | 


--------------------------------------------------------------------------------
/bridges.csv:
--------------------------------------------------------------------------------
  1 | A,B,C,D,E,F,G,H,I,J,K,L,M
  2 | E1,M,3,1818,HIGHWAY,?,2,N,THROUGH,WOOD,SHORT,S,WOOD
  3 | E2,A,25,1819,HIGHWAY,1037,2,N,THROUGH,WOOD,SHORT,S,WOOD
  4 | E3,A,39,1829,AQUEDUCT,?,1,N,THROUGH,WOOD,?,S,WOOD
  5 | E5,A,29,1837,HIGHWAY,1000,2,N,THROUGH,WOOD,SHORT,S,WOOD
  6 | E6,M,23,1838,HIGHWAY,?,2,N,THROUGH,WOOD,?,S,WOOD
  7 | E7,A,27,1840,HIGHWAY,990,2,N,THROUGH,WOOD,MEDIUM,S,WOOD
  8 | E8,A,28,1844,AQUEDUCT,1000,1,N,THROUGH,IRON,SHORT,S,SUSPEN
  9 | E9,M,3,1846,HIGHWAY,1500,2,N,THROUGH,IRON,SHORT,S,SUSPEN
 10 | E10,A,39,1848,AQUEDUCT,?,1,N,DECK,WOOD,?,S,WOOD
 11 | E11,A,29,1851,HIGHWAY,1000,2,N,THROUGH,WOOD,MEDIUM,S,WOOD
 12 | E12,A,39,1853,RR,?,2,N,DECK,WOOD,?,S,WOOD
 13 | E14,M,6,1856,HIGHWAY,1200,2,N,THROUGH,WOOD,MEDIUM,S,WOOD
 14 | E13,A,33,1856,HIGHWAY,?,2,N,THROUGH,WOOD,?,S,WOOD
 15 | E15,A,28,1857,RR,?,2,N,THROUGH,WOOD,?,S,WOOD
 16 | E16,A,25,1859,HIGHWAY,1030,2,N,THROUGH,IRON,MEDIUM,S-F,SUSPEN
 17 | E17,M,4,1863,RR,1000,2,N,THROUGH,IRON,MEDIUM,?,SIMPLE-T
 18 | E18,A,28,1864,RR,1200,2,N,THROUGH,IRON,SHORT,S,SIMPLE-T
 19 | E19,A,29,1866,HIGHWAY,1000,2,N,THROUGH,WOOD,MEDIUM,S,WOOD
 20 | E20,A,32,1870,HIGHWAY,1000,2,N,THROUGH,WOOD,MEDIUM,S,WOOD
 21 | E21,M,16,1874,RR,?,2,?,THROUGH,IRON,?,?,SIMPLE-T
 22 | E23,M,1,1876,HIGHWAY,1245,?,?,THROUGH,STEEL,LONG,F,SUSPEN
 23 | E22,A,24,1876,HIGHWAY,1200,4,G,THROUGH,WOOD,SHORT,S,WOOD
 24 | E24,O,45,1878,RR,?,2,G,?,STEEL,?,?,SIMPLE-T
 25 | E25,M,10,1882,RR,?,2,G,?,STEEL,?,?,SIMPLE-T
 26 | E27,A,39,1883,RR,?,2,G,THROUGH,STEEL,?,F,SIMPLE-T
 27 | E26,M,12,1883,RR,1150,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T
 28 | E30,A,31,1884,RR,?,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 29 | E29,A,26,1884,HIGHWAY,1080,2,G,THROUGH,STEEL,MEDIUM,?,SUSPEN
 30 | E28,M,3,1884,HIGHWAY,1000,2,G,THROUGH,STEEL,MEDIUM,S,ARCH
 31 | E32,A,30,1887,HIGHWAY,?,2,G,THROUGH,IRON,MEDIUM,F,SIMPLE-T
 32 | E31,M,8,1887,RR,1161,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T
 33 | E34,O,41,1888,RR,4558,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T
 34 | E33,M,19,1889,HIGHWAY,1120,?,G,THROUGH,IRON,MEDIUM,F,SIMPLE-T
 35 | E36,O,45,1890,HIGHWAY,?,2,G,THROUGH,IRON,SHORT,F,SIMPLE-T
 36 | E35,A,27,1890,HIGHWAY,1000,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 37 | E38,M,17,1891,HIGHWAY,?,2,G,THROUGH,IRON,MEDIUM,F,SIMPLE-T
 38 | E37,M,18,1891,RR,1350,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T
 39 | E39,A,25,1892,HIGHWAY,?,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 40 | E4,A,27,1892,AQUEDUCT,1092,1,N,THROUGH,WOOD,SHORT,S,WOOD
 41 | E40,M,22,1893,HIGHWAY,?,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 42 | E41,M,11,1894,HIGHWAY,?,2,G,THROUGH,IRON,MEDIUM,F,SIMPLE-T
 43 | E42,M,9,1895,HIGHWAY,2367,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T
 44 | E44,O,48,1896,HIGHWAY,?,2,G,THROUGH,STEEL,LONG,F,SUSPEN
 45 | E43,M,7,1896,HIGHWAY,1040,2,G,THROUGH,STEEL,LONG,F,ARCH
 46 | E46,A,37,1897,RR,4000,2,G,DECK,STEEL,LONG,F,SIMPLE-T
 47 | E45,M,14,1897,RR,2264,?,G,THROUGH,STEEL,?,F,SIMPLE-T
 48 | E47,M,15,1898,RR,2000,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T
 49 | E58,A,33,1900,HIGHWAY,1200,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 50 | E48,A,38,1900,HIGHWAY,2000,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 51 | E94,M,13,1901,RR,?,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T
 52 | E49,A,34,1902,HIGHWAY,1850,2,G,THROUGH,STEEL,MEDIUM,F,CANTILEV
 53 | E95,M,16,1903,RR,1300,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T
 54 | E87,A,35,1903,RR,3000,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T
 55 | E51,M,6,1903,RR,1417,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 56 | E50,M,21,1903,RR,1154,?,G,THROUGH,STEEL,LONG,F,SIMPLE-T
 57 | E89,M,4,1904,RR,1200,2,G,THROUGH,STEEL,MEDIUM,S-F,SIMPLE-T
 58 | E53,A,28,1904,RR,965,4,G,THROUGH,STEEL,MEDIUM,S-F,SIMPLE-T
 59 | E52,M,2,1904,RR,1504,?,G,THROUGH,STEEL,LONG,F,CANTILEV
 60 | E54,Y,?,1908,HIGHWAY,1240,?,G,?,STEEL,MEDIUM,F,SIMPLE-T
 61 | E56,M,23,1909,HIGHWAY,?,?,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 62 | E55,A,36,1909,HIGHWAY,1730,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T
 63 | E57,O,49,1910,RR,1620,2,G,THROUGH,STEEL,LONG,F,CANTILEV
 64 | E59,O,43,1911,HIGHWAY,1652,2,G,THROUGH,STEEL,LONG,F,CANTILEV
 65 | E107,A,39,1914,RR,?,?,G,?,STEEL,?,F,NIL
 66 | E92,M,10,1914,RR,2210,?,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 67 | E61,O,41,1915,RR,2822,2,G,THROUGH,STEEL,LONG,F,SIMPLE-T
 68 | E60,A,24,1915,HIGHWAY,1000,4,G,THROUGH,STEEL,LONG,F,SIMPLE-T
 69 | E62,A,37,1918,RR,2300,2,N,DECK,STEEL,LONG,F,CONT-T
 70 | E63,A,31,1920,RR,2122,2,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 71 | E65,A,30,1921,WALK,?,?,G,THROUGH,STEEL,?,F,SUSPEN
 72 | E64,A,29,1923,HIGHWAY,885,4,G,THROUGH,STEEL,MEDIUM,F,ARCH
 73 | E66,A,32,1924,HIGHWAY,2365,4,G,THROUGH,STEEL,MEDIUM,S,ARCH
 74 | E70,A,27,1926,HIGHWAY,860,4,G,THROUGH,STEEL,MEDIUM,S-F,SUSPEN
 75 | E69,A,26,1926,HIGHWAY,884,4,G,THROUGH,STEEL,MEDIUM,S-F,SUSPEN
 76 | E101,O,46,1927,HIGHWAY,1770,2,G,THROUGH,STEEL,LONG,S-F,CANTILEV
 77 | E73,A,38,1927,HIGHWAY,1508,?,G,THROUGH,STEEL,MEDIUM,S,ARCH
 78 | E72,M,5,1927,HIGHWAY,2663,4,N,DECK,STEEL,MEDIUM,S-F,CANTILEV
 79 | E67,M,1,1927,HIGHWAY,1330,4,G,THROUGH,STEEL,LONG,F,CANTILEV
 80 | E75,A,30,1928,HIGHWAY,2678,4,G,DECK,STEEL,MEDIUM,F,ARCH
 81 | E74,M,20,1928,HIGHWAY,2220,2,G,DECK,STEEL,MEDIUM,S-F,CANTILEV
 82 | E71,A,25,1928,HIGHWAY,860,4,G,THROUGH,STEEL,MEDIUM,S-F,SUSPEN
 83 | E68,M,17,1928,HIGHWAY,2250,2,G,THROUGH,STEEL,MEDIUM,S,SIMPLE-T
 84 | E78,O,40,1931,HIGHWAY,1365,4,G,THROUGH,STEEL,LONG,F,ARCH
 85 | E77,O,42,1931,HIGHWAY,1450,4,N,THROUGH,STEEL,LONG,F,ARCH
 86 | E76,M,6,1931,HIGHWAY,1500,4,G,THROUGH,STEEL,LONG,F,SUSPEN
 87 | E93,M,11,1937,HIGHWAY,1690,4,N,DECK,STEEL,LONG,S-F,CONT-T
 88 | E79,A,34,1939,HIGHWAY,1800,4,G,DECK,STEEL,MEDIUM,F,CANTILEV
 89 | E108,A,39.5,1945,HIGHWAY,1060,4,G,DECK,STEEL,MEDIUM,S-F,CONT-T
 90 | E107N,A,39.7,1945,RR,840,2,G,THROUGH,STEEL,MEDIUM,S-F,SIMPLE-T
 91 | E105,A,38.5,1945,HIGHWAY,1710,2,N,DECK,STEEL,MEDIUM,S-F,CONT-T
 92 | E103,O,48,1945,HIGHWAY,2160,2,G,THROUGH,STEEL,LONG,F,CANTILEV
 93 | E97,Y,52,1945,HIGHWAY,?,?,G,THROUGH,STEEL,MEDIUM,S,ARCH
 94 | E96,Y,51,1945,RR,?,?,G,THROUGH,STEEL,MEDIUM,F,SIMPLE-T
 95 | E99,M,23,1950,HIGHWAY,1320,2,G,THROUGH,STEEL,MEDIUM,S-F,SIMPLE-T
 96 | E98,M,22,1951,HIGHWAY,900,4,G,THROUGH,STEEL,MEDIUM,F,CONT-T
 97 | E81,M,14,1951,HIGHWAY,2423,4,G,DECK,STEEL,LONG,F,CONT-T
 98 | E80,M,19,1951,HIGHWAY,1031,4,G,THROUGH,STEEL,LONG,F,CANTILEV
 99 | E88,A,37,1955,HIGHWAY,2300,4,N,DECK,STEEL,LONG,F,CONT-T
100 | E82,O,42,1955,HIGHWAY,804,?,G,THROUGH,STEEL,?,F,SIMPLE-T
101 | E102,O,47,1959,HIGHWAY,1700,2,G,THROUGH,STEEL,LONG,F,CONT-T
102 | E83,M,1,1959,HIGHWAY,1000,6,G,THROUGH,STEEL,LONG,F,ARCH
103 | E86,A,33,1961,HIGHWAY,980,4,G,DECK,STEEL,MEDIUM,S-F,CONT-T
104 | E85,M,9,1962,HIGHWAY,2213,4,G,DECK,STEEL,LONG,F,CONT-T
105 | E84,A,24,1969,HIGHWAY,870,6,G,THROUGH,STEEL,MEDIUM,F,ARCH
106 | E91,O,44,1975,HIGHWAY,3756,6,G,THROUGH,STEEL,LONG,F,ARCH
107 | E90,M,7,1978,HIGHWAY,950,6,G,THROUGH,STEEL,LONG,F,ARCH
108 | E100,O,43,1982,HIGHWAY,?,?,G,?,?,?,F,?
109 | E109,A,28,1986,HIGHWAY,?,?,G,?,?,?,F,?
110 | 


--------------------------------------------------------------------------------
/tane.py:
--------------------------------------------------------------------------------
  1 | """------------------------------------------------------------------------------------------
  2 | TANE Algorithm for discovery of exact functional dependencies
  3 | Author: Nabiha Asghar, nasghar@uwaterloo.ca
  4 | February 2015
  5 | Use for research purposes only.
  6 | Please do not re-distribute without written permission from the author
  7 | Any commerical uses strictly forbidden.
  8 | Code is provided without any guarantees.
  9 | ----------------------------------------------------------------------------------------------"""
 10 | from pandas import *
 11 | from collections import defaultdict
 12 | import numpy as NP
 13 | import sys
 14 | 
 15 | def list_duplicates(seq):
 16 |     tally = defaultdict(list)
 17 |     for i,item in enumerate(seq):
 18 |         tally[item].append(i)
 19 |     return ((key,locs) for key,locs in tally.items() 
 20 |                             if len(locs)>0)
 21 | 
 22 | def findCplus(x): # this computes the Cplus of x as an intersection of smaller Cplus sets
 23 | 	global dictCplus
 24 | 	thesets=[]
 25 | 	for a in x:
 26 | 		if x.replace(a,'') in dictCplus.keys():
 27 | 			temp = dictCplus[x.replace(a,'')]
 28 | 		else:
 29 | 			temp=findCplus(x.replace(a,'')) # compute C+(X\{A}) for each A at a time
 30 | 			#dictCplus[x.replace(a,'')] = temp
 31 | 		thesets.insert(0, set(temp))
 32 | 	if list(set.intersection(*thesets)) == []:
 33 | 		cplus = []
 34 | 	else:
 35 | 		cplus = list(set.intersection(*thesets))  # compute the intersection in line 2 of pseudocode
 36 | 	return cplus
 37 | 
 38 | def compute_dependencies(level, listofcols):
 39 |     global dictCplus
 40 |     global finallistofFDs
 41 |     global listofcolumns
 42 |     for x in level:
 43 |     	thesets=[]
 44 |     	for a in x:
 45 |     		if x.replace(a,'') in dictCplus.keys():
 46 |     			temp = dictCplus[x.replace(a,'')]
 47 |     		else:
 48 |     			temp=computeCplus(x.replace(a,'')) # compute C+(X\{A}) for each A at a time
 49 |     			dictCplus[x.replace(a,'')] = temp
 50 |     		thesets.insert(0, set(temp))
 51 |     	if list(set.intersection(*thesets)) == []:
 52 |     		dictCplus[x] = []
 53 |     	else:
 54 |     		dictCplus[x] = list(set.intersection(*thesets))  # compute the intersection in line 2 of pseudocode
 55 |     for x in level:
 56 |     	for a in x:
 57 |     		if a in dictCplus[x]:
 58 |     			#if x=='BCJ': print "dictCplus['BCJ'] = ", dictCplus[x]
 59 | 	    		if validfd(x.replace(a,''), a): # line 5
 60 |     				finallistofFDs.append([x.replace(a,''), a]) # line 6
 61 |     				dictCplus[x].remove(a)  # line 7
 62 | 
 63 |     				listofcols=listofcolumns[:]
 64 |     				for j in x: # this loop computes R\X
 65 |     					if j in listofcols: listofcols.remove(j)
 66 | 
 67 |     				for b in listofcols: # this loop removes each b in R\X from C+(X)
 68 |     					if b in dictCplus[x]: dictCplus[x].remove(b)
 69 | 
 70 | def computeCplus(x): # this computes the Cplus from the first definition in section 3.2.2 of TANE paper. output should be a list of single attributes
 71 | 	global listofcolumns
 72 | 	listofcols = listofcolumns[:]
 73 | 	if x=='': return listofcols # because C+{phi} = R
 74 | 	cplus = []
 75 | 	for a in listofcols:
 76 | 		for b in x:
 77 | 			temp = x.replace(a,'')
 78 | 			temp = temp.replace(b,'')
 79 | 			if not validfd(temp, b):
 80 | 				cplus.append(a)
 81 | 	return cplus
 82 | 
 83 | def validfd(y,z):
 84 | 	if y=='' or z=='': return False
 85 | 	ey = computeE(y)
 86 | 	eyz = computeE(y+z)
 87 | 	if ey == eyz :
 88 | 		return True
 89 | 	else:
 90 | 		return False
 91 | 
 92 | def computeE(x):
 93 | 	global totaltuples
 94 | 	global dictpartitions
 95 | 	doublenorm = 0
 96 | 	for i in dictpartitions[''.join(sorted(x))]:
 97 | 		doublenorm = doublenorm + len(i)
 98 | 	e = (doublenorm-len(dictpartitions[''.join(sorted(x))]))/float(totaltuples)
 99 | 	return e
100 | 
101 | def check_superkey(x):
102 |     global dictpartitions
103 |     if ((dictpartitions[x] == [[]]) or (dictpartitions[x] == [])):
104 |         return True
105 |     else:
106 |         return False
107 | 
108 | def prune(level):
109 |     global dictCplus
110 |     global finallistofFDs
111 |     stufftobedeletedfromlevel = []
112 |     for x in level: # line 1
113 |     	if dictCplus[x]==[]: # line 2
114 |     		level.remove(x) # line 3
115 |     	if check_superkey(x): # line 4   ### should this check for a key, instead of super key??? Not sure.
116 |     		temp = dictCplus[x][:]
117 |     		for i in x: # this loop computes C+(X) \ X
118 |     			if i in temp: temp.remove(i)
119 |     		for a in temp: # line 5
120 |     			thesets=[]
121 |     			for b in x:
122 |     				if not( ''.join(sorted((x+a).replace(b,''))) in dictCplus.keys()): 
123 |     					dictCplus[''.join(sorted((x+a).replace(b,'')))] = findCplus(''.join(sorted((x+a).replace(b,''))))
124 |     				thesets.insert(0,set(dictCplus[''.join(sorted((x+a).replace(b,'')))]))
125 |     			if a in list(set.intersection(*thesets)): # line 6
126 |     				finallistofFDs.append([x, a]) # line 7
127 |     				#print "adding key FD: ", [x,a]
128 |     		if x in level: stufftobedeletedfromlevel.append(x) # line 8
129 |     for item in stufftobedeletedfromlevel:
130 |     	level.remove(item)
131 |     	
132 | def generate_next_level(level):
133 |     nextlevel=[]
134 |     for i in range(0,len(level)): # pick an element
135 |         for j in range(i+1, len(level)): # compare it to every element that comes after it. 
136 |             if ((not level[i]==level[j]) and level[i][0:-1]==level[j][0:-1]):  # i.e. line 2 and 3
137 |                 x = level[i]+level[j][-1]  #line 4        
138 |                 flag = True
139 |                 for a in x: # this entire for loop is for the 'for all' check in line 5
140 |                     if not(x.replace(a, '') in level):
141 |                         flag=False
142 |                 if flag==True:
143 |                     nextlevel.append(x)
144 |                     stripped_product(x, level[i] , level[j] ) # compute partition of x as pi_y * pi_z (where y is level[i] and z is level[j])
145 |     return nextlevel
146 | 
147 | 
148 | def stripped_product(x,y,z):
149 | 	global dictpartitions
150 | 	global tableT
151 | 	tableS = ['']*len(tableT)
152 | 	partitionY = dictpartitions[''.join(sorted(y))] # partitionY is a list of lists, each list is an equivalence class
153 | 	partitionZ = dictpartitions[''.join(sorted(z))]
154 | 	partitionofx = [] # line 1
155 | 	for i in range(len(partitionY)): # line 2
156 | 		for t in partitionY[i]: # line 3
157 | 			tableT[t] = i
158 | 		tableS[i]='' #line 4
159 | 	for i in range(len(partitionZ)): # line 5
160 | 		for t in partitionZ[i]: # line 6
161 | 			if ( not (tableT[t] == 'NULL')): # line 7
162 | 				tableS[tableT[t]] = sorted(list(set(tableS[tableT[t]]) | set([t]))) 
163 | 		for t in partitionZ[i]: # line 8
164 | 			if (not (tableT[t] == 'NULL')) and len(tableS[tableT[t]])>= 2 : # line 9
165 | 				partitionofx.append(tableS[tableT[t]]) 
166 | 			if not (tableT[t] == 'NULL'): tableS[tableT[t]]='' # line 10
167 | 	for i in range(len(partitionY)): # line 11
168 | 		for t in partitionY[i]: # line 12
169 | 			tableT[t]='NULL'
170 | 	dictpartitions[''.join(sorted(x))] = partitionofx
171 | 
172 | def computeSingletonPartitions(listofcols):
173 | 	global data2D
174 | 	global dictpartitions	
175 | 	for a in listofcols:
176 | 		dictpartitions[a]=[]
177 | 		for element in list_duplicates(data2D[a].tolist()): # list_duplicates returns 2-tuples, where 1st is a value, and 2nd is a list of indices where that value occurs
178 | 			if len(element[1])>1: # ignore singleton equivalence classes
179 | 				dictpartitions[a].append(element[1])
180 |     
181 | #------------------------------------------------------- START ---------------------------------------------------
182 | 
183 | if len(sys.argv) > 1:
184 |     infile=str(sys.argv[1]) # this would be e.g. "testdata.csv"
185 | 
186 | data2D = read_csv(infile)
187 | 
188 | totaltuples = len(data2D.index)
189 | listofcolumns = list(data2D.columns.values) # returns ['A', 'B', 'C', 'D', .....]
190 | 
191 | tableT = ['NULL']*totaltuples # this is for the table T used in the function stripped_product
192 | 
193 | L0 = []
194 | dictCplus = {'NULL': listofcolumns}
195 | dictpartitions = {} # maps 'stringslikethis' to a list of lists, each of which contains indices
196 | computeSingletonPartitions(listofcolumns)
197 | finallistofFDs=[]
198 | #print dictCplus['NULL']
199 | L1=listofcolumns[:]  # L1 is a copy of listofcolumns
200 | l=1
201 | 
202 | L = [L0,L1]
203 | 
204 | while (not (L[l] == [])):
205 |     compute_dependencies(L[l],listofcolumns[:])
206 |     prune(L[l])
207 |     temp = generate_next_level(L[l])
208 |     L.append(temp)
209 |     l=l+1
210 | 
211 | print "List of all FDs: " , finallistofFDs
212 | print "Total number of FDs found: ", len(finallistofFDs)
213 | 


--------------------------------------------------------------------------------
/ctane.py:
--------------------------------------------------------------------------------
  1 | """------------------------------------------------------------------------------------------
  2 | TANE Algorithm for discovery of exact conditional functional dependencies
  3 | Author: Nabiha Asghar, nasghar@uwaterloo.ca
  4 | March 2015
  5 | Use for research purposes only.
  6 | Please do not re-distribute without written permission from the author
  7 | Any commerical uses strictly forbidden.
  8 | Code is provided without any guarantees.
  9 | ----------------------------------------------------------------------------------------------"""
 10 | from pandas import *
 11 | from collections import defaultdict
 12 | import numpy as NP
 13 | import itertools
 14 | import sys
 15 | 
 16 | def replace_element_in_tuple(tup, elementindex, elementval):
 17 |     if type(elementval)==tuple:
 18 |         elementval = elementval[0]
 19 |     newtup = list(tup)
 20 |     newtup[elementindex] = elementval
 21 |     newtup = tuple(newtup)
 22 |     return newtup
 23 | 
 24 | def add_element_in_tuple(spxminusa, ca):
 25 |     thelist = list(spxminusa)
 26 |     thelist.append(ca[0])
 27 |     return tuple(thelist)
 28 | 
 29 | def validcfd(xminusa, x, a, spxminusa, sp, ca):
 30 |     global dictpartitions
 31 |     if xminusa == '' or a == '': 
 32 |         return False
 33 |     indexofa = x.index(a)
 34 |     newsp0 = add_element_in_tuple(spxminusa, ca)
 35 |     newsp1 = replace_element_in_tuple(sp, indexofa, ca)   #this is sp, except that in place of value of a we put ca
 36 |     if (x, newsp1) in dictpartitions.keys():
 37 |         if  len(dictpartitions[(xminusa, spxminusa)]) == len(dictpartitions[(x, newsp1)]):# and twodlen(dictpartitions[(xminusa, spxminusa)]) == twodlen(dictpartitions[(x, newsp1)]):
 38 |             return True    
 39 |     return False
 40 | 
 41 | def twodlen(listoflists):
 42 | 	summ = 0
 43 | 	for item in listoflists:
 44 | 		summ = summ + len(item)
 45 | 	return summ
 46 | 
 47 | def greaterthanorequalto(upxminusa, spxminusa): # this is actually greaterthan or equal to
 48 |     if upxminusa == spxminusa: 
 49 |         return True
 50 |     flag = True
 51 |     for index in range(0, len(upxminusa)):
 52 |         if not (spxminusa[index]=='--'):
 53 |             if (not (upxminusa[index] == spxminusa[index])):
 54 |                 flag = False
 55 |     return flag
 56 | 
 57 | def doublegreaterthan(upxminusa, spxminusa): 
 58 |     if upxminusa == spxminusa: 
 59 |         return False
 60 |     flag = True
 61 |     for index in range(0, len(upxminusa)):
 62 |         if (not spxminusa[index]=='--'):
 63 |             if (not (upxminusa[index] == spxminusa[index])):
 64 |                 flag = False
 65 |     return flag
 66 |     
 67 | def compute_dependencies(level, listofcols):
 68 |     global dictCplus
 69 |     global finallistofCFDs
 70 |     global listofcolumns
 71 |     for (x,sp) in level:
 72 |         for a in x:
 73 |             for (att, ca) in dictCplus[(x, sp)]:
 74 |                 if att == a:
 75 |                     newtup =  spXminusA(sp, x, a)      ### tuple(y for y in sp if not sp.index(y)==x.index(a)) # this is sp[X\A]                             
 76 |                     if validcfd( x.replace(a,''), x, a, newtup, sp, ca) and not ([x.replace(a,''), a, [newtup, ca]] in finallistofCFDs):
 77 |                         finallistofCFDs.append([x.replace(a,''), a, [newtup, ca]])
 78 |                         for (xx, up) in level:
 79 |                             if xx==x:
 80 |                                 newtup0 =  spXminusA(up, x, a)          ### tuple(y for y in up if not up.index(y)==x.index(a)) # this is up[X\A]
 81 |                                 if up[x.index(a)]==ca[0] and greaterthanorequalto(newtup0, newtup) :
 82 |                                     if (a, ca) in dictCplus[(x,up)]: dictCplus[(x,up)].remove((a,ca))
 83 |                                     listofcolscopy = listofcols[:]
 84 |                                     for j in x: # this loop computes R\X
 85 |                                         if j in listofcolscopy: listofcolscopy.remove(j)
 86 |                                     for b_att in listofcolscopy: # this loop removes each b in R\X from C+(X,up)
 87 |                                         stufftobedeleted = []
 88 |                                         for (bbval, sometup) in dictCplus[(x,up)]:
 89 |                                             if b_att == bbval:
 90 |                                                 stufftobedeleted.append((bbval,sometup))                        
 91 |                                         for item in stufftobedeleted:
 92 |                                             dictCplus[(x,up)].remove(item)
 93 | 
 94 | def prune(level):
 95 |     global dictCplus
 96 |     stufftobedeleted=[]
 97 |     for (x,sp) in level:
 98 |         if len(dictCplus[(x,sp)])==0:
 99 |             stufftobedeleted.append((x,sp))
100 |     for item in stufftobedeleted:
101 |         level.remove(item)
102 | 
103 | def computeCplus(level): # for each tuple (x,sp) in the list level, it computes C+(x,sp), which is a list of (attribute, value) tuples) 
104 |     global listofcolumns
105 |     global dictCplus
106 |     listofcols = listofcolumns[:]
107 |     for (x,sp) in level: #sp is a tuple of strings like this: ('aa', 'bb', 'cc') or ('aa', )     
108 |        thesets=[]
109 |         for b in x:
110 |             indx = x.index(b) # the index where b is located in x
111 |             spcopy =  spXminusA(sp, x, b)     ### tuple(y for y in sp if not sp.index(y)==indx)
112 |             spcopy2 = sp[:]            
113 |             if (x.replace(b,''), spcopy ) in dictCplus.keys():
114 |                 temp = dictCplus[(x.replace(b,''), spcopy)]
115 |             else: temp = []   # is this correct???? should I put [] here?
116 |             thesets.insert(0, set(temp))
117 |         if list(set.intersection(*thesets)) == []:
118 |             dictCplus[(x,sp)] = []
119 |         else:
120 |             dictCplus[(x,sp)] = list(set.intersection(*thesets))
121 | 
122 | def initial_Cplus(level):
123 |     global listofcolumns
124 |     global dictCplus
125 |     computeCplus(level)
126 |     for (a,ca) in level:
127 |         stufftobedeleted = []
128 |         for (att, val) in dictCplus[(a,ca)]:
129 |             if att==a and not val==ca:
130 |                 stufftobedeleted.append((att,val))
131 |         for item in stufftobedeleted:
132 |             dictCplus[(a,ca)].remove(item)
133 | 
134 | def populateL1(listofcols):    
135 |     global k_suppthreshold
136 |     l1 = []
137 |     attributepartitions = computeAttributePartitions(listofcols)
138 |     for a in listofcols:
139 |         l1.append((a, ('--',)))
140 |         for eqclass in attributepartitions[a]:
141 |             if len(eqclass)>= k_suppthreshold:
142 |                 l1.append( (a, (str(data2D.iloc[eqclass[0]][a]) , ) ) )
143 |     computeInitialPartitions(l1, attributepartitions) # populates the dictpartitions with the initial partitions (X,sp) where X is a single attribute
144 |     return l1
145 | 
146 | def computeInitialPartitions(level1, attributepartitions):
147 | 	global data2D
148 | 	global dictpartitions # dictpartitions[(x,sp)] is of the form [[0,1,2]]. So simply a list of lists of indices  
149 | 	for (a,sp) in level1:
150 | 		dictpartitions[(a,sp)]=[]
151 | 		dictpartitions[(a,sp)] = attributepartitions[a]
152 | 
153 | def old_computeInitialPartitions(level1, attributepartitions):
154 |     global data2D
155 |     global dictpartitions # dictpartitions[(x,sp)] is of the form [[0,1,2]]. So simply a list of lists of indices  
156 |     for (a,sp) in level1:
157 |         dictpartitions[(a,sp)]=[]
158 |         if sp[0]=='--':
159 |             dictpartitions[(a,sp)] = attributepartitions[a]
160 |         else:
161 |             for eqclass in attributepartitions[a]:
162 |                 if str(data2D.iloc[eqclass[0]][a])==sp[0]:
163 |                     dictpartitions[(a,sp)].append(eqclass)
164 | 
165 | def computeAttributePartitions(listofcols): # compute partitions for every attribute 
166 |     global data2D    
167 |     attributepartitions = {}
168 |     for a in listofcols:
169 |         attributepartitions[a]=[]
170 |         for element in list_duplicates(data2D[a].tolist()): # list_duplicates returns 2-tuples, where 1st is a value, and 2nd is a list of indices where that value occurs
171 |             if len(element[1])>0: # if >1, then ignore singleton equivalence classes
172 |                 attributepartitions[a].append(element[1])
173 |     return attributepartitions
174 | 
175 | def list_duplicates(seq):
176 |     tally = defaultdict(list)
177 |     for i,item in enumerate(seq):
178 |         tally[item].append(i)
179 |     return ((key,locs) for key,locs in tally.items() 
180 |                             if len(locs)>0)
181 | 
182 | def sometuplematchesZUP(z,up):
183 |     global dictpartitions
184 |     global k_suppthreshold
185 |     sumofmatches = 0
186 |     for eqclass in dictpartitions[(z, up)]:
187 |         sumofmatches = sumofmatches +  len(eqclass)
188 |     if sumofmatches >= k_suppthreshold:
189 |         return True
190 |     else:
191 |         return False
192 | 
193 | def generate_next_level(level):
194 |     nextlevel=[]
195 |     for i in range(0,len(level)): # pick an element
196 |         for j in range(i+1, len(level)): # compare it to every element that comes after it. 
197 |             if ((not level[i][0]==level[j][0]) and level[i][0][0:-1]==level[j][0][0:-1] and level[i][1][0:-1]==level[j][1][0:-1]):
198 |                 z = level[i][0] + level[j][0][-1]
199 |                 up = tuple(list(level[i][1]) + [level[j][1][-1]])
200 |                 (z, up) = sortspbasedonx(z, up)
201 |                 partition_product((z,up), level[i], level[j])
202 |                 if sometuplematchesZUP(z,up):
203 |                     flag = True
204 |                     for att in z:
205 |                         indexofatt = z.index(att) # where is att located in z                        
206 |                         up_zminusa = spXminusA(up, z, att)
207 |                         zminusa = z.replace(att,'')
208 |                         if not ((zminusa, up_zminusa) in level):
209 |                             flag = False
210 |                     if flag:
211 |                         nextlevel.append((z, up))
212 |     return nextlevel
213 | 
214 | def spXminusA(sp, x, a):
215 |     indexofa = x.index(a)
216 |     mylist=[]
217 |     for i in range(0, len(sp)):
218 |         if not i==indexofa:
219 |             mylist.append(sp[i])
220 |     return tuple(mylist)
221 | 
222 | def partition_product(zup, xsp, ytp):
223 |     global dictpartitions
224 |     global tableT
225 |     tableS = ['']*len(tableT)
226 |     partitionXSP = dictpartitions[xsp]
227 |     partitionYTP = dictpartitions[ytp]
228 |     partitionZUP = []
229 |     for i in range(len(partitionXSP)):
230 |         for t in partitionXSP[i]:
231 |             tableT[t] = i
232 |         tableS[i]=''
233 |     for i in range(len(partitionYTP)):
234 |         for t in partitionYTP[i]: 
235 |             if ( not (tableT[t] == 'NULL')): 
236 |                 tableS[tableT[t]] = sorted(list(set(tableS[tableT[t]]) | set([t]))) 
237 |         for t in partitionYTP[i]: 
238 |             if (not (tableT[t] == 'NULL')) and len(tableS[tableT[t]])>= 1 : 
239 |                 partitionZUP.append(tableS[tableT[t]]) 
240 |             if not (tableT[t] == 'NULL'): tableS[tableT[t]]='' 
241 |     for i in range(len(partitionXSP)): 
242 |         for t in partitionXSP[i]: 
243 |             tableT[t]='NULL'
244 |     dictpartitions[zup] = partitionZUP
245 |     dictpartitions[zup] = partitionZUP
246 | 
247 | def sortspbasedonx(x,sp):
248 |     x = list(x)
249 |     points = zip(x,sp)
250 |     sorted_points = sorted(points)
251 |     new_x = [point[0] for point in sorted_points]
252 |     new_sp = [point[1] for point in sorted_points]
253 |     return (''.join(new_x), tuple(new_sp))
254 | 
255 | #------------------------------------------------------- START ---------------------------------------------------
256 | if len(sys.argv) > 1:
257 |     infile=str(sys.argv[1])
258 | if len(sys.argv) > 2:
259 |     k=int(sys.argv[2])
260 | 
261 | data2D = read_csv(infile)
262 | 
263 | totaltuples = len(data2D.index)
264 | listofcolumns = list(data2D.columns.values) # returns ['A', 'B', 'C', 'D', .....]
265 | tableT = ['NULL']*totaltuples # this is for the table T used in the function partition_product
266 | k_suppthreshold = k
267 | L0 = []
268 | 
269 | dictpartitions = {} # maps 'stringslikethis' to a list of lists, each of which contains indices
270 | finallistofCFDs=[]
271 | L1=populateL1(listofcolumns[:])  # L1 is a list of tuples of the form [ ('A', ('val1') ), ('A', ('val2') ), ..., ('B', ('val3') ), ......]
272 | dictCplus = {('',()): L1[:]}
273 | l=1
274 | L = [L0,L1]
275 | 
276 | while (not (L[l] == [])):
277 |     if l==1:
278 |         initial_Cplus(L[l])
279 |     else:
280 |         computeCplus(L[l])
281 |     compute_dependencies(L[l],listofcolumns[:])
282 |     prune(L[l])
283 |     temp = generate_next_level(L[l])
284 |     L.append(temp)
285 |     l=l+1
286 |     #print "List of all CFDs: " , finallistofCFDs
287 |     #print "CFDs found: ", len(finallistofCFDs), ", level = ", l-1    
288 | 
289 | print "List of all CFDs: " , finallistofCFDs
290 | print "Total number of CFDs found: ", len(finallistofCFDs)
291 | 


--------------------------------------------------------------------------------