├── tests ├── __init__.py └── test_autofj.py ├── src ├── autofj │ ├── blocker │ │ ├── __init__.py │ │ └── blocker.py │ ├── optimizer │ │ └── __init__.py │ ├── join_function_space │ │ ├── __init__.py │ │ ├── join_function │ │ │ ├── __init__.py │ │ │ ├── tokenizer.py │ │ │ ├── join_function.py │ │ │ ├── preprocessor.py │ │ │ ├── token_weight.py │ │ │ └── distance_function.py │ │ └── options.py │ ├── __init__.py │ ├── benchmark │ │ ├── Galaxy │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── NationalFootballLeagueSeason │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── TennisTournament │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── NCAATeamSeason │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── ArtificialSatellite │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── NaturalEvent │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── Enzyme │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── RugbyLeague │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── SoccerClubSeason │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── FootballMatch │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── GivenName │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── Drug │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── BasketballTeam │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── Race │ │ │ └── right.csv │ │ ├── ShoppingMall │ │ │ ├── left.csv │ │ │ ├── right.csv │ │ │ └── gt.csv │ │ ├── Monarch │ │ │ └── right.csv │ │ ├── Magazine │ │ │ └── right.csv │ │ ├── SoccerLeague │ │ │ └── right.csv │ │ ├── Legislature │ │ │ └── right.csv │ │ ├── Country │ │ │ └── right.csv │ │ ├── ClericalAdministrativeRegion │ │ │ └── right.csv │ │ └── Artwork │ │ │ └── right.csv │ ├── utils.py │ ├── datasets.py │ ├── 50-single-column-datasets.md │ ├── negative_rule.py │ └── autofj.py └── autofj.egg-info │ ├── dependency_links.txt │ ├── top_level.txt │ ├── requires.txt │ ├── SOURCES.txt │ └── PKG-INFO ├── .gitignore ├── MANIFEST.in ├── dist ├── autofj-0.0.6.tar.gz └── autofj-0.0.6-py3-none-any.whl ├── pyproject.toml ├── setup.py ├── LISENCE └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/autofj/blocker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/autofj/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | -------------------------------------------------------------------------------- /src/autofj/join_function_space/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/autofj.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/autofj.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | autofj 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include src/autofj/benchmark/ * -------------------------------------------------------------------------------- /src/autofj/__init__.py: -------------------------------------------------------------------------------- 1 | from .autofj import AutoFJ 2 | -------------------------------------------------------------------------------- /src/autofj/join_function_space/join_function/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/autofj.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | nltk 4 | ngram 5 | editdistance 6 | jellyfish 7 | spacy 8 | -------------------------------------------------------------------------------- /dist/autofj-0.0.6.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chu-data-lab/AutomaticFuzzyJoin/HEAD/dist/autofj-0.0.6.tar.gz -------------------------------------------------------------------------------- /dist/autofj-0.0.6-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chu-data-lab/AutomaticFuzzyJoin/HEAD/dist/autofj-0.0.6-py3-none-any.whl -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /src/autofj/benchmark/Galaxy/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Sagittarius Dwarf Spheroidal Galaxy 3 | 1,RX J1242-11 4 | 2,Canis Major Overdensity 5 | 3,Carina Dwarf Spheroidal Galaxy 6 | 4,NGC 34 7 | 5,NGC 6872 8 | 6,GR 8 9 | 7,NGC 1265 10 | 8,3C 433 11 | 9,MS 1512-cB58 12 | 10,NGC 935/IC 1801 13 | 11,Arp 302 14 | 12,DDO 169 15 | 13,Segue 2 16 | 14,NGC 5562 17 | 15,DDO 190 18 | 16,Carina Dwarf Spheroidal galaxy 19 | -------------------------------------------------------------------------------- /src/autofj/benchmark/NationalFootballLeagueSeason/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,1900 Homestead Library & Athletic Club football season 3 | 1,2003 FC Barcelona Dragons season 4 | 2,1905 Canton Athletic Club season 5 | 3,1911 Canton Professionals season 6 | 4,1912 Canton Professionals season 7 | 5,1914 Canton Professionals season 8 | 6,1913 Canton Professionals season 9 | 7,2002 FC Barcelona Dragons season 10 | 8,1921 Detroit Tigers season (NFL) 11 | 9,1996 Minnesota Fighting Pike Season 12 | -------------------------------------------------------------------------------- /src/autofj/utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import os 3 | 4 | def print_log(message): 5 | print("{}:{}".format(datetime.now().strftime('%H:%M:%S'), message)) 6 | 7 | def makedir(dir_list, file=None, remove_old_dir=False): 8 | save_dir = os.path.join(*dir_list) 9 | 10 | if remove_old_dir and os.path.exists(save_dir) and file is None: 11 | shutil.rmtree(save_dir) 12 | 13 | if not os.path.exists(save_dir): 14 | os.makedirs(save_dir) 15 | if file is not None: 16 | save_dir = os.path.join(save_dir, file) 17 | return save_dir -------------------------------------------------------------------------------- /src/autofj/datasets.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | from os.path import dirname 4 | 5 | def load_data(name): 6 | module_path = dirname(__file__) 7 | if os.path.exists(os.path.join(module_path, "benchmark", name)): 8 | left_table = pd.read_csv(os.path.join(module_path, "benchmark", name, "left.csv")) 9 | right_table = pd.read_csv(os.path.join(module_path, "benchmark", name, "right.csv")) 10 | gt_table = pd.read_csv(os.path.join(module_path, "benchmark", name, "gt.csv")) 11 | return left_table, right_table, gt_table 12 | else: 13 | raise Exception("Dataset {} does not exist.".format(name)) 14 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Galaxy/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 10,Sagittarius Dwarf Elliptical Galaxy,0,Sagittarius Dwarf Spheroidal Galaxy 3 | 14,RXJ1242-11,1,RX J1242-11 4 | 65,Canis Major Dwarf Galaxy,2,Canis Major Overdensity 5 | 94,Carina Dwarf,3,Carina Dwarf Spheroidal Galaxy 6 | 94,Carina Dwarf,16,Carina Dwarf Spheroidal galaxy 7 | 104,NGC 17,4,NGC 34 8 | 190,NGC 6872 and IC 4970,5,NGC 6872 9 | 213,UGC 8091,6,GR 8 10 | 317,3C 83.1B,7,NGC 1265 11 | 324,QSO B2121+248,8,3C 433 12 | 330,MS 1512 +36-cB58,9,MS 1512-cB58 13 | 354,Arp 276,10,NGC 935/IC 1801 14 | 370,UGC 9618,11,Arp 302 15 | 423,UGC 8331,12,DDO 169 16 | 446,Segue 2 (dwarf galaxy),13,Segue 2 17 | 509,NGC 5662,14,NGC 5562 18 | 528,UGC 9240,15,DDO 190 19 | -------------------------------------------------------------------------------- /src/autofj/benchmark/NationalFootballLeagueSeason/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 1976,1921 Detroit Tigers (NFL) season,8,1921 Detroit Tigers season (NFL) 3 | 2057,1900 Homestead Library & Athletic Club football team,0,1900 Homestead Library & Athletic Club football season 4 | 2339,2003 Barcelona Dragons season,1,2003 FC Barcelona Dragons season 5 | 2469,1905 Canton Bulldogs season,2,1905 Canton Athletic Club season 6 | 2471,1911 Canton Bulldogs season,3,1911 Canton Professionals season 7 | 2477,1912 Canton Bulldogs season,4,1912 Canton Professionals season 8 | 2478,1914 Canton Bulldogs season,5,1914 Canton Professionals season 9 | 2479,1913 Canton Bulldogs season,6,1913 Canton Professionals season 10 | 2634,2002 Barcelona Dragons season,7,2002 FC Barcelona Dragons season 11 | 2647,1996 Minnesota Fighting Pike season,9,1996 Minnesota Fighting Pike Season 12 | -------------------------------------------------------------------------------- /src/autofj/blocker/blocker.py: -------------------------------------------------------------------------------- 1 | class Blocker(object): 2 | """The customized blocker must have a block method as below. The constructor 3 | can be overwritten""" 4 | def __init__(self): 5 | pass 6 | 7 | def block(self, left_table, right_table, id_column): 8 | """ Perform blocking on two tables 9 | 10 | Parameters 11 | ---------- 12 | left_table: pd.DataFrame 13 | Reference table. The left table is assumed to be almost 14 | duplicate-free, which means it has no or only few duplicates. 15 | 16 | right_table: pd.DataFrame 17 | Another input table. 18 | 19 | id_column: string 20 | The name of id column in two tables. 21 | 22 | Returns: 23 | -------- 24 | result: pd.DataFrame 25 | A table of records pairs survived blocking. Column names 26 | id_column + "_l" and id_column + "_r" 27 | """ 28 | result = None 29 | return result -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="autofj", 8 | version="0.0.6", 9 | author="Peng Li", 10 | author_email="lipengpublic@gmail.com", 11 | description="Auto-Program Fuzzy Similarity Joins Without Labeled Examples", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/chu-data-lab/AutomaticFuzzyJoin", 15 | classifiers=[ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | ], 20 | package_dir={"": "src"}, 21 | packages=setuptools.find_packages(where="src"), 22 | python_requires=">=3.7", 23 | install_requires=[ 24 | 'numpy', 25 | 'pandas', 26 | 'nltk', 27 | 'ngram', 28 | 'editdistance', 29 | 'jellyfish', 30 | 'spacy', 31 | ], 32 | include_package_data=True 33 | ) -------------------------------------------------------------------------------- /src/autofj/benchmark/TennisTournament/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,2000 Tennis Masters Cup and ATP Tour World Championships 3 | 1,2008 Challenge Bell 4 | 2,2010 Aegon Championships 5 | 3,2009 Challenge Bell 6 | 4,1996 Challenge Bell 7 | 5,1997 Challenge Bell 8 | 6,1998 Challenge Bell 9 | 7,2010 Challenger Banque Nationale de Rimouski 10 | 8,2009 San Benedetto Tennis Cup 11 | 9,2010 Challenge Bell 12 | 10,2010 Blu-express.com Tennis Cup 13 | 11,2011 Challenger Banque Nationale de Rimouski 14 | 12,2011 Aegon Championships 15 | 13,2011 Aegon Classic 16 | 14,2011 Challenge Bell 17 | 15,2011 Challenger Banque Nationale de Saguenay 18 | 16,2012 Challenger Banque Nationale de Rimouski 19 | 17,2012 Aegon Championships 20 | 18,2012 Challenge Bell 21 | 19,2012 Blu-express.com Tennis Cup 22 | 20,2012 Korea Open 23 | 21,2012 Arimex Challenger Trophy 24 | 22,2012 Challenger Banque Nationale de Saguenay 25 | 23,2012 Aegon Pro-Series Loughborough 26 | 24,2013 Garanti Koza WTA Tournament of Champions 27 | 25,2013 Challenger Banque Nationale de Rimouski 28 | 26,2012 Commonwealth Bank Tournament of Champions 29 | -------------------------------------------------------------------------------- /LISENCE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /tests/test_autofj.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import time 3 | import os 4 | from autofj import AutoFJ 5 | from autofj.datasets import load_data 6 | 7 | def evaluate(pred_joins, gt_joins): 8 | """ Evaluate the performance of fuzzy joins 9 | 10 | Parameters 11 | ---------- 12 | pred_joins: list 13 | A list of tuple pairs (id_l, id_r) that are predicted to be matches 14 | 15 | gt_joins: 16 | The ground truth matches 17 | 18 | Returns 19 | ------- 20 | precision: float 21 | Precision score 22 | 23 | recall: float 24 | Recall score 25 | 26 | f1: float 27 | F1 score 28 | """ 29 | pred = {(l, r) for l, r in pred_joins} 30 | gt = {(l, r) for l, r in gt_joins} 31 | tp = pred.intersection(gt) 32 | 33 | precision = len(tp) / len(pred) 34 | recall = len(tp) / len(gt) 35 | f1 = 2 * precision * recall / (precision + recall) 36 | return precision, recall, f1 37 | 38 | def test_autofj(dataset): 39 | left, right, gt = load_data(dataset) 40 | autofj = AutoFJ(verbose=True) 41 | LR_joins = autofj.join(left, right, id_column="id") 42 | 43 | print(LR_joins) 44 | gt_joins = gt[["id_l", "id_r"]].values 45 | LR_joins = LR_joins[["id_l", "id_r"]].values 46 | p, r, f1 = evaluate(LR_joins, gt_joins) 47 | print("Precision:", p, "Recall:", r, "F1:", f1) 48 | 49 | if __name__ == '__main__': 50 | test_autofj("TennisTournament") 51 | 52 | -------------------------------------------------------------------------------- /src/autofj/benchmark/NCAATeamSeason/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,1893 LSU football team 3 | 1,1894 LSU football team 4 | 2,1895 LSU football team 5 | 3,1892 Alabama Cadets football team 6 | 4,1894 Alabama Crimson White football team 7 | 5,1893 Alabama Crimson White football team 8 | 6,1895 Alabama Crimson White football team 9 | 7,1898 Alabama Crimson White football team 10 | 8,1897 Alabama Crimson White football team 11 | 9,1896 Alabama Crimson White football team 12 | 10,1899 Alabama Crimson White football team 13 | 11,1900 Alabama Crimson White football team 14 | 12,1901 Alabama Crimson White football team 15 | 13,1902 Alabama Crimson White football team 16 | 14,1903 Alabama Crimson White football team 17 | 15,1904 Alabama Crimson White football team 18 | 16,1906 Alabama Crimson White football team 19 | 17,1905 Alabama Crimson White football team 20 | 18,1906 Arkansas Cardinals football team 21 | 19,2011 Austin Peay Governors football team 22 | 20,1907 Notre Dame football team 23 | 21,2012 Austin Peay Governors football team 24 | 22,1919 Washington Sun Dodgers football team 25 | 23,2013 Hawaii Rainbow Warriors football team 26 | 24,1887 Notre Dame football team 27 | 25,2002 Florida Atlantic Owls baseball team 28 | 26,1974 Oregon Ducks football team 29 | 27,1964 Oregon Ducks football team 30 | 28,1963 Oregon Ducks football team 31 | 29,2010 FIU Golden Panthers football team 32 | 30,1975 Oregon Ducks football team 33 | 31,2011 FIU Golden Panthers football team 34 | 32,1979 UCF Golden Knights football team 35 | 33,2012 FIU Golden Panthers football team 36 | -------------------------------------------------------------------------------- /src/autofj/benchmark/ArtificialSatellite/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Soyuz 7K-T No.39 3 | 1,Pad Abort Test 1 4 | 2,AS-101 (spacecraft) 5 | 3,AS-102 (spacecraft) 6 | 4,Spirit rover 7 | 5,Opportunity rover 8 | 6,UO-11 9 | 7,AS-104 (spacecraft) 10 | 8,AS-103 (spacecraft) 11 | 9,AS-105 (spacecraft) 12 | 10,Pad Abort Test 2 13 | 11,Soyuz 7K-ST No. 16L 14 | 12,SpaceShipOne Flight 17P 15 | 13,Nuclear Spectroscopic Telescope Array 16 | 14,Eutelsat 33C 17 | 15,Seasat 18 | 16,Foton-M No.2 19 | 17,CHAMP (satellite) 20 | 18,Kosmos 605 21 | 19,Kosmos 1667 22 | 20,CUTE-1.7 + APD 23 | 21,Eutelsat 31A 24 | 22,Eutelsat 16B 25 | 23,GEOTAIL 26 | 24,Fengyun 2D 27 | 25,ACRIMSAT 28 | 26,Resurs-DK No.1 29 | 27,HYLAS 30 | 28,Landsat 8 31 | 29,PROBA2 32 | 30,Solwind 33 | 31,UoSAT-1 34 | 32,UoSat-OSCAR 9 35 | 33,ABRIXAS 36 | 34,ABS-3 37 | 35,Sentinel-3 38 | 36,Soil Moisture and Ocean Salinity 39 | 37,MightySat-2.1 40 | 38,Aditya (satellite) 41 | 39,Eutelsat 48D 42 | 40,Eutelsat 48B 43 | 41,Afghansat 1 44 | 42,Hot Bird 13C 45 | 43,AMC-1 46 | 44,Prisma (satellite project) 47 | 45,Eutelsat 4A 48 | 46,CP6 (satellite) 49 | 47,Eutelsat 113 West A 50 | 48,Orion 3 51 | 49,Azerspace 52 | 50,LightSail 2 53 | 51,Eutelsat 33B 54 | 52,Eutelsat 25C 55 | 53,COTS Demo Flight 1 56 | 54,COTS Demo Flight 2 57 | 55,Dragon C3 58 | 56,NEE-01 Pegaso 59 | 57,SES-7 60 | 58,Laplace-P 61 | 59,Intelsat 28 62 | 60,Ziyuan III-01 63 | 61,Venera 3MV-1 No.2 64 | 62,Cygnus Orb-D1 65 | 63,Ekspress AM4 66 | 64,IRNSS-1A 67 | 65,Dragon C4 68 | 66,Telstar 14 69 | 67,USA-242 70 | 68,Mars Orbiter Mission 71 | 69,AIDA (mission) 72 | 70,TDRS-11 73 | 71,Eutelsat 117 West A 74 | -------------------------------------------------------------------------------- /src/autofj/join_function_space/join_function/tokenizer.py: -------------------------------------------------------------------------------- 1 | import ngram 2 | import pandas as pd 3 | import numpy as np 4 | import time 5 | 6 | three_gramer = ngram.NGram(N=3) 7 | 8 | def splitBySpace(x): 9 | return x.split() 10 | 11 | def threeGram(x): 12 | # Replace whitespace more than one by a single blank 13 | return list(three_gramer.split(" ".join(x.split()))) 14 | 15 | class Tokenizer: 16 | """Tokenize data 17 | 18 | Parameters 19 | ---------- 20 | method: string 21 | Tokenization method. The available methods are listed as follows. 22 | - splitBySpace 23 | - threeGram 24 | - None (no tokenization) 25 | """ 26 | def __init__(self, method): 27 | self.method = method 28 | if method is None: 29 | self.func = None 30 | elif method == "splitBySpace": 31 | self.func = splitBySpace 32 | elif method == "threeGram": 33 | self.func = threeGram 34 | else: 35 | raise Exception("{} is an invalid tokenization method" 36 | .format(method)) 37 | 38 | def tokenize(self, X): 39 | """ Tokenize input data 40 | 41 | Parameters 42 | ---------- 43 | X: pd.Series 44 | Input data 45 | """ 46 | if self.func is not None: 47 | X = X.apply(self.func) 48 | return X 49 | 50 | # data = pd.read_csv("../../data/left.csv")["title"] 51 | # X = np.concatenate([data.values for _ in range(20)]) 52 | # X = pd.Series(X) 53 | # 54 | # tokenizer = Tokenizer("threeGram") 55 | # tic = time.time() 56 | # tokenizer.tokenize(X) 57 | # print(time.time() - tic) -------------------------------------------------------------------------------- /src/autofj/benchmark/NaturalEvent/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,1988 Armenian earthquake 3 | 1,1935 Quetta earthquake 4 | 2,2006 Yogyakarta earthquake 5 | 3,2006 Pangandaran earthquake and tsunami 6 | 4,2006 Hengchun earthquakes 7 | 5,January 2001 El Salvador earthquake 8 | 6,1509 Constantinople earthquake 9 | 7,1959 Hebgen Lake earthquake 10 | 8,February 1998 Afghanistan earthquake 11 | 9,May 1998 Afghanistan earthquake 12 | 10,2004 Al Hoceima earthquake 13 | 11,1968 Dasht-e Bayaz and Ferdows earthquakes 14 | 12,2000 Enggano earthquake 15 | 13,1980 Oaxaca earthquake 16 | 14,1894 Tokyo earthquake 17 | 15,1953 Ionian earthquake 18 | 16,2009 Papua earthquakes 19 | 17,1929 Kopet Dag earthquake 20 | 18,2008 Qeshm earthquake 21 | 19,1755 Cape Ann earthquake 22 | 20,749 Galilee earthquake 23 | 21,2009 Samoa earthquake and tsunami 24 | 22,1896 Sanriku earthquake 25 | 23,1854 Nankai earthquake 26 | 24,1940 New Hampshire earthquakes 27 | 25,2010 Solomon Islands earthquake 28 | 26,1996 Duvall earthquake 29 | 27,1903 Manzikert earthquake 30 | 28,1653 East Smyrna earthquake 31 | 29,1688 Smyrna earthquake 32 | 30,1855 Edo earthquake 33 | 31,1927 Jericho earthquake 34 | 32,1909 Provence earthquake 35 | 33,869 Sanriku earthquake 36 | 34,2011 Myanmar earthquake 37 | 35,1911 Guerrero earthquake 38 | 36,1611 Sanriku earthquake 39 | 37,1932 Jalisco earthquakes 40 | 38,2012 Afghanistan earthquakes 41 | 39,2012 Yangzhou earthquake 42 | 40,2008 Bandar Abbas earthquake 43 | 41,March 2013 Nantou earthquake 44 | 42,1985 Santiago earthquake 45 | 43,1962 Bou'in-Zahra earthquake 46 | 44,Near East earthquake of 1759 47 | 45,2010 Kaohsiung earthquake 48 | 46,1995 Egypt earthquake 49 | 47,1914 Afyon-Bolvadin earthquake 50 | 48,Great Adelaide Earthquake 51 | 49,847 Antioch earthquake 52 | 50,2012 Indian Ocean earthquake 53 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Enzyme/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Adenylyl cyclase 3 | 1,NADH:ubiquinone reductase (H+-translocating) 4 | 2,Ribonuclease H 5 | 3,Guanylyl cyclase 6 | 4,Bovine pancreatic ribonuclease 7 | 5,Gelatinase A 8 | 6,Carboxypeptidase U 9 | 7,3beta-Hydroxysteroid dehydrogenase 10 | 8,Aspartate kinase 11 | 9,Aralkylamine N-acetyltransferase 12 | 10,CTP synthase 13 | 11,Alpha-Amylase 14 | 12,Carboxypeptidase C 15 | 13,Diamine oxidase 16 | 14,Alpha-N-acetylgalactosaminidase 17 | 15,"1,3-beta-glucan synthase" 18 | 16,4-Hydroxybutyrate dehydrogenase 19 | 17,Sn-glycerol-1-phosphate dehydrogenase 20 | 18,3-Ketosteroid reductase 21 | 19,(iso)eugenol O-methyltransferase 22 | 20,(myelin basic protein)-arginine N-methyltransferase 23 | 21,(ribulose-bisphosphate carboxylase)-lysine N-methyltransferase 24 | 22,(formate-C-acetyltransferase)-activating enzyme 25 | 23,Sulfhydrogenase 26 | 24,4-hydroxyphenylacetate 3-monooxygenase 27 | 25,Desacetoxyvindoline 4-hydroxylase 28 | 26,L-2-hydroxyglutarate dehydrogenase 29 | 27,4-cresol dehydrogenase (hydroxylating) 30 | 28,(methionine synthase) reductase 31 | 29,"5,10-methenyltetrahydromethanopterin hydrogenase" 32 | 30,Chlorite dismutase 33 | 31,NAD(P)+ transhydrogenase (Re/Si-specific) 34 | 32,NAD(P)+ transhydrogenase (Si-specific) 35 | 33,Myosin-light-chain phosphatase 36 | 34,2-hydroxyacylsphingosine 1-beta-galactosyltransferase 37 | 35,(isocitrate dehydrogenase (NADP+)) kinase 38 | 36,TRNA cytidylyltransferase 39 | 37,4-hydroxy-3-methylbut-2-enyl diphosphate reductase 40 | 38,Cyanase 41 | 39,Formylglycine-generating enzyme 42 | 40,Nucleotide pyrophosphatase/phosphodiesterase 43 | 41,IgA specific serine endopeptidase 44 | 42,5-beta-reductase 45 | 43,ALG10 (enzyme class) 46 | 44,ALG8 (enzyme class) 47 | 45,ALG6 (enzyme class) 48 | 46,Methionine transaminase 49 | 47,4-Sulfomuconolactone hydrolase 50 | -------------------------------------------------------------------------------- /src/autofj/benchmark/TennisTournament/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 4,2000 Tennis Masters Cup,0,2000 Tennis Masters Cup and ATP Tour World Championships 3 | 25,2008 Bell Challenge,1,2008 Challenge Bell 4 | 64,2010 Queen's Club Championships,2,2010 Aegon Championships 5 | 70,2009 Bell Challenge,3,2009 Challenge Bell 6 | 73,1996 Bell Challenge,4,1996 Challenge Bell 7 | 75,1997 Bell Challenge,5,1997 Challenge Bell 8 | 83,1998 Bell Challenge,6,1998 Challenge Bell 9 | 84,2010 Challenger Banque Nationale,7,2010 Challenger Banque Nationale de Rimouski 10 | 115,2009 Carisap Tennis Cup,8,2009 San Benedetto Tennis Cup 11 | 151,2010 Bell Challenge,9,2010 Challenge Bell 12 | 165,2010 Internazionali di Tennis dell'Umbria,10,2010 Blu-express.com Tennis Cup 13 | 183,2011 Men's Rimouski Challenger,11,2011 Challenger Banque Nationale de Rimouski 14 | 187,2011 AEGON Championships,12,2011 Aegon Championships 15 | 188,2011 AEGON Classic,13,2011 Aegon Classic 16 | 200,2011 Bell Challenge,14,2011 Challenge Bell 17 | 202,2011 National Bank Challenger Saguenay,15,2011 Challenger Banque Nationale de Saguenay 18 | 219,2012 Qatar Airways Tournament of Champions,26,2012 Commonwealth Bank Tournament of Champions 19 | 224,2012 Men's Rimouski Challenger,16,2012 Challenger Banque Nationale de Rimouski 20 | 229,2012 AEGON Championships,17,2012 Aegon Championships 21 | 261,2012 Bell Challenge,18,2012 Challenge Bell 22 | 266,2012 Internazionali di Tennis dell'Umbria,19,2012 Blu-express.com Tennis Cup 23 | 267,2012 Hansol Korea Open,20,2012 Korea Open 24 | 269,2012 ATP Challenger Trophy,21,2012 Arimex Challenger Trophy 25 | 284,2012 National Bank Challenger Saguenay,22,2012 Challenger Banque Nationale de Saguenay 26 | 296,2012 AEGON Pro-Series Loughborough,23,2012 Aegon Pro-Series Loughborough 27 | 308,2013 Qatar Airways Tournament of Champions,24,2013 Garanti Koza WTA Tournament of Champions 28 | 319,2013 Challenger Banque Nationale,25,2013 Challenger Banque Nationale de Rimouski 29 | -------------------------------------------------------------------------------- /src/autofj/join_function_space/options.py: -------------------------------------------------------------------------------- 1 | """Options of join functions""" 2 | 3 | autofj_lg = { 4 | "preprocess_methods":["lower", "lowerStem", "lowerRemovePunctuation", 5 | "lowerRemovePunctuationStem"], 6 | "tokenize_methods": ["threeGram", "splitBySpace"], 7 | "token_weights": ["uniformWeight", "idfWeight"], 8 | "char_distance_functions": ["editDistance", "jaroDistance"], 9 | "set_distance_functions": ["containJaccardDistance", 10 | "containCosineDistance", 11 | "containDiceDistance", 12 | "intersectDistance", 13 | "jaccardDistance", 14 | "cosineDistance", 15 | "diceDistance", 16 | "maxincDistance"] 17 | } 18 | 19 | autofj_md = { 20 | "preprocess_methods":["lower", "lowerRemovePunctuationStem"], 21 | "tokenize_methods": ["threeGram", "splitBySpace"], 22 | "token_weights": ["uniformWeight", "idfWeight"], 23 | "char_distance_functions": ["editDistance", "jaroDistance"], 24 | "set_distance_functions": ["containJaccardDistance", 25 | "containCosineDistance", 26 | "containDiceDistance", 27 | "intersectDistance", 28 | "jaccardDistance", 29 | "cosineDistance", 30 | "diceDistance", 31 | "maxincDistance"] 32 | } 33 | 34 | autofj_sm = { 35 | "preprocess_methods":["lower", "lowerRemovePunctuationStem"], 36 | "tokenize_methods": ["threeGram", "splitBySpace"], 37 | "token_weights": ["idfWeight"], 38 | "char_distance_functions": ["jaroDistance"], 39 | "set_distance_functions": ["containCosineDistance", 40 | "jaccardDistance", 41 | "maxincDistance"] 42 | } -------------------------------------------------------------------------------- /src/autofj/benchmark/RugbyLeague/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,World Rugby Sevens Series 3 | 1,World Club Series 4 | 2,Mitre 10 Cup 5 | 3,Pro12 6 | 4,League Cup (rugby league) 7 | 5,Top League Champions Cup 8 | 6,World Rugby Pacific Nations Cup 9 | 7,Intrust Super Premiership NSW 10 | 8,World Rugby Pacific Challenge 11 | 9,Tom Richards Cup 12 | 10,World Rugby Nations Cup 13 | 11,France Sevens 14 | 12,Charity Shield (NRL) 15 | 13,Caledonia Regional League 16 | 14,Asia Rugby Women's Championship 17 | 15,List of Rugby World Cup finals 18 | 16,Scottish Premiership (rugby) 19 | 17,Asia Rugby Championship 20 | 18,African Development Trophy 21 | 19,Rugby League European Championship B 22 | 20,World Rugby Under 20 Championship 23 | 21,World Rugby Under 20 Trophy 24 | 22,Sri Lanka Sevens 25 | 23,ANZAC Day Cup 26 | 24,Women's Nations Cup (rugby union) 27 | 25,Rugby League European Championship C 28 | 26,Oceania Rugby Cup 29 | 27,GIO Schoolboy Cup 30 | 28,South Premier 31 | 29,National Women's Rugby Championship 32 | 30,2. Rugby-Bundesliga 33 | 31,All Stars match 34 | 32,Asian Women's Sevens Championship 35 | 33,African Women's Sevens Championship 36 | 34,North America and Caribbean Women's Sevens Championship 37 | 35,Oceania Women's Sevens Championship 38 | 36,Women's rugby sevens in South America 39 | 37,Ekstraliga (rugby) 40 | 38,North East Rugby League Premier Division 41 | 39,Division 1-A Rugby 42 | 40,SARU Gold Cup 43 | 41,World Rugby Women's Sevens Series 44 | 42,National Championship of Excellence (Italian premiership) 45 | 43,All Japan University Rugby Championship 46 | 44,Colonial Cup 47 | 45,Pacific Nations Cup 48 | 46,Bundaberg Red Cup 49 | 47,"London, South and East Merit League" 50 | 48,Tonga National Rugby League 51 | 49,Rugby League Conference South Premier 52 | 50,St. Patrick's Day Test 53 | 51,Rugby League Conference North East Division 54 | 52,Rugby League Conference London & South Division 55 | 53,International Origin Match 56 | 54,The Ron Coote Cup 57 | 55,South American Women's Sevens 58 | 56,Asian Rugby Championship 59 | 57,Sevens World Series 60 | -------------------------------------------------------------------------------- /src/autofj/join_function_space/join_function/join_function.py: -------------------------------------------------------------------------------- 1 | class JoinFunction(object): 2 | """Customized join function must have an unique name attribute and a method named 3 | compute_distance as below. The constructor can be overwritten""" 4 | def __init__(self): 5 | self.name = "jf_example" 6 | pass 7 | 8 | def compute_distance(self, left, right, LL_blocked, LR_blocked, 9 | cache_dir=None): 10 | """Compute the distance of each tuple pair in the LL and LR blocked table. 11 | 12 | Parameters 13 | ---------- 14 | left: pd.DataFrame 15 | A subset of the left table that contains the id column and the 16 | column to be processed. The id column is named as autofj_id. 17 | The column to be processed is named as value. 18 | 19 | right: pd.DataFrame 20 | A subset of the right table that contains the id column and the 21 | column to be processed. The id column is named as autofj_id. 22 | The column to be processed is named as value. 23 | 24 | LL_blocked: pd.DataFrame 25 | The LL blocked table that consists of the id columns and 26 | the columns to be processed. The id columns are named as 27 | autofj_id_l and autofj_id_r. The column to be processed is named as 28 | value_l and value_r. 29 | 30 | LR_blocked: pd.DataFrame 31 | The LR blocked table that consists of the id columns and 32 | the columns to be processed. The id columns are named as 33 | autofj_id_l and autofj_id_r. The column to be processed is named as 34 | value_l and value_r. 35 | 36 | Returns 37 | ------- 38 | LL_distance: pd.Series 39 | Distance of each tuple pair in the LL blocked table. 40 | 41 | LR_distance: pd.Series 42 | Distance of each tuple pair in the LR blocked table. 43 | """ 44 | LL_distance = None 45 | LR_distance = None 46 | return LL_distance, LR_distance 47 | -------------------------------------------------------------------------------- /src/autofj/benchmark/SoccerClubSeason/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,2006 Australia national soccer team season 3 | 1,2007 Australia national soccer team season 4 | 2,2008 Australia national soccer team season 5 | 3,2005 Australia national soccer team season 6 | 4,2009 Australia national soccer team season 7 | 5,2010 Australia national soccer team season 8 | 6,2003 Cienciano season 9 | 7,2009 Helsingin Jalkapalloklubi season 10 | 8,2010 Jeonbuk Hyundai Motors FC season 11 | 9,2010 Down football season 12 | 10,2011 Down football season 13 | 11,2011 Daejeon Citizen FC season 14 | 12,2011 Australia national soccer team season 15 | 13,2011 Orlando City SC season 16 | 14,2011 Jeonbuk Hyundai Motors FC season 17 | 15,2011 Jeonnam Dragons season 18 | 16,2004 Australia national soccer team season 19 | 17,2011 Incheon United FC season 20 | 18,2003 Australia national soccer team season 21 | 19,2002 Australia national soccer team season 22 | 20,1999 Australia national soccer team season 23 | 21,1998 Australia national soccer team season 24 | 22,2001 Australia national soccer team season 25 | 23,2000 Australia national soccer team season 26 | 24,2012 Down football season 27 | 25,1997 Australia national soccer team season 28 | 26,1996 Australia national soccer team season 29 | 27,2012 Kuala Lumpur FA season 30 | 28,2012 Daejeon Citizen FC season 31 | 29,2012 Orlando City SC season 32 | 30,2012 Australia national soccer team season 33 | 31,2012 Jeonbuk Hyundai Motors FC season 34 | 32,2012 Jeonnam Dragons season 35 | 33,2012 Incheon United FC season 36 | 34,2012 Jeju United FC season 37 | 35,2012 Woodlands Wellington FC season 38 | 36,2011 Woodlands Wellington FC season 39 | 37,2010 Woodlands Wellington FC season 40 | 38,2013 Down football season 41 | 39,2013 Orlando City SC season 42 | 40,2013 Woodlands Wellington FC season 43 | 41,2013 Negeri Sembilan FA season 44 | 42,2009 Down football season 45 | 43,2013 Australia national soccer team season 46 | 44,2013 Carolina RailHawks season 47 | 45,2013 Incheon United FC season 48 | 46,Derry football season 2008 49 | 47,Derry football season 2009 50 | 48,Derry football season 2010 51 | 49,2010 Down GAA Senior Football 52 | 50,2011 Down GAA Senior Football 53 | -------------------------------------------------------------------------------- /src/autofj/benchmark/FootballMatch/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Battle of Santiago (1962 FIFA World Cup) 3 | 1,Battle of Berne (1954 FIFA World Cup) 4 | 2,2005 international rules series 5 | 3,Austria v Switzerland (1954 FIFA World Cup) 6 | 4,2001 Germany v England football match 7 | 5,Shamrock Rovers XI v Brazil 8 | 6,2006 International Rules series 9 | 7,2006 international rules series 10 | 8,Poland v Brazil (1938 FIFA World Cup) 11 | 9,2001 international rules series 12 | 10,1998 international rules series 13 | 11,Austria v West Germany (1978 FIFA World Cup) 14 | 12,2000 England v Germany football match 15 | 13,Battle of Nuremberg (2006 FIFA World Cup) 16 | 14,2008 Conference Premier play-off Final 17 | 15,2008 international rules series 18 | 16,1968 DFB-Pokal Final 19 | 17,1988 DFB-Pokal Final 20 | 18,1999 international rules series 21 | 19,1989 MISL All-Star Game 22 | 20,Argentina 2–1 England (1986 FIFA World Cup) 23 | 21,2009 African Championship of Nations Final 24 | 22,Hungary v El Salvador (1982 FIFA World Cup) 25 | 23,Hungary 10–1 El Salvador (1982) 26 | 24,2009 Conference Premier play-off Final 27 | 25,2009 WPS All-Star Game 28 | 26,1871 Scotland versus England rugby union match 29 | 27,1870–71 Home Nations rugby union matches 30 | 28,2009 Republic of Ireland v France football matches 31 | 29,France 1–1 Ireland (18 November 2009) 32 | 30,1985 China v Hong Kong football match 33 | 31,1876 Scotland v Wales football match 34 | 32,2010 international rules series 35 | 33,1993 Dutch Supercup 36 | 34,2010 Conference Premier play-off Final 37 | 35,2010 WPS All-Star Game 38 | 36,West Germany v France (1982 FIFA World Cup) 39 | 37,2002 international rules series 40 | 38,2010–11 W-League Grand Final 41 | 39,2006 Copa Indonesia final 42 | 40,Brazil v Italy (1982 FIFA World Cup) 43 | 41,1992 Dutch Supercup 44 | 42,1991 Dutch Supercup 45 | 43,2011 Conference Premier play-off Final 46 | 44,2011 All-Ireland Minor Hurling Championship 47 | 45,2011 international rules series 48 | 46,2011 J.League Cup Final 49 | 47,2013 Kenyan Super Cup (pre-season) 50 | 48,2012 Conference Premier play-off Final 51 | 49,2011 UEFA European Under-21 Championship Final 52 | 50,2012 Albanian Supercup 53 | 51,1985 Wales v Scotland football match 54 | 52,Soccer Bowl 2013 55 | -------------------------------------------------------------------------------- /src/autofj/join_function_space/join_function/preprocessor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nltk.stem.porter import PorterStemmer 3 | from nltk.stem import SnowballStemmer 4 | import time 5 | import pandas as pd 6 | import re 7 | 8 | ps = PorterStemmer() 9 | # ps = SnowballStemmer("english") 10 | 11 | def lower(x): 12 | return str(x).lower() 13 | 14 | def removePunctuation(x): 15 | return re.sub(r'[^\w\s]', '', x) 16 | 17 | def stem(x): 18 | return " ".join([ps.stem(w) for w in x.split()]) 19 | 20 | def lowerStem(x): 21 | x = lower(x) 22 | x = stem(x) 23 | return x 24 | 25 | def lowerRemovePunctuation(x): 26 | x = lower(x) 27 | x = removePunctuation(x) 28 | return x 29 | 30 | def lowerRemovePunctuationStem(x): 31 | x = lower(x) 32 | x = removePunctuation(x) 33 | x = stem(x) 34 | return x 35 | 36 | class Preprocessor: 37 | """Preprocess data 38 | 39 | Parameters 40 | ---------- 41 | method: string 42 | Preprocessing method. The available methods are listed as follows. 43 | - lower: lowercase 44 | - lowerStem: lowercase and stem 45 | - lowerRemovePunctuation: lowercase and remove punctuation 46 | - lowerRemovePunctuationStem: lowercase, remove punctuation and stem 47 | """ 48 | def __init__(self, method): 49 | self.method = method 50 | if method == "lower": 51 | self.func = lower 52 | elif method == "lowerStem": 53 | self.func = lowerStem 54 | elif method == "lowerRemovePunctuation": 55 | self.func = lowerRemovePunctuation 56 | elif method == "lowerRemovePunctuationStem": 57 | self.func = lowerRemovePunctuationStem 58 | else: 59 | raise Exception("{} is an invalid preprocessing method" 60 | .format(method)) 61 | def preprocess(self, X): 62 | """ Preprocess the given data 63 | 64 | Parameters 65 | ---------- 66 | X: pd.Series 67 | Input data 68 | """ 69 | X = X.apply(self.func) 70 | return X 71 | 72 | # data = pd.read_csv("../../data/left.csv")["title"] 73 | # X = np.concatenate([data.values for _ in range(20)]) 74 | # X = pd.Series(X) 75 | 76 | # pre1 = Preprocessor("lowerRemovePunctuationStem") 77 | # tic = time.time() 78 | # pre1.preprocess(X) 79 | # print(time.time() - tic) 80 | 81 | # pre2 = OldPreprocess(X) 82 | # tic = time.time() 83 | # pre2.process(("lower", "remove_punctuation", "stem")) 84 | # print(time.time() - tic) 85 | 86 | -------------------------------------------------------------------------------- /src/autofj/50-single-column-datasets.md: -------------------------------------------------------------------------------- 1 | |Dataset |Left|Right|Matches| 2 | |----------------------------|----|-----|-------| 3 | |Amphibian |3663|1161 |1161 | 4 | |ArtificialSatellite |1801|72 |72 | 5 | |Artwork |3112|245 |245 | 6 | |Award |3380|384 |384 | 7 | |BasketballTeam |928 |166 |166 | 8 | |Case |2474|380 |380 | 9 | |ChristianBishop |5363|494 |494 | 10 | |ClericalAdministrativeRegion|2547|190 |190 | 11 | |Country |2791|291 |291 | 12 | |Device |6933|658 |658 | 13 | |Drug |5356|157 |157 | 14 | |Election |6565|727 |727 | 15 | |Enzyme |3917|48 |48 | 16 | |EthnicGroup |4317|946 |946 | 17 | |FootballLeagueSeason |4457|280 |280 | 18 | |FootballMatch |1999|53 |53 | 19 | |Galaxy |555 |17 |17 | 20 | |GivenName |3021|154 |154 | 21 | |GovernmentAgency |3977|571 |571 | 22 | |HistoricBuilding |5064|512 |512 | 23 | |Hospital |2424|257 |257 | 24 | |Legislature |1314|216 |216 | 25 | |Magazine |4005|274 |274 | 26 | |MemberOfParliament |5774|503 |503 | 27 | |Monarch |2033|242 |242 | 28 | |MotorsportSeason |1465|388 |388 | 29 | |Museum |3982|305 |305 | 30 | |NCAATeamSeason |5619|34 |34 | 31 | |NationalFootballLeagueSeason|3003|10 |10 | 32 | |NaturalEvent |970 |51 |51 | 33 | |Noble |3609|364 |364 | 34 | |PoliticalParty |5254|495 |495 | 35 | |Race |2382|175 |175 | 36 | |RailwayLine |2189|298 |298 | 37 | |Reptile |666 |819 |562 | 38 | |RugbyLeague |418 |58 |58 | 39 | |ShoppingMall |201 |227 |159 | 40 | |SoccerClubSeason |1197|51 |51 | 41 | |SoccerLeague |1315|238 |238 | 42 | |SoccerTournament |2714|290 |290 | 43 | |Song |5726|440 |440 | 44 | |SportFacility |6392|672 |672 | 45 | |SportsLeague |3106|481 |481 | 46 | |Stadium |5105|619 |619 | 47 | |TelevisionStation |6752|1152 |1152 | 48 | |TennisTournament |324 |27 |27 | 49 | |Tournament |4858|459 |459 | 50 | |UnitOfWork |2483|380 |380 | 51 | |Venue |4079|384 |384 | 52 | |Wrestler |3150|464 |464 | -------------------------------------------------------------------------------- /src/autofj/join_function_space/join_function/token_weight.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import math 3 | import pandas as pd 4 | import time 5 | import numpy as np 6 | 7 | class defaultdict(dict): 8 | def set_default_value(self, default_value): 9 | self.default_value = default_value 10 | 11 | def __missing__(self, key): 12 | self[key] = self.default_value 13 | return self.default_value 14 | 15 | def uniformWeight(document): 16 | """Uniform weight""" 17 | weight = defaultdict() 18 | weight.set_default_value(1) 19 | return weight 20 | 21 | def idfWeight(document): 22 | """Compute idf weight for tokens 23 | 24 | Parameters: 25 | ----------- 26 | document: list of sets 27 | A list of token sets, which is the document on which the idf is 28 | computed. 29 | 30 | Return: 31 | ------- 32 | weight: dict 33 | idf weight of tokens 34 | """ 35 | token_count = collections.defaultdict(set) 36 | 37 | for i, row in enumerate(document): 38 | for token in row: 39 | token_count[token].add(i) 40 | 41 | # calculate idf value 42 | weight = defaultdict() 43 | weight.set_default_value(math.log(len(document))) 44 | 45 | for k, v in token_count.items(): 46 | weight[k] = math.log(len(document) / (len(v) + 1)) 47 | return weight 48 | 49 | class TokenWeight(object): 50 | """Token weight 51 | 52 | Parameters 53 | ---------- 54 | method: string 55 | Token weighting schema. The available methods are listed as follows. 56 | - uniformWight 57 | - idfWeight 58 | - None (no weights) 59 | """ 60 | def __init__(self, method): 61 | self.method = method 62 | if method is None: 63 | self.func = None 64 | elif method == "uniformWeight": 65 | self.func = uniformWeight 66 | elif method == "idfWeight": 67 | self.func = idfWeight 68 | else: 69 | raise Exception("{} is an invalid weighting schema" 70 | .format(method)) 71 | 72 | def weight(self, X): 73 | """ Weight tokens 74 | 75 | Parameters 76 | ---------- 77 | X: pd.Series 78 | Input data 79 | 80 | Return 81 | ------ 82 | weight: dict 83 | weight of tokens 84 | """ 85 | if self.func is not None: 86 | weight = self.func(X) 87 | return weight 88 | else: 89 | return None 90 | # 91 | # data = pd.read_csv("../../data/left.csv")["title"] 92 | # X = np.concatenate([data.values for _ in range(20)]) 93 | # X = pd.Series(X) 94 | # 95 | # weight = TokenWeight("idfWeight") 96 | # tic = time.time() 97 | # weight.weight(X) 98 | # print(time.time() - tic) 99 | -------------------------------------------------------------------------------- /src/autofj/benchmark/ArtificialSatellite/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 119,Soyuz 18a,0,Soyuz 7K-T No.39 3 | 200,Pad Abort Test-1 (Apollo),1,Pad Abort Test 1 4 | 201,A-101 (spacecraft),2,AS-101 (spacecraft) 5 | 202,A-102 (spacecraft),3,AS-102 (spacecraft) 6 | 219,Spirit (rover),4,Spirit rover 7 | 220,Opportunity (rover),5,Opportunity rover 8 | 302,UoSAT-2,6,UO-11 9 | 374,A-104 (spacecraft),7,AS-104 (spacecraft) 10 | 375,A-103 (spacecraft),8,AS-103 (spacecraft) 11 | 376,A-105 (spacecraft),9,AS-105 (spacecraft) 12 | 378,Pad Abort Test-2 (Apollo),10,Pad Abort Test 2 13 | 410,Soyuz T-10-1,11,Soyuz 7K-ST No. 16L 14 | 450,SpaceShipOne flight 17P,12,SpaceShipOne Flight 17P 15 | 527,NuSTAR,13,Nuclear Spectroscopic Telescope Array 16 | 530,Eutelsat 28A,14,Eutelsat 33C 17 | 542,SEASAT,15,Seasat 18 | 544,Foton-M2,16,Foton-M No.2 19 | 584,CHAMP,17,CHAMP (satellite) 20 | 607,Bion 1,18,Kosmos 605 21 | 608,Bion 7,19,Kosmos 1667 22 | 619,CUTE-1.7,20,CUTE-1.7 + APD 23 | 622,Eutelsat 33A,21,Eutelsat 31A 24 | 640,Nilesat 103,22,Eutelsat 16B 25 | 669,Geotail,23,GEOTAIL 26 | 688,Fengyun 2-05,24,Fengyun 2D 27 | 716,Active Cavity Radiometer Irradiance Monitor Satellite,25,ACRIMSAT 28 | 733,Resurs-DK1,26,Resurs-DK No.1 29 | 759,HYLAS-1,27,HYLAS 30 | 783,Landsat Data Continuity Mission,28,Landsat 8 31 | 786,Proba-2,29,PROBA2 32 | 796,P78-1,30,Solwind 33 | 798,UoSat-1,31,UoSAT-1 34 | 798,UoSat-1,32,UoSat-OSCAR 9 35 | 804,A Broadband Imaging X-ray All-sky Survey,33,ABRIXAS 36 | 831,Agila 2,34,ABS-3 37 | 832,Sentinel 3,35,Sentinel-3 38 | 840,Soil Moisture and Ocean Salinity satellite,36,Soil Moisture and Ocean Salinity 39 | 854,MightySat-2,37,MightySat-2.1 40 | 886,Aditya (spacecraft),38,Aditya (satellite) 41 | 898,Eutelsat 28B,39,Eutelsat 48D 42 | 898,Eutelsat 28B,40,Eutelsat 48B 43 | 898,Eutelsat 28B,41,Afghansat 1 44 | 899,Hot Bird 9,42,Hot Bird 13C 45 | 911,AMC 1,43,AMC-1 46 | 924,Prisma,44,Prisma (satellite project) 47 | 953,Eurobird 4A,45,Eutelsat 4A 48 | 959,CP-6,46,CP6 (satellite) 49 | 1089,Satmex 6,47,Eutelsat 113 West A 50 | 1093,Orion 3 (satellite),48,Orion 3 51 | 1219,Azerspace-1/Africasat-1a,49,Azerspace 52 | 1274,LightSail-1,50,LightSail 2 53 | 1277,Eutelsat 70A,51,Eutelsat 33B 54 | 1277,Eutelsat 70A,52,Eutelsat 25C 55 | 1337,SpaceX COTS Demo Flight 1,53,COTS Demo Flight 1 56 | 1370,Dragon C2+,54,COTS Demo Flight 2 57 | 1371,SpaceX CRS-1,55,Dragon C3 58 | 1415,NEE-01 Pegasus,56,NEE-01 Pegaso 59 | 1421,SES7,57,SES-7 60 | 1423,Europa Lander,58,Laplace-P 61 | 1460,New Dawn (satellite),59,Intelsat 28 62 | 1470,Ziyuan 3,60,Ziyuan III-01 63 | 1472,Zond 3MV-1 No.2,61,Venera 3MV-1 No.2 64 | 1490,Cygnus 1,62,Cygnus Orb-D1 65 | 1491,Ekspress-AM4,63,Ekspress AM4 66 | 1495,IRNSS-1,64,IRNSS-1A 67 | 1575,SpaceX CRS-2,65,Dragon C4 68 | 1577,Estrela do Sul 1,66,Telstar 14 69 | 1646,GPS IIF-4,67,USA-242 70 | 1699,Mangalyaan,68,Mars Orbiter Mission 71 | 1703,AIDA (spacecraft),69,AIDA (mission) 72 | 1774,TDRS-K,70,TDRS-11 73 | 1798,Satmex 8,71,Eutelsat 117 West A 74 | -------------------------------------------------------------------------------- /src/autofj/benchmark/NCAATeamSeason/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 24,1893 LSU Tigers football team,0,1893 LSU football team 3 | 25,1894 LSU Tigers football team,1,1894 LSU football team 4 | 26,1895 LSU Tigers football team,2,1895 LSU football team 5 | 380,2002 Florida Atlantic Blue Wave baseball team,25,2002 Florida Atlantic Owls baseball team 6 | 450,1892 Alabama Crimson Tide football team,3,1892 Alabama Cadets football team 7 | 451,1894 Alabama Crimson Tide football team,4,1894 Alabama Crimson White football team 8 | 452,1893 Alabama Crimson Tide football team,5,1893 Alabama Crimson White football team 9 | 453,1895 Alabama Crimson Tide football team,6,1895 Alabama Crimson White football team 10 | 454,1898 Alabama Crimson Tide football team,7,1898 Alabama Crimson White football team 11 | 455,1897 Alabama Crimson Tide football team,8,1897 Alabama Crimson White football team 12 | 456,1896 Alabama Crimson Tide football team,9,1896 Alabama Crimson White football team 13 | 457,1899 Alabama Crimson Tide football team,10,1899 Alabama Crimson White football team 14 | 458,1900 Alabama Crimson Tide football team,11,1900 Alabama Crimson White football team 15 | 459,1901 Alabama Crimson Tide football team,12,1901 Alabama Crimson White football team 16 | 460,1902 Alabama Crimson Tide football team,13,1902 Alabama Crimson White football team 17 | 461,1903 Alabama Crimson Tide football team,14,1903 Alabama Crimson White football team 18 | 462,1904 Alabama Crimson Tide football team,15,1904 Alabama Crimson White football team 19 | 463,1906 Alabama Crimson Tide football team,16,1906 Alabama Crimson White football team 20 | 464,1905 Alabama Crimson Tide football team,17,1905 Alabama Crimson White football team 21 | 482,1974 Oregon Webfoots football team,26,1974 Oregon Ducks football team 22 | 496,1964 Oregon Webfoots football team,27,1964 Oregon Ducks football team 23 | 964,1906 Arkansas Razorbacks football team,18,1906 Arkansas Cardinals football team 24 | 1712,1963 Oregon Webfoots football team,28,1963 Oregon Ducks football team 25 | 2880,2010 FIU Panthers football team,29,2010 FIU Golden Panthers football team 26 | 2978,1975 Oregon Webfoots football team,30,1975 Oregon Ducks football team 27 | 3227,2011 FIU Panthers football team,31,2011 FIU Golden Panthers football team 28 | 3335,2011 Austin Peay State Governors football team,19,2011 Austin Peay Governors football team 29 | 3389,1907 Notre Dame Fighting Irish football team,20,1907 Notre Dame football team 30 | 3843,1979 UCF Knights football team,32,1979 UCF Golden Knights football team 31 | 4222,2012 FIU Panthers football team,33,2012 FIU Golden Panthers football team 32 | 4301,2012 Austin Peay State Governors football team,21,2012 Austin Peay Governors football team 33 | 5402,1919 Washington football team,22,1919 Washington Sun Dodgers football team 34 | 5480,2013 Hawaii Warriors football team,23,2013 Hawaii Rainbow Warriors football team 35 | 5496,1887 Notre Dame Fighting Irish football team,24,1887 Notre Dame football team 36 | -------------------------------------------------------------------------------- /src/autofj/benchmark/NaturalEvent/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 21,1988 Spitak earthquake,0,1988 Armenian earthquake 3 | 44,1935 Balochistan earthquake,1,1935 Quetta earthquake 4 | 54,May 2006 Java earthquake,2,2006 Yogyakarta earthquake 5 | 59,July 2006 Java earthquake,3,2006 Pangandaran earthquake and tsunami 6 | 79,2006 Hengchun earthquake,4,2006 Hengchun earthquakes 7 | 91,2001 El Salvador earthquakes,5,January 2001 El Salvador earthquake 8 | 103,1509 Istanbul earthquake,6,1509 Constantinople earthquake 9 | 108,1959 Yellowstone earthquake,7,1959 Hebgen Lake earthquake 10 | 116,1985 Algarrobo earthquake,42,1985 Santiago earthquake 11 | 134,"February 4, 1998 Afghanistan earthquake",8,February 1998 Afghanistan earthquake 12 | 135,"May 30, 1998 Afghanistan earthquake",9,May 1998 Afghanistan earthquake 13 | 142,2004 Morocco earthquake,10,2004 Al Hoceima earthquake 14 | 153,1968 Dasht-e Bayaz and Ferdows earthquake,11,1968 Dasht-e Bayaz and Ferdows earthquakes 15 | 173,2000 Sumatra earthquake,12,2000 Enggano earthquake 16 | 194,1980 Central Mexico earthquake,13,1980 Oaxaca earthquake 17 | 197,1894 Meiji Tokyo earthquake,14,1894 Tokyo earthquake 18 | 207,1953 Ionian Earthquake,15,1953 Ionian earthquake 19 | 215,2009 Papua earthquake,16,2009 Papua earthquakes 20 | 219,1929 Koppeh Dagh earthquake,17,1929 Kopet Dag earthquake 21 | 225,1962 Buin Zahra earthquake,43,1962 Bou'in-Zahra earthquake 22 | 240,2008 Bandar Abbas earthquake,18,2008 Qeshm earthquake 23 | 250,Near East earthquakes of 1759,44,Near East earthquake of 1759 24 | 256,1755 Cape Ann Earthquake,19,1755 Cape Ann earthquake 25 | 282,The Seventh Earthquake,20,749 Galilee earthquake 26 | 284,2009 Samoa earthquake,21,2009 Samoa earthquake and tsunami 27 | 294,1896 Meiji-Sanriku earthquake,22,1896 Sanriku earthquake 28 | 368,1854 Ansei-Nankai earthquake,23,1854 Nankai earthquake 29 | 539,1940 New Hampshire earthquake,24,1940 New Hampshire earthquakes 30 | 677,January 2010 Solomon Islands earthquake,25,2010 Solomon Islands earthquake 31 | 699,Duvall earthquake,26,1996 Duvall earthquake 32 | 705,2010 Kaohsiung earthquakes,45,2010 Kaohsiung earthquake 33 | 719,1903 Malazgirt earthquake,27,1903 Manzikert earthquake 34 | 735,1995 Gulf of Aqaba earthquake,46,1995 Egypt earthquake 35 | 748,1914 Burdur earthquake,47,1914 Afyon-Bolvadin earthquake 36 | 750,1653 East Symirna earthquake,28,1653 East Smyrna earthquake 37 | 751,1688 Izmir earthquake,29,1688 Smyrna earthquake 38 | 759,1855 Ansei Edo earthquake,30,1855 Edo earthquake 39 | 778,1927 earthquake in Palestine,31,1927 Jericho earthquake 40 | 810,1909 Lambesc earthquake,32,1909 Provence earthquake 41 | 829,869 Jogan Sanriku earthquake,33,869 Sanriku earthquake 42 | 832,2011 Burma earthquake,34,2011 Myanmar earthquake 43 | 844,1954 Adelaide earthquake,48,Great Adelaide Earthquake 44 | 867,December 1911 Guerrero earthquake,35,1911 Guerrero earthquake 45 | 887,1611 Keicho Sanriku earthquake,36,1611 Sanriku earthquake 46 | 894,1932 Jalisco earthquake,37,1932 Jalisco earthquakes 47 | 899,847 Damascus earthquake,49,847 Antioch earthquake 48 | 915,2012 Indian Ocean earthquakes,50,2012 Indian Ocean earthquake 49 | 924,June 2012 Afghanistan earthquakes,38,2012 Afghanistan earthquakes 50 | 932,2012 Yangzhou Earthquake,39,2012 Yangzhou earthquake 51 | 962,2008 Qeshm earthquake,40,2008 Bandar Abbas earthquake 52 | 969,2013 Nantou earthquake,41,March 2013 Nantou earthquake 53 | -------------------------------------------------------------------------------- /src/autofj/benchmark/GivenName/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Luke (name) 3 | 1,Julia 4 | 2,Fatima (name) 5 | 3,Fanny (name) 6 | 4,Lyfing 7 | 5,Adele 8 | 6,Jeffrey (name) 9 | 7,Kathryn 10 | 8,Cadwallader 11 | 9,Gita (given name) 12 | 10,Banu 13 | 11,Casper 14 | 12,Hana (name) 15 | 13,Abd al-Aziz 16 | 14,Guillaume 17 | 15,Dustin (name) 18 | 16,Shahrokh (mythical bird) 19 | 17,William (given name) 20 | 18,Arif (given name) 21 | 19,Corinne 22 | 20,Haruna (given name) 23 | 21,Brianna 24 | 22,Takeru (name) 25 | 23,Yasmin (given name) 26 | 24,Calvin (name) 27 | 25,Alexis 28 | 26,Hjalmar (disambiguation) 29 | 27,Rei (given name) 30 | 28,Asad (name) 31 | 29,Mervin 32 | 30,Margaret 33 | 31,Aurora (given name) 34 | 32,Michelle (name) 35 | 33,Jahsh 36 | 34,Mark (name) 37 | 35,Sujata (name) 38 | 36,Anupama (given name) 39 | 37,Paulina (given name) 40 | 38,Padraic 41 | 39,Bojan 42 | 40,Chetan (name) 43 | 41,Juanfran (disambiguation) 44 | 42,Gun (Swedish name) 45 | 43,Dalia (given name) 46 | 44,Travis 47 | 45,Merwin 48 | 46,Daniel 49 | 47,Arun (given name) 50 | 48,Michele 51 | 49,Zakiah 52 | 50,Parvati (given name) 53 | 51,Medad 54 | 52,Joseph 55 | 53,Joseph (name) 56 | 54,Aida (name) 57 | 55,Edwina 58 | 56,Zvonimir 59 | 57,Annetta (given name) 60 | 58,Asa (name) 61 | 59,Hannu (disambiguation) 62 | 60,Hayley (given name) 63 | 61,Bauyrzhan 64 | 62,Abd al-Rahman 65 | 63,Bram (given name) 66 | 64,Hannah (name) 67 | 65,Katherine 68 | 66,Hideyoshi (disambiguation) 69 | 67,Alexandru 70 | 68,Lee (English given name) 71 | 69,Leonard 72 | 70,Vivian (given name) 73 | 71,Marvin (given name) 74 | 72,Faiz (disambiguation) 75 | 73,Anthony (name) 76 | 74,Stanley (name) 77 | 75,Cory 78 | 76,Gerard 79 | 77,Lindita (given name) 80 | 78,Sophie (given name) 81 | 79,Leonie 82 | 80,Raizo 83 | 81,Raffaello (disambiguation) 84 | 82,Sorin (given name) 85 | 83,Ljubica 86 | 84,Ludovica 87 | 85,Alessia 88 | 86,Coralie 89 | 87,Navneet 90 | 88,Aisling (name) 91 | 89,Nelofar 92 | 90,Miley (given name) 93 | 91,Mary (name) 94 | 92,Madeleine (name) 95 | 93,Schuyler (name) 96 | 94,Milica 97 | 95,Laimonis 98 | 96,Uldis 99 | 97,Dzintars 100 | 98,Modris 101 | 99,Gatis 102 | 100,Indulis 103 | 101,Priya (given name) 104 | 102,Maki (name) 105 | 103,Junichi 106 | 104,Alison (given name) 107 | 105,Kanye (name) 108 | 106,Kaj 109 | 107,Tawfik 110 | 108,Kayo (name) 111 | 109,Pia (given name) 112 | 110,Hristo 113 | 111,Heather (given name) 114 | 112,Lubomir 115 | 113,Ctirad 116 | 114,Tom (given name) 117 | 115,Aileen 118 | 116,Bita (Persian) 119 | 117,Christopher 120 | 118,Abid (name) 121 | 119,Brooke (name) 122 | 120,Mira (given name) 123 | 121,Parisa (disambiguation) 124 | 122,Tuukka 125 | 123,Soo-young (name) 126 | 124,Roosevelt (name) 127 | 125,Hamad 128 | 126,Ziemowit 129 | 127,Graciela (disambiguation) 130 | 128,Dobroslav 131 | 129,Tess (given name) 132 | 130,Ellen 133 | 131,Kalina (name) 134 | 132,Kaede 135 | 133,Ranald 136 | 134,Sandy (given name) 137 | 135,Maytham 138 | 136,Damayanti (disambiguation) 139 | 137,Faith (name) 140 | 138,Llewellyn (name) 141 | 139,Veronica (name) 142 | 140,Amos (name) 143 | 141,Bora (Turkish name) 144 | 142,Oladapo 145 | 143,Jerry (given name) 146 | 144,Sania (disambiguation) 147 | 145,Domagoj (given name) 148 | 146,Okonma 149 | 147,Keon 150 | 148,Enn 151 | 149,Haruchika (given name) 152 | 150,Farah (name) 153 | 151,Harutyun 154 | 152,Sora (Japanese given name) 155 | 153,Verity 156 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Enzyme/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 0,Adenylate cyclase,0,Adenylyl cyclase 3 | 15,NADH dehydrogenase (ubiquinone),1,NADH:ubiquinone reductase (H+-translocating) 4 | 42,RNase H,2,Ribonuclease H 5 | 49,Guanylate cyclase,3,Guanylyl cyclase 6 | 151,Ribonuclease A,4,Bovine pancreatic ribonuclease 7 | 153,Gelatinase a,5,Gelatinase A 8 | 189,Carboxypeptidase u,6,Carboxypeptidase U 9 | 234,3-beta-HSD,7,3beta-Hydroxysteroid dehydrogenase 10 | 236,Aspartokinase,8,Aspartate kinase 11 | 261,Serotonin N-acetyltransferase,9,Aralkylamine N-acetyltransferase 12 | 262,CTP synthetase,10,CTP synthase 13 | 328,Alpha-amylase,11,Alpha-Amylase 14 | 333,Carboxypeptidase c,12,Carboxypeptidase C 15 | 350,Amine oxidase,13,Diamine oxidase 16 | 365,A-N-acetylgalactosaminidase,14,Alpha-N-acetylgalactosaminidase 17 | 376,"1,3-Beta-glucan synthase",15,"1,3-beta-glucan synthase" 18 | 380,4-hydroxybutyrate dehydrogenase,16,4-Hydroxybutyrate dehydrogenase 19 | 469,Glycerol-1-phosphate dehydrogenase (NAD(P)+),17,Sn-glycerol-1-phosphate dehydrogenase 20 | 610,3-keto-steroid reductase,18,3-Ketosteroid reductase 21 | 684,(Iso)eugenol O-methyltransferase,19,(iso)eugenol O-methyltransferase 22 | 702,(Myelin basic protein)-arginine N-methyltransferase,20,(myelin basic protein)-arginine N-methyltransferase 23 | 723,(Ribulose-bisphosphate carboxylase)-lysine N-methyltransferase,21,(ribulose-bisphosphate carboxylase)-lysine N-methyltransferase 24 | 855,(Formate-C-acetyltransferase)-activating enzyme,22,(formate-C-acetyltransferase)-activating enzyme 25 | 858,Sulfur reductase,23,Sulfhydrogenase 26 | 898,4-Hydroxyphenylacetate 3-monooxygenase,24,4-hydroxyphenylacetate 3-monooxygenase 27 | 945,Deacetoxyvindoline 4-hydroxylase,25,Desacetoxyvindoline 4-hydroxylase 28 | 1043,2-hydroxyglutarate dehydrogenase,26,L-2-hydroxyglutarate dehydrogenase 29 | 1185,4-Cresol dehydrogenase (hydroxylating),27,4-cresol dehydrogenase (hydroxylating) 30 | 1199,(Methionine synthase) reductase,28,(methionine synthase) reductase 31 | 1220,"5,10-Methenyltetrahydromethanopterin hydrogenase",29,"5,10-methenyltetrahydromethanopterin hydrogenase" 32 | 1233,Chlorite O2-lyase,30,Chlorite dismutase 33 | 1363,NAD(P)+ transhydrogenase (AB-specific),31,NAD(P)+ transhydrogenase (Re/Si-specific) 34 | 1364,NAD(P)+ transhydrogenase (B-specific),32,NAD(P)+ transhydrogenase (Si-specific) 35 | 2192,(myosin-light-chain) phosphatase,33,Myosin-light-chain phosphatase 36 | 2728,2-Hydroxyacylsphingosine 1-beta-galactosyltransferase,34,2-hydroxyacylsphingosine 1-beta-galactosyltransferase 37 | 3031,Isocitrate dehydrogenase (NADP+) kinase,35,(isocitrate dehydrogenase (NADP+)) kinase 38 | 3118,CCA tRNA nucleotidyltransferase,36,TRNA cytidylyltransferase 39 | 3151,4-Hydroxy-3-methylbut-2-enyl diphosphate reductase,37,4-hydroxy-3-methylbut-2-enyl diphosphate reductase 40 | 3179,Cyanate hydratase,38,Cyanase 41 | 3180,Formylglycine-generating sulfatase enzyme,39,Formylglycine-generating enzyme 42 | 3185,Nucleotide Pyrophosphatase/Phosphodiesterase (NPP),40,Nucleotide pyrophosphatase/phosphodiesterase 43 | 3188,IgA protease,41,IgA specific serine endopeptidase 44 | 3286,Cortisone b-reductase,42,5-beta-reductase 45 | 3736,"Dolichyl-P-Glc:Glc2Man9GlcNAc2-PP-dolichol alpha-1,2-glucosyltransferase",43,ALG10 (enzyme class) 46 | 3745,"Dolichyl-P-Glc:Glc1Man9GlcNAc2-PP-dolichol alpha-1,3-glucosyltransferase",44,ALG8 (enzyme class) 47 | 3747,"Dolichyl-P-Glc:Man9GlcNAc2-PP-dolichol alpha-1,3-glucosyltransferase",45,ALG6 (enzyme class) 48 | 3828,Metionin transaminase,46,Methionine transaminase 49 | 3915,4-sulfomuconolactone hydrolase,47,4-Sulfomuconolactone hydrolase 50 | -------------------------------------------------------------------------------- /src/autofj/benchmark/RugbyLeague/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 5,IRB Sevens World Series,0,World Rugby Sevens Series 3 | 5,IRB Sevens World Series,57,Sevens World Series 4 | 13,World Club Challenge,1,World Club Series 5 | 19,ITM Cup,2,Mitre 10 Cup 6 | 21,Pro 12,3,Pro12 7 | 25,National Championship of Excellence,42,National Championship of Excellence (Italian premiership) 8 | 29,Regal Trophy,4,League Cup (rugby league) 9 | 34,Microsoft Cup,5,Top League Champions Cup 10 | 35,All-Japan University Rugby Championship,43,All Japan University Rugby Championship 11 | 48,Colonial Cup (rugby union),44,Colonial Cup 12 | 59,IRB Pacific Nations Cup,6,World Rugby Pacific Nations Cup 13 | 59,IRB Pacific Nations Cup,45,Pacific Nations Cup 14 | 60,Ron Massey Cup,46,Bundaberg Red Cup 15 | 62,New South Wales Cup,7,Intrust Super Premiership NSW 16 | 66,London and South East Merit League,47,"London, South and East Merit League" 17 | 79,Pacific Rugby Cup,8,World Rugby Pacific Challenge 18 | 85,Tom Richards Trophy,9,Tom Richards Cup 19 | 88,IRB Nations Cup,10,World Rugby Nations Cup 20 | 98,Paris Sevens,11,France Sevens 21 | 119,Rugby League Charity Shield (Australia),12,Charity Shield (NRL) 22 | 139,Caledonia Regional League (rugby union),13,Caledonia Regional League 23 | 143,Tongan National Rugby League,48,Tonga National Rugby League 24 | 150,ARFU Women's Rugby Championship,14,Asia Rugby Women's Championship 25 | 152,Rugby World Cup Final,15,List of Rugby World Cup finals 26 | 153,Scottish Premiership,16,Scottish Premiership (rugby) 27 | 163,Asian Five Nations,17,Asia Rugby Championship 28 | 163,Asian Five Nations,56,Asian Rugby Championship 29 | 166,CAR Development Trophy,18,African Development Trophy 30 | 177,Rugby League European Shield,19,Rugby League European Championship B 31 | 181,IRB Junior World Championship,20,World Rugby Under 20 Championship 32 | 183,IRB Junior World Rugby Trophy,21,World Rugby Under 20 Trophy 33 | 188,Singer Sri Lankan Airlines Rugby 7's,22,Sri Lanka Sevens 34 | 190,Club ANZAC Game,23,ANZAC Day Cup 35 | 193,Nations Cup (women's rugby union),24,Women's Nations Cup (rugby union) 36 | 203,Rugby League European Bowl,25,Rugby League European Championship C 37 | 206,FORU Oceania Cup,26,Oceania Rugby Cup 38 | 212,ARL Schoolboy Cup,27,GIO Schoolboy Cup 39 | 213,South Premier (rugby league),28,South Premier 40 | 213,South Premier (rugby league),49,Rugby League Conference South Premier 41 | 233,National Women's Championship,29,National Women's Rugby Championship 42 | 235,2nd Rugby-Bundesliga,30,2. Rugby-Bundesliga 43 | 267,All Stars Match,31,All Stars match 44 | 308,Asian Women's Sevens,32,Asian Women's Sevens Championship 45 | 309,African Women's Sevens,33,African Women's Sevens Championship 46 | 311,Caribbean Women's Sevens Championship,34,North America and Caribbean Women's Sevens Championship 47 | 312,Pacific Women's Sevens Championship,35,Oceania Women's Sevens Championship 48 | 313,South American Women's Sevens Championship,36,Women's rugby sevens in South America 49 | 313,South American Women's Sevens Championship,55,South American Women's Sevens 50 | 315,Rugby Ekstraliga,37,Ekstraliga (rugby) 51 | 319,Saint Patrick's Day Test,50,St. Patrick's Day Test 52 | 327,North East Rugby League,38,North East Rugby League Premier Division 53 | 327,North East Rugby League,51,Rugby League Conference North East Division 54 | 329,London & South East Men's League,52,Rugby League Conference London & South Division 55 | 332,International Origin,53,International Origin Match 56 | 336,College Premier Division,39,Division 1-A Rugby 57 | 358,Ron Coote Cup,54,The Ron Coote Cup 58 | 411,SARU Community Cup,40,SARU Gold Cup 59 | 413,IRB Women's Sevens World Series,41,World Rugby Women's Sevens Series 60 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Drug/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Bacillus Calmette-Guérin 3 | 1,"N,N-Dimethyltryptamine" 4 | 2,Hyoscine hydrobromide 5 | 3,Valproate 6 | 4,Interferon beta 1a 7 | 5,4-Androstenedione 8 | 6,Percodan 9 | 7,5-MeO-aMT 10 | 8,FluMist 11 | 9,Tenofovir disoproxil 12 | 10,Ursodeoxycholic acid 13 | 11,Quinacrine 14 | 12,Glycopyrrolate 15 | 13,Neomycin/polymyxin B/bacitracin 16 | 14,Enoxolone 17 | 15,Hyoscine butylbromide 18 | 16,Paromomycin sulfate 19 | 17,Bacitracin/polymyxin B 20 | 18,Ethinylestradiol 21 | 19,Dactinomycin 22 | 20,Interferon beta 1b 23 | 21,Metandienone 24 | 22,Caffeine/ergotamine 25 | 23,Roxatidine acetate 26 | 24,HPV vaccines 27 | 25,Tenoretic 28 | 26,Xyrem 29 | 27,Crotalidae polyvalent immune fab 30 | 28,4-Hydroxyamphetamine 31 | 29,Chlordiazepoxide/clidinium bromide 32 | 30,Umifenovir 33 | 31,Efavirenz/emtricitabine/tenofovir 34 | 32,N1-Methyl-lysergic acid diethylamide 35 | 33,Dimethyllysergamide 36 | 34,Gestonorone caproate 37 | 35,Zoster vaccine 38 | 36,Politor 39 | 37,"1,8-Dihydroxyanthraquinone" 40 | 38,Corbadrine 41 | 39,Hydroxyprogesterone caproate 42 | 40,Mestranol/noretynodrel 43 | 41,Ethylamphetamine 44 | 42,GHRP-6 45 | 43,Abiraterone acetate 46 | 44,Phenyramidol 47 | 45,Digoxin immune fab 48 | 46,Senna glycosides 49 | 47,Norbolethone 50 | 48,ASAQ 51 | 49,Methopholine 52 | 50,Betamethylfentanyl 53 | 51,MN-18 54 | 52,4-Fluoro-N-methylamphetamine 55 | 53,Interferon alfa 2b 56 | 54,GW501516 57 | 55,W-18 58 | 56,Regular insulin 59 | 57,Thiobromadol 60 | 58,Droxidopa 61 | 59,Thiambutene 62 | 60,MDV3100 63 | 61,Periciazine 64 | 62,SB-242084 65 | 63,Carbidopa/levodopa/entacapone 66 | 64,Carbaldrate 67 | 65,Docusate 68 | 66,Bradanicline 69 | 67,EMD-386088 70 | 68,SB-258585 71 | 69,SB-399885 72 | 70,SB-357134 73 | 71,Obinutuzumab 74 | 72,4-Bromomethcathinone 75 | 73,SB-271046 76 | 74,Entolimod 77 | 75,A-372159 78 | 76,SB-699551 79 | 77,RS-127445 80 | 78,SB-204741 81 | 79,RS-102221 82 | 80,CP-94253 83 | 81,Suloctidil 84 | 82,Insulin (medication) 85 | 83,Ro 04-6790 86 | 84,SB-216641 87 | 85,SB-269970 88 | 86,Tedizolid 89 | 87,Dasotraline 90 | 88,Radium-223 91 | 89,Radium-223 chloride 92 | 90,MDMAI 93 | 91,Pivhydrazine 94 | 92,MMDMA (drug) 95 | 93,2-Methoxymethyl salvinorin B 96 | 94,Afegostat 97 | 95,SB-215505 98 | 96,Meclinertant 99 | 97,GR-127935 100 | 98,Homarylamine 101 | 99,BDPC 102 | 100,Norethisterone enanthate 103 | 101,CP-809101 104 | 102,5-IT 105 | 103,Chromium(III) nicotinate 106 | 104,Clazakizumab 107 | 105,Fabomotizole 108 | 106,Oxomemazine/guaifenesin 109 | 107,Surfaxin 110 | 108,Idalopirdine 111 | 109,Acetylsalicylic acid/dipyridamole 112 | 110,Lavoltidine 113 | 111,Phencyclamine 114 | 112,Erbuzole 115 | 113,6-Methyl-2-ethyl-3-hydroxypyridine 116 | 114,LY-293284 117 | 115,Triflunordazepam 118 | 116,S-14671 119 | 117,Zoptarelin doxorubicin 120 | 118,Meclizine 121 | 119,CP-93129 122 | 120,25TFM-NBOMe 123 | 121,25C-NBOMe 124 | 122,1-(Thiophen-2-yl)-2-aminopropane 125 | 123,TM38837 126 | 124,Ethinyl estradiol/drospirenone/levomefolic acid 127 | 125,Carphenazine 128 | 126,Perfosfamide 129 | 127,Org 12962 130 | 128,Censavudine 131 | 129,Omecamtiv Mecarbil (CK-1827452) 132 | 130,SB-258719 133 | 131,GR-113808 134 | 132,LY-310762 135 | 133,SB-204070 136 | 134,CJ-033466 137 | 135,SB-206553 138 | 136,CP-135807 139 | 137,VX-809 140 | 138,Tenofovir alafenamide 141 | 139,GS 7340 142 | 140,PSI-7977 143 | 141,Abaloparatide 144 | 142,RG7795 145 | 143,ANA773 146 | 144,C16 (drug) 147 | 145,Valsartan/sacubitril 148 | 146,APINACA 149 | 147,Trifluridine/tipiracil 150 | 148,Rapastinel 151 | 149,APICA (synthetic cannabinoid drug) 152 | 150,Andexanet alfa 153 | 151,Doravirine 154 | 152,Ledipasvir 155 | 153,Deleobuvir 156 | 154,QUPIC 157 | 155,QUCHIC 158 | 156,Elafibranor 159 | -------------------------------------------------------------------------------- /src/autofj/benchmark/FootballMatch/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 1,Battle of Santiago,0,Battle of Santiago (1962 FIFA World Cup) 3 | 2,Battle of Berne,1,Battle of Berne (1954 FIFA World Cup) 4 | 9,2005 International Rules Series,2,2005 international rules series 5 | 13,Austria v Switzerland (1954),3,Austria v Switzerland (1954 FIFA World Cup) 6 | 41,2001 Germany vs England football match,4,2001 Germany v England football match 7 | 67,Shamrock Rovers XI vs Brazil,5,Shamrock Rovers XI v Brazil 8 | 94,2006 International Rules Series,6,2006 International Rules series 9 | 94,2006 International Rules Series,7,2006 international rules series 10 | 101,Poland v Brazil (1938),8,Poland v Brazil (1938 FIFA World Cup) 11 | 287,2001 International Rules Series,9,2001 international rules series 12 | 294,1998 International Rules Series,10,1998 international rules series 13 | 358,Miracle of Cordoba,11,Austria v West Germany (1978 FIFA World Cup) 14 | 363,2000 England vs Germany football match,12,2000 England v Germany football match 15 | 394,Battle of Nuremberg (association football),13,Battle of Nuremberg (2006 FIFA World Cup) 16 | 442,2008 Conference National play-off Final,14,2008 Conference Premier play-off Final 17 | 466,2008 International Rules Series,15,2008 international rules series 18 | 545,1968 DFB Cup Final,16,1968 DFB-Pokal Final 19 | 546,1988 DFB Cup Final,17,1988 DFB-Pokal Final 20 | 568,1999 International Rules Series,18,1999 international rules series 21 | 597,1989 Major Indoor Soccer League All-Star Game,19,1989 MISL All-Star Game 22 | 607,Argentina v England (1986 FIFA World Cup),20,Argentina 2–1 England (1986 FIFA World Cup) 23 | 702,2009 African Nations Championship Final,21,2009 African Championship of Nations Final 24 | 704,Hungary vs El Salvador (1982),22,Hungary v El Salvador (1982 FIFA World Cup) 25 | 704,Hungary vs El Salvador (1982),23,Hungary 10–1 El Salvador (1982) 26 | 706,2009 Conference National play-off Final,24,2009 Conference Premier play-off Final 27 | 777,WPS All-Star 2009,25,2009 WPS All-Star Game 28 | 811,1871 England versus Scotland rugby union match,26,1871 Scotland versus England rugby union match 29 | 811,1871 England versus Scotland rugby union match,27,1870–71 Home Nations rugby union matches 30 | 846,2009 Republic of Ireland vs France football matches,28,2009 Republic of Ireland v France football matches 31 | 846,2009 Republic of Ireland vs France football matches,29,France 1–1 Ireland (18 November 2009) 32 | 991,19 May incident,30,1985 China v Hong Kong football match 33 | 1005,1876 Scotland vs Wales football match,31,1876 Scotland v Wales football match 34 | 1036,2010 International Rules Series,32,2010 international rules series 35 | 1051,1993 PTT Telecom Cup,33,1993 Dutch Supercup 36 | 1052,2010 Conference National play-off Final,34,2010 Conference Premier play-off Final 37 | 1091,WPS All-Star 2010,35,2010 WPS All-Star Game 38 | 1141,West Germany vs France (1982),36,West Germany v France (1982 FIFA World Cup) 39 | 1159,2002 International Rules Series,37,2002 international rules series 40 | 1244,2011 W-League Grand Final,38,2010–11 W-League Grand Final 41 | 1250,2006 Copa Indonesia Final,39,2006 Copa Indonesia final 42 | 1273,Brazil vs Italy (1982),40,Brazil v Italy (1982 FIFA World Cup) 43 | 1314,1992 PTT Telecom Cup,41,1992 Dutch Supercup 44 | 1315,1991 PTT Telecom Cup,42,1991 Dutch Supercup 45 | 1319,2011 Conference National play-off Final,43,2011 Conference Premier play-off Final 46 | 1347,All-Ireland Minor Hurling Championship 2011,44,2011 All-Ireland Minor Hurling Championship 47 | 1362,2011 International Rules Series,45,2011 international rules series 48 | 1828,2011 J. League Cup Final,46,2011 J.League Cup Final 49 | 1843,2013 Kenyan Super Cup,47,2013 Kenyan Super Cup (pre-season) 50 | 1859,2012 Conference National play-off Final,48,2012 Conference Premier play-off Final 51 | 1879,2011 UEFA European Under-21 Football Championship Final,49,2011 UEFA European Under-21 Championship Final 52 | 1901,Albanian Supercup 2012,50,2012 Albanian Supercup 53 | 1925,1985 Wales vs Scotland football match,51,1985 Wales v Scotland football match 54 | 1972,2013 Soccer Bowl,52,Soccer Bowl 2013 55 | -------------------------------------------------------------------------------- /src/autofj/benchmark/SoccerClubSeason/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 1,2006 Australia national football team season,0,2006 Australia national soccer team season 3 | 2,2007 Australia national football team season,1,2007 Australia national soccer team season 4 | 9,2008 Australia national football team season,2,2008 Australia national soccer team season 5 | 14,2008 Derry Gaelic football season,46,Derry football season 2008 6 | 18,2005 Australia national football team season,3,2005 Australia national soccer team season 7 | 26,2009 Australia national football team season,4,2009 Australia national soccer team season 8 | 40,2009 Derry Gaelic football season,47,Derry football season 2009 9 | 41,2010 Australia national football team season,5,2010 Australia national soccer team season 10 | 93,Cienciano season 2003,6,2003 Cienciano season 11 | 106,HJK Helsinki season 2009,7,2009 Helsingin Jalkapalloklubi season 12 | 141,2010 Derry Gaelic football season,48,Derry football season 2010 13 | 172,2010 Jeonbuk Hyundai Motors season,8,2010 Jeonbuk Hyundai Motors FC season 14 | 211,2010 Down Senior Football season,9,2010 Down football season 15 | 211,2010 Down Senior Football season,49,2010 Down GAA Senior Football 16 | 212,2011 Down Senior Football season,10,2011 Down football season 17 | 212,2011 Down Senior Football season,50,2011 Down GAA Senior Football 18 | 247,2011 Daejeon Citizen season,11,2011 Daejeon Citizen FC season 19 | 258,2011 Australia national football team season,12,2011 Australia national soccer team season 20 | 277,2011 Orlando City S.C. season,13,2011 Orlando City SC season 21 | 281,2011 Jeonbuk Hyundai Motors season,14,2011 Jeonbuk Hyundai Motors FC season 22 | 300,2011 Chunnam Dragons season,15,2011 Jeonnam Dragons season 23 | 306,2004 Australia national football team season,16,2004 Australia national soccer team season 24 | 395,2011 Incheon United season,17,2011 Incheon United FC season 25 | 496,2003 Australia national football team season,18,2003 Australia national soccer team season 26 | 571,2002 Australia national football team season,19,2002 Australia national soccer team season 27 | 572,1999 Australia national football team season,20,1999 Australia national soccer team season 28 | 575,1998 Australia national football team season,21,1998 Australia national soccer team season 29 | 576,2001 Australia national football team season,22,2001 Australia national soccer team season 30 | 577,2000 Australia national football team season,23,2000 Australia national soccer team season 31 | 795,2012 Down GAA Senior Football,24,2012 Down football season 32 | 796,1997 Australia national football team season,25,1997 Australia national soccer team season 33 | 797,1996 Australia national football team season,26,1996 Australia national soccer team season 34 | 874,Kuala Lumpur FA season 2012,27,2012 Kuala Lumpur FA season 35 | 881,2012 Daejeon Citizen season,28,2012 Daejeon Citizen FC season 36 | 887,2012 Orlando City S.C. season,29,2012 Orlando City SC season 37 | 899,2012 Australia national football team season,30,2012 Australia national soccer team season 38 | 934,2012 Jeonbuk Hyundai Motors season,31,2012 Jeonbuk Hyundai Motors FC season 39 | 936,2012 Chunnam Dragons season,32,2012 Jeonnam Dragons season 40 | 978,2012 Incheon United season,33,2012 Incheon United FC season 41 | 979,2012 Jeju United season,34,2012 Jeju United FC season 42 | 1015,2012 Woodlands Wellington Season,35,2012 Woodlands Wellington FC season 43 | 1016,2011 Woodlands Wellington Season,36,2011 Woodlands Wellington FC season 44 | 1020,2010 Woodlands Wellington Season,37,2010 Woodlands Wellington FC season 45 | 1044,2013 Down Senior Football season,38,2013 Down football season 46 | 1073,2013 Orlando City S.C. season,39,2013 Orlando City SC season 47 | 1075,2013 Woodlands Wellington Season,40,2013 Woodlands Wellington FC season 48 | 1080,Negeri Sembilan FA Season 2013,41,2013 Negeri Sembilan FA season 49 | 1151,2009 Down Senior Football season,42,2009 Down football season 50 | 1158,2013 Australia national football team season,43,2013 Australia national soccer team season 51 | 1161,2013 Carolina RailHawks FC season,44,2013 Carolina RailHawks season 52 | 1187,2013 Incheon United season,45,2013 Incheon United FC season 53 | -------------------------------------------------------------------------------- /src/autofj/negative_rule.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from nltk.stem.porter import PorterStemmer 3 | import re 4 | 5 | 6 | class NegativeRule(object): 7 | """Negative rules""" 8 | 9 | def __init__(self, left, right, id_column): 10 | self.left = self._preprocess(left, id_column) 11 | self.right = self._preprocess(right, id_column) 12 | self.id_column = id_column 13 | self.negative_rules = set() 14 | 15 | def get_tokens_diff(self, l_tokens, r_tokens): 16 | # get difference of l_token set and r_token set 17 | l_diff = [l - r for l, r in zip(l_tokens, r_tokens)] 18 | r_diff = [r - l for l, r in zip(l_tokens, r_tokens)] 19 | return l_diff, r_diff 20 | 21 | def _preprocess(self, df, id_column): 22 | """ Preprocess the records: (1) concatenate all columns. (2) lowercase, 23 | remove punctuation and do stemming (3) split by space 24 | 25 | Parameters 26 | ---------- 27 | df: pd.DataFrame 28 | Original table 29 | 30 | id_column: string 31 | The name of id column in two tables. 32 | 33 | Reutrn 34 | ------ 35 | result: pd.DataFrame 36 | Preprocessed table that has two columns, am id column named as "id" 37 | and a column for preprocessed record named "value" 38 | """ 39 | # get column names except id 40 | columns = [c for c in df.columns if c != id_column] 41 | ps = PorterStemmer() 42 | 43 | # concat all columns, lowercase, remove punctuation, split by space, 44 | # and do stemming 45 | new_value = [] 46 | for x in df[columns].values: 47 | concat_x = " ".join([str(i) for i in x]) 48 | lower_x = re.sub('[^\w\s]', " ", concat_x.lower()) 49 | stem_x = [ps.stem(w) for w in lower_x.split()] 50 | new_x = set(stem_x) 51 | new_value.append(new_x) 52 | 53 | id_df = df[id_column].values 54 | result = pd.DataFrame({id_column: id_df, "value": new_value}) 55 | return result 56 | 57 | def learn(self, LL_blocked): 58 | """Learn opposite rules from LL""" 59 | # merge LL with left 60 | LL = self._merge(self.left, self.left, LL_blocked) 61 | 62 | # get token difference 63 | l_diff, r_diff = self.get_tokens_diff(LL["value_l"].values, 64 | LL["value_r"].values) 65 | 66 | # get rules: (l_token, r_token) that have one different token from each other 67 | for l, r, l_set in zip(l_diff, r_diff, LL["value_l"]): 68 | if len(l) == 1 and len(r) == 1 and len(l_set) != 1: 69 | self.negative_rules.add((list(l)[0], list(r)[0])) 70 | self.negative_rules.add((list(r)[0], list(l)[0])) 71 | 72 | # print(self.negative_rules) 73 | # raise 74 | 75 | def _merge(self, left, right, LR_blocked): 76 | id_column = self.id_column 77 | LR = LR_blocked[[id_column + "_l", id_column + "_r"]] 78 | LR = LR.merge(left, left_on=id_column + "_l", right_on=id_column)\ 79 | .drop(columns=id_column) \ 80 | .merge(right, left_on=id_column + "_r", right_on=id_column, 81 | suffixes=("_l", "_r"))\ 82 | .drop(columns=id_column) 83 | return LR 84 | 85 | def apply(self, LR_blocked): 86 | """Apply opposite rule on LR blocked""" 87 | # merge LR with left, right 88 | LR = self._merge(self.left, self.right, LR_blocked) 89 | 90 | # get token difference 91 | l_diff, r_diff = self.get_tokens_diff(LR["value_l"].values, 92 | LR["value_r"].values) 93 | 94 | # apply rule 95 | mask = [] 96 | for lid, rid, l_d, r_d in zip(LR["autofj_id_l"].values, 97 | LR["autofj_id_r"].values, 98 | l_diff, 99 | r_diff): 100 | pairs = [(l, r) for l in l_d for r in r_d] 101 | meet_rule = any([p in self.negative_rules for p in pairs]) 102 | mask.append(not meet_rule) 103 | 104 | LR_blocked = LR[mask][["autofj_id_l", "autofj_id_r"]] 105 | return LR_blocked 106 | -------------------------------------------------------------------------------- /src/autofj/benchmark/BasketballTeam/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,New Orleans Pelicans 3 | 1,Charlotte Hornets 4 | 2,Polonia Warszawa (basketball) 5 | 3,Asseco Gdynia 6 | 4,Melbourne United 7 | 5,Liaoning Flying Leopards 8 | 6,Shandong Golden Stars 9 | 7,Fujian Sturgeons 10 | 8,P.A.O.K. BC 11 | 9,Ulriken Elite 12 | 10,Mens Sana 1871 Basket 13 | 11,Oklahoma City Blue 14 | 12,Austin Spurs 15 | 13,KK Olimpija 16 | 14,Surrey Scorchers 17 | 15,Guildford Heat 18 | 16,Surrey United (basketball) 19 | 17,Shenzhen Leopards 20 | 18,Treviso Basket 21 | 19,Universo Treviso Basket 22 | 20,UNES FC Barcelona 23 | 21,University of Canberra Capitals 24 | 22,Otago Nuggets 25 | 23,Helsinki Seagulls 26 | 24,BC Torpan Pojat 27 | 25,Milton Keynes Lions 28 | 26,Brooklyn Kings (basketball) 29 | 27,CB Bilbao Berri 30 | 28,BC Cherno More Port Varna 31 | 29,KK Vojvodina Srbijagas 32 | 30,Dumbarton Dodgers 33 | 31,FC Porto (basketball) 34 | 32,U.D. Oliveirense (basketball) 35 | 33,CA Queluz 36 | 34,MHP Riesen Ludwigsburg 37 | 35,EnBW Ludwigsburg 38 | 36,Gladiators Trier 39 | 37,Pallacanestro Don Bosco Livorno 40 | 38,PBC Ural Great 41 | 39,Ayrshire Tornadoes 42 | 40,Bree B.B.C. 43 | 41,Delaware 87ers 44 | 42,Fubon Braves 45 | 43,Fubon Braves Basketball Team 46 | 44,Omaha Racers 47 | 45,Edinburgh Tigers 48 | 46,PBC Lokomotiv-Kuban 49 | 47,Bristol Flyers 50 | 48,PAWS London Capital 51 | 49,Trabzonspor B.K. 52 | 50,Olympiada Patras BC 53 | 51,Sporting BC 54 | 52,Galatasaray S.K. (men's basketball) 55 | 53,Cheshire Jets 56 | 54,BC Cherkasy 57 | 55,KK Mega Basket 58 | 56,KK Mega Leks 59 | 57,Nuova AMG Sebastiani Basket Rieti 60 | 58,BC Yenisey Krasnoyarsk 61 | 59,KK Krka 62 | 60,Roseto Basket 63 | 61,Veroli Basket 64 | 62,Gruppo Triboldi Basket 65 | 63,AEL 1964 B.C. 66 | 64,Bnei HaSharon 67 | 65,Belfast Star 68 | 66,Melbourne Boomers 69 | 67,B.C. Zenit Saint Petersburg 70 | 68,Swans Gmunden 71 | 69,Donar (basketball club) 72 | 70,Rethymno Cretan Kings B.C. 73 | 71,Egaleo BC 74 | 72,CB Tizona 75 | 73,Bunbury Slammers 76 | 74,Ilissiakos B.C. 77 | 75,MENT B.C. 78 | 76,Dafni BC 79 | 77,Xanthi B.C. 80 | 78,Athlitikos Omilos Paleou Falirou BC 81 | 79,AGEH Gymnastikos B.C. 82 | 80,Union Kavala B.C. 83 | 81,Peramatos Ermis B.C. 84 | 82,ICBS B.C. 85 | 83,Ionikos Lamias BC 86 | 84,AO Pagrati BC 87 | 85,Toros de Nuevo Laredo 88 | 86,Trikala 2000 B.C. 89 | 87,A.S. Trikala 2000 BC 90 | 88,Iraklio BC 91 | 89,KK Millenium Strumica 92 | 90,Incheon Electroland Elephants 93 | 91,Goyang Orion Orions 94 | 92,Gigantes de Carolina (men's basketball) 95 | 93,South China AA (basketball) 96 | 94,Galatasaray S.K. (women's basketball) 97 | 95,Galatasaray Medical Park (women's basketball) 98 | 96,Elitzur Givat Shmuel 99 | 97,Kent Crusaders (basketball) 100 | 98,AS Ionikos Neas Filadelfeias BC 101 | 99,Ionikos Nikaias BC 102 | 100,Polytekhnika-Halychyna Lviv 103 | 101,Galatasaray S.K. (wheelchair basketball) 104 | 102,Galatasaray Wheelchair Basketball Team 105 | 103,SK Valmiera 106 | 104,Rakvere Tarvas 107 | 105,Al-Ahli Benghazi (basketball club) 108 | 106,BC Partizani Tirana 109 | 107,Gymnastikos S. Larissas B.C. 110 | 108,BC Budivelnik 111 | 109,Bintulu Eagles B.C. 112 | 110,Bintulu Rainbow B.C. 113 | 111,Perak Farmcochem B.C. 114 | 112,CS Otopeni (Basketball) 115 | 113,CS Energia 116 | 114,Hapoel Afula B.C. 117 | 115,Satria Muda Pertamina Jakarta 118 | 116,Hi-Tech Bangkok City 119 | 117,Sports Rev Thailand Slammers 120 | 118,Maccabi Ra'anana 121 | 119,Logan Thunder (WNBL) 122 | 120,Al Riyadi Amman 123 | 121,Ezzahra Sports 124 | 122,Barak Netanya B.C. 125 | 123,Athinaikos women's basketball 126 | 124,Ikaros Chalkidas B.C. 127 | 125,Medi Bayreuth 128 | 126,RosaSport Radom 129 | 127,Gent Hawks 130 | 128,Halcones de Xalapa 131 | 129,Aspac Jakarta 132 | 130,Leeds Force 133 | 131,Pelita Jaya Energi Mega Persada 134 | 132,CLS Knights Surabaya 135 | 133,Yongin Samsung Blueminx 136 | 134,Yongin Samsung Life Bichumi 137 | 135,Piimameister Otto/Rapla 138 | 136,TYCO Rapla 139 | 137,Muba Hangtuah Sumatera Selatan 140 | 138,Blackwater Sports 141 | 139,Black Water Sports 142 | 140,BC Juventus 143 | 141,BC RÅ«dupis 144 | 142,BC Palanga 145 | 143,BC Naglis 146 | 144,Cuxhaven BasCats 147 | 145,Mississauga Power 148 | 146,AB Pas 149 | 147,BC Minsk-2006 150 | 148,Jalaa SC (men's basketball) 151 | 149,Porta XI Ensino CBF 152 | 150,UNIQA Sopron 153 | 151,Al Rayan SC Basketball Team 154 | 152,Stella Artois Leuven Bears 155 | 153,BC Barsy Atyrau 156 | 154,El Ittihad Alexandria (basketball) 157 | 155,Al Ittihad Alexandria (basketball) 158 | 156,Zamalek (basketball) 159 | 157,Tanduay Light Rhum Masters 160 | 158,Al Kuwait SC (basketball) 161 | 159,SOMB Boulogne-sur-Mer 162 | 160,KK Slavonski Brod 163 | 161,CB Ciudad de Algeciras 164 | 162,Pacific Caesar Surabaya 165 | 163,C.D. Primeiro de Agosto (basketball) 166 | 164,G.S. FIAT 167 | 165,Orangeville A's 168 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Race/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,2000 Guineas Stakes 3 | 1,St Leger Stakes 4 | 2,W.S. Cox Plate 5 | 3,1000 Guineas Stakes 6 | 4,Duchess of Cambridge Stakes 7 | 5,Champagne Stakes (ATC) 8 | 6,Breeders' Cup Distaff 9 | 7,Vincent O'Brien National Stakes 10 | 8,National Stakes 11 | 9,Champions Cup (horse race) 12 | 10,The Metropolitan (ATC) 13 | 11,Darley Classic 14 | 12,Futurity Stakes (MRC) 15 | 13,The BMW 16 | 14,Queen Elizabeth Stakes (VRC) 17 | 15,Australian Oaks (ATC) 18 | 16,Queen Elizabeth Stakes (ATC) 19 | 17,Sires' Produce Stakes (ATC) 20 | 18,Sires Produce Stakes (ATC) 21 | 19,Schweppes Oaks 22 | 20,Myer Classic 23 | 21,Lightning Stakes 24 | 22,Vinery Stud Stakes 25 | 23,TJ Smith Stakes 26 | 24,The J. J. Atkins 27 | 25,Bing Crosby Stakes 28 | 26,Los Alamitos Futurity 29 | 27,Starlet Stakes 30 | 28,Oak Leaf Stakes 31 | 29,Rodeo Drive Stakes 32 | 30,Belmont Derby 33 | 31,Baring Bingham Novices' Hurdle 34 | 32,Wise Dan Handicap 35 | 33,Red Carpet Handicap 36 | 34,Summertime Oaks 37 | 35,Los Alamitos Derby 38 | 36,Goodwood Stakes 39 | 37,Norfolk Stakes (United States) 40 | 38,Lady's Secret Stakes 41 | 39,Oak Tree Mile Stakes 42 | 40,Santa Ana Stakes 43 | 41,Linlithgow Stakes 44 | 42,A.V. Kewney Stakes 45 | 43,Makybe Diva Stakes 46 | 44,Sires' Produce Stakes (VRC) 47 | 45,Sires' Produce Stakes (BRC) 48 | 46,Cape Town Cycle Tour 49 | 47,Cape Argus Cycle Race 50 | 48,The Goodwood 51 | 49,South Australian Derby 52 | 50,Robert Sangster Stakes 53 | 51,The Galaxy (ATC) 54 | 52,Miracle Mile (harness race) 55 | 53,Grosser Preis von Bayern 56 | 54,Dubai Turf 57 | 55,Dato' Tan Chin Nam Stakes 58 | 56,Dato Tan Chin Nam Stakes 59 | 57,Welsh Grand National 60 | 58,Peter Young Stakes 61 | 59,Caulfield Classic 62 | 60,Caulfield Guineas Prelude 63 | 61,Gazet van Antwerpen Trophy 64 | 62,Moonee Valley Vase 65 | 63,British Champions Fillies & Mares Stakes 66 | 64,King Richard III Stakes 67 | 65,People's Choice Classic 68 | 66,Tour of Wellington 69 | 67,Yallambee Classic 70 | 68,SA Fillies Classic 71 | 69,Arrowfield 3YO Sprint 72 | 70,UCI Track Cycling World Cup 73 | 71,Tour de Filipinas 74 | 72,Breeders' Stakes (SAJC) 75 | 73,Lord Reims Stakes 76 | 74,National Stakes (SAJC) 77 | 75,Sires' Produce Stakes (SAJC) 78 | 76,Spring Stakes (SAJC) 79 | 77,Prix Bertrand du Breuil 80 | 78,Emakumeen Bira 81 | 79,Grand Prix of Aargau Canton 82 | 80,Settimana internazionale di Coppi e Bartali 83 | 81,Rund um die Hainleite 84 | 82,Munster Oaks 85 | 83,Denny Cordell Lavarack Fillies Stakes 86 | 84,Caulfield Sprint 87 | 85,Strade Bianche - Eroica Pro 88 | 86,Crystal Mile 89 | 87,A J Moir Stakes 90 | 88,Moonee Valley Gold Cup 91 | 89,Tesio Stakes 92 | 90,Eliza Park International Stakes 93 | 91,Matriarch Stakes (VRC) 94 | 92,Matriarch Stakes (Australia) 95 | 93,Moonee Valley Fillies Classic 96 | 94,Australia Stakes 97 | 95,The Marathon (horse race) 98 | 96,Las Vegas Marathon (horse race) 99 | 97,Champagne Stakes (MVRC) 100 | 98,Telstra Phonewords Stakes 101 | 99,Walther J. Jacobs-Stutenpreis 102 | 100,Hamburger Stutenpreis 103 | 101,Frankfurter Stutenpreis 104 | 102,Grafenberger Meilen-Trophy 105 | 103,Excelsior Stakes 106 | 104,Play the King Stakes 107 | 105,T.S. Carlyon Cup 108 | 106,Caulfield Autumn Classic 109 | 107,Pol Roger Stakes 110 | 108,BRC Sprint 111 | 109,Eagle Farm Cup 112 | 110,Grand Prix Stakes 113 | 111,Dane Ripper Stakes 114 | 112,Tattersall's Tiara 115 | 113,Bucks County Classic 116 | 114,Herald Champion Novice Hurdle 117 | 115,Tattersalls Ireland Champion Novice Hurdle 118 | 116,Tour of Iran (Azerbaijan) 119 | 117,Ryanair Gold Cup 120 | 118,December Gold Cup 121 | 119,Spinal Research The Atlantic 4 Gold Cup 122 | 120,Turf Sprint Stakes 123 | 121,Mathis Brothers Mile 124 | 122,1965 Chase 125 | 123,Dance Design Stakes 126 | 124,Mildmay of Flete Challenge Cup 127 | 125,Brown Advisory and Merriebelle Stable Plate 128 | 126,Appleton Handicap 129 | 127,Tour des Fjords 130 | 128,The Run to the Rose 131 | 129,The Run to the Roses 132 | 130,Bobbie Lewis Quality 133 | 131,Tramway Stakes 134 | 132,Golden Pendant 135 | 133,Spring Stakes (NJC) 136 | 134,Blazer Stakes 137 | 135,Rose Of Kingston Stakes 138 | 136,The Shorts (ATC) 139 | 137,Carbine Club Stakes (VRC) 140 | 138,Glenfarclas Cross Country Chase 141 | 139,Chairman's Handicap (ATC) 142 | 140,Chairmans Handicap (ATC) 143 | 141,Sapphire Stakes (ATC) 144 | 142,Sires' Produce Stakes (WA) 145 | 143,Sires Produce Stakes (WA) 146 | 144,Gunsynd Classic 147 | 145,Victory Stakes 148 | 146,Chairman's Handicap (BRC) 149 | 147,Champagne Classic (BRC) 150 | 148,Air Force Association Cycling Classic 151 | 149,Prince Of Wales Stakes (Australia) 152 | 150,Eclipse Stakes (MRC) 153 | 151,Summer Cup (ATC) 154 | 152,Summer Cup (horse racing) 155 | 153,Autumn Stakes (MRC) 156 | 154,Breeders Classic 157 | 155,Geoffrey Belmaine Stakes 158 | 156,Schweppervescence Trophy 159 | 157,Challenge Stakes (ATC) 160 | 158,AJC Challenge Stakes 161 | 159,Fort Lauderdale Stakes 162 | 160,MTB Himalaya 163 | 161,Velothon Berlin 164 | 162,Scandinavian Race Uppsala 165 | 163,Tour Series 166 | 164,Halfords Tour Series 167 | 165,Okolo Slovenska 168 | 166,World Ports Cycling Classic 169 | 167,Tour of Faroe Islands 170 | 168,Irish St Leger Trial Stakes 171 | 169,Tour of Norway 172 | 170,Teio Sho 173 | 171,Sodexo Gold Cup 174 | 172,Murphy Group Handicap Chase 175 | 173,Bow Mistress Trophy 176 | 174,South African National Road Race Championships 177 | -------------------------------------------------------------------------------- /src/autofj/benchmark/ShoppingMall/left.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,The Galleria (Houston) 3 | 1,Shops at Prudential Center 4 | 2,The Fashion Centre at Pentagon City 5 | 3,Westfield Sydney Central Plaza 6 | 4,Mantri Square 7 | 6,Downtown Disney (Walt Disney World) 8 | 7,Bluewater (shopping centre) 9 | 8,Lancaster Mall 10 | 9,"The Summit (Birmingham, Alabama)" 11 | 10,Centro Box Hill 12 | 11,Victoria Square Shopping Centre 13 | 12,Irvine Spectrum Center 14 | 13,Liffey Valley Shopping Centre 15 | 14,ISQUARE 16 | 15,Asheville Mall 17 | 16,Metropolis at Metrotown 18 | 17,St. Louis Outlet Mall 19 | 18,Northlake Mall (Charlotte) 20 | 19,Grand Central Mall 21 | 20,Philadelphia Premium Outlets 22 | 21,Touchwood 23 | 22,Atlantic Terminal (shopping mall) 24 | 23,Vintage Faire Mall 25 | 24,RioCan Centre Kingston 26 | 25,City Central 27 | 26,Hamilton Place (shopping mall) 28 | 27,Langham Place (Hong Kong) 29 | 28,Centrale (Croydon) 30 | 29,Centrio 31 | 30,Broadmarsh (shopping centre) 32 | 31,Rhodes Shopping Centre 33 | 32,The CentrePoint 34 | 33,Wilton Mall 35 | 34,Palm Beach Mall 36 | 35,Great Northern Mall 37 | 36,Centro Bankstown 38 | 37,Sunway Carnival Mall 39 | 38,HarbourFront Centre 40 | 39,CityPlace 41 | 40,Galleria Shopping Centre 42 | 41,"Westgate Shopping Centre, Oxford" 43 | 42,Westfield Annapolis 44 | 43,Brentwood Town Centre 45 | 44,St. David's (Cardiff) 46 | 45,Centro Toombul 47 | 46,Grand Indonesia Shopping Town 48 | 47,Rockville Mall 49 | 48,Westfield Connecticut Post 50 | 49,Fox Run Mall 51 | 50,Downtown Shopping Centre 52 | 51,Downtown Plaza (Sacramento) 53 | 52,Westfield MainPlace 54 | 53,Capital City Mall 55 | 54,Westfield Fox Valley 56 | 55,Westfield Chicago Ridge 57 | 56,Louis Joliet Mall 58 | 57,Indian Mall 59 | 58,Westfield Franklin Park 60 | 59,Westfield Southland 61 | 61,Westfield Belden Village 62 | 62,Solano Town Center 63 | 63,Eastland Center 64 | 64,Westfield West Covina 65 | 65,Parkway Mall 66 | 66,Sahara Mall (Riyadh) 67 | 67,Crossroads Center 68 | 68,St. Laurent Centre 69 | 69,Market Place Mall 70 | 70,"Zona Rosa (Kansas City, Missouri)" 71 | 71,Centro Lutwyche 72 | 72,Les Promenades de l'Outaouais 73 | 73,Westdale Mall 74 | 74,Victoria Mall 75 | 75,Woodbury Common Premium Outlets 76 | 76,El Con Mall 77 | 77,Epping Plaza 78 | 78,The Paragon 79 | 79,Auburn Mall 80 | 81,Cataraqui Town Centre 81 | 82,Royal Victoria Place 82 | 83,Pier Park (Florida) 83 | 84,"Conestoga Mall (Waterloo, Ontario)" 84 | 85,Maple Hill Pavilion 85 | 86,Centro Roselands 86 | 87,Mall at The Source 87 | 88,The Oaks Mall 88 | 89,Santa Rosa Mall (Florida) 89 | 90,"Crossroads Center (Waterloo, IA)" 90 | 91,Spires Shopping Centre 91 | 92,Change Alley (Singapore) 92 | 93,Seacon Square 93 | 94,Northfield Square 94 | 96,Paradise Park (Mall) 95 | 97,Centro Colonnades 96 | 98,Centro The Glen 97 | 99,CentrO 98 | 100,Eastridge 99 | 101,The Florida Mall 100 | 102,Rivergate Mall 101 | 103,Jantzen Beach SuperCenter 102 | 104,St. Charles Towne Center 103 | 106,Galeria Kazimierz 104 | 107,Northwest Plaza 105 | 108,Acadiana Mall 106 | 109,Arsenal Mall 107 | 110,Antioch Center 108 | 111,Omni Park Shopping Centre 109 | 113,West Mall 110 | 114,Mail Champlain 111 | 115,Splendid China Tower 112 | 116,Brunswick shopping centre 113 | 117,Mid Rivers Mall 114 | 118,Centro Mayor 115 | 119,Direct Factory Outlets 116 | 120,Westfield Warrawong 117 | 121,Westfield Figtree 118 | 122,Westfield Pakuranga 119 | 124,Knollwood Mall 120 | 125,Shangri-la Plaza Mall 121 | 127,Lakeshore Mall (Florida) 122 | 128,Menara Great Eastern 123 | 129,Hickory Ridge Mall 124 | 130,Avenue Carriage Crossing 125 | 132,The Mall at Shelter Cove 126 | 133,Bishops Corner (West Hartford) 127 | 135,The Mall in Columbia 128 | 137,Tallahassee Mall 129 | 138,La Encantada 130 | 139,Woodfield Mall 131 | 140,The Shoppes at Eastchase 132 | 141,Forest Lake Shopping Centre 133 | 142,MegaBox (shopping mall) 134 | 143,Westfield CastleCourt 135 | 145,Deira City Centre 136 | 146,Paddock Mall 137 | 147,The Promenade Shopping Centre 138 | 148,The Mall at Cortana 139 | 149,Merry Hill Shopping Centre 140 | 150,Bentley Bridge Retail Park 141 | 152,City Centre Plaza 142 | 155,Hillside Shopping Centre 143 | 156,NewPark Mall 144 | 157,Springfield Mall (Pennsylvania) 145 | 158,"Regency Square Mall (Florence, Alabama)" 146 | 159,Pace Shopping Mall 147 | 160,Centro Taigum 148 | 161,Sta. Lucia East Grand Mall 149 | 162,Florence Mall 150 | 163,SM City 151 | 164,Centro Karingal 152 | 165,Regency Square Mall (Jacksonville) 153 | 166,Broadway Mall 154 | 168,Dembel City Center 155 | 169,Colton Plaza 156 | 171,Winrock Shopping Center 157 | 172,Nex 158 | 174,Chesapeake Square Mall 159 | 175,Lulu Mall 160 | 177,Mirdif City Centre 161 | 178,"Shaktan Thampuran Private Bus Stand, Thrissur" 162 | 179,"Star City, Seoul" 163 | 182,Exchange Ilford 164 | 183,My Mall Limassol 165 | 184,Castletown Shoppingworld 166 | 185,The Market Common Myrtle Beach 167 | 186,Kenwood Towne Centre 168 | 187,The Mall Pavilions 169 | 188,Settlers' Green Outlet Village 170 | 189,"The Summit (Reno, Nevada)" 171 | 190,"The Summit (Wheatfield, New York)" 172 | 191,Domain Central 173 | 192,Granada Center 174 | 193,The Base (mall) 175 | 195,Tulsa Promenade Mall 176 | 196,West Manchester Mall 177 | 197,Lakeview Square 178 | 198,Palladium Square 179 | 199,Centro Lavington 180 | 200,Bahrain City Centre 181 | 201,Spinderiet (Copenhagen) 182 | 202,Central Park (shopping complex) 183 | 203,Kukui Grove Shopping Center 184 | 205,Albany Mall 185 | 207,"University Mall (Chapel Hill, North Carolina)" 186 | 208,The Outlets at Sands Bethlehem 187 | 209,Enfield Square 188 | 210,"Harbor Point, Subic" 189 | 211,Crystal Palace Complex (Dieppe) 190 | 212,Centre at Glen Burnie 191 | 213,South City (shopping mall) 192 | 214,The Gallery at Military Circle 193 | 215,West 12 Shepherds Bush 194 | 217,The Brentwood Country Mart 195 | 218,South Point (shopping mall) 196 | 219,Cross County Plaza 197 | 220,Werribee Plaza 198 | 221,Monroe Crossing Mall 199 | 222,"Northwoods Mall (Peoria, Illinois)" 200 | 223,Square 2 (Shopping Mall) 201 | 225,Westshore Mall 202 | 226,Florin Mall 203 | -------------------------------------------------------------------------------- /src/autofj/benchmark/GivenName/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 7,Luke,0,Luke (name) 3 | 12,Julia (given name),1,Julia 4 | 16,Fatima (given name),2,Fatima (name) 5 | 20,Fanny (given name),3,Fanny (name) 6 | 22,Lyfing (given name),4,Lyfing 7 | 70,Adele (given name),5,Adele 8 | 72,Jeffrey (given name),6,Jeffrey (name) 9 | 73,Kathryn (name),7,Kathryn 10 | 85,Cadwaladr (name),8,Cadwallader 11 | 86,Geeta,9,Gita (given name) 12 | 102,Banu (name),10,Banu 13 | 104,Casper (name),11,Casper 14 | 140,Hana (given name),12,Hana (name) 15 | 170,Abdul Aziz,13,Abd al-Aziz 16 | 178,Guillaume (given name),14,Guillaume 17 | 190,Dustin (given name),15,Dustin (name) 18 | 200,Shahrokh,16,Shahrokh (mythical bird) 19 | 233,William (name),17,William (given name) 20 | 238,Arif,18,Arif (given name) 21 | 254,Corinne (name),19,Corinne 22 | 334,Haruna,20,Haruna (given name) 23 | 342,Brianna (given name),21,Brianna 24 | 360,Takeru,22,Takeru (name) 25 | 389,Yasmin,23,Yasmin (given name) 26 | 431,Calvin (given name),24,Calvin (name) 27 | 440,Alexis (disambiguation),25,Alexis 28 | 449,Hjalmar (given name),26,Hjalmar (disambiguation) 29 | 464,Rei,27,Rei (given name) 30 | 473,Asad,28,Asad (name) 31 | 486,Mervin (given name),29,Mervin 32 | 495,Margaret (name),30,Margaret 33 | 514,Aurora (name),31,Aurora (given name) 34 | 521,Michelle (given name),32,Michelle (name) 35 | 524,Jahsh (name),33,Jahsh 36 | 530,Mark (given name),34,Mark (name) 37 | 582,Sujata,35,Sujata (name) 38 | 605,Anupama,36,Anupama (given name) 39 | 610,Paulina (name),37,Paulina (given name) 40 | 619,Padraig,38,Padraic 41 | 625,Bojan (name),39,Bojan 42 | 629,Chetan,40,Chetan (name) 43 | 640,Juanfran,41,Juanfran (disambiguation) 44 | 652,Gun (name),42,Gun (Swedish name) 45 | 669,Dalia (name),43,Dalia (given name) 46 | 672,Travis (given name),44,Travis 47 | 773,Merwin (name),45,Merwin 48 | 786,Daniel (name),46,Daniel 49 | 798,Arun (name),47,Arun (given name) 50 | 835,Michele (given name),48,Michele 51 | 861,Zakiah (female name),49,Zakiah 52 | 880,Parvathy,50,Parvati (given name) 53 | 882,Medad (name),51,Medad 54 | 886,Joseph (given name),52,Joseph 55 | 886,Joseph (given name),53,Joseph (name) 56 | 889,Aida (given name),54,Aida (name) 57 | 892,Edwina (given name),55,Edwina 58 | 909,Zvonimir (name),56,Zvonimir 59 | 915,Annetta (name),57,Annetta (given name) 60 | 936,Asa (given name),58,Asa (name) 61 | 944,Hannu (given name),59,Hannu (disambiguation) 62 | 948,Hayley,60,Hayley (given name) 63 | 957,Baurzhan,61,Bauyrzhan 64 | 961,Abdur Rahman,62,Abd al-Rahman 65 | 964,Bram (name),63,Bram (given name) 66 | 1005,Hannah (given name),64,Hannah (name) 67 | 1019,Katherine (given name),65,Katherine 68 | 1023,Hideyoshi (given name),66,Hideyoshi (disambiguation) 69 | 1035,Alexandru (name),67,Alexandru 70 | 1042,Lee (given name),68,Lee (English given name) 71 | 1044,Leonard (name),69,Leonard 72 | 1049,Vivienne,70,Vivian (given name) 73 | 1057,Marvin (name),71,Marvin (given name) 74 | 1127,Faiz,72,Faiz (disambiguation) 75 | 1145,Anthony (given name),73,Anthony (name) 76 | 1150,Stanley (surname),74,Stanley (name) 77 | 1165,Cory (name),75,Cory 78 | 1172,Gerhard,76,Gerard 79 | 1253,Lindita,77,Lindita (given name) 80 | 1268,Sophie,78,Sophie (given name) 81 | 1281,Leonie (given name),79,Leonie 82 | 1282,Raizo (given name),80,Raizo 83 | 1283,Raffaello,81,Raffaello (disambiguation) 84 | 1291,Sorin (first name),82,Sorin (given name) 85 | 1317,Ljubica (name),83,Ljubica 86 | 1336,Ludovica (given name),84,Ludovica 87 | 1337,Alessia (given name),85,Alessia 88 | 1366,Coralie (given name),86,Coralie 89 | 1372,Navneet (given name),87,Navneet 90 | 1385,Aisling (given name),88,Aisling (name) 91 | 1395,Niloufar,89,Nelofar 92 | 1404,Miley (name),90,Miley (given name) 93 | 1430,Mary (given name),91,Mary (name) 94 | 1432,Madeleine (given name),92,Madeleine (name) 95 | 1443,Schuyler (given name),93,Schuyler (name) 96 | 1471,Milica (given name),94,Milica 97 | 1482,Laimonis (name),95,Laimonis 98 | 1483,Uldis (name),96,Uldis 99 | 1484,Dzintars (name),97,Dzintars 100 | 1485,Modris (name),98,Modris 101 | 1487,Gatis (name),99,Gatis 102 | 1488,Indulis (name),100,Indulis 103 | 1497,Priya,101,Priya (given name) 104 | 1544,Maki (given name),102,Maki (name) 105 | 1547,Jun'ichi,103,Junichi 106 | 1588,Alison (name),104,Alison (given name) 107 | 1589,Kanye (Igbo name),105,Kanye (name) 108 | 1596,Kaj (name),106,Kaj 109 | 1618,Tawfik (given name),107,Tawfik 110 | 1625,Kayo (Nigerian name),108,Kayo (name) 111 | 1656,Pia (name),109,Pia (given name) 112 | 1669,Khristo,110,Hristo 113 | 1708,Heather (name),111,Heather (given name) 114 | 1709,Lubomir (given name),112,Lubomir 115 | 1712,Ctirad (name),113,Ctirad 116 | 1716,Tom (name),114,Tom (given name) 117 | 1719,Aileen (given name),115,Aileen 118 | 1729,Bita,116,Bita (Persian) 119 | 1730,Christopher (given name),117,Christopher 120 | 1734,Abid,118,Abid (name) 121 | 1760,Brooke (given name),119,Brooke (name) 122 | 1761,Mira (name),120,Mira (given name) 123 | 1796,Parisa (given name),121,Parisa (disambiguation) 124 | 1806,Tuukka (given name),122,Tuukka 125 | 1834,Soo-young,123,Soo-young (name) 126 | 1842,Roosevelt (surname),124,Roosevelt (name) 127 | 1862,Hamad (name),125,Hamad 128 | 1877,Ziemowit (given name),126,Ziemowit 129 | 1913,Graciela (given name),127,Graciela (disambiguation) 130 | 1922,Dobroslaw (name),128,Dobroslav 131 | 1955,Tess,129,Tess (given name) 132 | 1976,Ellen (given name),130,Ellen 133 | 1993,Kalina (given name),131,Kalina (name) 134 | 2027,Kaede (disambiguation),132,Kaede 135 | 2039,Ranald (given name),133,Ranald 136 | 2046,Sandy (name),134,Sandy (given name) 137 | 2052,Maytham (name),135,Maytham 138 | 2055,Damayanti (given name),136,Damayanti (disambiguation) 139 | 2091,Faith (given name),137,Faith (name) 140 | 2094,Llywelyn (name),138,Llewellyn (name) 141 | 2110,Veronica (given name),139,Veronica (name) 142 | 2121,Amos,140,Amos (name) 143 | 2162,Bora (given name),141,Bora (Turkish name) 144 | 2173,Oladapo (name),142,Oladapo 145 | 2198,Jerry (name),143,Jerry (given name) 146 | 2199,Sania,144,Sania (disambiguation) 147 | 2207,Domagoj,145,Domagoj (given name) 148 | 2210,Okonma (surname),146,Okonma 149 | 2247,Keon (given name),147,Keon 150 | 2336,Enn (given name),148,Enn 151 | 2506,Haruchika,149,Haruchika (given name) 152 | 2968,Farah (given name),150,Farah (name) 153 | 2999,Harutyun (given name),151,Harutyun 154 | 3007,Sora (given name),152,Sora (Japanese given name) 155 | 3008,Verity (given name),153,Verity 156 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Drug/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 4,BCG vaccine,0,Bacillus Calmette-Guérin 3 | 10,Dimethyltryptamine,1,"N,N-Dimethyltryptamine" 4 | 30,Scopolamine,2,Hyoscine hydrobromide 5 | 46,Valproic acid,3,Valproate 6 | 161,Interferon beta-1a,4,Interferon beta 1a 7 | 180,Androstenedione,5,4-Androstenedione 8 | 224,Oxycodone/aspirin,6,Percodan 9 | 342,5-MeO-AMT,7,5-MeO-aMT 10 | 437,Live attenuated influenza vaccine,8,FluMist 11 | 530,Tenofovir,9,Tenofovir disoproxil 12 | 563,Ursodiol,10,Ursodeoxycholic acid 13 | 565,Mepacrine,11,Quinacrine 14 | 628,Glycopyrronium bromide,12,Glycopyrrolate 15 | 664,Neosporin,13,Neomycin/polymyxin B/bacitracin 16 | 699,Glycyrrhetinic acid,14,Enoxolone 17 | 752,Butylscopolamine,15,Hyoscine butylbromide 18 | 761,Paromomycin,16,Paromomycin sulfate 19 | 809,Polysporin,17,Bacitracin/polymyxin B 20 | 831,Ethinyl estradiol,18,Ethinylestradiol 21 | 889,Actinomycin,19,Dactinomycin 22 | 904,Interferon beta-1b,20,Interferon beta 1b 23 | 912,Methandrostenolone,21,Metandienone 24 | 994,Cafergot,22,Caffeine/ergotamine 25 | 1009,Roxatidine,23,Roxatidine acetate 26 | 1016,HPV vaccine,24,HPV vaccines 27 | 1050,Atenolol/chlorthalidone,25,Tenoretic 28 | 1051,Sodium oxybate,26,Xyrem 29 | 1114,CroFab,27,Crotalidae polyvalent immune fab 30 | 1274,Hydroxyamfetamine,28,4-Hydroxyamphetamine 31 | 1377,Librax,29,Chlordiazepoxide/clidinium bromide 32 | 1386,Arbidol,30,Umifenovir 33 | 1428,Emtricitabine/tenofovir/efavirenz,31,Efavirenz/emtricitabine/tenofovir 34 | 1435,MLD-41,32,N1-Methyl-lysergic acid diethylamide 35 | 1439,DAM-57,33,Dimethyllysergamide 36 | 1580,Gestonorone,34,Gestonorone caproate 37 | 1735,Zostavax,35,Zoster vaccine 38 | 1786,Pioglitazone/metformin,36,Politor 39 | 1904,Dantron,37,"1,8-Dihydroxyanthraquinone" 40 | 1915,Levonordefrin,38,Corbadrine 41 | 1999,17-Hydroxyprogesterone caproate,39,Hydroxyprogesterone caproate 42 | 2027,Mestranol/norethynodrel,40,Mestranol/noretynodrel 43 | 2051,Etilamfetamine,41,Ethylamphetamine 44 | 2114,Growth hormone releasing hexapeptide,42,GHRP-6 45 | 2151,Abiraterone,43,Abiraterone acetate 46 | 2252,Fenyramidol,44,Phenyramidol 47 | 2260,Digoxin Immune Fab,45,Digoxin immune fab 48 | 2536,Senna glycoside,46,Senna glycosides 49 | 2586,Norboletone,47,Norbolethone 50 | 2622,Artesunate/amodiaquine,48,ASAQ 51 | 2649,Metofoline,49,Methopholine 52 | 2676,Beta-Methylfentanyl,50,Betamethylfentanyl 53 | 2718,MN 18,51,MN-18 54 | 2753,4-Fluoromethamphetamine,52,4-Fluoro-N-methylamphetamine 55 | 2763,Interferon alfa-2b,53,Interferon alfa 2b 56 | 2863,GW 501516,54,GW501516 57 | 2940,1-(4-Nitrophenylethyl)piperidylidene-2-(4-chlorophenyl)sulfonamide,55,W-18 58 | 2977,Humulin,56,Regular insulin 59 | 3006,C-8813,57,Thiobromadol 60 | 3038,L-DOPS,58,Droxidopa 61 | 3113,Thiambutenes,59,Thiambutene 62 | 3122,Enzalutamide,60,MDV3100 63 | 3185,Pericyazine,61,Periciazine 64 | 3277,"SB-242,084",62,SB-242084 65 | 3359,Stalevo,63,Carbidopa/levodopa/entacapone 66 | 3392,Dihydroxialumini sodium carbonate,64,Carbaldrate 67 | 3403,Dioctyl sodium sulfosuccinate,65,Docusate 68 | 3454,TC-5619,66,Bradanicline 69 | 3481,"EMD-386,088",67,EMD-386088 70 | 3482,"SB-258,585",68,SB-258585 71 | 3483,"SB-399,885",69,SB-399885 72 | 3484,"SB-357,134",70,SB-357134 73 | 3492,Afutuzumab,71,Obinutuzumab 74 | 3541,4-Bromo-N-methylcathinone,72,4-Bromomethcathinone 75 | 3552,"SB-271,046",73,SB-271046 76 | 3595,CBLB502,74,Entolimod 77 | 3633,"A-372,159",75,A-372159 78 | 3635,"SB-699,551",76,SB-699551 79 | 3692,"RS-127,445",77,RS-127445 80 | 3693,"SB-204,741",78,SB-204741 81 | 3694,"RS-102,221",79,RS-102221 82 | 3713,"CP-94,253",80,CP-94253 83 | 3720,Sulcotidil,81,Suloctidil 84 | 3773,Insulin therapy,82,Insulin (medication) 85 | 3821,Ro04-6790,83,Ro 04-6790 86 | 3830,"SB-216,641",84,SB-216641 87 | 3835,"SB-269,970",85,SB-269970 88 | 3889,Torezolid,86,Tedizolid 89 | 3913,SEP-225289,87,Dasotraline 90 | 3918,Alpharadin,88,Radium-223 91 | 3918,Alpharadin,89,Radium-223 chloride 92 | 3921,"5,6-Methylenedioxy-N-methyl-2-aminoindane",90,MDMAI 93 | 3960,Pivalylbenzhydrazine,91,Pivhydrazine 94 | 3974,MMDMA,92,MMDMA (drug) 95 | 4013,Salvinorin B methoxymethyl ether,93,2-Methoxymethyl salvinorin B 96 | 4014,Isofagomine tartrate,94,Afegostat 97 | 4044,"SB-215,505",95,SB-215505 98 | 4045,SR-48692,96,Meclinertant 99 | 4047,"GR-127,935",97,GR-127935 100 | 4081,Methylenedioxymethylphenethylamine,98,Homarylamine 101 | 4096,Bromadol,99,BDPC 102 | 4103,Norethindrone enanthate,100,Norethisterone enanthate 103 | 4114,"CP-809,101",101,CP-809101 104 | 4152,5-(2-Aminopropyl)indole,102,5-IT 105 | 4154,Chromium polynicotinate,103,Chromium(III) nicotinate 106 | 4215,BMS-945429,104,Clazakizumab 107 | 4223,Afobazole,105,Fabomotizole 108 | 4264,Toplexil,106,Oxomemazine/guaifenesin 109 | 4288,Lucinactant,107,Surfaxin 110 | 4347,Lu AE58054,108,Idalopirdine 111 | 4368,Asasantin,109,Acetylsalicylic acid/dipyridamole 112 | 4451,Loxtidine,110,Lavoltidine 113 | 4475,PCPr,111,Phencyclamine 114 | 4484,Erbulozole,112,Erbuzole 115 | 4507,Emoxypine,113,6-Methyl-2-ethyl-3-hydroxypyridine 116 | 4551,"LY-293,284",114,LY-293284 117 | 4588,Ro5-2904,115,Triflunordazepam 118 | 4597,"S-14,671",116,S-14671 119 | 4611,AEZS-108,117,Zoptarelin doxorubicin 120 | 4658,Meclozine,118,Meclizine 121 | 4662,"CP-93,129",119,CP-93129 122 | 4687,2C-TFM-NBOMe,120,25TFM-NBOMe 123 | 4688,2C-C-NBOMe,121,25C-NBOMe 124 | 4703,Thiopropamine,122,1-(Thiophen-2-yl)-2-aminopropane 125 | 4713,TM-38837,123,TM38837 126 | 4727,Beyaz (drug),124,Ethinyl estradiol/drospirenone/levomefolic acid 127 | 4735,Carfenazine,125,Carphenazine 128 | 4736,4-Hydroxycyclophosphamide,126,Perfosfamide 129 | 4746,"Org 12,962",127,Org 12962 130 | 4748,Festinavir,128,Censavudine 131 | 4780,Omecamtiv mecarbil,129,Omecamtiv Mecarbil (CK-1827452) 132 | 4926,"SB-258,719",130,SB-258719 133 | 4927,"GR-113,808",131,GR-113808 134 | 4931,"LY-310,762",132,LY-310762 135 | 4932,"SB-204,070",133,SB-204070 136 | 4933,"CJ-033,466",134,CJ-033466 137 | 4934,"SB-206,553",135,SB-206553 138 | 4935,"CP-135,807",136,CP-135807 139 | 4984,Lumacaftor,137,VX-809 140 | 4990,Tenofovir alafenamide fumarate,138,Tenofovir alafenamide 141 | 4990,Tenofovir alafenamide fumarate,139,GS 7340 142 | 4997,Sofosbuvir,140,PSI-7977 143 | 4998,BA058,141,Abaloparatide 144 | 5017,Ana773,142,RG7795 145 | 5017,Ana773,143,ANA773 146 | 5031,C16 (PKR inhibitor),144,C16 (drug) 147 | 5052,LCZ696,145,Valsartan/sacubitril 148 | 5224,AKB48 (drug),146,APINACA 149 | 5249,TAS-102,147,Trifluridine/tipiracil 150 | 5262,GLYX-13,148,Rapastinel 151 | 5273,SDB-001,149,APICA (synthetic cannabinoid drug) 152 | 5332,PRT064445,150,Andexanet alfa 153 | 5333,Mk-1439,151,Doravirine 154 | 5334,GS-5885,152,Ledipasvir 155 | 5336,Bi 207127,153,Deleobuvir 156 | 5344,PB-22,154,QUPIC 157 | 5345,BB-22 (drug),155,QUCHIC 158 | 5348,GFT505,156,Elafibranor 159 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Monarch/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Emperor Kinmei 3 | 1,Emperor Tenmu 4 | 2,Emperor Monmu 5 | 3,Anastasius I Dicorus 6 | 4,Theodoric the Great 7 | 5,Xerxes I 8 | 6,Herod Agrippa 9 | 7,Theodosius III 10 | 8,Michael V 11 | 9,Michael VI Stratiotikos 12 | 10,Berenice II of Egypt 13 | 11,Oswiu 14 | 12,Antiochus V Eupator 15 | 13,Hiram I 16 | 14,Khosrow I 17 | 15,Khosrow II 18 | 16,Phetsarath Ratanavongsa 19 | 17,Stefan Radoslav 20 | 18,"Nur ad-Din, atabeg of Aleppo" 21 | 19,Dost Mohammad Khan (Emir of Afghanistan) 22 | 20,Dost Mohammad Barakzai 23 | 21,Yax Nuun Ahiin I 24 | 22,Cyaxares 25 | 23,Bardylis 26 | 24,Ajen Yohl Mat 27 | 25,K'uk' Bahlam I 28 | 26,Ahkal Mo' Nahb I 29 | 27,Kan Bahlam I 30 | 28,K'inich Ahkal Mo' Nahb III 31 | 29,Idris I 32 | 30,Nicias (Indo-Greek king) 33 | 31,Avittom Thirunal Balarama Varma 34 | 32,Dutthagamani of Anuradhapura 35 | 33,Fat′h-Ali Shah Qajar 36 | 34,Batbayan 37 | 35,Shuja Shah Durrani 38 | 36,Artemidoros Aniketos 39 | 37,Qutb al-Din Aibak 40 | 38,Qutbu l-Din Aibak 41 | 39,An-Nasir Muhammad 42 | 40,Demetrius III Aniketos 43 | 41,Ahmad ibn Ibrahim al-Ghazi 44 | 42,Zhanibek Khan 45 | 43,Phraates II 46 | 44,Phraates I 47 | 45,Artabanus V of Parthia 48 | 46,Artabanus IV of Parthia 49 | 47,Artabanus III of Parthia 50 | 48,Artabanus II of Parthia 51 | 49,Yaqub al-Mansur 52 | 50,Phriapatius 53 | 51,Gotarzes I 54 | 52,Yasovarman 55 | 53,Gelawdewos of Ethiopia 56 | 54,Yaqob of Ethiopia 57 | 55,Susenyos of Ethiopia 58 | 56,Fasilides of Ethiopia 59 | 57,Yohannes I of Ethiopia 60 | 58,Iyasu I of Ethiopia 61 | 59,Amda Seyon I of Ethiopia 62 | 60,Dawit I of Ethiopia 63 | 61,Telephos Euergetes 64 | 62,Baeda Maryam of Ethiopia 65 | 63,Ya'qub-i Laith Saffari 66 | 64,Iyoas I of Ethiopia 67 | 65,Susenyos II of Ethiopia 68 | 66,Tekle Giyorgis I of Ethiopia 69 | 67,Salomon II of Ethiopia 70 | 68,Hezqeyas of Ethiopia 71 | 69,Baeda Maryam III of Ethiopia 72 | 70,Wazir Akbar Khan 73 | 71,Asaf ad-Dawlah Mir Ali Salabat Jang 74 | 72,Ayub Khan (Emir of Afghanistan) 75 | 73,Ghazi Ayub Khan 76 | 74,Ssuuna II of Buganda 77 | 75,Caranus (king) 78 | 76,Coenus (king) 79 | 77,Empress Ma (Han dynasty) 80 | 78,Asaf-ud-Daula 81 | 79,Izz al-Dawla 82 | 80,'Izz al-Daula 83 | 81,"Yusuf II, Almohad caliph" 84 | 82,Abd al-Wahid I 85 | 83,Mu'izz al-Daula 86 | 84,Imad al-Dawla 87 | 85,'Imad al-Daula 88 | 86,Majd al-Dawla 89 | 87,Ljudevit 90 | 88,Constantine Doukas 91 | 89,Idris II 92 | 90,Ismail Samani 93 | 91,Rajasinha II of Kandy 94 | 92,Drest X 95 | 93,Bridei VII 96 | 94,Ciniod II 97 | 95,Bridei VI 98 | 96,Ciniod I 99 | 97,Raja Dahir 100 | 98,Man Singh II 101 | 99,Shao Kang 102 | 100,Ram Singh I 103 | 101,Nasir ad-Din Qabacha 104 | 102,Ilyas ibn Asad 105 | 103,Baraq (Golden Horde) 106 | 104,Olafr Godredsson 107 | 105,Amr Saffari 108 | 106,Ahmad I bin Mohammed 109 | 107,Khalaf I 110 | 108,Mohammad Khodabanda 111 | 109,Muhammad of Ghazni 112 | 110,Mawdud of Ghazni 113 | 111,Maw'dud of Ghazni 114 | 112,Jaswant Singh of Bharatpur 115 | 113,Keshri Singh 116 | 114,Mohammadu Maccido 117 | 115,Agrasen 118 | 116,Abdullah ibn Tahir al-Khurasani 119 | 117,Parakramabahu I of Polonnaruwa 120 | 118,Yahya ibn al-Qasim 121 | 119,Al-Hajjam al-Hasan ibn Muhammad ibn al-Qasim 122 | 120,Al-Qasim Guennoun 123 | 121,Abul-Aish Ahmad 124 | 122,Fadl ibn Muhammad 125 | 123,Fadl I 126 | 124,Muhammad ibn Abi'l-Saj 127 | 125,Yusuf Ibn Abi'l-Saj 128 | 126,Mirwais Hotak 129 | 127,Pandara Vanniyan 130 | 128,Saadatullah Khan I 131 | 129,Tidal (king) 132 | 130,Alexander II Mircea 133 | 131,Mahmud Hotak 134 | 132,Beorna of East Anglia 135 | 133,Ashraf Hotak 136 | 134,Chashtana 137 | 135,Tia (princess) 138 | 136,Abd al-Latif ibn Muhammad Taraghay Ulughbek 139 | 137,Shivaji of Thanjavur 140 | 138,Amar Singh of Thanjavur 141 | 139,Muhammad ibn Shaddad 142 | 140,Lashkari ibn Muhammad 143 | 141,Marzuban ibn Muhammad ibn Shaddad 144 | 142,Marzuban ibn Muhammad (Shaddadid) 145 | 143,Abu'l-Fath Musa 146 | 144,Lashkari ibn Musa 147 | 145,Lashkari ibn Fadl 148 | 146,Anushirvan ibn Lashkari 149 | 147,Fadl ibn Shavur 150 | 148,Fadl II 151 | 149,Abu'l-Aswar Shavur ibn Fadl 152 | 150,Abu'l-Asvar Shavur I 153 | 151,Ashot ibn Shavur 154 | 152,"Vikramabahu, Prince of Ruhuna" 155 | 153,Tode Mongke 156 | 154,Vicar-ul-Umra 157 | 155,Chaiyasiri 158 | 156,Najmuddin Ali Khan 159 | 157,Najabat Ali Khan 160 | 158,Mansur Ali Khan of Bengal 161 | 159,Lutf Allah (Sarbadar) 162 | 160,Uthram Thirunal Marthanda Varma 163 | 161,Monunius I 164 | 162,Monunius 165 | 163,Bato (Dardanian chieftain) 166 | 164,Cleitus (Dardania) 167 | 165,Mytilus (Dardania) 168 | 166,Mytilus 169 | 167,Yax Nuun Ahiin II 170 | 168,Kapeliele Faupala 171 | 169,Mahathammaracha II 172 | 170,Kaloyan and Desislava 173 | 171,Vimaladharmasuriya II of Kandy 174 | 172,Maravarman Rajasimha III 175 | 173,"As-Salih Ismail, Emir of Damascus" 176 | 174,Akhsitan I 177 | 175,"Kavan Tissa, Prince of Ruhuna" 178 | 176,"Gothabhaya, Prince of Ruhuna" 179 | 177,"Mahanaga, Prince of Ruhuna" 180 | 178,"Yatala Tissa, Prince of Ruhuna" 181 | 179,Muhammad ibn Suri 182 | 180,Ghiyath al-Din Muhammad 183 | 181,Hussain Hotak 184 | 182,Vistahm 185 | 183,Parameswara (king) 186 | 184,George I (Miskito) 187 | 185,Monunius II 188 | 186,Monunius of Dardania 189 | 187,Indradyumna (Mythological King) 190 | 188,"Idris I, Almohad Caliph" 191 | 189,"Idris II, Almohad Caliph" 192 | 190,Bhuvanaikabahu VII of Kotte 193 | 191,Parakramabahu IX of Kotte 194 | 192,Sihyaj Chan K'awiil II 195 | 193,"Al-Ashraf Musa, Emir of Homs" 196 | 194,Moggallana II 197 | 195,Khosrow IV 198 | 196,Khosrow III 199 | 197,Haytham b. Khalid 200 | 198,Muhammad I of Shirvan 201 | 199,Muhammad II Shirvanshah 202 | 200,Haytham II of Shirvan 203 | 201,Ali I of Shirvan 204 | 202,Muhammad II of Shirvan 205 | 203,Muhammad III Shirvanshah 206 | 204,Ahmad of Shirvan 207 | 205,Muhammad IV of Shirvan 208 | 206,Muhammad V Shirvanshah 209 | 207,Yazid II of Shirvan 210 | 208,Manuchihr I of Shirvan 211 | 209,Ali II of Shirvan 212 | 210,Qubad of Shirvan 213 | 211,Ali III of Shirvan 214 | 212,Sallar of Shirvan 215 | 213,Fariburz I 216 | 214,Manuchihr II of Shirvan 217 | 215,Afridun I 218 | 216,Manuchihr III of Shirvan 219 | 217,Afridun II 220 | 218,Shahanshah (Shirvanshah) 221 | 219,Fariburz II 222 | 220,Farrukhzad I 223 | 221,Gushtasb I 224 | 222,Fariburz III 225 | 223,Akhsitan II 226 | 224,Farrukhzad II 227 | 225,Akhsitan III 228 | 226,Keykavus I (Shirvanshah) 229 | 227,Kayqubad I of Shirvan 230 | 228,Kavus I 231 | 229,Hushang of Shirvan 232 | 230,Daniyal (Mughal prince) 233 | 231,Lu'lu' al-Kabir 234 | 232,Senekerim-Hovhannes Artsruni 235 | 233,Senekerim-Hovhannes 236 | 234,Parakramabahu Epa of Gampola 237 | 235,Parakramabahu V of Gampola 238 | 236,Varaz-Tiridates I 239 | 237,Varaz Trdat I 240 | 238,Gaumata 241 | 239,Parakramabahu II of Dambadeniya 242 | 240,Parakramabahu III of Dambadeniya 243 | 241,Parakramabahu IV of Dambadeniya 244 | -------------------------------------------------------------------------------- /src/autofj/benchmark/ShoppingMall/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,The Galleria 3 | 1,The Shops at Prudential Center 4 | 2,Fashion Centre at Pentagon City 5 | 3,Sydney Central Plaza 6 | 4,Mahatma Jyotiba Phule Mandai 7 | 5,Disney Springs 8 | 6,Downtown Disney (Walt Disney World Resort) 9 | 7,Bluewater 10 | 8,"The Triangle, Manchester" 11 | 9,The Summit (Birmingham) 12 | 10,Box Hill Central Shopping Centre 13 | 11,Victoria Centre 14 | 12,Irvine Spectrum 15 | 13,Liffey Valley 16 | 14,The Square Tallaght 17 | 15,Asheville Outlets 18 | 16,Metrotown 19 | 17,St. Louis Mills 20 | 18,"Northlake Mall (Charlotte, North Carolina)" 21 | 19,"Grand Central, Birmingham" 22 | 20,Philadelphia Mills 23 | 21,"Touchwood, Solihull" 24 | 22,Atlantic Terminal (Shopping Mall) 25 | 23,Forest Fair Village 26 | 24,Kingston Collection 27 | 25,Knox City Shopping Centre 28 | 26,Hamilton Place 29 | 27,"Langham Place, Hong Kong" 30 | 28,Centrale 31 | 29,Strathpine Centre 32 | 30,Broadmarsh 33 | 31,Rhodes Waterside 34 | 32,The Centrepoint 35 | 33,Wilton Mall at Saratoga 36 | 34,Palm Beach Outlets 37 | 35,Great Northern Mall (New York) 38 | 36,Bankstown Central Shopping Centre 39 | 37,Sunway Putra Mall 40 | 38,HarbourFront Centre (Singapore) 41 | 39,CityPlace (West Palm Beach) 42 | 40,Galleria Shopping Centre (Toronto) 43 | 41,"Westgate, Oxford" 44 | 42,Annapolis Mall 45 | 43,Brentwood Town Centre (mall) 46 | 44,"St David's, Cardiff" 47 | 45,Toombul Shopping Centre 48 | 46,Grand Indonesia 49 | 47,Yorkville Village 50 | 48,Connecticut Post Mall 51 | 49,The Mall at Fox Run 52 | 50,Downtown Commons 53 | 51,Westfield Downtown Plaza 54 | 52,MainPlace Mall 55 | 53,Capital Mall 56 | 54,Fox Valley Mall 57 | 55,Chicago Ridge Mall 58 | 56,Westfield Louis Joliet 59 | 57,Southlake Mall (Indiana) 60 | 58,Franklin Park Mall 61 | 59,Westfield SouthPark 62 | 60,Great Northern Mall (Ohio) 63 | 61,Belden Village Mall 64 | 62,Westfield Solano 65 | 63,Westfield Eastland 66 | 64,Plaza West Covina 67 | 65,Parkway Plaza 68 | 66,Riyadh Sahara Mall 69 | 67,"Crossroads Center (St. Cloud, Minnesota)" 70 | 68,St. Laurent Shopping Centre 71 | 69,Artegon Marketplace 72 | 70,Zona Rosa (Kansas City) 73 | 71,Lutwyche City Shopping Centre 74 | 72,Les Promenades Gatineau 75 | 73,Westgate Entertainment District 76 | 74,Victoria Gardens (Rancho Cucamonga) 77 | 75,Woodburn Premium Outlets 78 | 76,El Con Center 79 | 77,Pacific Epping 80 | 78,Parmatown Mall 81 | 79,Auburn Mall (Massachusetts) 82 | 80,"Auburn Mall (Auburn, Massachusetts)" 83 | 81,Cataraqui Centre 84 | 82,Westfield Royal Victoria Place 85 | 83,Playground Pier 86 | 84,Conestoga Mall 87 | 85,Maple Hill Mall 88 | 86,Roselands Shopping Centre 89 | 87,The Mall at the Source 90 | 88,The Oaks Shopping Center 91 | 89,Santa Rosa Mall 92 | 90,"Crossroads Center (Waterloo, Iowa)" 93 | 91,The Spires Shopping Centre 94 | 92,"Change Alley, Singapore" 95 | 93,Seacon Square Srinakarin 96 | 94,Northfield Square Mall 97 | 95,GreenStreet 98 | 96,Paradise Park (mall) 99 | 97,Colonnades Shopping Centre 100 | 98,The Glen Shopping Centre 101 | 99,North Rocks Shopping Centre 102 | 100,Eastridge Mall (Gastonia) 103 | 101,The Oaks Mall (Florida) 104 | 102,The Outlet Collection at Riverwalk 105 | 103,Jantzen Beach Center 106 | 104,The Quad St. Charles 107 | 105,Galleria Shopping Centre (Perth) 108 | 106,Galleria 220 109 | 107,The Crossings at Northwest 110 | 108,Mall of Acadiana 111 | 109,The Arsenal Project of Watertown 112 | 110,Antioch Crossing 113 | 111,Omni Park 114 | 112,"The Paragon, Singapore" 115 | 113,The Legends at Village West 116 | 114,Champlain Mall 117 | 115,Splendid China Mall 118 | 116,Brunswick Shopping Centre 119 | 117,River Drive Mall 120 | 118,Plaza Mayor (Oklahoma) 121 | 119,Uni Hill Factory Outlets 122 | 120,Warrawong Plaza 123 | 121,Figtree Grove 124 | 122,Pakuranga Plaza 125 | 123,Westfield Downtown 126 | 124,Shoppes at Knollwood 127 | 125,Shangri-La Plaza 128 | 126,Shangri-La Plaza (shopping mall) 129 | 127,Lakeshore Mall 130 | 128,Great Eastern Tower 131 | 129,Hickory Hollow Mall 132 | 130,Carriage Crossing 133 | 131,Victoria Gate 134 | 132,Shelter Cove Towne Centre 135 | 133,"Bishops Corner, West Hartford" 136 | 134,Harbor Square 137 | 135,The Columbia Mall 138 | 136,The Centre on Barton 139 | 137,Centre of Tallahassee 140 | 138,La Encantada (shopping center) 141 | 139,Westfield Gateway 142 | 140,The Shoppes at Gateway 143 | 141,Forest Lake Village Shopping Centre 144 | 142,Megabox (shopping mall) 145 | 143,CastleCourt 146 | 144,Plaza Central (Texas) 147 | 145,City Centre Deira 148 | 146,Paddock Shops 149 | 147,Promenade (shopping centre) 150 | 148,Cortana Mall 151 | 149,Mercury Shopping Centre 152 | 150,Bentley Bridge 153 | 151,Iluma 154 | 152,"City Centre Plaza, Rockhampton" 155 | 153,SuperMall of the Great Northwest 156 | 154,Downtown Summerlin (shopping center) 157 | 155,Hillside Village 158 | 156,Newpark Mall 159 | 157,Springfield Mall (Virginia) 160 | 158,Florence Mall (Alabama) 161 | 159,Ellsworth Place 162 | 160,Taigum Square Shopping Centre 163 | 161,Sta. Lucia East Grandmall 164 | 162,Florence Mall (Kentucky) 165 | 163,SM City Taguig 166 | 164,Karingal Hub Shopping Centre 167 | 165,"Regency Square Mall (Jacksonville, Florida)" 168 | 166,"The Broadway, Bradford" 169 | 167,Motherwell Shopping Centre 170 | 168,Bole Dembel Shopping Center 171 | 169,Worcester Common Outlets 172 | 170,City Mall (Amman) 173 | 171,Winrock Center 174 | 172,"Northway Mall (Colonie, New York)" 175 | 173,"Saratoga Mall (Wilton, New York)" 176 | 174,Chesapeake Square 177 | 175,LuLu International Shopping Mall 178 | 176,Lulu Cochin Mall 179 | 177,City Centre Mirdif 180 | 178,Shaktan Thampuran Private Bus Stand 181 | 179,Star City (shopping mall) 182 | 180,"Auburn Mall (Auburn, Alabama)" 183 | 181,"Village Mall (Auburn, Alabama)" 184 | 182,"The Exchange, Ilford" 185 | 183,MY MALL Limassol 186 | 184,CastleTown Shoppingworld 187 | 185,The Market Common 188 | 186,The Kenwood Collection 189 | 187,The Pavilions 190 | 188,Settlers Green 191 | 189,The Summit (Reno) 192 | 190,The Summit (Wheatfield) 193 | 191,"Domain Central, Townsville" 194 | 192,Granada Centre 195 | 193,The Base (shopping centre) 196 | 194,The Base (Shopping Centre) 197 | 195,Tulsa Promenade 198 | 196,West Manchester Town Center 199 | 197,Lakeview Square Mall 200 | 198,Palladium World 201 | 199,Lavington Square Shopping Centre 202 | 200,City Centre Bahrain 203 | 201,Spinderiet 204 | 202,Fars Shopping Complex 205 | 203,Kukui Grove Center 206 | 204,Riverdale Village 207 | 205,Festival Alabang 208 | 206,Toa Payoh Entertainment Centre 209 | 207,"University Place (Chapel Hill, North Carolina)" 210 | 208,The Shoppes at Sands 211 | 209,Enfield Square Mall 212 | 210,Harbor Point (Subic) 213 | 211,Bass Pro Complex (Dieppe) 214 | 212,Glen Burnie Mall 215 | 213,South City Mall 216 | 214,Military Circle Mall 217 | 215,West 12 218 | 216,West 12 Shepherd's Bush 219 | 217,Brentwood Country Mart 220 | 218,South Point Mall 221 | 219,Cross County Mall 222 | 220,Pacific Werribee 223 | 221,Monroe Crossing 224 | 222,Northwoods Mall (Illinois) 225 | 223,Square 2 226 | 224,Square 2 (shopping mall) 227 | 225,The Shops at Westshore 228 | 226,ViaPort Florida 229 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoFJ 2 | 3 | The official code for our SIGMOD 2021 paper: [Auto-FuzzyJoin: Auto-Program Fuzzy Similarity Joins Without Labeled Examples](https://arxiv.org/abs/2103.04489). To reproduce the main results in our paper, switch to `reproduce` branch. 4 | 5 | AutoFJ automatically produces record pairs that approximately match in two input 6 | tables without requiring explicit human input such as labeled training data. Using AutoFJ, 7 | users only need to provide two input tables, and a desired precision target (say 0.9). 8 | AutoFJ leverages the fact that one of the input is a reference table to 9 | automatically program fuzzy-joins that meet the precision target in expectation, 10 | while maximizing fuzzy-join recall (defined as the number of correctly joined records). 11 | 12 | In AutoFJ, the left table refers to a reference table, which is assumed to be almost "duplicate-free". AutoFJ attempts to solve many-to-one join problems, where each record in the right table will be joined with at most one record in the left table, but each record in left table can be joined with multiple records in the right table. 13 | 14 | AutoFJ also provides a benchmark that contains [50 diverse datasets](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/50-single-column-datasets.md) for single-column fuzzy-join tasks constructed from [DBPedia](https://www.dbpedia.org). 15 | 16 | ## Installation 17 | 18 | Install the package using pip 19 | 20 | ``` 21 | pip install autofj 22 | ``` 23 | 24 | ## Usage 25 | 26 | Let `left_table` be the reference table and `right_table` be another input table. The two tables are assumed to have the same schema and have an id column named `id_column`. To join `left_table` and `right_table` with 27 | precision target 0.9, run the following code. The result will be a joined table of record pairs that are identified as matches from two input tables. 28 | ```python 29 | from autofj import AutoFJ 30 | fj = AutoFJ(precision_target=0.9) 31 | result = fj.join(left_table, right_table, id_column) 32 | ``` 33 | 34 | To load a benchmark dataset named as `dataset_name`, run the following code. Each dataset contains a left table (reference table), a right table and a ground-truth table of matched record pairs. The id column of each dataset is named as "id" and the column to be joined is named as "title". The names of all benchmark datasets are listed [here](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/50-single-column-datasets.md). 35 | ```python 36 | from autofj.datasets import load_data 37 | left_table, right_table, gt_table = load_data(dataset_name) 38 | ``` 39 | ## Example 40 | Run the following code to join the left and right table of TennisTournament dataset. 41 | ```python 42 | from autofj.datasets import load_data 43 | from autofj import AutoFJ 44 | left_table, right_table, gt_table = load_data("TennisTournament") 45 | fj = AutoFJ(precision_target=0.9) 46 | result = fj.join(left_table, right_table, "id") 47 | ``` 48 | 49 | ## Documentation 50 | ```python 51 | class AutoFJ(object): 52 | def __init__(self, 53 | precision_target=0.9, 54 | join_function_space="autofj_sm", 55 | distance_threshold_space=50, 56 | column_weight_space=10, 57 | blocker=None, 58 | n_jobs=-1, 59 | verbose=False): 60 | ``` 61 | 62 | ### Parameters 63 | * **precision_target: *float*, default=0.9**
64 | Precision target. The value is taken from 0-1. The default value is 0.9. 65 | 66 | * **join_function_space: *string, dict or list of objects*, default="autofj_sm"**
67 | Space of join functions. There are three ways to define the space of join functions: 68 | 1. Use the name (string) of built-in join function space. There are three 69 | options, including "autofj_lg", "autofj_md" and "autofj_sm" that use 70 | 136, 68 and 14 join functions, respectively. Using less join functions 71 | can improve efficiency but may worsen performance. 72 | 2. Use a dict specifying the options for preprocessing methods, 73 | tokenization methods, token weighting methods and distance functions. 74 | The space will be the cartesian product of all options in the dict. 75 | See [options.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/join_function_space/options.py) for defining join functions using 76 | a dict. 77 | 3. Use a list of customized JoinFunction objects. Define JoinFunction class using prototype in [join_function.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/join_function_space/join_function/join_function.py). 78 | 79 | * **distance_threshold_space: *int or list of floats*, default=50**
80 | The number of candidate distance thresholds or a list of candidate 81 | distance thresholds in the space. If the number of distance thresholds 82 | (integer) is given, distance thresholds are spaced evenly from 0 to 1. 83 | Otherwise, it should be a list of floats from 0 to 1. Using fewer candidates 84 | can improve efficiency but may worsen performance. 85 | 86 | * **column_weight_space: *int or list of floats*, default=10**
87 | The number of candidate column weights or a list of candidate 88 | column weights in the space. If the number of column weights 89 | (integer) is given, column weights are spaced evenly from 0 to 1. 90 | Otherwise, it should be a list of floats from 0 to 1. Using fewer candidates 91 | can improve efficiency but may worsen performance. 92 | 93 | 94 | * **blocker: *None or a Blocker object*, default None**
95 | A Blocker object that performs blocking on two tables. If None, use 96 | the built-in blocker. For using customized blocker, define Blocker class using prototype in [blocker.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/blocker/blocker.py). 97 | 98 | * **n_jobs : *int*, default=-1**
99 | Number of CPU cores used. -1 means using all processors. 100 | 101 | * **verbose: *bool*, default=False**
102 | Whether to print logging 103 | 104 | ### Attributes 105 | * **selected_column_weights: *dict***
106 | The columns and column weights selected by the algorithm. The key is the 107 | column name, the value is the weight selected for the column. 108 | 109 | * **selected_join_configs: *list of tuples***
110 | The union of join configurations selected by the algorithm. Each tuple 111 | (join_function, threshold) in the list is a join configuration that 112 | consists of the name of the join function and its distance threshold. 113 | 114 | ### Methods 115 | ```python 116 | join(left_table, right_table, id_column, on=None) 117 | ``` 118 | 119 | Join left table and right table. 120 | 121 | #### Parameters 122 | * **left_table: *pandas.DataFrame***
123 | Reference table. The left table is assumed to be almost duplicate-free, which means it has no or only few duplicates. 124 | 125 | * **right_table: *pandas.DataFrame***
126 | Another input table. 127 | 128 | * **id_column: *string***
129 | The name of id column in the two tables. This column will not be 130 | used to join two tables. 131 | 132 | * **on: *list or None*, default=None**
133 | A list of column names (multi-column fuzzy join) that the two tables 134 | will be joined on. If None, two tables will be joined on all columns 135 | that exist in both tables, excluding the id column. 136 | 137 | #### Return 138 | * ***pandas.DataFrame***
139 | A table of joining pairs. The columns of left table are 140 | suffixed with "_l" and the columns of right table are suffixed 141 | with "_r". -------------------------------------------------------------------------------- /src/autofj/benchmark/Magazine/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,The Raven: Anarchist Quarterly 3 | 1,Next Magazine (Chinese magazine) 4 | 2,Men's Health 5 | 3,Seventeen (American magazine) 6 | 4,Liberty (libertarian magazine) 7 | 5,Q (magazine) 8 | 6,Run (magazine) 9 | 7,Net (magazine) 10 | 8,Foreign Policy (magazine) 11 | 9,This England (magazine) 12 | 10,City Journal (New York City) 13 | 11,L'Obs 14 | 12,Cat Fancy 15 | 13,Answer Me! (magazine) 16 | 14,MikroBitti 17 | 15,Painkiller Magazine 18 | 16,The Australian Women's Weekly 19 | 17,PCWorld (magazine) 20 | 18,The Wipers Times 21 | 19,Chart Attack 22 | 20,Panorama (magazine) 23 | 21,Now (magazine) 24 | 22,Automobile (magazine) 25 | 23,MIT Technology Review 26 | 24,L'Express 27 | 25,Hotdog (magazine) 28 | 26,Storm Track 29 | 27,Next Generation Magazine 30 | 28,Playboy magazine (Brazilian issue) 31 | 29,Perfect 10 32 | 30,O: The Oprah Magazine 33 | 31,Monthly Afternoon 34 | 32,CPC Attack! 35 | 33,Comics Scene (magazine) 36 | 34,X-One 37 | 35,Compute!'s Gazette 38 | 36,Slam (magazine) 39 | 37,Custom PC (magazine) 40 | 38,Owl (magazine) 41 | 39,Back Issue! 42 | 40,Atari Age (magazine) 43 | 41,Sugarscape.com 44 | 42,Sugar Magazine 45 | 43,Railroad Magazine 46 | 44,Relevant (magazine) 47 | 45,Weekly Young Magazine 48 | 46,Dilema Veche 49 | 47,PC User 50 | 48,The New Atlantis 51 | 49,Maayan (magazine) 52 | 50,Armchair General (magazine) 53 | 51,Chip-India 54 | 52,Chronicle of Current Events 55 | 53,Animation Magazine 56 | 54,Gempak Starz 57 | 55,Hero (magazine) 58 | 56,Hero (gay magazine) 59 | 57,ANALOG Computing 60 | 58,Linux For You 61 | 59,NW (magazine) 62 | 60,Ray Li (magazine) 63 | 61,Dazed 64 | 62,AirForces Monthly 65 | 63,True Detective (magazine) 66 | 64,SKY Magazine 67 | 65,PlayStation Magazine (Italy) 68 | 66,Amtix 69 | 67,In-Training (magazine) 70 | 68,America (Jesuit magazine) 71 | 69,"Brick, A Literary Journal" 72 | 70,Monthly Magazine Z 73 | 71,Swimming World 74 | 72,Humanism Ireland 75 | 73,Wholphin (DVD magazine) 76 | 74,Blitz (magazine) 77 | 75,North & South (New Zealand magazine) 78 | 76,Leading Edge (magazine) 79 | 77,Leading Edge (fiction magazine) 80 | 78,Executive Travel 81 | 79,CHIP (magazine) 82 | 80,Bowlers Journal 83 | 81,Out Front (newspaper) 84 | 82,Sporting Life (magazine) 85 | 83,BlackBook (magazine) 86 | 84,Washingtonian (magazine) 87 | 85,CKM (adult magazine) 88 | 86,Your Family Tree Magazine 89 | 87,The New-York Magazine 90 | 88,Backstage (magazine) 91 | 89,Borderline Comics Magazine 92 | 90,CHARGE! 93 | 91,Now (UK magazine) 94 | 92,NOW (British magazine) 95 | 93,Flux Magazine 96 | 94,Flux Magazine (US) 97 | 95,16 Magazine 98 | 96,Inquiry (magazine) 99 | 97,Xbox World 360 100 | 98,YOU (South African magazine) 101 | 99,FACTS (magazine) 102 | 100,Bluff (magazine) 103 | 101,Kvant (magazine) 104 | 102,Samakalika Malayalam Vaarika 105 | 103,Wink (manhwa) 106 | 104,Resurgence 107 | 105,Essentials (magazine) 108 | 106,Storyworks 109 | 107,Tilllate Magazine 110 | 108,The Messenger Magazine 111 | 109,Guernica (magazine) 112 | 110,The Fed (newspaper) 113 | 111,Familia (magazine) 114 | 112,Commodore Power/Play 115 | 113,Renditions (magazine) 116 | 114,House and Home 117 | 115,The Blast (magazine) 118 | 116,Epoch (magazine) 119 | 117,Sheeko magazine 120 | 118,CURSOR 121 | 119,Computing (Urdu magazine) 122 | 120,PlayStation Official Magazine - UK 123 | 121,The Boulevard Magazine 124 | 122,OffBeat 125 | 123,Urban Realm 126 | 124,Thirteen Minutes Magazine 127 | 125,Comic BomBom 128 | 126,The New Times (magazine) 129 | 127,Down East (magazine) 130 | 128,Monthly Asuka Fantasy DX 131 | 129,Due South Magazine 132 | 130,SPORT (magazine) 133 | 131,Safari (magazine) 134 | 132,Abitare 135 | 133,Touchstone (magazine) 136 | 134,Nash Country Weekly 137 | 135,Next City 138 | 136,T: The New York Times Style Magazine 139 | 137,Morbid Curiosity (magazine) 140 | 138,BeE Woman 141 | 139,Hinduism Today Magazine 142 | 140,Fortnight (magazine) 143 | 141,The Caterer 144 | 142,The Caterer (formerly Caterer and Hotelkeeper) 145 | 143,Australian 4WD Monthly 146 | 144,PlayStation Official Magazine - Australia 147 | 145,Car (magazine) 148 | 146,DIY (magazine) 149 | 147,Gulf Coast (magazine) 150 | 148,Lighting & Sound International 151 | 149,Veckorevyn 152 | 150,Light (journal) 153 | 151,Sport (magazine) 154 | 152,Venture Magazine 155 | 153,OC Metro magazine 156 | 154,Revolution magazine 157 | 155,Faithworks Magazine 158 | 156,Sidewalk magazine 159 | 157,Popstar! Magazine 160 | 158,Louisville (magazine) 161 | 159,Rattle (magazine) 162 | 160,ShortList 163 | 161,Icon (lifestyle magazine) 164 | 162,Icon Magazine 165 | 163,Modern Dog (magazine) 166 | 164,OPEN (North Dakota magazine) 167 | 165,Business Matters 168 | 166,Muziekkrant OOR 169 | 167,Boutique Design 170 | 168,EMS World 171 | 169,'47 (magazine) 172 | 170,The Hispanic Outlook in Higher Education 173 | 171,Booking (manhwa) 174 | 172,Rouleur Magazine 175 | 173,PRISM international 176 | 174,Noggin Magazine 177 | 175,Blanco y Negro (magazine) 178 | 176,Ukrainskyi Tyzhden 179 | 177,Avotaynu (magazine) 180 | 178,Success (magazine) 181 | 179,Wave Magazine 182 | 180,FACT (United Kingdom magazine) 183 | 181,Caravan magazine 184 | 182,Kiss (Japanese magazine) 185 | 183,The European (1953 magazine) 186 | 184,Maxim India (magazine) 187 | 185,The Zamboni 188 | 186,Max (German magazine) 189 | 187,Oxonian Review 190 | 188,Huntin' Fool Magazine 191 | 189,FHM India 192 | 190,The Fountain (magazine) 193 | 191,Full Circle Magazine 194 | 192,The Middle East in London 195 | 193,WildTomato 196 | 194,Classical Music (magazine) 197 | 195,Environment and Rights 198 | 196,Junk Jet 199 | 197,Online (magazine) 200 | 198,Ciak 201 | 199,Cincinnati (magazine) 202 | 200,Forza Milan! 203 | 201,REM (magazine) 204 | 202,Gariyoshi 205 | 203,Uralsky Sledopyt 206 | 204,Frank Leslie's Weekly 207 | 205,Chorus (magazine) 208 | 206,You (Japanese magazine) 209 | 207,Professional Pilot 210 | 208,Tank Magazine 211 | 209,National Wildlife 212 | 210,Suspense Magazine 213 | 211,Top Gear (Indian magazine) 214 | 212,Disco 45 215 | 213,Grafik Magazine 216 | 214,Irish America (magazine) 217 | 215,OPEN (magazine) 218 | 216,Proto (magazine) 219 | 217,Huck (magazine) 220 | 218,Minerva (archaeology magazine) 221 | 219,Port Folio (magazine) 222 | 220,Square Mile (magazine) 223 | 221,Aeroplane (magazine) 224 | 222,Canadian Immigrant 225 | 223,Art Collector (magazine) 226 | 224,Fenuxe Magazine 227 | 225,Sisterhood Magazine 228 | 226,Black Sea Security 229 | 227,Delayed Gratification magazine 230 | 228,"Studia theologica, Czech Republic" 231 | 229,Bass Musician 232 | 230,Nightshift (Oxford Music Magazine) 233 | 231,Chief Investment Officer Magazine 234 | 232,SoGlos 235 | 233,Manga Action 236 | 234,The Family Friend (magazine) 237 | 235,Antiques info magazine 238 | 236,In Out 239 | 237,WORD Magazine 240 | 238,Women in Music (periodical) 241 | 239,Illustrated Rhodesia Life 242 | 240,Royal flush magazine 243 | 241,The Trades 244 | 242,Science Reporter 245 | 243,Exame 246 | 244,Glass Mountain (magazine) 247 | 245,Musica e dischi 248 | 246,She Kicks 249 | 247,Climbing Magazine 250 | 248,Weekly Shonen Jump Alpha 251 | 249,Natural Home & Garden 252 | 250,Sportsnet Magazine 253 | 251,The Connoisseur (magazine) 254 | 252,Women with Vision! 255 | 253,Explore (magazine) 256 | 254,Zest (magazine) 257 | 255,4-Wheel & Off-Road (magazine) 258 | 256,FACTA (magazine) 259 | 257,Teenage Survival Handbook 260 | 258,Stone Soup Magazine 261 | 259,Literary Club bulgarian 262 | 260,Orange Coast Magazine 263 | 261,Respect. (magazine) 264 | 262,Shokun! 265 | 263,Today's Trucking 266 | 264,SPUR (Australian newspaper) 267 | 265,Das Gedicht 268 | 266,Film Magazine (magazine) 269 | 267,Chalachithram 270 | 268,Rock Australia Magazine 271 | 269,Ideas and Discoveries 272 | 270,Web Techniques 273 | 271,C California Style Magazine 274 | 272,Contemporary Review (Chinese magazine) 275 | 273,Dhanam (business magazine) 276 | -------------------------------------------------------------------------------- /src/autofj/join_function_space/join_function/distance_function.py: -------------------------------------------------------------------------------- 1 | """Compute distance""" 2 | import editdistance 3 | import jellyfish 4 | import collections 5 | from collections import Counter 6 | import time 7 | import numpy as np 8 | import pandas as pd 9 | import spacy 10 | 11 | 12 | """Distance Functions""" 13 | def jaccardDistance(x, y, w=None): 14 | inter = set(x).intersection(set(y)) 15 | union = set(x).union(set(y)) 16 | if w is None: 17 | sum_inter = len(inter) 18 | sum_union = len(union) 19 | else: 20 | sum_inter = sum([w[s] for s in inter]) 21 | sum_union = sum([w[s] for s in union]) 22 | d = 1 - sum_inter / (sum_union + 1e-9) 23 | return d 24 | 25 | def cosineDistance(x, y, w=None): 26 | c1 = Counter(x) 27 | c2 = Counter(y) 28 | inter = set(x).intersection(set(y)) 29 | 30 | if w is None: 31 | uv = sum([c1[s]*c2[s] for s in inter]) 32 | u = np.sqrt(sum([c1[s]**2 for s in set(x)])) 33 | v = np.sqrt(sum([c2[s]**2 for s in set(y)])) 34 | else: 35 | uv = sum([w[s]*c1[s]*w[s]*c2[s] for s in inter]) 36 | u = np.sqrt(sum([(w[s]*c1[s])**2 for s in set(x)])) 37 | v = np.sqrt(sum([(w[s]*c2[s])**2 for s in set(y)])) 38 | 39 | d = 1 - uv / (u * v + 1e-9) 40 | return d 41 | 42 | def diceDistance(x, y, w=None): 43 | inter = set(x).intersection(set(y)) 44 | union = set(x).union(set(y)) 45 | if w is None: 46 | sum_inter = len(inter) 47 | sum_union = len(union) 48 | else: 49 | sum_inter = sum([w[s] for s in inter]) 50 | sum_union = sum([w[s] for s in union]) 51 | d = 1 - (2 * sum_inter / (sum_inter + sum_union + 1e-9)) 52 | return d 53 | 54 | def maxincDistance(x, y, w=None): 55 | inter = set(x).intersection(set(y)) 56 | if w is None: 57 | sum_inter = len(inter) 58 | else: 59 | sum_inter = sum([w[s] for s in inter]) 60 | 61 | if w is None: 62 | sum_x = len(set(x)) 63 | sum_y = len(set(y)) 64 | else: 65 | sum_x = sum([w[s] for s in set(x)]) 66 | sum_y = sum([w[s] for s in set(y)]) 67 | min_sum = min(sum_x, sum_y) 68 | d = 1 - (sum_inter / (min_sum + 1e-9)) 69 | return d 70 | 71 | def intersectDistance(x, y, w=None): 72 | inter = set(x).intersection(set(y)) 73 | union = set(x).union(set(y)) 74 | if w is None: 75 | sum_inter = len(inter) 76 | sum_union = len(union) 77 | else: 78 | sum_inter = sum([w[s] for s in inter]) 79 | sum_union = sum([w[s] for s in union]) 80 | d = 1 - sum_inter / (sum_inter + sum_union + 1e-9) 81 | return d 82 | 83 | def isContain(x, y): 84 | set_x = set(x) 85 | set_y = set(y) 86 | 87 | if len(set_x) > len(set_y): 88 | return set_y.issubset(set_x) 89 | else: 90 | return set_x.issubset(set_y) 91 | 92 | def containCosineDistance(x, y, w=None): 93 | if isContain(x, y): 94 | return cosineDistance(x, y, w) 95 | else: 96 | return 1 97 | 98 | def containJaccardDistance(x, y, w=None): 99 | if isContain(x, y): 100 | return jaccardDistance(x, y, w) 101 | else: 102 | return 1 103 | 104 | def containDiceDistance(x, y, w=None): 105 | if isContain(x, y): 106 | return diceDistance(x, y, w) 107 | else: 108 | return 1 109 | 110 | def editDistance(x, y): 111 | d = editdistance.eval(x, y) 112 | return d 113 | 114 | def jaroDistance(x, y): 115 | d = 1 - jellyfish.jaro_winkler_similarity(x, y) 116 | return d 117 | 118 | def embedDistance(x, y, embedding): 119 | x = embedding(x) 120 | y = embedding(y) 121 | d = 1 - x.similarity(y) 122 | return d 123 | 124 | class DistanceFunction(object): 125 | """Distance function 126 | 127 | Parameters 128 | ---------- 129 | method: string 130 | Method of computing distance. The available methods are listed as 131 | follows. 132 | Set-based distance 133 | - jaccardDistance 134 | - cosineDistance 135 | - diceDistance 136 | - maxincDistance 137 | - intersectDistance 138 | - containCosineDistance 139 | - containJaccardDistance 140 | - containDiceDistance 141 | Char-based distance 142 | - editDistance 143 | - jaroDistance 144 | 145 | """ 146 | def __init__(self, method): 147 | self.method = method 148 | if method == "jaccardDistance": 149 | self.func = jaccardDistance 150 | elif method == "cosineDistance": 151 | self.func = cosineDistance 152 | elif method == "diceDistance": 153 | self.func = diceDistance 154 | elif method == "maxincDistance": 155 | self.func = maxincDistance 156 | elif method == "intersectDistance": 157 | self.func = intersectDistance 158 | elif method == "editDistance": 159 | self.func = editDistance 160 | elif method == "jaroDistance": 161 | self.func = jaroDistance 162 | elif method == "containCosineDistance": 163 | self.func = containCosineDistance 164 | elif method == "containJaccardDistance": 165 | self.func = containJaccardDistance 166 | elif method == "containDiceDistance": 167 | self.func = containDiceDistance 168 | elif method == "embedDistance": 169 | self.func = embedDistance 170 | self.embedding = spacy.load("en_core_web_lg") 171 | else: 172 | raise Exception("{} is an invalid distance function" 173 | .format(method)) 174 | 175 | def compute_distance(self, LR, weight=None): 176 | """"Compute distance score between tuple pairs 177 | 178 | Parameters: 179 | ---------- 180 | LR: pd.DataFrame 181 | A table of tuple pairs. The columns of left and right values are 182 | named as "value_l" and "value_r". For char-based distance the type 183 | of values are string. For set-based distance the type of values are 184 | token set. 185 | 186 | weight: dict, default=None 187 | Weighting schema. If none, uniform weight or no weight is used. 188 | 189 | Return: 190 | ------- 191 | distance: pd.Series 192 | distance between tuple pairs for each row 193 | """ 194 | if weight is None: 195 | if self.method != "embedDistance": 196 | distance = LR.apply(lambda x: self.func(x.value_l, x.value_r), axis=1) 197 | else: 198 | distance = LR.apply(lambda x: self.func(x.value_l, x.value_r, self.embedding), axis=1) 199 | else: 200 | distance = LR.apply(lambda x: self.func(x.value_l, x.value_r, weight), axis=1) 201 | return distance 202 | 203 | # data = pd.read_csv("../../data/left.csv")["title"] 204 | # X = np.concatenate([data.values for _ in range(20)]) 205 | # X = pd.Series(X) 206 | # 207 | # L = X 208 | # R = X.sample(frac=1) 209 | # 210 | # from tokenizer import Tokenizer 211 | # tokenizer = Tokenizer("splitBySpace") 212 | # L = tokenizer.tokenize(L) 213 | # R = tokenizer.tokenize(R) 214 | # LR = pd.DataFrame({"value_l":L, "value_r":R}) 215 | # 216 | # tic = time.time() 217 | # methods = ["jaccardDistance", "maxincDistance", "containCosineDistance"] 218 | # distance_function = DistanceFunction("jaccardDistance") 219 | # distance_function.compute_distance(LR) 220 | # distance_function = DistanceFunction("maxincDistance") 221 | # distance_function.compute_distance(LR) 222 | # distance_function = DistanceFunction("containCosineDistance") 223 | # distance_function.compute_distance(LR) 224 | # print(time.time() - tic) 225 | -------------------------------------------------------------------------------- /src/autofj.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | MANIFEST.in 2 | README.md 3 | pyproject.toml 4 | setup.py 5 | src/autofj/__init__.py 6 | src/autofj/autofj.py 7 | src/autofj/datasets.py 8 | src/autofj/negative_rule.py 9 | src/autofj/utils.py 10 | src/autofj.egg-info/PKG-INFO 11 | src/autofj.egg-info/SOURCES.txt 12 | src/autofj.egg-info/dependency_links.txt 13 | src/autofj.egg-info/requires.txt 14 | src/autofj.egg-info/top_level.txt 15 | src/autofj/benchmark/.DS_Store 16 | src/autofj/benchmark/Amphibian/gt.csv 17 | src/autofj/benchmark/Amphibian/left.csv 18 | src/autofj/benchmark/Amphibian/right.csv 19 | src/autofj/benchmark/ArtificialSatellite/gt.csv 20 | src/autofj/benchmark/ArtificialSatellite/left.csv 21 | src/autofj/benchmark/ArtificialSatellite/right.csv 22 | src/autofj/benchmark/Artwork/gt.csv 23 | src/autofj/benchmark/Artwork/left.csv 24 | src/autofj/benchmark/Artwork/right.csv 25 | src/autofj/benchmark/Award/gt.csv 26 | src/autofj/benchmark/Award/left.csv 27 | src/autofj/benchmark/Award/right.csv 28 | src/autofj/benchmark/BasketballTeam/gt.csv 29 | src/autofj/benchmark/BasketballTeam/left.csv 30 | src/autofj/benchmark/BasketballTeam/right.csv 31 | src/autofj/benchmark/Case/gt.csv 32 | src/autofj/benchmark/Case/left.csv 33 | src/autofj/benchmark/Case/right.csv 34 | src/autofj/benchmark/ChristianBishop/gt.csv 35 | src/autofj/benchmark/ChristianBishop/left.csv 36 | src/autofj/benchmark/ChristianBishop/right.csv 37 | src/autofj/benchmark/ClericalAdministrativeRegion/gt.csv 38 | src/autofj/benchmark/ClericalAdministrativeRegion/left.csv 39 | src/autofj/benchmark/ClericalAdministrativeRegion/right.csv 40 | src/autofj/benchmark/Country/gt.csv 41 | src/autofj/benchmark/Country/left.csv 42 | src/autofj/benchmark/Country/right.csv 43 | src/autofj/benchmark/Device/gt.csv 44 | src/autofj/benchmark/Device/left.csv 45 | src/autofj/benchmark/Device/right.csv 46 | src/autofj/benchmark/Drug/gt.csv 47 | src/autofj/benchmark/Drug/left.csv 48 | src/autofj/benchmark/Drug/right.csv 49 | src/autofj/benchmark/Election/gt.csv 50 | src/autofj/benchmark/Election/left.csv 51 | src/autofj/benchmark/Election/right.csv 52 | src/autofj/benchmark/Enzyme/gt.csv 53 | src/autofj/benchmark/Enzyme/left.csv 54 | src/autofj/benchmark/Enzyme/right.csv 55 | src/autofj/benchmark/EthnicGroup/gt.csv 56 | src/autofj/benchmark/EthnicGroup/left.csv 57 | src/autofj/benchmark/EthnicGroup/right.csv 58 | src/autofj/benchmark/FootballLeagueSeason/gt.csv 59 | src/autofj/benchmark/FootballLeagueSeason/left.csv 60 | src/autofj/benchmark/FootballLeagueSeason/right.csv 61 | src/autofj/benchmark/FootballMatch/gt.csv 62 | src/autofj/benchmark/FootballMatch/left.csv 63 | src/autofj/benchmark/FootballMatch/right.csv 64 | src/autofj/benchmark/Galaxy/gt.csv 65 | src/autofj/benchmark/Galaxy/left.csv 66 | src/autofj/benchmark/Galaxy/right.csv 67 | src/autofj/benchmark/GivenName/gt.csv 68 | src/autofj/benchmark/GivenName/left.csv 69 | src/autofj/benchmark/GivenName/right.csv 70 | src/autofj/benchmark/GovernmentAgency/gt.csv 71 | src/autofj/benchmark/GovernmentAgency/left.csv 72 | src/autofj/benchmark/GovernmentAgency/right.csv 73 | src/autofj/benchmark/HistoricBuilding/gt.csv 74 | src/autofj/benchmark/HistoricBuilding/left.csv 75 | src/autofj/benchmark/HistoricBuilding/right.csv 76 | src/autofj/benchmark/Hospital/gt.csv 77 | src/autofj/benchmark/Hospital/left.csv 78 | src/autofj/benchmark/Hospital/right.csv 79 | src/autofj/benchmark/Legislature/gt.csv 80 | src/autofj/benchmark/Legislature/left.csv 81 | src/autofj/benchmark/Legislature/right.csv 82 | src/autofj/benchmark/Magazine/gt.csv 83 | src/autofj/benchmark/Magazine/left.csv 84 | src/autofj/benchmark/Magazine/right.csv 85 | src/autofj/benchmark/MemberOfParliament/gt.csv 86 | src/autofj/benchmark/MemberOfParliament/left.csv 87 | src/autofj/benchmark/MemberOfParliament/right.csv 88 | src/autofj/benchmark/Monarch/gt.csv 89 | src/autofj/benchmark/Monarch/left.csv 90 | src/autofj/benchmark/Monarch/right.csv 91 | src/autofj/benchmark/MotorsportSeason/gt.csv 92 | src/autofj/benchmark/MotorsportSeason/left.csv 93 | src/autofj/benchmark/MotorsportSeason/right.csv 94 | src/autofj/benchmark/Museum/gt.csv 95 | src/autofj/benchmark/Museum/left.csv 96 | src/autofj/benchmark/Museum/right.csv 97 | src/autofj/benchmark/NCAATeamSeason/gt.csv 98 | src/autofj/benchmark/NCAATeamSeason/left.csv 99 | src/autofj/benchmark/NCAATeamSeason/right.csv 100 | src/autofj/benchmark/NationalFootballLeagueSeason/gt.csv 101 | src/autofj/benchmark/NationalFootballLeagueSeason/left.csv 102 | src/autofj/benchmark/NationalFootballLeagueSeason/right.csv 103 | src/autofj/benchmark/NaturalEvent/gt.csv 104 | src/autofj/benchmark/NaturalEvent/left.csv 105 | src/autofj/benchmark/NaturalEvent/right.csv 106 | src/autofj/benchmark/Noble/gt.csv 107 | src/autofj/benchmark/Noble/left.csv 108 | src/autofj/benchmark/Noble/right.csv 109 | src/autofj/benchmark/PoliticalParty/gt.csv 110 | src/autofj/benchmark/PoliticalParty/left.csv 111 | src/autofj/benchmark/PoliticalParty/right.csv 112 | src/autofj/benchmark/Race/gt.csv 113 | src/autofj/benchmark/Race/left.csv 114 | src/autofj/benchmark/Race/right.csv 115 | src/autofj/benchmark/RailwayLine/gt.csv 116 | src/autofj/benchmark/RailwayLine/left.csv 117 | src/autofj/benchmark/RailwayLine/right.csv 118 | src/autofj/benchmark/Reptile/gt.csv 119 | src/autofj/benchmark/Reptile/left.csv 120 | src/autofj/benchmark/Reptile/right.csv 121 | src/autofj/benchmark/RugbyLeague/gt.csv 122 | src/autofj/benchmark/RugbyLeague/left.csv 123 | src/autofj/benchmark/RugbyLeague/right.csv 124 | src/autofj/benchmark/ShoppingMall/gt.csv 125 | src/autofj/benchmark/ShoppingMall/left.csv 126 | src/autofj/benchmark/ShoppingMall/right.csv 127 | src/autofj/benchmark/SoccerClubSeason/gt.csv 128 | src/autofj/benchmark/SoccerClubSeason/left.csv 129 | src/autofj/benchmark/SoccerClubSeason/right.csv 130 | src/autofj/benchmark/SoccerLeague/gt.csv 131 | src/autofj/benchmark/SoccerLeague/left.csv 132 | src/autofj/benchmark/SoccerLeague/right.csv 133 | src/autofj/benchmark/SoccerTournament/gt.csv 134 | src/autofj/benchmark/SoccerTournament/left.csv 135 | src/autofj/benchmark/SoccerTournament/right.csv 136 | src/autofj/benchmark/Song/gt.csv 137 | src/autofj/benchmark/Song/left.csv 138 | src/autofj/benchmark/Song/right.csv 139 | src/autofj/benchmark/SportFacility/gt.csv 140 | src/autofj/benchmark/SportFacility/left.csv 141 | src/autofj/benchmark/SportFacility/right.csv 142 | src/autofj/benchmark/SportsLeague/gt.csv 143 | src/autofj/benchmark/SportsLeague/left.csv 144 | src/autofj/benchmark/SportsLeague/right.csv 145 | src/autofj/benchmark/Stadium/gt.csv 146 | src/autofj/benchmark/Stadium/left.csv 147 | src/autofj/benchmark/Stadium/right.csv 148 | src/autofj/benchmark/TelevisionStation/gt.csv 149 | src/autofj/benchmark/TelevisionStation/left.csv 150 | src/autofj/benchmark/TelevisionStation/right.csv 151 | src/autofj/benchmark/TennisTournament/gt.csv 152 | src/autofj/benchmark/TennisTournament/left.csv 153 | src/autofj/benchmark/TennisTournament/right.csv 154 | src/autofj/benchmark/Tournament/gt.csv 155 | src/autofj/benchmark/Tournament/left.csv 156 | src/autofj/benchmark/Tournament/right.csv 157 | src/autofj/benchmark/UnitOfWork/gt.csv 158 | src/autofj/benchmark/UnitOfWork/left.csv 159 | src/autofj/benchmark/UnitOfWork/right.csv 160 | src/autofj/benchmark/Venue/gt.csv 161 | src/autofj/benchmark/Venue/left.csv 162 | src/autofj/benchmark/Venue/right.csv 163 | src/autofj/benchmark/Wrestler/gt.csv 164 | src/autofj/benchmark/Wrestler/left.csv 165 | src/autofj/benchmark/Wrestler/right.csv 166 | src/autofj/blocker/__init__.py 167 | src/autofj/blocker/autofj_blocker.py 168 | src/autofj/blocker/blocker.py 169 | src/autofj/join_function_space/__init__.py 170 | src/autofj/join_function_space/autofj_join_function_space.py 171 | src/autofj/join_function_space/options.py 172 | src/autofj/join_function_space/join_function/__init__.py 173 | src/autofj/join_function_space/join_function/autofj_join_function.py 174 | src/autofj/join_function_space/join_function/distance_function.py 175 | src/autofj/join_function_space/join_function/join_function.py 176 | src/autofj/join_function_space/join_function/preprocessor.py 177 | src/autofj/join_function_space/join_function/token_weight.py 178 | src/autofj/join_function_space/join_function/tokenizer.py 179 | src/autofj/optimizer/__init__.py 180 | src/autofj/optimizer/autofj_multi_column_greedy_algorithm.py 181 | src/autofj/optimizer/autofj_single_column_greedy_algorithm.py -------------------------------------------------------------------------------- /src/autofj.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: autofj 3 | Version: 0.0.6 4 | Summary: Auto-Program Fuzzy Similarity Joins Without Labeled Examples 5 | Home-page: https://github.com/chu-data-lab/AutomaticFuzzyJoin 6 | Author: Peng Li 7 | Author-email: lipengpublic@gmail.com 8 | License: UNKNOWN 9 | Platform: UNKNOWN 10 | Classifier: Programming Language :: Python :: 3 11 | Classifier: License :: OSI Approved :: MIT License 12 | Classifier: Operating System :: OS Independent 13 | Requires-Python: >=3.7 14 | Description-Content-Type: text/markdown 15 | 16 | # AutoFJ 17 | 18 | The official code for our SIGMOD 2021 paper: [Auto-FuzzyJoin: Auto-Program Fuzzy Similarity Joins Without Labeled Examples](https://arxiv.org/abs/2103.04489). To reproduce the main results in our paper, switch to `reproduce` branch. 19 | 20 | AutoFJ automatically produces record pairs that approximately match in two input 21 | tables without requiring explicit human input such as labeled training data. Using AutoFJ, 22 | users only need to provide two input tables, and a desired precision target (say 0.9). 23 | AutoFJ leverages the fact that one of the input is a reference table to 24 | automatically program fuzzy-joins that meet the precision target in expectation, 25 | while maximizing fuzzy-join recall (defined as the number of correctly joined records). 26 | 27 | In AutoFJ, the left table refers to a reference table, which is assumed to be almost "duplicate-free". AutoFJ attempts to solve many-to-one join problems, where each record in the right table will be joined with at most one record in the left table, but each record in left table can be joined with multiple records in the right table. 28 | 29 | AutoFJ also provides a benchmark that contains [50 diverse datasets](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/50-single-column-datasets.md) for single-column fuzzy-join tasks constructed from [DBPedia](https://www.dbpedia.org). 30 | 31 | ## Installation 32 | 33 | Install the package using pip 34 | 35 | ``` 36 | pip install autofj 37 | ``` 38 | 39 | ## Usage 40 | 41 | Let `left_table` be the reference table and `right_table` be another input table. The two tables are assumed to have the same schema and have an id column named `id_column`. To join `left_table` and `right_table` with 42 | precision target 0.9, run the following code. The result will be a joined table of record pairs that are identified as matches from two input tables. 43 | ```python 44 | from autofj import AutoFJ 45 | fj = AutoFJ(precision_target=0.9) 46 | result = fj.join(left_table, right_table, id_column) 47 | ``` 48 | 49 | To load a benchmark dataset named as `dataset_name`, run the following code. Each dataset contains a left table (reference table), a right table and a ground-truth table of matched record pairs. The id column of each dataset is named as "id" and the column to be joined is named as "title". The names of all benchmark datasets are listed [here](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/50-single-column-datasets.md). 50 | ```python 51 | from autofj.datasets import load_data 52 | left_table, right_table, gt_table = load_data(dataset_name) 53 | ``` 54 | ## Example 55 | Run the following code to join the left and right table of TennisTournament dataset. 56 | ```python 57 | from autofj.datasets import load_data 58 | from autofj import AutoFJ 59 | left_table, right_table, gt_table = load_data("TennisTournament") 60 | fj = AutoFJ(precision_target=0.9) 61 | result = fj.join(left_table, right_table, "id") 62 | ``` 63 | 64 | ## Documentation 65 | ```python 66 | class AutoFJ(object): 67 | def __init__(self, 68 | precision_target=0.9, 69 | join_function_space="autofj_sm", 70 | distance_threshold_space=50, 71 | column_weight_space=10, 72 | blocker=None, 73 | n_jobs=-1, 74 | verbose=False): 75 | ``` 76 | 77 | ### Parameters 78 | * **precision_target: *float*, default=0.9**
79 | Precision target. The value is taken from 0-1. The default value is 0.9. 80 | 81 | * **join_function_space: *string, dict or list of objects*, default="autofj_sm"**
82 | Space of join functions. There are three ways to define the space of join functions: 83 | 1. Use the name (string) of built-in join function space. There are three 84 | options, including "autofj_lg", "autofj_md" and "autofj_sm" that use 85 | 136, 68 and 14 join functions, respectively. Using less join functions 86 | can improve efficiency but may worsen performance. 87 | 2. Use a dict specifying the options for preprocessing methods, 88 | tokenization methods, token weighting methods and distance functions. 89 | The space will be the cartesian product of all options in the dict. 90 | See [options.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/join_function_space/options.py) for defining join functions using 91 | a dict. 92 | 3. Use a list of customized JoinFunction objects. Define JoinFunction class using prototype in [join_function.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/join_function_space/join_function/join_function.py). 93 | 94 | * **distance_threshold_space: *int or list of floats*, default=50**
95 | The number of candidate distance thresholds or a list of candidate 96 | distance thresholds in the space. If the number of distance thresholds 97 | (integer) is given, distance thresholds are spaced evenly from 0 to 1. 98 | Otherwise, it should be a list of floats from 0 to 1. Using fewer candidates 99 | can improve efficiency but may worsen performance. 100 | 101 | * **column_weight_space: *int or list of floats*, default=10**
102 | The number of candidate column weights or a list of candidate 103 | column weights in the space. If the number of column weights 104 | (integer) is given, column weights are spaced evenly from 0 to 1. 105 | Otherwise, it should be a list of floats from 0 to 1. Using fewer candidates 106 | can improve efficiency but may worsen performance. 107 | 108 | 109 | * **blocker: *None or a Blocker object*, default None**
110 | A Blocker object that performs blocking on two tables. If None, use 111 | the built-in blocker. For using customized blocker, define Blocker class using prototype in [blocker.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/blocker/blocker.py). 112 | 113 | * **n_jobs : *int*, default=-1**
114 | Number of CPU cores used. -1 means using all processors. 115 | 116 | * **verbose: *bool*, default=False**
117 | Whether to print logging 118 | 119 | ### Attributes 120 | * **selected_column_weights: *dict***
121 | The columns and column weights selected by the algorithm. The key is the 122 | column name, the value is the weight selected for the column. 123 | 124 | * **selected_join_configs: *list of tuples***
125 | The union of join configurations selected by the algorithm. Each tuple 126 | (join_function, threshold) in the list is a join configuration that 127 | consists of the name of the join function and its distance threshold. 128 | 129 | ### Methods 130 | ```python 131 | join(left_table, right_table, id_column, on=None) 132 | ``` 133 | 134 | Join left table and right table. 135 | 136 | #### Parameters 137 | * **left_table: *pandas.DataFrame***
138 | Reference table. The left table is assumed to be almost duplicate-free, which means it has no or only few duplicates. 139 | 140 | * **right_table: *pandas.DataFrame***
141 | Another input table. 142 | 143 | * **id_column: *string***
144 | The name of id column in the two tables. This column will not be 145 | used to join two tables. 146 | 147 | * **on: *list or None*, default=None**
148 | A list of column names (multi-column fuzzy join) that the two tables 149 | will be joined on. If None, two tables will be joined on all columns 150 | that exist in both tables, excluding the id column. 151 | 152 | #### Return 153 | * ***pandas.DataFrame***
154 | A table of joining pairs. The columns of left table are 155 | suffixed with "_l" and the columns of right table are suffixed 156 | with "_r". 157 | 158 | -------------------------------------------------------------------------------- /src/autofj/benchmark/SoccerLeague/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,National League (English football) 3 | 1,USL W-League 4 | 2,J1 League 5 | 3,Gibraltar Premier Division 6 | 4,Persian Gulf Pro League 7 | 5,Southern Counties East Football League 8 | 6,Kent League 9 | 7,National League North 10 | 8,National League South 11 | 9,Wessex League 12 | 10,National League (division) 13 | 11,Conference Premier 14 | 12,United Counties Football League 15 | 13,Southern Combination Football League 16 | 14,Professional Indoor Football League (1998) 17 | 15,Premier Development League 18 | 16,LigaPro 19 | 17,Liga de Honra 20 | 18,West Cheshire Amateur Football League 21 | 19,K-League 22 | 20,Croatian First Football League 23 | 21,Belgian Fourth Division 24 | 22,Croatian Second Football League 25 | 23,Swedish football Division 2 26 | 24,Norwegian First Division 27 | 25,Japan Women's Football League 28 | 26,Division 1 (Swedish football) 29 | 27,Swedish football Division 1 30 | 28,National Premier Leagues NSW 31 | 29,National Premier Leagues Victoria 32 | 30,Championnat de France Amateur 33 | 31,Russian Football National League 34 | 32,Football Championship of the National League 35 | 33,Russian Professional Football League 36 | 34,Czech First League 37 | 35,Gambrinus Liga 38 | 36,Croatian Prva HMNL 39 | 37,Championnat de France Amateur 2 40 | 38,"Brighton, Worthing & District Football League" 41 | 39,Cheshire Football League 42 | 40,Gloucestershire County League 43 | 41,Somerset County Football League 44 | 42,Northamptonshire Football Combination 45 | 43,Thames Valley Premier Football League 46 | 44,West Yorkshire League 47 | 45,First Capital Plus Premier League 48 | 46,Midland Football League (Scotland) 49 | 47,West of Scotland Super League Premier Division 50 | 48,West of Scotland Super League First Division 51 | 49,Capital League 1 52 | 50,Capital 1 League 53 | 51,NIFL Premiership 54 | 52,NAIA Men's Soccer Championship 55 | 53,Macedonian First Football League 56 | 54,Umaglesi Liga 57 | 55,Georgian Premier League 58 | 56,National Premier Leagues South Australia 59 | 57,FFSA Premier League 60 | 58,Bristol Downs Football League 61 | 59,Subroto Cup Football Tournament 62 | 60,Bristol and District League 63 | 61,NAIA Women's Soccer Championship 64 | 62,Ligat Nashim 65 | 63,Dorset Senior League 66 | 64,Plymouth and West Devon Football League 67 | 65,Plymouth and West Devon Combination 68 | 66,Doncaster & District Senior League 69 | 67,Bristol and Avon League 70 | 68,South Yorkshire Amateur League 71 | 69,I Liga (Slovakia) 72 | 70,Bath and North Somerset District Football League 73 | 71,Bath and District League 74 | 72,Andover and District Saturday Football League 75 | 73,Cheltenham League 76 | 74,Craven and District League 77 | 75,North Gloucestershire League 78 | 76,Stroud and District League 79 | 77,Taunton & District Saturday League 80 | 78,Weston super Mare and District League 81 | 79,Slovak Third League 82 | 80,Calcutta Football League 83 | 81,Kingdom Caledonian Amateur Football Association 84 | 82,Liga Portuguesa de Futebol Profissional 85 | 83,Portuguese Handball Super Cup 86 | 84,Czech National Football League 87 | 85,Cape Verdean Football Championship 88 | 86,V.League 1 89 | 87,V-League (Vietnam) 90 | 88,Oman Professional League 91 | 89,Rwanda National Football League 92 | 90,Rwandan Premier League 93 | 91,United Indoor Football League 94 | 92,SVB Hoofdklasse 95 | 93,Ligue 1 Mauritania 96 | 94,Thimphu League 97 | 95,Bangladesh Football Premier League 98 | 96,Bangladesh League 99 | 97,National Premier Leagues Northern NSW 100 | 98,Somali First Division 101 | 99,Somali League 102 | 100,Provo Premier League 103 | 101,MFL League 104 | 102,Saint Kitts Premier Division 105 | 103,SKNFA Super League 106 | 104,AFA Senior Male League 107 | 105,Anguillian League 108 | 106,Cayman Islands Premier League 109 | 107,Guam Soccer League 110 | 108,FFAS Senior League 111 | 109,Port Vila Football League 112 | 110,TVL League 113 | 111,Tahiti Ligue 1 114 | 112,International rules series 115 | 113,St Helens Combination 116 | 114,Swedish football Division 1 Norra 117 | 115,Southern Championship 118 | 116,Northern Championship 119 | 117,Vale of Clwyd and Conwy Football League 120 | 118,British Virgin Islands Championship 121 | 119,Norwegian Second Division 122 | 120,National Premier Leagues NSW 2 123 | 121,National Premier Leagues NSW 3 124 | 122,NSW State League Division 1 125 | 123,Northern Mariana Championship 126 | 124,Northern NSW State League Division 1 127 | 125,Vodacom League 128 | 126,K3 League 129 | 127,K3 Challengers League 130 | 128,National Premier Leagues Capital Football 131 | 129,National Premier Leagues ACT 132 | 130,Wilson Cup 133 | 131,Hampshire Premier Football League 134 | 132,II liiga 135 | 133,F.League 136 | 134,III liiga 137 | 135,Alberton Football Netball League 138 | 136,IV liiga 139 | 137,Swiss 1. Liga (football) 140 | 138,The Football League 141 | 139,Macedonian Second Football League 142 | 140,2. Oberliga West 143 | 141,K league 144 | 142,National Youth League (Australia) 145 | 143,Football West State League Division 1 146 | 144,Surrey Elite Intermediate League 147 | 145,Macedonian Third Football League 148 | 146,Mexican Primera División 149 | 147,Major Arena Soccer League 150 | 148,Yarra Valley Mountain District Football and Netball League 151 | 149,Oman Super Cup 152 | 150,Ballymena & Provincial Football League 153 | 151,Scottish Women's First Division 154 | 152,R League 155 | 153,Rugby League Conference Welsh Premier 156 | 154,Scottish National League (rugby league) 157 | 155,Rugby League Conference Scotland Division 158 | 156,NSW State League 159 | 157,NSW State League Division 2 160 | 158,WK League 161 | 159,National Premier Leagues Western Australia 162 | 160,SPFL Development League 163 | 161,Scottish Premier under-19 League 164 | 162,SPFL U20 League 165 | 163,Zone League One 166 | 164,Zone League Two 167 | 165,Zone League Three 168 | 166,League 1 (rugby league) 169 | 167,Tehran Province League 170 | 168,Tehran Province league 171 | 169,Iraq Division One 172 | 170,Liga de Ascenso 173 | 171,Belgian Futsal Division 1 174 | 172,FAM League 175 | 173,Azerbaijan Futsal Premier League 176 | 174,V.League 2 177 | 175,Israeli Noar Leumit League 178 | 176,Czech Futsal First League 179 | 177,Georgian Futsal Super League 180 | 178,Rugby League Conference South West Division 181 | 179,Russian women's football championship 182 | 180,Vietnamese National Football Second League 183 | 181,Afghan National League 184 | 182,FA WSL 1 185 | 183,J2 League 186 | 184,North Wales Championship 187 | 185,Ekstraliga (women's football) 188 | 186,Romanian Superliga (women's football) 189 | 187,Welsh Premier League (women) 190 | 188,Bosnian women's football championship 191 | 189,OK League 192 | 190,United Soccer League 193 | 191,Campeonato Nacional de Futebol de Praia 194 | 192,Slovenian Regional League 195 | 193,SVB Eerste Klasse 196 | 194,Cypriot Futsal First Division 197 | 195,Serbian Prva Futsal Liga 198 | 196,Pertiwi Cup 199 | 197,Indonesian Womens Football Tournament 200 | 198,West Cork League Premier Division 201 | 199,Slovenian under-19 League 202 | 200,Norwegian Third Division 203 | 201,UAE Arabian Gulf League 204 | 202,UAE Pro-League 205 | 203,Rugby League Conference East Division 206 | 204,Kyrgyzstan Women's Championship 207 | 205,I-League U18 208 | 206,I-League U19 209 | 207,Serbian Zone League 210 | 208,Albanian Women's National Championship 211 | 209,5. Liga (Slovakia) 212 | 210,Slovak Fifth League 213 | 211,Great Yarmouth and District League 214 | 212,Norwich and District Business Houses League 215 | 213,Delhi Senior Division 216 | 214,Korea Semi-Professional Football League 217 | 215,Bromley and District League 218 | 216,AFL Europe Championship 219 | 217,Professional Indoor Football League 220 | 218,Qatargas League 221 | 219,Qatari 2nd Division 222 | 220,Professional Football League (Algeria) 223 | 221,WPSL Elite League 224 | 222,East Entry League 225 | 223,Kenyan Regional Leagues 226 | 224,Kenyan Provincial Leagues 227 | 225,Kenyan County Leagues 228 | 226,Kenyan District Leagues 229 | 227,Phthiotis Football Clubs Association 230 | 228,Corinthia Football Clubs Association 231 | 229,Maldivian Second Division Football Tournament 232 | 230,Oberliga Mittelrhein 233 | 231,National Premier Leagues Tasmania 234 | 232,Oman Professional League Cup 235 | 233,FAO Football League 236 | 234,A1 Ethniki Women's Water Polo 237 | 235,National Premier Leagues Queensland 238 | 236,Maldivian Third Division Football Tournament 239 | 237,J3 League 240 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Legislature/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Sejm 3 | 1,House of Representatives of the Netherlands 4 | 2,Parliament of Sweden 5 | 3,Diet of Japan 6 | 4,Storting 7 | 5,States General of the Netherlands 8 | 6,National Assembly of the Republic of China 9 | 7,House of Councillors (Japan) 10 | 8,House of Representatives of Japan 11 | 9,National Council of Austria 12 | 10,Federal Council of Austria 13 | 11,Secretariat of the Communist Party of China 14 | 12,National Assembly of France 15 | 13,National Council of Switzerland 16 | 14,Council of States of Switzerland 17 | 15,New Zealand House of Representatives 18 | 16,Croatian Parliament 19 | 17,Indian Parliament 20 | 18,Congress of Mexico 21 | 19,Chamber of Deputies of Mexico 22 | 20,National Assembly of Kuwait 23 | 21,The Folketing 24 | 22,Senate of Mexico 25 | 23,Senate of Poland 26 | 24,National Council of the Slovak Republic 27 | 25,National Assembly of Venezuela 28 | 26,National Assembly of the Republic of Poland 29 | 27,Parliament of Wallonia 30 | 28,Senate of France 31 | 29,National Assembly of Hungary 32 | 30,Senate of the Netherlands 33 | 31,National Assembly of South Korea 34 | 32,National Assembly of Serbia 35 | 33,National Assembly of Panama 36 | 34,Parliament of Fiji 37 | 35,National Assembly of Pakistan 38 | 36,Italian Senate 39 | 37,"City Council of Cincinnati, Ohio elections" 40 | 38,General Assembly of Nova Scotia 41 | 39,Parliament of France 42 | 40,Belgian Senate 43 | 41,Assembly of the Republic of Portugal 44 | 42,Italian Chamber of Deputies 45 | 43,Senate of Romania 46 | 44,Chamber of Deputies of Romania 47 | 45,Federal Assembly of Russia 48 | 46,Federation Council of Russia 49 | 47,25th Alberta Legislature 50 | 48,Alberta Legislature 51 | 49,Parliament of Italy 52 | 50,Arizona Legislature 53 | 51,House of Representatives of Trinidad and Tobago 54 | 52,Senate of Trinidad and Tobago 55 | 53,Landtag of the Free State of Saxony 56 | 54,Central Commission for Discipline Inspection 57 | 55,Greater Chennai Corporation 58 | 56,National Congress of Bolivia 59 | 57,Chamber of Deputies of Brazil 60 | 58,National Congress of Argentina 61 | 59,National Assembly of Nicaragua 62 | 60,National Council of Monaco 63 | 61,House of Elders 64 | 62,Federal Assembly of Switzerland 65 | 63,National Assembly of Nigeria 66 | 64,People's Assembly of Egypt 67 | 65,General Council (Andorra) 68 | 66,National Assembly of Bulgaria 69 | 67,National Assembly of Belize 70 | 68,Parliament of Austria 71 | 69,House of Representatives of Bosnia and Herzegovina 72 | 70,National Assembly of Burundi 73 | 71,Senate of Burundi 74 | 72,Chamber of Deputies of Tunisia 75 | 73,Urban Council 76 | 74,Urban Council (Hong Kong) 77 | 75,National Assembly of Djibouti 78 | 76,Chamber of Deputies of Luxembourg 79 | 77,Chamber of Deputies of Rwanda 80 | 78,House of Representatives of Belize 81 | 79,Majlis of the Maldives 82 | 80,House of Representatives of Liberia 83 | 81,Senate of Liberia 84 | 82,National Assembly of Bahrain 85 | 83,Council of Representatives of Bahrain 86 | 84,Consultative Council of Bahrain 87 | 85,Senate of the Democratic Republic of the Congo 88 | 86,National Parliament of East Timor 89 | 87,National Assembly of Botswana 90 | 88,National Assembly of Cape Verde 91 | 89,National Assembly of Namibia 92 | 90,Assembly of the Republic of Mozambique 93 | 91,House of Assembly of Zimbabwe 94 | 92,National Council of Namibia 95 | 93,National Assembly of Niger 96 | 94,National Assembly of the Central African Republic 97 | 95,National Assembly of Zambia 98 | 96,National Assembly of Seychelles 99 | 97,National Assembly of Guinea 100 | 98,National Assembly of Lesotho 101 | 99,Supreme Council (Transnistria) 102 | 100,Chamber of Deputies (Equatorial Guinea) 103 | 101,National Assembly of Togo 104 | 102,National People's Assembly of Guinea-Bissau 105 | 103,House of Representatives of Nigeria 106 | 104,National Assembly of Mauritania 107 | 105,National Legislature of Sudan 108 | 106,Zanzibar House of Representatives 109 | 107,House of Representatives of Zanzibar 110 | 108,House of Representatives of Antigua and Barbuda 111 | 109,National Assembly of Burkina Faso 112 | 110,National Assembly of Mali 113 | 111,National Assembly of Kenya 114 | 112,Belgian Chamber of Representatives 115 | 113,National Assembly of Suriname 116 | 114,National Assembly of Afghanistan 117 | 115,National Assembly of Angola 118 | 116,National Assembly of Armenia 119 | 117,National Assembly of Azerbaijan 120 | 118,Senate of Thailand 121 | 119,House of Representatives of Thailand 122 | 120,National Assembly of the Republika Srpska 123 | 121,People's Assembly (Republika Srpska) 124 | 122,National Assembly of Vietnam 125 | 123,National Assembly of Benin 126 | 124,Senate of Belize 127 | 125,Senate of Antigua and Barbuda 128 | 126,National Assembly of Côte d'Ivoire 129 | 127,Assembly of Representatives of Yemen 130 | 128,Supreme Assembly of Uzbekistan 131 | 129,National Assembly of Tanzania 132 | 130,National Assembly of Cameroon 133 | 131,Senate of Cambodia 134 | 132,National Assembly of Cambodia 135 | 133,National Assembly of Chad 136 | 134,National Assembly of Eritrea 137 | 135,House of Representatives of Cyprus 138 | 136,National Assembly of Guyana 139 | 137,House of Peoples of Bosnia and Herzegovina 140 | 138,Chamber of Deputies of Haiti 141 | 139,National Assembly of Laos 142 | 140,House of Assembly of Kiribati 143 | 141,National Assembly of Malawi 144 | 142,National Assembly of Mauritius 145 | 143,National Parliament of the Solomon Islands 146 | 144,National Assembly of Saint Kitts and Nevis 147 | 145,Assembly of the Republic of Macedonia 148 | 146,Supreme Assembly of Tajikistan 149 | 147,Assembly of Representatives of Morocco 150 | 148,Parliament of Mauritania 151 | 149,Supreme Council of Kyrgyzstan 152 | 150,Council of States of Sudan 153 | 151,National Assembly of Sudan 154 | 152,Peterborough City Council 155 | 153,Senate of Brazil 156 | 154,National Assembly of the Democratic Republic of the Congo 157 | 155,Senate of Haiti 158 | 156,State Council of Crimea 159 | 157,Länderkammer 160 | 158,Imperial Legislative Council 161 | 159,Prussian House of Lords 162 | 160,Iraqi Kurdistan Parliament 163 | 161,Charlotte City Council 164 | 162,Madrid Assembly 165 | 163,General Assembly of Prince Edward Island 166 | 164,National Assembly of Thailand 167 | 165,KwaZulu–Natal Legislature 168 | 166,New Brunswick Legislature 169 | 167,National Council of Bhutan 170 | 168,National Assembly of Bhutan 171 | 169,Penang Island City Council 172 | 170,27th Alberta Legislature 173 | 171,Wigan Metropolitan Borough Council 174 | 172,Senate of Ceylon 175 | 173,Borough of Poole 176 | 174,Chamber of Councillors of Tunisia 177 | 175,House of Representatives of Ceylon 178 | 176,Landtag of the Saarland 179 | 177,City of Bradford Metropolitan District Council 180 | 178,"Assembly of the Community of Municipalities, Autonomous Province of Kosovo and Metohija" 181 | 179,Assembly of the Community of Serbian municipalities 182 | 180,Ards and North Down Borough Council 183 | 181,Antrim and Newtownabbey Borough Council 184 | 182,Mid and East Antrim Borough Council 185 | 183,General Assembly of Newfoundland and Labrador 186 | 184,State Assembly of the Republic of Bashkortostan 187 | 185,House of Commons of South Korea 188 | 186,Senate of South Korea 189 | 187,Legislative Assembly of the Autonomous Region of the Azores 190 | 188,Cumberland County Council 191 | 189,Current members of the Bolivian Plurinational Legislative Assembly 192 | 190,Senate of Kenya 193 | 191,City of York Council 194 | 192,Soviet of the Republic 195 | 193,National Assembly of Ecuador 196 | 194,Pyidaungsu Hluttaw 197 | 195,Amyotha Hluttaw 198 | 196,House of Representatives (Myanmar) 199 | 197,Pyithu Hluttaw 200 | 198,Orissa Legislative Assembly 201 | 199,Legislative Assembly of Odisha 202 | 200,Parliament of the Ottoman Empire 203 | 201,National Legislature of South Sudan 204 | 202,National Legislative Assembly of South Sudan 205 | 203,Council of States of South Sudan 206 | 204,Parliament of British Columbia 207 | 205,Chavakacheri Urban Council 208 | 206,Coimbatore Municipal Corporation 209 | 207,Chavakacheri Divisional Council 210 | 208,Saskatchewan Legislature 211 | 209,Egyptian Constituent Assembly of 2012 212 | 210,Constituent Assembly of Portugal 213 | 211,List of constituencies of Gujarat Legislative Assembly 214 | 212,General Junta of the Principality of Asturias 215 | 213,Diet of Galicia and Lodomeria 216 | 214,Legislative Assembly of Emilia-Romagna 217 | 215,History of the National Assembly of Pakistan 218 | -------------------------------------------------------------------------------- /src/autofj/benchmark/BasketballTeam/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 8,New Orleans Hornets,0,New Orleans Pelicans 3 | 30,Charlotte Bobcats,1,Charlotte Hornets 4 | 58,Polonia Warbud Warszawa,2,Polonia Warszawa (basketball) 5 | 59,Asseco Prokom Gdynia,3,Asseco Gdynia 6 | 96,Melbourne Tigers,4,Melbourne United 7 | 99,Liaoning Dinosaurs,5,Liaoning Flying Leopards 8 | 103,Shandong Lions,6,Shandong Golden Stars 9 | 106,Fujian Xunxing,7,Fujian Sturgeons 10 | 117,P.A.O.K. B.C.,8,P.A.O.K. BC 11 | 120,Ulriken Eagles,9,Ulriken Elite 12 | 121,Mens Sana Basket,10,Mens Sana 1871 Basket 13 | 124,Tulsa 66ers,11,Oklahoma City Blue 14 | 125,Austin Toros,12,Austin Spurs 15 | 130,KK Union Olimpija,13,KK Olimpija 16 | 137,Surrey Heat,14,Surrey Scorchers 17 | 137,Surrey Heat,15,Guildford Heat 18 | 137,Surrey Heat,16,Surrey United (basketball) 19 | 139,Dongguan Leopards,17,Shenzhen Leopards 20 | 150,Pallacanestro Treviso,18,Treviso Basket 21 | 150,Pallacanestro Treviso,19,Universo Treviso Basket 22 | 152,FC Barcelona-Institut Guttmann,20,UNES FC Barcelona 23 | 163,Canberra Capitals,21,University of Canberra Capitals 24 | 167,OceanaGold Nuggets,22,Otago Nuggets 25 | 181,Torpan Pojat,23,Helsinki Seagulls 26 | 181,Torpan Pojat,24,BC Torpan Pojat 27 | 183,London Lions (basketball),25,Milton Keynes Lions 28 | 196,Brooklyn Kings,26,Brooklyn Kings (basketball) 29 | 199,Bilbao Basket,27,CB Bilbao Berri 30 | 202,Euroins Cherno More,28,BC Cherno More Port Varna 31 | 225,KK Vojvodina,29,KK Vojvodina Srbijagas 32 | 242,Dumbarton Dodgers Basketball Club,30,Dumbarton Dodgers 33 | 251,F.C. Porto (basketball),31,FC Porto (basketball) 34 | 253,Oliveirense Basquetebol,32,U.D. Oliveirense (basketball) 35 | 255,C.A. Queluz,33,CA Queluz 36 | 259,Neckar Riesen Ludwigsburg,34,MHP Riesen Ludwigsburg 37 | 259,Neckar Riesen Ludwigsburg,35,EnBW Ludwigsburg 38 | 262,TBB Trier,36,Gladiators Trier 39 | 266,Basket Livorno,37,Pallacanestro Don Bosco Livorno 40 | 279,PBC Ural Great Perm,38,PBC Ural Great 41 | 281,Troon Tornadoes,39,Ayrshire Tornadoes 42 | 295,Bree BBC,40,Bree B.B.C. 43 | 301,Utah Flash,41,Delaware 87ers 44 | 317,Taiwan Mobile Clouded Leopards,42,Fubon Braves 45 | 317,Taiwan Mobile Clouded Leopards,43,Fubon Braves Basketball Team 46 | 319,Wisconsin Flyers,44,Omaha Racers 47 | 324,Pentland Tigers Basketball Club,45,Edinburgh Tigers 48 | 338,PBC Lokomotiv Kuban,46,PBC Lokomotiv-Kuban 49 | 344,Bristol Academy Flyers,47,Bristol Flyers 50 | 348,London Capital,48,PAWS London Capital 51 | 349,Trabzonspor Basketball,49,Trabzonspor B.K. 52 | 350,Olympias Patras B.C.,50,Olympiada Patras BC 53 | 352,Sporting B.C.,51,Sporting BC 54 | 357,Galatasaray Medical Park (men's basketball),52,Galatasaray S.K. (men's basketball) 55 | 366,Cheshire Phoenix,53,Cheshire Jets 56 | 367,Cherkaski Mavpy,54,BC Cherkasy 57 | 368,KK Mega Vizura,55,KK Mega Basket 58 | 368,KK Mega Vizura,56,KK Mega Leks 59 | 371,Nuova AMG Sebastiani Basket Napoli,57,Nuova AMG Sebastiani Basket Rieti 60 | 374,BC Enisey,58,BC Yenisey Krasnoyarsk 61 | 375,BC Krka,59,KK Krka 62 | 376,Roseto Sharks,60,Roseto Basket 63 | 388,Societa Veroli Basket,61,Veroli Basket 64 | 389,Guerino Vanoli Basket,62,Gruppo Triboldi Basket 65 | 390,A.E.L. 1964 B.C.,63,AEL 1964 B.C. 66 | 393,Bnei Herzliya,64,Bnei HaSharon 67 | 401,Star of the Sea (basketball),65,Belfast Star 68 | 403,Bulleen Boomers,66,Melbourne Boomers 69 | 406,BC Triumph Lyubertsy,67,B.C. Zenit Saint Petersburg 70 | 407,Allianz Swans Gmunden,68,Swans Gmunden 71 | 411,GasTerra Flames,69,Donar (basketball club) 72 | 413,Rethymno Aegean B.C.,70,Rethymno Cretan Kings B.C. 73 | 418,Egaleo B.C.,71,Egaleo BC 74 | 438,CB Atapuerca,72,CB Tizona 75 | 452,South West Slammers,73,Bunbury Slammers 76 | 467,Ilysiakos B.C.,74,Ilissiakos B.C. 77 | 468,M.E.N.T. B.C.,75,MENT B.C. 78 | 471,Amyntas Dafnis B.C.,76,Dafni BC 79 | 472,Xanthi BC,77,Xanthi B.C. 80 | 473,Paleo Faliro B.C.,78,Athlitikos Omilos Paleou Falirou BC 81 | 474,Chalkida BC,79,AGEH Gymnastikos B.C. 82 | 475,Kavala B.C.,80,Union Kavala B.C. 83 | 477,ICBS BC,81,Peramatos Ermis B.C. 84 | 477,ICBS BC,82,ICBS B.C. 85 | 481,Ionikos Lamias B.C.,83,Ionikos Lamias BC 86 | 482,Pagrati B.C.,84,AO Pagrati BC 87 | 485,Toros de Los Dos Laredos,85,Toros de Nuevo Laredo 88 | 490,Trikala B.C.,86,Trikala 2000 B.C. 89 | 490,Trikala B.C.,87,A.S. Trikala 2000 BC 90 | 495,Irakleio B.C.,88,Iraklio BC 91 | 501,BC Strumica 2005,89,KK Millenium Strumica 92 | 502,Incheon ET Land Elephants,90,Incheon Electroland Elephants 93 | 505,Goyang Orions,91,Goyang Orion Orions 94 | 506,Gigantes de Carolina (basketball),92,Gigantes de Carolina (men's basketball) 95 | 508,SCAA Basketball,93,South China AA (basketball) 96 | 513,Galatasaray SK (women's basketball),94,Galatasaray S.K. (women's basketball) 97 | 513,Galatasaray SK (women's basketball),95,Galatasaray Medical Park (women's basketball) 98 | 517,Habik'a B.C.,96,Elitzur Givat Shmuel 99 | 519,Medway Park Crusaders,97,Kent Crusaders (basketball) 100 | 520,Ionikos N.F. B.C.,98,AS Ionikos Neas Filadelfeias BC 101 | 521,Ionikos Nikaias B.C.,99,Ionikos Nikaias BC 102 | 526,BC Politekhnika-Halychyna,100,Polytekhnika-Halychyna Lviv 103 | 527,Galatasaray SK (wheelchair basketball),101,Galatasaray S.K. (wheelchair basketball) 104 | 527,Galatasaray SK (wheelchair basketball),102,Galatasaray Wheelchair Basketball Team 105 | 535,BK Valmiera,103,SK Valmiera 106 | 553,BC Rakvere Tarvas,104,Rakvere Tarvas 107 | 558,Al-Ahly (Benghazi) Men's Basketball Team,105,Al-Ahli Benghazi (basketball club) 108 | 563,B.C. Partizani Tirana,106,BC Partizani Tirana 109 | 567,Gymnastikos S. Larissas,107,Gymnastikos S. Larissas B.C. 110 | 593,BC Budivelnyk,108,BC Budivelnik 111 | 602,Bintulu Rainbow BC,109,Bintulu Eagles B.C. 112 | 602,Bintulu Rainbow BC,110,Bintulu Rainbow B.C. 113 | 603,Perak YSL Farmcochem BC,111,Perak Farmcochem B.C. 114 | 605,CS Otopeni (basketball),112,CS Otopeni (Basketball) 115 | 606,CS Energia Rovinari,113,CS Energia 116 | 614,Hapoel Afula,114,Hapoel Afula B.C. 117 | 615,Satria Muda BritAma Jakarta,115,Satria Muda Pertamina Jakarta 118 | 617,Chang Thailand Slammers,116,Hi-Tech Bangkok City 119 | 617,Chang Thailand Slammers,117,Sports Rev Thailand Slammers 120 | 626,Maccabi Raanana,118,Maccabi Ra'anana 121 | 632,Logan Thunder,119,Logan Thunder (WNBL) 122 | 639,Aramex (Jordan),120,Al Riyadi Amman 123 | 642,Ezzahra Sport Rades,121,Ezzahra Sports 124 | 644,Barak Netanya,122,Barak Netanya B.C. 125 | 657,Sony Athinaikos Athens,123,Athinaikos women's basketball 126 | 658,Ikaros Kallitheas B.C.,124,Ikaros Chalkidas B.C. 127 | 670,BBC Bayreuth,125,Medi Bayreuth 128 | 672,Rosa Radom,126,RosaSport Radom 129 | 675,Optima Gent,127,Gent Hawks 130 | 677,Halcones UV Xalapa,128,Halcones de Xalapa 131 | 688,Dell Aspac Jakarta,129,Aspac Jakarta 132 | 689,Leeds Carnegie (basketball),130,Leeds Force 133 | 690,Pelita Jaya Esia,131,Pelita Jaya Energi Mega Persada 134 | 702,Nuvo CLS Knights,132,CLS Knights Surabaya 135 | 704,Yongin Samsung Life Blue Minx,133,Yongin Samsung Blueminx 136 | 704,Yongin Samsung Life Blue Minx,134,Yongin Samsung Life Bichumi 137 | 713,Rapla KK,135,Piimameister Otto/Rapla 138 | 713,Rapla KK,136,TYCO Rapla 139 | 715,Muba Hangtuah Indonesia Muda Sumatera Selatan,137,Muba Hangtuah Sumatera Selatan 140 | 717,Black Water Elite,138,Blackwater Sports 141 | 717,Black Water Elite,139,Black Water Sports 142 | 722,Juventus (basketball club),140,BC Juventus 143 | 723,BC Prienai,141,BC Rūdupis 144 | 724,Palanga Triobet,142,BC Palanga 145 | 724,Palanga Triobet,143,BC Naglis 146 | 754,Cuxhaven Bascats,144,Cuxhaven BasCats 147 | 762,Oshawa Power,145,Mississauga Power 148 | 771,ADB Pas,146,AB Pas 149 | 795,BC Tsmoki-Minsk,147,BC Minsk-2006 150 | 800,Jalaa FC,148,Jalaa SC (men's basketball) 151 | 811,Porta XI CBF,149,Porta XI Ensino CBF 152 | 812,UNIQA Euroleasing Sopron,150,UNIQA Sopron 153 | 816,Al Rayyan Basketball Team,151,Al Rayan SC Basketball Team 154 | 824,Leuven Bears,152,Stella Artois Leuven Bears 155 | 826,Barsy Atyrau,153,BC Barsy Atyrau 156 | 830,Al-Ittihad Alexandria,154,El Ittihad Alexandria (basketball) 157 | 830,Al-Ittihad Alexandria,155,Al Ittihad Alexandria (basketball) 158 | 832,Zamalek (basketball club),156,Zamalek (basketball) 159 | 840,Boracay Rum Waves,157,Tanduay Light Rhum Masters 160 | 871,Kuwait SC (basketball),158,Al Kuwait SC (basketball) 161 | 875,Stade Olympique Maritime Boulonnais,159,SOMB Boulogne-sur-Mer 162 | 876,KK Brod,160,KK Slavonski Brod 163 | 916,Club Baloncesto Ciudad de Algeciras,161,CB Ciudad de Algeciras 164 | 918,Pacific Caesar,162,Pacific Caesar Surabaya 165 | 923,Primeiro de Agosto Basketball,163,C.D. Primeiro de Agosto (basketball) 166 | 925,Gruppo Sportivo FIAT,164,G.S. FIAT 167 | 927,Brampton A's,165,Orangeville A's 168 | -------------------------------------------------------------------------------- /src/autofj/benchmark/ShoppingMall/gt.csv: -------------------------------------------------------------------------------- 1 | id_l,title_l,id_r,title_r 2 | 0,The Galleria (Houston),0,The Galleria 3 | 1,Shops at Prudential Center,1,The Shops at Prudential Center 4 | 2,The Fashion Centre at Pentagon City,2,Fashion Centre at Pentagon City 5 | 3,Westfield Sydney Central Plaza,3,Sydney Central Plaza 6 | 6,Downtown Disney (Walt Disney World),5,Disney Springs 7 | 6,Downtown Disney (Walt Disney World),6,Downtown Disney (Walt Disney World Resort) 8 | 7,Bluewater (shopping centre),7,Bluewater 9 | 9,"The Summit (Birmingham, Alabama)",9,The Summit (Birmingham) 10 | 10,Centro Box Hill,10,Box Hill Central Shopping Centre 11 | 12,Irvine Spectrum Center,12,Irvine Spectrum 12 | 13,Liffey Valley Shopping Centre,13,Liffey Valley 13 | 16,Metropolis at Metrotown,16,Metrotown 14 | 17,St. Louis Outlet Mall,17,St. Louis Mills 15 | 18,Northlake Mall (Charlotte),18,"Northlake Mall (Charlotte, North Carolina)" 16 | 21,Touchwood,21,"Touchwood, Solihull" 17 | 22,Atlantic Terminal (shopping mall),22,Atlantic Terminal (Shopping Mall) 18 | 26,Hamilton Place (shopping mall),26,Hamilton Place 19 | 27,Langham Place (Hong Kong),27,"Langham Place, Hong Kong" 20 | 28,Centrale (Croydon),28,Centrale 21 | 30,Broadmarsh (shopping centre),30,Broadmarsh 22 | 31,Rhodes Shopping Centre,31,Rhodes Waterside 23 | 32,The CentrePoint,32,The Centrepoint 24 | 33,Wilton Mall,33,Wilton Mall at Saratoga 25 | 34,Palm Beach Mall,34,Palm Beach Outlets 26 | 35,Great Northern Mall,35,Great Northern Mall (New York) 27 | 36,Centro Bankstown,36,Bankstown Central Shopping Centre 28 | 38,HarbourFront Centre,38,HarbourFront Centre (Singapore) 29 | 39,CityPlace,39,CityPlace (West Palm Beach) 30 | 40,Galleria Shopping Centre,40,Galleria Shopping Centre (Toronto) 31 | 41,"Westgate Shopping Centre, Oxford",41,"Westgate, Oxford" 32 | 42,Westfield Annapolis,42,Annapolis Mall 33 | 43,Brentwood Town Centre,43,Brentwood Town Centre (mall) 34 | 44,St. David's (Cardiff),44,"St David's, Cardiff" 35 | 45,Centro Toombul,45,Toombul Shopping Centre 36 | 46,Grand Indonesia Shopping Town,46,Grand Indonesia 37 | 48,Westfield Connecticut Post,48,Connecticut Post Mall 38 | 49,Fox Run Mall,49,The Mall at Fox Run 39 | 51,Downtown Plaza (Sacramento),50,Downtown Commons 40 | 51,Downtown Plaza (Sacramento),51,Westfield Downtown Plaza 41 | 52,Westfield MainPlace,52,MainPlace Mall 42 | 54,Westfield Fox Valley,54,Fox Valley Mall 43 | 55,Westfield Chicago Ridge,55,Chicago Ridge Mall 44 | 56,Louis Joliet Mall,56,Westfield Louis Joliet 45 | 58,Westfield Franklin Park,58,Franklin Park Mall 46 | 61,Westfield Belden Village,61,Belden Village Mall 47 | 62,Solano Town Center,62,Westfield Solano 48 | 64,Westfield West Covina,64,Plaza West Covina 49 | 66,Sahara Mall (Riyadh),66,Riyadh Sahara Mall 50 | 67,Crossroads Center,67,"Crossroads Center (St. Cloud, Minnesota)" 51 | 68,St. Laurent Centre,68,St. Laurent Shopping Centre 52 | 70,"Zona Rosa (Kansas City, Missouri)",70,Zona Rosa (Kansas City) 53 | 71,Centro Lutwyche,71,Lutwyche City Shopping Centre 54 | 72,Les Promenades de l'Outaouais,72,Les Promenades Gatineau 55 | 76,El Con Mall,76,El Con Center 56 | 77,Epping Plaza,77,Pacific Epping 57 | 79,Auburn Mall,79,Auburn Mall (Massachusetts) 58 | 79,Auburn Mall,80,"Auburn Mall (Auburn, Massachusetts)" 59 | 81,Cataraqui Town Centre,81,Cataraqui Centre 60 | 82,Royal Victoria Place,82,Westfield Royal Victoria Place 61 | 84,"Conestoga Mall (Waterloo, Ontario)",84,Conestoga Mall 62 | 85,Maple Hill Pavilion,85,Maple Hill Mall 63 | 86,Centro Roselands,86,Roselands Shopping Centre 64 | 87,Mall at The Source,87,The Mall at the Source 65 | 89,Santa Rosa Mall (Florida),89,Santa Rosa Mall 66 | 90,"Crossroads Center (Waterloo, IA)",90,"Crossroads Center (Waterloo, Iowa)" 67 | 91,Spires Shopping Centre,91,The Spires Shopping Centre 68 | 92,Change Alley (Singapore),92,"Change Alley, Singapore" 69 | 93,Seacon Square,93,Seacon Square Srinakarin 70 | 94,Northfield Square,94,Northfield Square Mall 71 | 96,Paradise Park (Mall),96,Paradise Park (mall) 72 | 97,Centro Colonnades,97,Colonnades Shopping Centre 73 | 98,Centro The Glen,98,The Glen Shopping Centre 74 | 88,The Oaks Mall,101,The Oaks Mall (Florida) 75 | 103,Jantzen Beach SuperCenter,103,Jantzen Beach Center 76 | 107,Northwest Plaza,107,The Crossings at Northwest 77 | 108,Acadiana Mall,108,Mall of Acadiana 78 | 109,Arsenal Mall,109,The Arsenal Project of Watertown 79 | 110,Antioch Center,110,Antioch Crossing 80 | 111,Omni Park Shopping Centre,111,Omni Park 81 | 78,The Paragon,112,"The Paragon, Singapore" 82 | 114,Mail Champlain,114,Champlain Mall 83 | 115,Splendid China Tower,115,Splendid China Mall 84 | 116,Brunswick shopping centre,116,Brunswick Shopping Centre 85 | 120,Westfield Warrawong,120,Warrawong Plaza 86 | 121,Westfield Figtree,121,Figtree Grove 87 | 122,Westfield Pakuranga,122,Pakuranga Plaza 88 | 50,Downtown Shopping Centre,123,Westfield Downtown 89 | 124,Knollwood Mall,124,Shoppes at Knollwood 90 | 125,Shangri-la Plaza Mall,125,Shangri-La Plaza 91 | 125,Shangri-la Plaza Mall,126,Shangri-La Plaza (shopping mall) 92 | 127,Lakeshore Mall (Florida),127,Lakeshore Mall 93 | 128,Menara Great Eastern,128,Great Eastern Tower 94 | 130,Avenue Carriage Crossing,130,Carriage Crossing 95 | 132,The Mall at Shelter Cove,132,Shelter Cove Towne Centre 96 | 133,Bishops Corner (West Hartford),133,"Bishops Corner, West Hartford" 97 | 137,Tallahassee Mall,137,Centre of Tallahassee 98 | 138,La Encantada,138,La Encantada (shopping center) 99 | 141,Forest Lake Shopping Centre,141,Forest Lake Village Shopping Centre 100 | 142,MegaBox (shopping mall),142,Megabox (shopping mall) 101 | 143,Westfield CastleCourt,143,CastleCourt 102 | 145,Deira City Centre,145,City Centre Deira 103 | 147,The Promenade Shopping Centre,147,Promenade (shopping centre) 104 | 148,The Mall at Cortana,148,Cortana Mall 105 | 150,Bentley Bridge Retail Park,150,Bentley Bridge 106 | 152,City Centre Plaza,152,"City Centre Plaza, Rockhampton" 107 | 156,NewPark Mall,156,Newpark Mall 108 | 158,"Regency Square Mall (Florence, Alabama)",158,Florence Mall (Alabama) 109 | 160,Centro Taigum,160,Taigum Square Shopping Centre 110 | 161,Sta. Lucia East Grand Mall,161,Sta. Lucia East Grandmall 111 | 162,Florence Mall,162,Florence Mall (Kentucky) 112 | 164,Centro Karingal,164,Karingal Hub Shopping Centre 113 | 165,Regency Square Mall (Jacksonville),165,"Regency Square Mall (Jacksonville, Florida)" 114 | 168,Dembel City Center,168,Bole Dembel Shopping Center 115 | 171,Winrock Shopping Center,171,Winrock Center 116 | 174,Chesapeake Square Mall,174,Chesapeake Square 117 | 175,Lulu Mall,175,LuLu International Shopping Mall 118 | 175,Lulu Mall,176,Lulu Cochin Mall 119 | 177,Mirdif City Centre,177,City Centre Mirdif 120 | 178,"Shaktan Thampuran Private Bus Stand, Thrissur",178,Shaktan Thampuran Private Bus Stand 121 | 179,"Star City, Seoul",179,Star City (shopping mall) 122 | 182,Exchange Ilford,182,"The Exchange, Ilford" 123 | 183,My Mall Limassol,183,MY MALL Limassol 124 | 184,Castletown Shoppingworld,184,CastleTown Shoppingworld 125 | 185,The Market Common Myrtle Beach,185,The Market Common 126 | 187,The Mall Pavilions,187,The Pavilions 127 | 188,Settlers' Green Outlet Village,188,Settlers Green 128 | 189,"The Summit (Reno, Nevada)",189,The Summit (Reno) 129 | 190,"The Summit (Wheatfield, New York)",190,The Summit (Wheatfield) 130 | 191,Domain Central,191,"Domain Central, Townsville" 131 | 192,Granada Center,192,Granada Centre 132 | 193,The Base (mall),193,The Base (shopping centre) 133 | 193,The Base (mall),194,The Base (Shopping Centre) 134 | 195,Tulsa Promenade Mall,195,Tulsa Promenade 135 | 196,West Manchester Mall,196,West Manchester Town Center 136 | 197,Lakeview Square,197,Lakeview Square Mall 137 | 198,Palladium Square,198,Palladium World 138 | 199,Centro Lavington,199,Lavington Square Shopping Centre 139 | 200,Bahrain City Centre,200,City Centre Bahrain 140 | 201,Spinderiet (Copenhagen),201,Spinderiet 141 | 203,Kukui Grove Shopping Center,203,Kukui Grove Center 142 | 207,"University Mall (Chapel Hill, North Carolina)",207,"University Place (Chapel Hill, North Carolina)" 143 | 208,The Outlets at Sands Bethlehem,208,The Shoppes at Sands 144 | 209,Enfield Square,209,Enfield Square Mall 145 | 210,"Harbor Point, Subic",210,Harbor Point (Subic) 146 | 211,Crystal Palace Complex (Dieppe),211,Bass Pro Complex (Dieppe) 147 | 212,Centre at Glen Burnie,212,Glen Burnie Mall 148 | 213,South City (shopping mall),213,South City Mall 149 | 214,The Gallery at Military Circle,214,Military Circle Mall 150 | 215,West 12 Shepherds Bush,215,West 12 151 | 215,West 12 Shepherds Bush,216,West 12 Shepherd's Bush 152 | 217,The Brentwood Country Mart,217,Brentwood Country Mart 153 | 218,South Point (shopping mall),218,South Point Mall 154 | 219,Cross County Plaza,219,Cross County Mall 155 | 220,Werribee Plaza,220,Pacific Werribee 156 | 221,Monroe Crossing Mall,221,Monroe Crossing 157 | 222,"Northwoods Mall (Peoria, Illinois)",222,Northwoods Mall (Illinois) 158 | 223,Square 2 (Shopping Mall),223,Square 2 159 | 223,Square 2 (Shopping Mall),224,Square 2 (shopping mall) 160 | 225,Westshore Mall,225,The Shops at Westshore 161 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Country/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Kosovo (region) 3 | 1,Myanmar 4 | 2,Qing dynasty 5 | 3,Special administrative regions of China 6 | 4,Kingdom of Northumbria 7 | 5,Sui dynasty 8 | 6,Tang dynasty 9 | 7,Ming dynasty 10 | 8,Qin dynasty 11 | 9,Shang dynasty 12 | 10,Zhou dynasty 13 | 11,Xia dynasty 14 | 12,Han dynasty 15 | 13,Cape Colony 16 | 14,Duchy of Lorraine 17 | 15,Liu Song dynasty 18 | 16,Song dynasty 19 | 17,Côte d'Ivoire 20 | 18,Episcopal principality of Utrecht 21 | 19,Free France 22 | 20,Malacca Sultanate 23 | 21,Emirate of Transjordan 24 | 22,Qara Khitai 25 | 23,Nassau (state) 26 | 24,Lan Na 27 | 25,Schaumburg-Lippe 28 | 26,Upper Mustang 29 | 27,Archbishopric of Mainz 30 | 28,Xin dynasty 31 | 29,Dzungar Khanate 32 | 30,Communist Romania 33 | 31,Hephthalite 34 | 32,Kingdom of Iberia 35 | 33,Buyeo 36 | 34,Zimbabwe Rhodesia 37 | 35,Former Zhao 38 | 36,French protectorate of Cambodia 39 | 37,Polish People's Republic 40 | 38,Nakhchivan Autonomous Republic 41 | 39,Nakhchivan 42 | 40,Moche culture 43 | 41,Moche 44 | 42,History of the Lands of the Bohemian Crown (Middle Ages) 45 | 43,Isle de France 46 | 44,Chen dynasty 47 | 45,Liang dynasty 48 | 46,Later Liang (Sixteen Kingdoms) 49 | 47,Western Liang (Sixteen Kingdoms) 50 | 48,Ryūkyū Kingdom 51 | 49,Rustamid 52 | 50,Hammadid 53 | 51,Later Liang (Five Dynasties) 54 | 52,Later Tang 55 | 53,Croatia in the union with Hungary 56 | 54,Indo-Greek Kingdom 57 | 55,French protectorate of Morocco 58 | 56,Nanda Dynasty 59 | 57,Samanid Empire 60 | 58,Nueva Navarra 61 | 59,Shunga Empire 62 | 60,Pandyan dynasty 63 | 61,Jordanian occupation of the West Bank and East Jerusalem 64 | 62,Spanish protectorate in Morocco 65 | 63,Spanish Protectorate of Morocco 66 | 64,Mandatory Iraq 67 | 65,Early Dynastic Period (Egypt) 68 | 66,Bavarian Council Republic 69 | 67,Kadamba dynasty 70 | 68,Dali Kingdom 71 | 69,Later Baekje 72 | 70,Kartl-Kakheti 73 | 71,Tierra Firma 74 | 72,Merkit 75 | 73,Mamluk Dynasty (Delhi) 76 | 74,Kerma Culture 77 | 75,Principality of Reuss Younger Line 78 | 76,Keraites 79 | 77,Khereid 80 | 78,Provisional Government of Autonomous Siberia 81 | 79,Saar Protectorate 82 | 80,Nanyo (Japanese mandated territory) 83 | 81,Muzaffarids (Iran) 84 | 82,Reorganized National Government of the Republic of China 85 | 83,Kingdom of Soissons 86 | 84,Indian-Ocean Rim Association 87 | 85,Restoration (Spain) 88 | 86,Imperial County of Ortenburg 89 | 87,Bushmanland 90 | 88,Kingdom of Albania (medieval) 91 | 89,Colony of Fiji 92 | 90,Kediri Kingdom 93 | 91,Pasai 94 | 92,Free and Independent Republic of West Florida 95 | 93,"Eastern Slavonia, Baranja and Western Syrmia" 96 | 94,Kurt dynasty 97 | 95,Kartids 98 | 96,Republic of Ancona 99 | 97,Ghurid dynasty 100 | 98,Kingdom of Limmu-Ennarea 101 | 99,Yamataikoku 102 | 100,First Austrian Republic 103 | 101,Aztec Empire 104 | 102,Qi (Li Maozhen's state) 105 | 103,Sultanate of Ifat 106 | 104,Nawabs of Bengal & Murshidabad 107 | 105,United Belgian States 108 | 106,Hotak dynasty 109 | 107,Hotaki Empire 110 | 108,Chera dynasty 111 | 109,Later Jin (Five Dynasties) 112 | 110,Rai dynasty 113 | 111,Chavda dynasty 114 | 112,Hadiya Sultanate 115 | 113,Mahdist Sudan 116 | 114,Odanad 117 | 115,First Republic of Armenia 118 | 116,Rashtrakuta dynasty 119 | 117,Osraige 120 | 118,Kingdom of Osraige 121 | 119,Jalairid Sultanate 122 | 120,Jalayirids 123 | 121,Belarusian Central Council 124 | 122,Margravate of Meissen 125 | 123,Kingdom of the Suebi 126 | 124,Tuyuhun 127 | 125,Tarkhan dynasty 128 | 126,Western Ganga dynasty 129 | 127,Arghun dynasty 130 | 128,Samma dynasty 131 | 129,Wadai Empire 132 | 130,Alid dynasties of northern Iran 133 | 131,Alavids 134 | 132,Restoration and Regeneration in Switzerland 135 | 133,French Protectorate of Laos 136 | 134,Mleccha dynasty 137 | 135,Panchala Kingdom 138 | 136,Ganja Khanate 139 | 137,Kurdistansky Uyezd 140 | 138,Japanese occupation of Malaya 141 | 139,Erivan khanate 142 | 140,Beylik of Dulkadir 143 | 141,German military administration in occupied France during World War II 144 | 142,Hungarian People's Republic 145 | 143,Prince-Bishopric of Strasbourg 146 | 144,Realm of Stefan Dragutin 147 | 145,Occupation of Turkish Armenia 148 | 146,Duchy of Luxemburg 149 | 147,"County, Duchy and Grand Duchy of Luxembourg" 150 | 148,Sultanate of Aussa 151 | 149,County of Brabant 152 | 150,Sasanian Empire 153 | 151,Colony of Singapore 154 | 152,Nayakas of Keladi 155 | 153,Jabal al-Druze (state) 156 | 154,Japanese occupation of the Dutch East Indies 157 | 155,Sajid dynasty 158 | 156,County of Luxemburg 159 | 157,Duchy of Neopatras 160 | 158,Albona Republic 161 | 159,First Hungarian Republic 162 | 160,Zanzibar Sultanate 163 | 161,Kingdom of Fouta Tooro 164 | 162,Kingdom of Fouta Djallon 165 | 163,Later Han (Five Dynasties) 166 | 164,Later Zhou 167 | 165,Italian Islands of the Aegean 168 | 166,Yin (Five Dynasties period) 169 | 167,Yan (Five Dynasties period) 170 | 168,Duchy of Gascony 171 | 169,Shun dynasty 172 | 170,Anhalt 173 | 171,Crown Colony of Malacca 174 | 172,Malacca (British Crown colony) 175 | 173,Sallarid 176 | 174,Kingdom of Tashir-Dzoraget 177 | 175,Dutch Ceylon 178 | 176,Yueban 179 | 177,St. Ulrich's and St. Afra's Abbey 180 | 178,Kingdom of Italy (1861–1946) 181 | 179,Principality of Reuss Elder Line 182 | 180,Bavand dynasty 183 | 181,Heungyo 184 | 182,Lordship of Negroponte 185 | 183,"Yanam, French India" 186 | 184,Margraviate of Austria 187 | 185,Arminiya 188 | 186,Emirate of Armenia 189 | 187,Byzantium under the Komnenos dynasty 190 | 188,Byzantium under the Palaiologoi 191 | 189,Trust Territory of Somaliland 192 | 190,Lordship of Glamorgan 193 | 191,Seljuk Empire 194 | 192,Seljuq Empire 195 | 193,Byzantine Empire under the Angeloi 196 | 194,Byzantium under the Heraclians 197 | 195,Jeongan 198 | 196,Mamluk dynasty of Iraq 199 | 197,Lordship of Winneburg and Beilstein 200 | 198,Byzantium under the Isaurians 201 | 199,Byzantium under the Macedonians 202 | 200,Principality of Iberia 203 | 201,Austro-Hungarian rule in Bosnia and Herzegovina 204 | 202,Condominium of Bosnia and Herzegovina 205 | 203,K'iche' kingdom of Q'umarkaj 206 | 204,Provisional Administration of South Ossetia 207 | 205,Tanganyika (territory) 208 | 206,Bagratid Armenia 209 | 207,Unification of Hispaniola 210 | 208,Maha-Meghavahana Dynasty 211 | 209,Pudukkottai State 212 | 210,Namayan 213 | 211,Independent State of Macedonia 214 | 212,British Kenya 215 | 213,Yuan dynasty 216 | 214,Principality of Arbër 217 | 215,Principality of Albania (medieval) 218 | 216,Kingdom of Hejaz and Nejd 219 | 217,British invasion of Manila 220 | 218,Rajahnate of Butuan 221 | 219,Northern Yuan dynasty 222 | 220,Post-Soviet transition in Ukraine 223 | 221,Commonwealth of Independent States Free Trade Area 224 | 222,Adriatic Ionian Euroregion 225 | 223,Sultanate of Ternate 226 | 224,Bengal Sultanate 227 | 225,Kingdom of Chiang Mai 228 | 226,German military administration in occupied Poland 229 | 227,Four Oirat 230 | 228,Liao dynasty 231 | 229,Principality of Turov 232 | 230,Sultanate of the Geledi 233 | 231,Gobroon dynasty 234 | 232,Byzantium under the Justinian dynasty 235 | 233,Zhou (Zhang Shicheng's kingdom) 236 | 234,Ettaiyapuram estate 237 | 235,Kingdom of Ava 238 | 236,Arab Kingdom of Syria 239 | 237,Ror dynasty 240 | 238,History of Iraq under Ba'athist rule 241 | 239,Japanese colonial empire 242 | 240,Kingdom of Spain under Joseph Bonaparte 243 | 241,Spain under Joseph Bonaparte 244 | 242,Provisional Government of Bangladesh 245 | 243,Eurasian Economic Union 246 | 244,Gazikumukh Shamkhalate 247 | 245,Shamkhalate of Kazi-Kumukh 248 | 246,Kition (ancient state) 249 | 247,Gazikumukh Khanate 250 | 248,Khanate of Kazi-Kumukh 251 | 249,People's Republic of Zanzibar 252 | 250,Imperial Abbey of Kempten 253 | 251,Liao (Zhou dynasty state) 254 | 252,Kalingga Kingdom 255 | 253,Insular Government 256 | 254,Zhao (Five Dynasties period) 257 | 255,Vaspurakan Kingdom 258 | 256,Interim Government of Iran 259 | 257,Cyrenaica Emirate 260 | 258,General Government of Belgium 261 | 259,Federation of Nigeria 262 | 260,Dominion of Mauritius 263 | 261,Grand Principality of Serbia 264 | 262,Amecatl 265 | 263,Duchy of Dol-Combourg 266 | 264,Shirvan Baylarbaylik 267 | 265,Imperial Throne (micronation) 268 | 266,Imperial Throne (Sovereign Nation) 269 | 267,Kalinyamat Sultanate 270 | 268,Karabakh Beylerbeylik 271 | 269,Kara Del 272 | 270,Crown Colony of Labuan 273 | 271,Jin (Later Tang precursor) 274 | 272,United States Military Government in Cuba 275 | 273,Colony of Santiago 276 | 274,Spanish occupation of Santiago (Jamaica) 277 | 275,Kempten (Imperial Free City) 278 | 276,Carniola (Early Middle Ages) 279 | 277,Armi (Syria) 280 | 278,Armani (Ancient kingdom) 281 | 279,Turkish Provisional Government 282 | 280,Cupul 283 | 281,Duchy of Croatia 284 | 282,Rojava 285 | 283,Kingdom of Qocho 286 | 284,Transnistria autonomous territorial unit with special legal status 287 | 285,Passports issued by the European Union candidate states 288 | 286,Nagas of Padmavati 289 | 287,Margraviate of Moravia 290 | 288,Crown Colony of Malta 291 | 289,Gozo (independent state) 292 | 290,Gozo (1798–1800) 293 | -------------------------------------------------------------------------------- /src/autofj/autofj.py: -------------------------------------------------------------------------------- 1 | from .join_function_space.autofj_join_function_space import AutoFJJoinFunctionSpace 2 | from .blocker.autofj_blocker import AutoFJBlocker 3 | from .optimizer.autofj_multi_column_greedy_algorithm import \ 4 | AutoFJMulticolGreedyAlgorithm 5 | import pandas as pd 6 | from .utils import print_log 7 | import os 8 | from .negative_rule import NegativeRule 9 | import numpy as np 10 | 11 | 12 | class AutoFJ(object): 13 | """ 14 | AutoFJ automatically produces record pairs that approximately match in 15 | two tables L and R. It proceeds to configure suitable parameters 16 | automatically, which when used to fuzzy-join L and R, meets the 17 | user-specified precision target, while maximizing recall. 18 | 19 | AutoFJ attempts to solve many-to-one join problems, where each record in R 20 | will be joined with at most one record in L, but each record in L can be 21 | joined with multiple records in R. In AutoFJ, L refers to a reference 22 | table, which is assumed to be almost "duplicate-free". 23 | 24 | Parameters 25 | ---------- 26 | precision_target: float, default=0.9 27 | Precision target. 28 | 29 | join_function_space: string or dict or list of objects, default="autofj_sm" 30 | There are following three ways to define the space of join functions: 31 | (1) Use the name of built-in join function space. There are three 32 | options, including "autofj_lg", "autofj_lg" and "autofj_sm" that use 33 | 136, 68 and 14 join functions, respectively. Using less join functions 34 | can improve efficiency but may worsen performance. 35 | (2) Use a dict specifying the options for preprocessing methods, 36 | tokenization methods, token weighting methods and distance functions. 37 | The space will be the cartesian product of all options in the dict. 38 | See ./join_function_space/options.py for defining join functions using 39 | a dict. 40 | (3) Use a list of customized JoinFunction objects. 41 | 42 | distance_threshold_space: int or list, default=50 43 | The number of candidate distance thresholds or a list of candidate 44 | distance thresholds in the space. If the number of distance thresholds 45 | (integer) is given, distance thresholds are spaced evenly from 0 to 1. 46 | Otherwise, it should be a list of floats from 0 to 1. 47 | 48 | column_weight_space: int or list, default=10 49 | The number of candidate column weights or a list of candidate 50 | column weights in the space. If the number of column weights 51 | (integer) is given, column weights are spaced evenly from 0 to 1. 52 | Otherwise, it should be a list of floats from 0 to 1. 53 | 54 | blocker: a Blocker object or None, default None 55 | A Blocker object that performs blocking on two tables. If None, use 56 | the built-in blocker. For customized blocker, see Blocker class. 57 | 58 | n_jobs : int, default=-1 59 | Number of CPU cores used. -1 means using all processors. 60 | 61 | verbose: bool, default=False 62 | Whether to print logging 63 | """ 64 | 65 | def __init__(self, 66 | precision_target=0.9, 67 | join_function_space="autofj_sm", 68 | distance_threshold_space=50, 69 | column_weight_space=10, 70 | blocker=None, 71 | n_jobs=-1, 72 | verbose=False): 73 | self.precision_target = precision_target 74 | self.join_function_space = join_function_space 75 | 76 | if type(distance_threshold_space) == int: 77 | self.distance_threshold_space = list( 78 | np.linspace(0, 1, distance_threshold_space)) 79 | else: 80 | self.distance_threshold_space = distance_threshold_space 81 | 82 | if type(column_weight_space) == int: 83 | self.column_weight_space = list( 84 | np.linspace(0, 1, column_weight_space)) 85 | else: 86 | self.column_weight_space = column_weight_space 87 | 88 | if blocker is None: 89 | self.blocker = AutoFJBlocker(n_jobs=n_jobs) 90 | else: 91 | self.blocker = blocker 92 | 93 | self.n_jobs = n_jobs if n_jobs > 0 else os.cpu_count() 94 | self.verbose = verbose 95 | 96 | def join(self, left_table, right_table, id_column, on=None): 97 | """Join left table and right table. 98 | 99 | Parameters 100 | ---------- 101 | left_table: pd.DataFrame 102 | Reference table. The left table is assumed to be almost 103 | duplicate-free, which means it has no or only few duplicates. 104 | 105 | right_table: pd.DataFrame 106 | Another input table. 107 | 108 | id_column: string 109 | The name of id column in the two tables. This column will not be 110 | used to join two tables. 111 | 112 | on: list or None 113 | A list of column names (multi-column fuzzy join) that the two tables 114 | will be joined on. If None, two tables will be joined on all columns 115 | that exist in both tables, excluding the id column. 116 | 117 | Returns: 118 | -------- 119 | result: pd.DataFrame 120 | A table of joining pairs. The columns of left table are 121 | suffixed with "_l" and the columns of right table are suffixed 122 | with "_r" 123 | """ 124 | left = left_table.copy(deep=True) 125 | right = right_table.copy(deep=True) 126 | 127 | # create internal id columns (use internal ids) 128 | left["autofj_id"] = range(len(left)) 129 | right["autofj_id"] = range(len(right)) 130 | 131 | # remove original ids 132 | left.drop(columns=id_column, inplace=True) 133 | right.drop(columns=id_column, inplace=True) 134 | 135 | # get names of columns to be joined 136 | if on is None: 137 | on = sorted(list(set(left.columns).intersection(right.columns))) 138 | left = left[on] 139 | right = right[on] 140 | 141 | # do blocking 142 | if self.verbose: 143 | print_log("Start blocking") 144 | LL_blocked = self.blocker.block(left, left, "autofj_id") 145 | LR_blocked = self.blocker.block(left, right, "autofj_id") 146 | 147 | # remove equi-joins on LL 148 | LL_blocked = LL_blocked[ 149 | LL_blocked["autofj_id_l"] != LL_blocked["autofj_id_r"]] 150 | 151 | # learn and apply negative rules 152 | nr = NegativeRule(left, right, "autofj_id") 153 | nr.learn(LL_blocked) 154 | LR_blocked = nr.apply(LR_blocked) 155 | 156 | # create join function space 157 | jf_space = AutoFJJoinFunctionSpace(self.join_function_space, 158 | n_jobs=self.n_jobs) 159 | 160 | # compute distance 161 | if self.verbose: 162 | print_log("Start computing distances. Size of join function space: {}" 163 | .format(len(jf_space.join_functions))) 164 | 165 | LL_distance, LR_distance = jf_space.compute_distance(left, 166 | right, 167 | LL_blocked, 168 | LR_blocked) 169 | 170 | # run greedy algorithm 171 | if self.verbose: 172 | print_log("Start running greedy algorithm.") 173 | 174 | optimizer = AutoFJMulticolGreedyAlgorithm( 175 | LL_distance, 176 | LR_distance, 177 | precision_target=self.precision_target, 178 | candidate_thresholds=self.distance_threshold_space, 179 | candidate_column_weights=self.column_weight_space, 180 | n_jobs=self.n_jobs 181 | ) 182 | 183 | self.selected_column_weights, self.selected_join_configs, LR_joins = \ 184 | optimizer.run() 185 | 186 | if LR_joins is None: 187 | print("Warning: The precision target cannot be achieved.", 188 | "Try a lower precision target or a larger space of join functions,", 189 | "distance thresholds and column weights.") 190 | LR_joins = pd.DataFrame(columns=[c+"_l" for c in left_table.columns]+ 191 | [c+"_r" for c in right_table.columns]) 192 | return LR_joins 193 | 194 | # merge with original left and right tables 195 | left_idx = [l for l, r in LR_joins] 196 | right_idx = [r for l, r in LR_joins] 197 | L = left_table.iloc[left_idx].add_suffix("_l").reset_index(drop=True) 198 | R = right_table.iloc[right_idx].add_suffix("_r").reset_index(drop=True) 199 | result = pd.concat([L, R], axis=1).sort_values(by=id_column + "_r") 200 | return result 201 | -------------------------------------------------------------------------------- /src/autofj/benchmark/ClericalAdministrativeRegion/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Latin Patriarchate of Jerusalem 3 | 1,Baptist Union of Australia 4 | 2,Diocese of New Westminster 5 | 3,Roman Catholic Diocese of Sankt Gallen 6 | 4,Diocese of Gibraltar in Europe 7 | 5,Patriarch of Lisbon 8 | 6,Roman Catholic Diocese of Busan 9 | 7,Roman Catholic Diocese of Lafayette in Indiana 10 | 8,Episcopal see of Carthage 11 | 9,Roman Catholic Diocese of Kon Tum 12 | 10,Syro-Malabar Catholic Archeparchy of Changanassery 13 | 11,Syro-Malabar Catholic Major Archeparchy of Ernakulam-Angamaly 14 | 12,Diocese of Niassa 15 | 13,Eparchy of Gornji Karlovac 16 | 14,Syro-Malabar Catholic Eparchy of St. Thomas of Chicago 17 | 15,St. Thomas Syro-Malabar Catholic Eparchy of Chicago 18 | 16,Melkite Greek Catholic Archeparchy of Zahle and Forzol 19 | 17,Anglican Diocese of Eastern Newfoundland and Labrador 20 | 18,Anglican Diocese of Central Newfoundland 21 | 19,Anglican Diocese of Western Newfoundland 22 | 20,Anglican Diocese of Nova Scotia and Prince Edward Island 23 | 21,Diocese of Ripon 24 | 22,Anglican Diocese of Niagara 25 | 23,Anglican Diocese of Qu'Appelle 26 | 24,Ukrainian Catholic Eparchy of Saints Peter and Paul of Melbourne 27 | 25,Diocese of Jos 28 | 26,Diocese of Makurdi 29 | 27,Diocese of Yola 30 | 28,Syro-Malabar Catholic Archeparchy of Tellicherry 31 | 29,Roman Catholic Archdiocese of Valencia in Spain 32 | 30,Episcopal Church in Minnesota 33 | 31,Roman Catholic Archdiocese of Kingston in Canada 34 | 32,Apostolic Vicariate of Southern Arabia 35 | 33,Apostolic Vicariate of Iles Saint Pierre and Miquelon 36 | 34,Diocese of Namibia 37 | 35,Diocese of Natal 38 | 36,Syro-Malabar Catholic Archeparchy of Thrissur 39 | 37,Anglican Diocese of Grafton 40 | 38,Syro-Malabar Catholic Eparchy of Kanjirappally 41 | 39,Syro-Malabar Catholic Eparchy of Kothamangalam 42 | 40,Syro-Malabar Catholic Eparchy of Idukki 43 | 41,Syro-Malabar Catholic Eparchy of Belthangady 44 | 42,Romanian Catholic Eparchy of Cluj-Gherla 45 | 43,Syro-Malabar Catholic Eparchy of Rajkot 46 | 44,Syro-Malabar Catholic Eparchy of Sagar 47 | 45,Syro-Malabar Catholic Eparchy of Adilabad 48 | 46,Syro-Malabar Catholic Eparchy of Bijnor 49 | 47,Syro-Malabar Catholic Eparchy of Chanda 50 | 48,Syro-Malabar Catholic Eparchy of Gorakhpur 51 | 49,Syro-Malabar Catholic Eparchy of Kalyan 52 | 50,Syro-Malabar Catholic Eparchy of Irinjalakuda 53 | 51,Anglican Diocese of Bendigo 54 | 52,Anglican Diocese of North West Australia 55 | 53,Syro-Malabar Catholic Eparchy of Jagdalpur 56 | 54,Syro-Malabar Catholic Eparchy of Satna 57 | 55,Syro-Malabar Catholic Eparchy of Thamarassery 58 | 56,Syro-Malabar Catholic Eparchy of Thuckalay 59 | 57,Ukrainian Catholic Eparchy of the Holy Family of London 60 | 58,Apostolic Exarchate for Ukrainians in Great Britain 61 | 59,Melkite Greek Catholic Eparchy of Saint Michael Archangel in Sydney 62 | 60,Roman Catholic Diocese of Bathurst in Australia 63 | 61,Maronite Catholic Eparchy of Saint Maron of Sydney 64 | 62,Ukrainian Catholic Eparchy of Saint Vladimir the Great of Paris 65 | 63,"Apostolic Exarchate in France, Benelux and Switzerland for the Ukrainians" 66 | 64,Roman Catholic Archdiocese of Xalapa 67 | 65,Anglican Diocese of Keewatin 68 | 66,Anglican Diocese of British Columbia 69 | 67,Anglican Diocese of Ontario 70 | 68,Roman Catholic Archdiocese of Santiago de Guatemala 71 | 69,Apostolic Vicariate of Izabal 72 | 70,Roman Catholic Diocese of Victoria in Canada 73 | 71,Roman Catholic Diocese of Bathurst in Canada 74 | 72,Roman Catholic Diocese of Concordia in Argentina 75 | 73,Roman Catholic Archdiocese of La Plata in Argentina 76 | 74,Suburbicarian Diocese of Porto e Santa Rufina 77 | 75,Anglican Diocese of Pretoria 78 | 76,Anglican Diocese of Huron 79 | 77,Anglican Diocese of Arctic 80 | 78,Anglican Diocese of Rupert's Land 81 | 79,Anglican Diocese of Algoma 82 | 80,Archdiocese of Russian Orthodox churches in Western Europe 83 | 81,Ruthenian Catholic Archeparchy of Pittsburgh 84 | 82,Ruthenian Catholic Eparchy of Passaic 85 | 83,Ruthenian Catholic Eparchy of Parma 86 | 84,Byzantine Catholic Apostolic Exarchate of Serbia 87 | 85,Croatian Catholic Apostolic Exarchate of Serbia 88 | 86,Apostolic Exarchate of Serbia 89 | 87,Roman Catholic Archdiocese of Ho Chi Minh City 90 | 88,Roman Catholic Archdiocese of Mary Most Holy in Astana 91 | 89,Roman Catholic Diocese of Sora-Cassino-Aquino-Pontecorvo 92 | 90,Diocese of Novgorod 93 | 91,Archbishop of Novgorod 94 | 92,Diocese of Novgorod and Staraya Russa 95 | 93,"Roman Catholic Diocese of Santa Rosa, Argentina" 96 | 94,Roman Catholic Diocese of Port Harcourt 97 | 95,Anglican Diocese of San Joaquin 98 | 96,Roman Catholic Diocese of Bethlehem in South Africa 99 | 97,Roman Catholic Diocese of Pietersburg 100 | 98,Roman Catholic Diocese of Lwiza 101 | 99,Ukrainian Catholic Eparchy of Toronto 102 | 100,Diocese of Grahamstown 103 | 101,Roman Catholic Diocese of Aire and Dax 104 | 102,Roman Catholic Diocese of La Rochelle 105 | 103,Syro-Malabar Catholic Eparchy of Bhadravathi 106 | 104,Roman Catholic Diocese of Brugge 107 | 105,Roman Catholic Archdiocese of Dodoma 108 | 106,Chaldean Catholic Eparchy of Saint Peter the Apostle of San Diego 109 | 107,"Roman Catholic Archdiocese of Naxos, Tinos, Andros and Mykonos" 110 | 108,Diocese of Medak of the Church of South India 111 | 109,Roman Catholic Archdiocese of Toamasina 112 | 110,Roman Catholic Diocese of Alessandria 113 | 111,Italo-Albanese Eparchy of Piana degli Albanesi 114 | 112,Slovak Catholic Eparchy of Bratislava 115 | 113,Albanian Catholic Apostolic Administration of Southern Albania 116 | 114,Roman Catholic Diocese of Syros 117 | 115,Romanian Catholic Eparchy of Oradea Mare 118 | 116,Roman Catholic Archdiocese of Cuzco 119 | 117,Roman Catholic Archdiocese of Cartagena in Colombia 120 | 118,Roman Catholic Diocese of San Juan de Calama 121 | 119,Maronite Catholic Eparchy of Saint Maron of Montreal 122 | 120,Eparchy of Saint Maron of Montreal Maronites 123 | 121,Anglican Diocese of Zululand 124 | 122,Roman Catholic Diocese of Coari 125 | 123,Roman Catholic Diocese of Nicopoli 126 | 124,Anglican Diocese of Athabasca 127 | 125,Anglican Diocese of Saint Helena 128 | 126,Roman Catholic Diocese of Trincomalee 129 | 127,Diocese of Lucknow of the Church of North India 130 | 128,Roman Catholic Diocese of Jalandhar 131 | 129,Roman Catholic Diocese of Simla and Chandigarh 132 | 130,Diocese of Angola 133 | 131,Diocese of Lebombo 134 | 132,Anglican Diocese of Lesotho 135 | 133,Latin Catholic Diocese of Punalur 136 | 134,Diocese of the Highveld 137 | 135,Diocese of Swaziland 138 | 136,Diocese of St Mark the Evangelist 139 | 137,Diocese of Mpumalanga 140 | 138,Latin Catholic Archdiocese of Baghdad 141 | 139,Anglican Diocese of Fredericton 142 | 140,Anglican Diocese of Saskatchewan 143 | 141,Anglican Diocese of Caledonia 144 | 142,Anglican Diocese of Brandon 145 | 143,Anglican Diocese of Kootenay 146 | 144,Anglican Diocese of Yukon 147 | 145,Territorial Prelature of the Mission de France at Pontigny 148 | 146,Apostolic Vicariate of Anatolia 149 | 147,Diocese of Karimnagar of the Church of South India 150 | 148,Diocese of Coimbatore of the Church of South India 151 | 149,Diocese of Coimbatore 152 | 150,Ukrainian Catholic Archeparchy of Ivano-Frankivsk 153 | 151,Ruthenian Catholic Eparchy of Mukacheve 154 | 152,Ruthenian Catholic Apostolic Exarchate of Czech Republic 155 | 153,Trichy-Tanjore Diocese of the Church of South India 156 | 154,Roman Catholic Diocese of Yarmouth 157 | 155,Military Ordinariate of the South African Defence Force 158 | 156,Military Ordinariate of Bolivia 159 | 157,Military Ordinariate of Chile 160 | 158,Military Ordinariate of Colombia 161 | 159,Military Ordinariate of the Dominican Republic 162 | 160,Military Bishopric of Dominican Republic 163 | 161,Military Ordinariate of Ecuador 164 | 162,Military Ordinariate of El Salvador 165 | 163,Military Ordinariate of Paraguay 166 | 164,Military Ordinariate of Peru 167 | 165,Military Ordinariate of Italy 168 | 166,Syro-Malankara Catholic Eparchy of the United States of America and Canada 169 | 167,Syro-Malankara Catholic Apostolic Exarchate in the United States 170 | 168,Metropolis of Servia and Kozani 171 | 169,Chaldean Catholic Archeparchy of Amida 172 | 170,Syro-Malabar Catholic Eparchy of Ramanathapuram 173 | 171,Diocese of South Kerala of the Church of South India 174 | 172,"Metropolitanate of Zagreb, Ljubljana and all Italy" 175 | 173,Roman Catholic Diocese of Roskilde 176 | 174,Diocese of Madhya Kerala of the Church of South India 177 | 175,Melkite Greek Catholic Archeparchy of Beirut and Jbeil 178 | 176,Syro-Malabar Catholic Eparchy of Mandya 179 | 177,Diocese of Iran 180 | 178,Archbishopric of Belgrade and Karlovci 181 | 179,Maronite Catholic Eparchy of Our Lady of Lebanon of Paris 182 | 180,Melkite Greek Catholic Archeparchy of Akka 183 | 181,Sufes 184 | 182,Maronite Catholic Eparchy of Saint Maron of Brooklyn 185 | 183,Armenian Catholic Eparchy of Our Lady of Nareg in the United States of America and Canada 186 | 184,Maronite Catholic Eparchy of San Charbel in Buenos Aires 187 | 185,Roman Catholic Apostolic Vicariate of El Beni 188 | 186,Roman Catholic Vicariate Apostolic of El Beni 189 | 187,Melkite Greek Catholic Patriarchal Archeparchy of Jerusalem 190 | 188,Melkite Greek Catholic Archeparchy of Jerusalem 191 | 189,Maronite Catholic Archeparchy of Haifa and the Holy Land 192 | -------------------------------------------------------------------------------- /src/autofj/benchmark/Artwork/right.csv: -------------------------------------------------------------------------------- 1 | id,title 2 | 0,Portlandia 3 | 1,La Danse (Bouguereau) 4 | 2,La Danse 5 | 3,The Return of Spring 6 | 4,The Night Watch (painting) 7 | 5,Guernica (Picasso) 8 | 6,The Chess Players (Eakins painting) 9 | 7,Alone in the World (Bouguereau) 10 | 8,Alone in the World 11 | 9,The Gilded Cage (Evelyn De Morgan painting) 12 | 10,The Virgin and Child with St. Anne (Leonardo da Vinci) 13 | 11,Adoration of the Magi (Leonardo da Vinci) 14 | 12,Polyptych of the Misericordia (Piero della Francesca) 15 | 13,"The Bride Stripped Bare by Her Bachelors, Even" 16 | 14,"Number 11, 1952 (painting)" 17 | 15,Bijin-ga 18 | 16,The Shepherdess (1889) 19 | 17,Portrait of Bindo Altoviti 20 | 18,La Fornarina 21 | 19,The Art of Painting (Vermeer) 22 | 20,Madonna of Foligno 23 | 21,Self-portrait with a friend 24 | 22,Madonna of the Book 25 | 23,Christ Falling on the Way to Calvary 26 | 24,Boy Bitten by a Lizard 27 | 25,The Little Street (Vermeer) 28 | 26,Neptune and Triton 29 | 27,Charity with Four Children 30 | 28,The Death of Sardanapalus 31 | 29,"Saint Jerome Writing (Caravaggio, Valletta)" 32 | 30,The Bohemian (Bouguereau) 33 | 31,The Bohemian 34 | 32,San Giorgio Maggiore at Dusk 35 | 33,Stone Flower (sculpture) 36 | 34,Sacred Cod 37 | 35,Farms near Auvers 38 | 36,The Artist's Studio 39 | 37,La maja vestida 40 | 38,Declaration of Independence (Trumbull) 41 | 39,The Roulin Family 42 | 40,Flowering Orchards 43 | 41,Sleeping Hermaphroditus 44 | 42,St. Michael Vanquishing Satan (Raphael) 45 | 43,Battle of Ostia (Raphael's painting) 46 | 44,"Adoration of the Magi (Bosch, Madrid)" 47 | 45,"Christ Carrying the Cross (Bosch, Vienna)" 48 | 46,The Hermit Saint 49 | 47,Blessed Ludovica Albertoni 50 | 48,Haystacks (Monet series) 51 | 49,The Monarch of the Glen (painting) 52 | 50,Corpus (Bernini) 53 | 51,28th Regiment at Quatre Bras (painting) 54 | 52,Portrait of a Princess (Pisanello) 55 | 53,Portrait of a princess (Pisanello) 56 | 54,Rouen Cathedral (Monet series) 57 | 55,April Love (painting) 58 | 56,The Open Window (Matisse) 59 | 57,"Queen Victoria Statue, Bristol" 60 | 58,Queen Victoria Statue 61 | 59,Boreas (painting) 62 | 60,Put Down Your Whip (painting) 63 | 61,St. Peter's baldachin 64 | 62,Cigarette (sculpture) 65 | 63,Gloria Victis (sculpture) 66 | 64,The Prodigal Son in the Brothel 67 | 65,Enigma of the Hour 68 | 66,Dippy 69 | 67,Large Interior Form 70 | 68,Saint Augustine (Botticelli) 71 | 69,Portrait of a Lady Known as Smeralda Brandini 72 | 70,Portrait of a Lady known as Smeralda Brandini 73 | 71,Non Violence 74 | 72,Young Knight in a Landscape 75 | 73,Butcher's Shop 76 | 74,Landscape with the Flight into Egypt 77 | 75,Landscape with the Flight into Egypt (Annibale Carracci) 78 | 76,Salting Madonna 79 | 77,The Judgement of Solomon (Giorgione) 80 | 78,Saint Augustine in His Cell (Botticelli) 81 | 79,Resurrection (Piero della Francesca) 82 | 80,Portrait of Adele Bloch-Bauer II 83 | 81,Manfred on the Jungfrau (Martin) 84 | 82,The Black Brunswicker (Millais) 85 | 83,Manfred on the Jungfrau (Madox Brown) 86 | 84,The Great Day of His Wrath 87 | 85,The Sortie Made by the Garrison of Gibraltar 88 | 86,Pinkie (painting) 89 | 87,The pioneer (painting) 90 | 88,Maman (sculpture) 91 | 89,The Goose Girl (Bouguereau) 92 | 90,Lincoln Monument 93 | 91,St. Jerome in the Wilderness (Leonardo) 94 | 92,The Bookworm (painting) 95 | 93,The Tribute Money 96 | 94,Frog Baby Fountain 97 | 95,Bathsheba at Her Bath 98 | 96,Walking on a mountain path in spring 99 | 97,Crucifixion with the Virgin and St John 100 | 98,The Crucifixion with the Virgin and St John (Hendrick ter Brugghen) 101 | 99,The Fortune Teller (de La Tour painting) 102 | 100,Aristotle with a Bust of Homer 103 | 101,The Needlewoman (painting) 104 | 102,George Washington (statue) 105 | 103,George Washington (1840 statue) 106 | 104,La Parisienne 107 | 105,Annunciation of Ustyug 108 | 106,Andromeda Chained to the Rocks 109 | 107,Garden at Sainte-Adresse 110 | 108,Insane Woman (La Monomane de l'envie) 111 | 109,Langlois Bridge at Arles 112 | 110,The Jester Don John of Austria 113 | 111,Prince Balthasar Charles with a Dwarf 114 | 112,Old Woman Frying Eggs 115 | 113,Convergence (Pollock) 116 | 114,"Portrait of a Young Woman (Vermeer, New York)" 117 | 115,Abraham Lincoln (1920 statue) 118 | 116,The Falconer (Simonds) 119 | 117,The Falconer (sculpture) 120 | 118,Traffic Light Tree 121 | 119,Iron Man (Minnesota statue) 122 | 120,Barrow (sculpture) 123 | 121,26 October 1993 124 | 122,"Adoration of the Magi (Rubens, Cambridge)" 125 | 123,Portrait of a Man in Red Chalk 126 | 124,Self-portrait (Leonardo da Vinci) 127 | 125,Portrait of a man in red chalk (Leonardo) 128 | 126,Bathsheba at Bath 129 | 127,La Parisienne (Hidalgo painting) 130 | 128,Farmhouses Among Trees 131 | 129,"Two Open Rectangles, Excentric, Variation VI" 132 | 130,Penitent Magdalene (Donatello) 133 | 131,Self-Portrait with Two Circles 134 | 132,Self-Portrait with Beret and Turned-Up Collar 135 | 133,"Portrait of a Young Woman (Botticelli, Frankfurt)" 136 | 134,View of Delft (Vermeer) 137 | 135,A Girl Asleep (Vermeer) 138 | 136,Portrait of Pope Julius II 139 | 137,Abraham Lincoln (1912 statue) 140 | 138,1.26 141 | 139,Madonna and Child Playing with the Veil 142 | 140,A Road at Saint-Remy with Female Figure 143 | 141,A Lane near Arles 144 | 142,A Young Tiger Playing with Its Mother 145 | 143,The Judgment of Paris (Rubens) 146 | 144,Battle of Vigo Bay (painting) 147 | 145,Christopher Columbus (Vittori) 148 | 146,George Washington (DeLue) 149 | 147,Edwin B. Hay (bust) 150 | 148,Eight Stone Lions 151 | 149,"Sundial, Boy with Spider" 152 | 150,George Washington (1785-1792 statue) 153 | 151,Francesca da Rimini and Paolo Malatesta Appraised by Dante and Virgil 154 | 152,R. D. Whitehead Monument 155 | 153,Lieutenant General George Washington (statue) 156 | 154,Aurora (di Suvero) 157 | 155,Alexander Pushkin (Bourganov) 158 | 156,"Philip Jaisohn (Washington, DC)" 159 | 157,Seated Woman 160 | 158,Still Life: Vase with Pink Roses 161 | 159,"Bottle, Glass, Fork" 162 | 160,On the wallaby track 163 | 161,The Last Supper (Ghirlandaio) 164 | 162,Almond Blossoms 165 | 163,Hospital in Arles 166 | 164,Saint Francis Receiving the Stigmata (Giotto) 167 | 165,Church Pew with Worshippers 168 | 166,Hermes (Sculpture) 169 | 167,The Great Day of Girona 170 | 168,A Young Man Being Introduced to the Seven Liberal Arts 171 | 169,Venus and the Three Graces Presenting Gifts to a Young Woman (Botticelli) 172 | 170,Saint Jerome in His Study (after van Eyck) 173 | 171,Portrait of the Duke of Wellington 174 | 172,"The Entombment (Titian, 1525)" 175 | 173,Captain Nathan Hale 176 | 174,Drunkenness of Noah 177 | 175,Martyrdom of Saint Lawrence 178 | 176,Bust of Thomas Baker 179 | 177,"Statue of Yuri Gagarin, Greenwich" 180 | 178,Two Busts of Cardinal Scipione Borghese 181 | 179,Lamentation of Christ (van der Weyden) 182 | 180,Head of a Woman (Leonardo da Vinci) 183 | 181,Memorial to Maria Raggi 184 | 182,Babe's Dream 185 | 183,"Equestrian statue of Charles I, Charing Cross" 186 | 184,The Virgin and Child with Two Angels (Andrea del Verrochio) 187 | 185,Madonna and Child (Lippi) 188 | 186,The Hope of a Condemned Man 189 | 187,Cafe Gurzuf 190 | 188,Salvator Mundi (Leonardo da Vinci) 191 | 189,Saint Sebastian (Bernini) 192 | 190,Bust of Francesco I d'Este 193 | 191,Bust of Camilla Barbadoni 194 | 192,Bust of Francesco Barberini 195 | 193,Bust of Cardinal Richelieu 196 | 194,Bust of Cardinal Richilieu 197 | 195,Saint Bibiana (Bernini) 198 | 196,Double Ascension 199 | 197,James Garfield Memorial 200 | 198,Robert Burns (Stevenson) 201 | 199,Fallen Firefighters Memorial (Wu) 202 | 200,General Thaddeus Kosciuszko 203 | 201,Wandering Rocks (Smith) 204 | 202,Fishing (Carracci) 205 | 203,Hunting (Carracci) 206 | 204,The Beggars (Bruegel) 207 | 205,Magdalene with the Smoking Flame 208 | 206,Pia de' Tolomei (Rossetti painting) 209 | 207,Thatched Cottages and Houses 210 | 208,"Statue of Margaret Thatcher, Palace of Westminster" 211 | 209,Adrianus Jacobus Zuyderland 212 | 210,Self-Portrait as the Allegory of Painting 213 | 211,Trio (Sugarman) 214 | 212,Queen Califias Magic Circle 215 | 213,Frontal from La Seu d'Urgell or of The Apostles 216 | 214,The Consecration of Saint Augustine 217 | 215,"South Wind, Clear Sky" 218 | 216,"The Elder Sister (Bouguereau, 1869)" 219 | 217,Bust of Pope Paul V 220 | 218,"Ritual wine server (guang), Indianapolis" 221 | 219,Ritual wine server 222 | 220,"Ritual wine server (guang), Indianapolis Museum of Art, 60.43" 223 | 221,Jupiter and Antiope (van Dyck) 224 | 222,Saint Peter and Saint Paul (El Greco) 225 | 223,In the Loge 226 | 224,The Descent from the Cross (David Folley) 227 | 225,"Equestrian statue of George IV, Trafalgar Square" 228 | 226,Footballer (Nolan) 229 | 227,Bust of Alessandro Peretti di Montalto 230 | 228,White on White 231 | 229,Apse from the Carthedral of Urgell 232 | 230,Middlebury to Her Soldiers 233 | 231,Statue of Pope Clement X 234 | 232,Les Orangers 235 | 233,John Harvard (statue) 236 | 234,Before the Race 237 | 235,The Potato Harvest 238 | 236,Allegory of the Element Earth 239 | 237,The Duel After the Masquerade 240 | 238,The Archdukes Albert and Isabella Visiting a Collector's Cabinet 241 | 239,Angel of the Resurrection (Tiffany Studios stained glass window) 242 | 240,Charing Cross Bridge (Monet series) 243 | 241,Wall Street (photograph) 244 | 242,(Untitled) Blue Lady 245 | 243,First Steps (painting) 246 | 244,Sumbanese woman's ceremonial skirt (Indianapolis Museum of Art) 247 | --------------------------------------------------------------------------------