├── tests
├── __init__.py
└── test_autofj.py
├── src
├── autofj
│ ├── blocker
│ │ ├── __init__.py
│ │ └── blocker.py
│ ├── optimizer
│ │ └── __init__.py
│ ├── join_function_space
│ │ ├── __init__.py
│ │ ├── join_function
│ │ │ ├── __init__.py
│ │ │ ├── tokenizer.py
│ │ │ ├── join_function.py
│ │ │ ├── preprocessor.py
│ │ │ ├── token_weight.py
│ │ │ └── distance_function.py
│ │ └── options.py
│ ├── __init__.py
│ ├── benchmark
│ │ ├── Galaxy
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── NationalFootballLeagueSeason
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── TennisTournament
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── NCAATeamSeason
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── ArtificialSatellite
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── NaturalEvent
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── Enzyme
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── RugbyLeague
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── SoccerClubSeason
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── FootballMatch
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── GivenName
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── Drug
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── BasketballTeam
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── Race
│ │ │ └── right.csv
│ │ ├── ShoppingMall
│ │ │ ├── left.csv
│ │ │ ├── right.csv
│ │ │ └── gt.csv
│ │ ├── Monarch
│ │ │ └── right.csv
│ │ ├── Magazine
│ │ │ └── right.csv
│ │ ├── SoccerLeague
│ │ │ └── right.csv
│ │ ├── Legislature
│ │ │ └── right.csv
│ │ ├── Country
│ │ │ └── right.csv
│ │ ├── ClericalAdministrativeRegion
│ │ │ └── right.csv
│ │ └── Artwork
│ │ │ └── right.csv
│ ├── utils.py
│ ├── datasets.py
│ ├── 50-single-column-datasets.md
│ ├── negative_rule.py
│ └── autofj.py
└── autofj.egg-info
│ ├── dependency_links.txt
│ ├── top_level.txt
│ ├── requires.txt
│ ├── SOURCES.txt
│ └── PKG-INFO
├── .gitignore
├── MANIFEST.in
├── dist
├── autofj-0.0.6.tar.gz
└── autofj-0.0.6-py3-none-any.whl
├── pyproject.toml
├── setup.py
├── LISENCE
└── README.md
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/autofj/blocker/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/autofj/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 |
--------------------------------------------------------------------------------
/src/autofj/join_function_space/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/autofj.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/src/autofj.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | autofj
2 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include src/autofj/benchmark/ *
--------------------------------------------------------------------------------
/src/autofj/__init__.py:
--------------------------------------------------------------------------------
1 | from .autofj import AutoFJ
2 |
--------------------------------------------------------------------------------
/src/autofj/join_function_space/join_function/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/autofj.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | nltk
4 | ngram
5 | editdistance
6 | jellyfish
7 | spacy
8 |
--------------------------------------------------------------------------------
/dist/autofj-0.0.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chu-data-lab/AutomaticFuzzyJoin/HEAD/dist/autofj-0.0.6.tar.gz
--------------------------------------------------------------------------------
/dist/autofj-0.0.6-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chu-data-lab/AutomaticFuzzyJoin/HEAD/dist/autofj-0.0.6-py3-none-any.whl
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
--------------------------------------------------------------------------------
/src/autofj/benchmark/Galaxy/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Sagittarius Dwarf Spheroidal Galaxy
3 | 1,RX J1242-11
4 | 2,Canis Major Overdensity
5 | 3,Carina Dwarf Spheroidal Galaxy
6 | 4,NGC 34
7 | 5,NGC 6872
8 | 6,GR 8
9 | 7,NGC 1265
10 | 8,3C 433
11 | 9,MS 1512-cB58
12 | 10,NGC 935/IC 1801
13 | 11,Arp 302
14 | 12,DDO 169
15 | 13,Segue 2
16 | 14,NGC 5562
17 | 15,DDO 190
18 | 16,Carina Dwarf Spheroidal galaxy
19 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/NationalFootballLeagueSeason/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,1900 Homestead Library & Athletic Club football season
3 | 1,2003 FC Barcelona Dragons season
4 | 2,1905 Canton Athletic Club season
5 | 3,1911 Canton Professionals season
6 | 4,1912 Canton Professionals season
7 | 5,1914 Canton Professionals season
8 | 6,1913 Canton Professionals season
9 | 7,2002 FC Barcelona Dragons season
10 | 8,1921 Detroit Tigers season (NFL)
11 | 9,1996 Minnesota Fighting Pike Season
12 |
--------------------------------------------------------------------------------
/src/autofj/utils.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | import os
3 |
4 | def print_log(message):
5 | print("{}:{}".format(datetime.now().strftime('%H:%M:%S'), message))
6 |
7 | def makedir(dir_list, file=None, remove_old_dir=False):
8 | save_dir = os.path.join(*dir_list)
9 |
10 | if remove_old_dir and os.path.exists(save_dir) and file is None:
11 | shutil.rmtree(save_dir)
12 |
13 | if not os.path.exists(save_dir):
14 | os.makedirs(save_dir)
15 | if file is not None:
16 | save_dir = os.path.join(save_dir, file)
17 | return save_dir
--------------------------------------------------------------------------------
/src/autofj/datasets.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import os
3 | from os.path import dirname
4 |
5 | def load_data(name):
6 | module_path = dirname(__file__)
7 | if os.path.exists(os.path.join(module_path, "benchmark", name)):
8 | left_table = pd.read_csv(os.path.join(module_path, "benchmark", name, "left.csv"))
9 | right_table = pd.read_csv(os.path.join(module_path, "benchmark", name, "right.csv"))
10 | gt_table = pd.read_csv(os.path.join(module_path, "benchmark", name, "gt.csv"))
11 | return left_table, right_table, gt_table
12 | else:
13 | raise Exception("Dataset {} does not exist.".format(name))
14 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Galaxy/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 10,Sagittarius Dwarf Elliptical Galaxy,0,Sagittarius Dwarf Spheroidal Galaxy
3 | 14,RXJ1242-11,1,RX J1242-11
4 | 65,Canis Major Dwarf Galaxy,2,Canis Major Overdensity
5 | 94,Carina Dwarf,3,Carina Dwarf Spheroidal Galaxy
6 | 94,Carina Dwarf,16,Carina Dwarf Spheroidal galaxy
7 | 104,NGC 17,4,NGC 34
8 | 190,NGC 6872 and IC 4970,5,NGC 6872
9 | 213,UGC 8091,6,GR 8
10 | 317,3C 83.1B,7,NGC 1265
11 | 324,QSO B2121+248,8,3C 433
12 | 330,MS 1512 +36-cB58,9,MS 1512-cB58
13 | 354,Arp 276,10,NGC 935/IC 1801
14 | 370,UGC 9618,11,Arp 302
15 | 423,UGC 8331,12,DDO 169
16 | 446,Segue 2 (dwarf galaxy),13,Segue 2
17 | 509,NGC 5662,14,NGC 5562
18 | 528,UGC 9240,15,DDO 190
19 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/NationalFootballLeagueSeason/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 1976,1921 Detroit Tigers (NFL) season,8,1921 Detroit Tigers season (NFL)
3 | 2057,1900 Homestead Library & Athletic Club football team,0,1900 Homestead Library & Athletic Club football season
4 | 2339,2003 Barcelona Dragons season,1,2003 FC Barcelona Dragons season
5 | 2469,1905 Canton Bulldogs season,2,1905 Canton Athletic Club season
6 | 2471,1911 Canton Bulldogs season,3,1911 Canton Professionals season
7 | 2477,1912 Canton Bulldogs season,4,1912 Canton Professionals season
8 | 2478,1914 Canton Bulldogs season,5,1914 Canton Professionals season
9 | 2479,1913 Canton Bulldogs season,6,1913 Canton Professionals season
10 | 2634,2002 Barcelona Dragons season,7,2002 FC Barcelona Dragons season
11 | 2647,1996 Minnesota Fighting Pike season,9,1996 Minnesota Fighting Pike Season
12 |
--------------------------------------------------------------------------------
/src/autofj/blocker/blocker.py:
--------------------------------------------------------------------------------
1 | class Blocker(object):
2 | """The customized blocker must have a block method as below. The constructor
3 | can be overwritten"""
4 | def __init__(self):
5 | pass
6 |
7 | def block(self, left_table, right_table, id_column):
8 | """ Perform blocking on two tables
9 |
10 | Parameters
11 | ----------
12 | left_table: pd.DataFrame
13 | Reference table. The left table is assumed to be almost
14 | duplicate-free, which means it has no or only few duplicates.
15 |
16 | right_table: pd.DataFrame
17 | Another input table.
18 |
19 | id_column: string
20 | The name of id column in two tables.
21 |
22 | Returns:
23 | --------
24 | result: pd.DataFrame
25 | A table of records pairs survived blocking. Column names
26 | id_column + "_l" and id_column + "_r"
27 | """
28 | result = None
29 | return result
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r", encoding="utf-8") as fh:
4 | long_description = fh.read()
5 |
6 | setuptools.setup(
7 | name="autofj",
8 | version="0.0.6",
9 | author="Peng Li",
10 | author_email="lipengpublic@gmail.com",
11 | description="Auto-Program Fuzzy Similarity Joins Without Labeled Examples",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/chu-data-lab/AutomaticFuzzyJoin",
15 | classifiers=[
16 | "Programming Language :: Python :: 3",
17 | "License :: OSI Approved :: MIT License",
18 | "Operating System :: OS Independent",
19 | ],
20 | package_dir={"": "src"},
21 | packages=setuptools.find_packages(where="src"),
22 | python_requires=">=3.7",
23 | install_requires=[
24 | 'numpy',
25 | 'pandas',
26 | 'nltk',
27 | 'ngram',
28 | 'editdistance',
29 | 'jellyfish',
30 | 'spacy',
31 | ],
32 | include_package_data=True
33 | )
--------------------------------------------------------------------------------
/src/autofj/benchmark/TennisTournament/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,2000 Tennis Masters Cup and ATP Tour World Championships
3 | 1,2008 Challenge Bell
4 | 2,2010 Aegon Championships
5 | 3,2009 Challenge Bell
6 | 4,1996 Challenge Bell
7 | 5,1997 Challenge Bell
8 | 6,1998 Challenge Bell
9 | 7,2010 Challenger Banque Nationale de Rimouski
10 | 8,2009 San Benedetto Tennis Cup
11 | 9,2010 Challenge Bell
12 | 10,2010 Blu-express.com Tennis Cup
13 | 11,2011 Challenger Banque Nationale de Rimouski
14 | 12,2011 Aegon Championships
15 | 13,2011 Aegon Classic
16 | 14,2011 Challenge Bell
17 | 15,2011 Challenger Banque Nationale de Saguenay
18 | 16,2012 Challenger Banque Nationale de Rimouski
19 | 17,2012 Aegon Championships
20 | 18,2012 Challenge Bell
21 | 19,2012 Blu-express.com Tennis Cup
22 | 20,2012 Korea Open
23 | 21,2012 Arimex Challenger Trophy
24 | 22,2012 Challenger Banque Nationale de Saguenay
25 | 23,2012 Aegon Pro-Series Loughborough
26 | 24,2013 Garanti Koza WTA Tournament of Champions
27 | 25,2013 Challenger Banque Nationale de Rimouski
28 | 26,2012 Commonwealth Bank Tournament of Champions
29 |
--------------------------------------------------------------------------------
/LISENCE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2018 The Python Packaging Authority
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
--------------------------------------------------------------------------------
/tests/test_autofj.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import time
3 | import os
4 | from autofj import AutoFJ
5 | from autofj.datasets import load_data
6 |
7 | def evaluate(pred_joins, gt_joins):
8 | """ Evaluate the performance of fuzzy joins
9 |
10 | Parameters
11 | ----------
12 | pred_joins: list
13 | A list of tuple pairs (id_l, id_r) that are predicted to be matches
14 |
15 | gt_joins:
16 | The ground truth matches
17 |
18 | Returns
19 | -------
20 | precision: float
21 | Precision score
22 |
23 | recall: float
24 | Recall score
25 |
26 | f1: float
27 | F1 score
28 | """
29 | pred = {(l, r) for l, r in pred_joins}
30 | gt = {(l, r) for l, r in gt_joins}
31 | tp = pred.intersection(gt)
32 |
33 | precision = len(tp) / len(pred)
34 | recall = len(tp) / len(gt)
35 | f1 = 2 * precision * recall / (precision + recall)
36 | return precision, recall, f1
37 |
38 | def test_autofj(dataset):
39 | left, right, gt = load_data(dataset)
40 | autofj = AutoFJ(verbose=True)
41 | LR_joins = autofj.join(left, right, id_column="id")
42 |
43 | print(LR_joins)
44 | gt_joins = gt[["id_l", "id_r"]].values
45 | LR_joins = LR_joins[["id_l", "id_r"]].values
46 | p, r, f1 = evaluate(LR_joins, gt_joins)
47 | print("Precision:", p, "Recall:", r, "F1:", f1)
48 |
49 | if __name__ == '__main__':
50 | test_autofj("TennisTournament")
51 |
52 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/NCAATeamSeason/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,1893 LSU football team
3 | 1,1894 LSU football team
4 | 2,1895 LSU football team
5 | 3,1892 Alabama Cadets football team
6 | 4,1894 Alabama Crimson White football team
7 | 5,1893 Alabama Crimson White football team
8 | 6,1895 Alabama Crimson White football team
9 | 7,1898 Alabama Crimson White football team
10 | 8,1897 Alabama Crimson White football team
11 | 9,1896 Alabama Crimson White football team
12 | 10,1899 Alabama Crimson White football team
13 | 11,1900 Alabama Crimson White football team
14 | 12,1901 Alabama Crimson White football team
15 | 13,1902 Alabama Crimson White football team
16 | 14,1903 Alabama Crimson White football team
17 | 15,1904 Alabama Crimson White football team
18 | 16,1906 Alabama Crimson White football team
19 | 17,1905 Alabama Crimson White football team
20 | 18,1906 Arkansas Cardinals football team
21 | 19,2011 Austin Peay Governors football team
22 | 20,1907 Notre Dame football team
23 | 21,2012 Austin Peay Governors football team
24 | 22,1919 Washington Sun Dodgers football team
25 | 23,2013 Hawaii Rainbow Warriors football team
26 | 24,1887 Notre Dame football team
27 | 25,2002 Florida Atlantic Owls baseball team
28 | 26,1974 Oregon Ducks football team
29 | 27,1964 Oregon Ducks football team
30 | 28,1963 Oregon Ducks football team
31 | 29,2010 FIU Golden Panthers football team
32 | 30,1975 Oregon Ducks football team
33 | 31,2011 FIU Golden Panthers football team
34 | 32,1979 UCF Golden Knights football team
35 | 33,2012 FIU Golden Panthers football team
36 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/ArtificialSatellite/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Soyuz 7K-T No.39
3 | 1,Pad Abort Test 1
4 | 2,AS-101 (spacecraft)
5 | 3,AS-102 (spacecraft)
6 | 4,Spirit rover
7 | 5,Opportunity rover
8 | 6,UO-11
9 | 7,AS-104 (spacecraft)
10 | 8,AS-103 (spacecraft)
11 | 9,AS-105 (spacecraft)
12 | 10,Pad Abort Test 2
13 | 11,Soyuz 7K-ST No. 16L
14 | 12,SpaceShipOne Flight 17P
15 | 13,Nuclear Spectroscopic Telescope Array
16 | 14,Eutelsat 33C
17 | 15,Seasat
18 | 16,Foton-M No.2
19 | 17,CHAMP (satellite)
20 | 18,Kosmos 605
21 | 19,Kosmos 1667
22 | 20,CUTE-1.7 + APD
23 | 21,Eutelsat 31A
24 | 22,Eutelsat 16B
25 | 23,GEOTAIL
26 | 24,Fengyun 2D
27 | 25,ACRIMSAT
28 | 26,Resurs-DK No.1
29 | 27,HYLAS
30 | 28,Landsat 8
31 | 29,PROBA2
32 | 30,Solwind
33 | 31,UoSAT-1
34 | 32,UoSat-OSCAR 9
35 | 33,ABRIXAS
36 | 34,ABS-3
37 | 35,Sentinel-3
38 | 36,Soil Moisture and Ocean Salinity
39 | 37,MightySat-2.1
40 | 38,Aditya (satellite)
41 | 39,Eutelsat 48D
42 | 40,Eutelsat 48B
43 | 41,Afghansat 1
44 | 42,Hot Bird 13C
45 | 43,AMC-1
46 | 44,Prisma (satellite project)
47 | 45,Eutelsat 4A
48 | 46,CP6 (satellite)
49 | 47,Eutelsat 113 West A
50 | 48,Orion 3
51 | 49,Azerspace
52 | 50,LightSail 2
53 | 51,Eutelsat 33B
54 | 52,Eutelsat 25C
55 | 53,COTS Demo Flight 1
56 | 54,COTS Demo Flight 2
57 | 55,Dragon C3
58 | 56,NEE-01 Pegaso
59 | 57,SES-7
60 | 58,Laplace-P
61 | 59,Intelsat 28
62 | 60,Ziyuan III-01
63 | 61,Venera 3MV-1 No.2
64 | 62,Cygnus Orb-D1
65 | 63,Ekspress AM4
66 | 64,IRNSS-1A
67 | 65,Dragon C4
68 | 66,Telstar 14
69 | 67,USA-242
70 | 68,Mars Orbiter Mission
71 | 69,AIDA (mission)
72 | 70,TDRS-11
73 | 71,Eutelsat 117 West A
74 |
--------------------------------------------------------------------------------
/src/autofj/join_function_space/join_function/tokenizer.py:
--------------------------------------------------------------------------------
1 | import ngram
2 | import pandas as pd
3 | import numpy as np
4 | import time
5 |
6 | three_gramer = ngram.NGram(N=3)
7 |
8 | def splitBySpace(x):
9 | return x.split()
10 |
11 | def threeGram(x):
12 | # Replace whitespace more than one by a single blank
13 | return list(three_gramer.split(" ".join(x.split())))
14 |
15 | class Tokenizer:
16 | """Tokenize data
17 |
18 | Parameters
19 | ----------
20 | method: string
21 | Tokenization method. The available methods are listed as follows.
22 | - splitBySpace
23 | - threeGram
24 | - None (no tokenization)
25 | """
26 | def __init__(self, method):
27 | self.method = method
28 | if method is None:
29 | self.func = None
30 | elif method == "splitBySpace":
31 | self.func = splitBySpace
32 | elif method == "threeGram":
33 | self.func = threeGram
34 | else:
35 | raise Exception("{} is an invalid tokenization method"
36 | .format(method))
37 |
38 | def tokenize(self, X):
39 | """ Tokenize input data
40 |
41 | Parameters
42 | ----------
43 | X: pd.Series
44 | Input data
45 | """
46 | if self.func is not None:
47 | X = X.apply(self.func)
48 | return X
49 |
50 | # data = pd.read_csv("../../data/left.csv")["title"]
51 | # X = np.concatenate([data.values for _ in range(20)])
52 | # X = pd.Series(X)
53 | #
54 | # tokenizer = Tokenizer("threeGram")
55 | # tic = time.time()
56 | # tokenizer.tokenize(X)
57 | # print(time.time() - tic)
--------------------------------------------------------------------------------
/src/autofj/benchmark/NaturalEvent/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,1988 Armenian earthquake
3 | 1,1935 Quetta earthquake
4 | 2,2006 Yogyakarta earthquake
5 | 3,2006 Pangandaran earthquake and tsunami
6 | 4,2006 Hengchun earthquakes
7 | 5,January 2001 El Salvador earthquake
8 | 6,1509 Constantinople earthquake
9 | 7,1959 Hebgen Lake earthquake
10 | 8,February 1998 Afghanistan earthquake
11 | 9,May 1998 Afghanistan earthquake
12 | 10,2004 Al Hoceima earthquake
13 | 11,1968 Dasht-e Bayaz and Ferdows earthquakes
14 | 12,2000 Enggano earthquake
15 | 13,1980 Oaxaca earthquake
16 | 14,1894 Tokyo earthquake
17 | 15,1953 Ionian earthquake
18 | 16,2009 Papua earthquakes
19 | 17,1929 Kopet Dag earthquake
20 | 18,2008 Qeshm earthquake
21 | 19,1755 Cape Ann earthquake
22 | 20,749 Galilee earthquake
23 | 21,2009 Samoa earthquake and tsunami
24 | 22,1896 Sanriku earthquake
25 | 23,1854 Nankai earthquake
26 | 24,1940 New Hampshire earthquakes
27 | 25,2010 Solomon Islands earthquake
28 | 26,1996 Duvall earthquake
29 | 27,1903 Manzikert earthquake
30 | 28,1653 East Smyrna earthquake
31 | 29,1688 Smyrna earthquake
32 | 30,1855 Edo earthquake
33 | 31,1927 Jericho earthquake
34 | 32,1909 Provence earthquake
35 | 33,869 Sanriku earthquake
36 | 34,2011 Myanmar earthquake
37 | 35,1911 Guerrero earthquake
38 | 36,1611 Sanriku earthquake
39 | 37,1932 Jalisco earthquakes
40 | 38,2012 Afghanistan earthquakes
41 | 39,2012 Yangzhou earthquake
42 | 40,2008 Bandar Abbas earthquake
43 | 41,March 2013 Nantou earthquake
44 | 42,1985 Santiago earthquake
45 | 43,1962 Bou'in-Zahra earthquake
46 | 44,Near East earthquake of 1759
47 | 45,2010 Kaohsiung earthquake
48 | 46,1995 Egypt earthquake
49 | 47,1914 Afyon-Bolvadin earthquake
50 | 48,Great Adelaide Earthquake
51 | 49,847 Antioch earthquake
52 | 50,2012 Indian Ocean earthquake
53 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Enzyme/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Adenylyl cyclase
3 | 1,NADH:ubiquinone reductase (H+-translocating)
4 | 2,Ribonuclease H
5 | 3,Guanylyl cyclase
6 | 4,Bovine pancreatic ribonuclease
7 | 5,Gelatinase A
8 | 6,Carboxypeptidase U
9 | 7,3beta-Hydroxysteroid dehydrogenase
10 | 8,Aspartate kinase
11 | 9,Aralkylamine N-acetyltransferase
12 | 10,CTP synthase
13 | 11,Alpha-Amylase
14 | 12,Carboxypeptidase C
15 | 13,Diamine oxidase
16 | 14,Alpha-N-acetylgalactosaminidase
17 | 15,"1,3-beta-glucan synthase"
18 | 16,4-Hydroxybutyrate dehydrogenase
19 | 17,Sn-glycerol-1-phosphate dehydrogenase
20 | 18,3-Ketosteroid reductase
21 | 19,(iso)eugenol O-methyltransferase
22 | 20,(myelin basic protein)-arginine N-methyltransferase
23 | 21,(ribulose-bisphosphate carboxylase)-lysine N-methyltransferase
24 | 22,(formate-C-acetyltransferase)-activating enzyme
25 | 23,Sulfhydrogenase
26 | 24,4-hydroxyphenylacetate 3-monooxygenase
27 | 25,Desacetoxyvindoline 4-hydroxylase
28 | 26,L-2-hydroxyglutarate dehydrogenase
29 | 27,4-cresol dehydrogenase (hydroxylating)
30 | 28,(methionine synthase) reductase
31 | 29,"5,10-methenyltetrahydromethanopterin hydrogenase"
32 | 30,Chlorite dismutase
33 | 31,NAD(P)+ transhydrogenase (Re/Si-specific)
34 | 32,NAD(P)+ transhydrogenase (Si-specific)
35 | 33,Myosin-light-chain phosphatase
36 | 34,2-hydroxyacylsphingosine 1-beta-galactosyltransferase
37 | 35,(isocitrate dehydrogenase (NADP+)) kinase
38 | 36,TRNA cytidylyltransferase
39 | 37,4-hydroxy-3-methylbut-2-enyl diphosphate reductase
40 | 38,Cyanase
41 | 39,Formylglycine-generating enzyme
42 | 40,Nucleotide pyrophosphatase/phosphodiesterase
43 | 41,IgA specific serine endopeptidase
44 | 42,5-beta-reductase
45 | 43,ALG10 (enzyme class)
46 | 44,ALG8 (enzyme class)
47 | 45,ALG6 (enzyme class)
48 | 46,Methionine transaminase
49 | 47,4-Sulfomuconolactone hydrolase
50 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/TennisTournament/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 4,2000 Tennis Masters Cup,0,2000 Tennis Masters Cup and ATP Tour World Championships
3 | 25,2008 Bell Challenge,1,2008 Challenge Bell
4 | 64,2010 Queen's Club Championships,2,2010 Aegon Championships
5 | 70,2009 Bell Challenge,3,2009 Challenge Bell
6 | 73,1996 Bell Challenge,4,1996 Challenge Bell
7 | 75,1997 Bell Challenge,5,1997 Challenge Bell
8 | 83,1998 Bell Challenge,6,1998 Challenge Bell
9 | 84,2010 Challenger Banque Nationale,7,2010 Challenger Banque Nationale de Rimouski
10 | 115,2009 Carisap Tennis Cup,8,2009 San Benedetto Tennis Cup
11 | 151,2010 Bell Challenge,9,2010 Challenge Bell
12 | 165,2010 Internazionali di Tennis dell'Umbria,10,2010 Blu-express.com Tennis Cup
13 | 183,2011 Men's Rimouski Challenger,11,2011 Challenger Banque Nationale de Rimouski
14 | 187,2011 AEGON Championships,12,2011 Aegon Championships
15 | 188,2011 AEGON Classic,13,2011 Aegon Classic
16 | 200,2011 Bell Challenge,14,2011 Challenge Bell
17 | 202,2011 National Bank Challenger Saguenay,15,2011 Challenger Banque Nationale de Saguenay
18 | 219,2012 Qatar Airways Tournament of Champions,26,2012 Commonwealth Bank Tournament of Champions
19 | 224,2012 Men's Rimouski Challenger,16,2012 Challenger Banque Nationale de Rimouski
20 | 229,2012 AEGON Championships,17,2012 Aegon Championships
21 | 261,2012 Bell Challenge,18,2012 Challenge Bell
22 | 266,2012 Internazionali di Tennis dell'Umbria,19,2012 Blu-express.com Tennis Cup
23 | 267,2012 Hansol Korea Open,20,2012 Korea Open
24 | 269,2012 ATP Challenger Trophy,21,2012 Arimex Challenger Trophy
25 | 284,2012 National Bank Challenger Saguenay,22,2012 Challenger Banque Nationale de Saguenay
26 | 296,2012 AEGON Pro-Series Loughborough,23,2012 Aegon Pro-Series Loughborough
27 | 308,2013 Qatar Airways Tournament of Champions,24,2013 Garanti Koza WTA Tournament of Champions
28 | 319,2013 Challenger Banque Nationale,25,2013 Challenger Banque Nationale de Rimouski
29 |
--------------------------------------------------------------------------------
/src/autofj/join_function_space/options.py:
--------------------------------------------------------------------------------
1 | """Options of join functions"""
2 |
3 | autofj_lg = {
4 | "preprocess_methods":["lower", "lowerStem", "lowerRemovePunctuation",
5 | "lowerRemovePunctuationStem"],
6 | "tokenize_methods": ["threeGram", "splitBySpace"],
7 | "token_weights": ["uniformWeight", "idfWeight"],
8 | "char_distance_functions": ["editDistance", "jaroDistance"],
9 | "set_distance_functions": ["containJaccardDistance",
10 | "containCosineDistance",
11 | "containDiceDistance",
12 | "intersectDistance",
13 | "jaccardDistance",
14 | "cosineDistance",
15 | "diceDistance",
16 | "maxincDistance"]
17 | }
18 |
19 | autofj_md = {
20 | "preprocess_methods":["lower", "lowerRemovePunctuationStem"],
21 | "tokenize_methods": ["threeGram", "splitBySpace"],
22 | "token_weights": ["uniformWeight", "idfWeight"],
23 | "char_distance_functions": ["editDistance", "jaroDistance"],
24 | "set_distance_functions": ["containJaccardDistance",
25 | "containCosineDistance",
26 | "containDiceDistance",
27 | "intersectDistance",
28 | "jaccardDistance",
29 | "cosineDistance",
30 | "diceDistance",
31 | "maxincDistance"]
32 | }
33 |
34 | autofj_sm = {
35 | "preprocess_methods":["lower", "lowerRemovePunctuationStem"],
36 | "tokenize_methods": ["threeGram", "splitBySpace"],
37 | "token_weights": ["idfWeight"],
38 | "char_distance_functions": ["jaroDistance"],
39 | "set_distance_functions": ["containCosineDistance",
40 | "jaccardDistance",
41 | "maxincDistance"]
42 | }
--------------------------------------------------------------------------------
/src/autofj/benchmark/RugbyLeague/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,World Rugby Sevens Series
3 | 1,World Club Series
4 | 2,Mitre 10 Cup
5 | 3,Pro12
6 | 4,League Cup (rugby league)
7 | 5,Top League Champions Cup
8 | 6,World Rugby Pacific Nations Cup
9 | 7,Intrust Super Premiership NSW
10 | 8,World Rugby Pacific Challenge
11 | 9,Tom Richards Cup
12 | 10,World Rugby Nations Cup
13 | 11,France Sevens
14 | 12,Charity Shield (NRL)
15 | 13,Caledonia Regional League
16 | 14,Asia Rugby Women's Championship
17 | 15,List of Rugby World Cup finals
18 | 16,Scottish Premiership (rugby)
19 | 17,Asia Rugby Championship
20 | 18,African Development Trophy
21 | 19,Rugby League European Championship B
22 | 20,World Rugby Under 20 Championship
23 | 21,World Rugby Under 20 Trophy
24 | 22,Sri Lanka Sevens
25 | 23,ANZAC Day Cup
26 | 24,Women's Nations Cup (rugby union)
27 | 25,Rugby League European Championship C
28 | 26,Oceania Rugby Cup
29 | 27,GIO Schoolboy Cup
30 | 28,South Premier
31 | 29,National Women's Rugby Championship
32 | 30,2. Rugby-Bundesliga
33 | 31,All Stars match
34 | 32,Asian Women's Sevens Championship
35 | 33,African Women's Sevens Championship
36 | 34,North America and Caribbean Women's Sevens Championship
37 | 35,Oceania Women's Sevens Championship
38 | 36,Women's rugby sevens in South America
39 | 37,Ekstraliga (rugby)
40 | 38,North East Rugby League Premier Division
41 | 39,Division 1-A Rugby
42 | 40,SARU Gold Cup
43 | 41,World Rugby Women's Sevens Series
44 | 42,National Championship of Excellence (Italian premiership)
45 | 43,All Japan University Rugby Championship
46 | 44,Colonial Cup
47 | 45,Pacific Nations Cup
48 | 46,Bundaberg Red Cup
49 | 47,"London, South and East Merit League"
50 | 48,Tonga National Rugby League
51 | 49,Rugby League Conference South Premier
52 | 50,St. Patrick's Day Test
53 | 51,Rugby League Conference North East Division
54 | 52,Rugby League Conference London & South Division
55 | 53,International Origin Match
56 | 54,The Ron Coote Cup
57 | 55,South American Women's Sevens
58 | 56,Asian Rugby Championship
59 | 57,Sevens World Series
60 |
--------------------------------------------------------------------------------
/src/autofj/join_function_space/join_function/join_function.py:
--------------------------------------------------------------------------------
1 | class JoinFunction(object):
2 | """Customized join function must have an unique name attribute and a method named
3 | compute_distance as below. The constructor can be overwritten"""
4 | def __init__(self):
5 | self.name = "jf_example"
6 | pass
7 |
8 | def compute_distance(self, left, right, LL_blocked, LR_blocked,
9 | cache_dir=None):
10 | """Compute the distance of each tuple pair in the LL and LR blocked table.
11 |
12 | Parameters
13 | ----------
14 | left: pd.DataFrame
15 | A subset of the left table that contains the id column and the
16 | column to be processed. The id column is named as autofj_id.
17 | The column to be processed is named as value.
18 |
19 | right: pd.DataFrame
20 | A subset of the right table that contains the id column and the
21 | column to be processed. The id column is named as autofj_id.
22 | The column to be processed is named as value.
23 |
24 | LL_blocked: pd.DataFrame
25 | The LL blocked table that consists of the id columns and
26 | the columns to be processed. The id columns are named as
27 | autofj_id_l and autofj_id_r. The column to be processed is named as
28 | value_l and value_r.
29 |
30 | LR_blocked: pd.DataFrame
31 | The LR blocked table that consists of the id columns and
32 | the columns to be processed. The id columns are named as
33 | autofj_id_l and autofj_id_r. The column to be processed is named as
34 | value_l and value_r.
35 |
36 | Returns
37 | -------
38 | LL_distance: pd.Series
39 | Distance of each tuple pair in the LL blocked table.
40 |
41 | LR_distance: pd.Series
42 | Distance of each tuple pair in the LR blocked table.
43 | """
44 | LL_distance = None
45 | LR_distance = None
46 | return LL_distance, LR_distance
47 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/SoccerClubSeason/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,2006 Australia national soccer team season
3 | 1,2007 Australia national soccer team season
4 | 2,2008 Australia national soccer team season
5 | 3,2005 Australia national soccer team season
6 | 4,2009 Australia national soccer team season
7 | 5,2010 Australia national soccer team season
8 | 6,2003 Cienciano season
9 | 7,2009 Helsingin Jalkapalloklubi season
10 | 8,2010 Jeonbuk Hyundai Motors FC season
11 | 9,2010 Down football season
12 | 10,2011 Down football season
13 | 11,2011 Daejeon Citizen FC season
14 | 12,2011 Australia national soccer team season
15 | 13,2011 Orlando City SC season
16 | 14,2011 Jeonbuk Hyundai Motors FC season
17 | 15,2011 Jeonnam Dragons season
18 | 16,2004 Australia national soccer team season
19 | 17,2011 Incheon United FC season
20 | 18,2003 Australia national soccer team season
21 | 19,2002 Australia national soccer team season
22 | 20,1999 Australia national soccer team season
23 | 21,1998 Australia national soccer team season
24 | 22,2001 Australia national soccer team season
25 | 23,2000 Australia national soccer team season
26 | 24,2012 Down football season
27 | 25,1997 Australia national soccer team season
28 | 26,1996 Australia national soccer team season
29 | 27,2012 Kuala Lumpur FA season
30 | 28,2012 Daejeon Citizen FC season
31 | 29,2012 Orlando City SC season
32 | 30,2012 Australia national soccer team season
33 | 31,2012 Jeonbuk Hyundai Motors FC season
34 | 32,2012 Jeonnam Dragons season
35 | 33,2012 Incheon United FC season
36 | 34,2012 Jeju United FC season
37 | 35,2012 Woodlands Wellington FC season
38 | 36,2011 Woodlands Wellington FC season
39 | 37,2010 Woodlands Wellington FC season
40 | 38,2013 Down football season
41 | 39,2013 Orlando City SC season
42 | 40,2013 Woodlands Wellington FC season
43 | 41,2013 Negeri Sembilan FA season
44 | 42,2009 Down football season
45 | 43,2013 Australia national soccer team season
46 | 44,2013 Carolina RailHawks season
47 | 45,2013 Incheon United FC season
48 | 46,Derry football season 2008
49 | 47,Derry football season 2009
50 | 48,Derry football season 2010
51 | 49,2010 Down GAA Senior Football
52 | 50,2011 Down GAA Senior Football
53 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/FootballMatch/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Battle of Santiago (1962 FIFA World Cup)
3 | 1,Battle of Berne (1954 FIFA World Cup)
4 | 2,2005 international rules series
5 | 3,Austria v Switzerland (1954 FIFA World Cup)
6 | 4,2001 Germany v England football match
7 | 5,Shamrock Rovers XI v Brazil
8 | 6,2006 International Rules series
9 | 7,2006 international rules series
10 | 8,Poland v Brazil (1938 FIFA World Cup)
11 | 9,2001 international rules series
12 | 10,1998 international rules series
13 | 11,Austria v West Germany (1978 FIFA World Cup)
14 | 12,2000 England v Germany football match
15 | 13,Battle of Nuremberg (2006 FIFA World Cup)
16 | 14,2008 Conference Premier play-off Final
17 | 15,2008 international rules series
18 | 16,1968 DFB-Pokal Final
19 | 17,1988 DFB-Pokal Final
20 | 18,1999 international rules series
21 | 19,1989 MISL All-Star Game
22 | 20,Argentina 2â1 England (1986 FIFA World Cup)
23 | 21,2009 African Championship of Nations Final
24 | 22,Hungary v El Salvador (1982 FIFA World Cup)
25 | 23,Hungary 10â1 El Salvador (1982)
26 | 24,2009 Conference Premier play-off Final
27 | 25,2009 WPS All-Star Game
28 | 26,1871 Scotland versus England rugby union match
29 | 27,1870â71 Home Nations rugby union matches
30 | 28,2009 Republic of Ireland v France football matches
31 | 29,France 1â1 Ireland (18 November 2009)
32 | 30,1985 China v Hong Kong football match
33 | 31,1876 Scotland v Wales football match
34 | 32,2010 international rules series
35 | 33,1993 Dutch Supercup
36 | 34,2010 Conference Premier play-off Final
37 | 35,2010 WPS All-Star Game
38 | 36,West Germany v France (1982 FIFA World Cup)
39 | 37,2002 international rules series
40 | 38,2010â11 W-League Grand Final
41 | 39,2006 Copa Indonesia final
42 | 40,Brazil v Italy (1982 FIFA World Cup)
43 | 41,1992 Dutch Supercup
44 | 42,1991 Dutch Supercup
45 | 43,2011 Conference Premier play-off Final
46 | 44,2011 All-Ireland Minor Hurling Championship
47 | 45,2011 international rules series
48 | 46,2011 J.League Cup Final
49 | 47,2013 Kenyan Super Cup (pre-season)
50 | 48,2012 Conference Premier play-off Final
51 | 49,2011 UEFA European Under-21 Championship Final
52 | 50,2012 Albanian Supercup
53 | 51,1985 Wales v Scotland football match
54 | 52,Soccer Bowl 2013
55 |
--------------------------------------------------------------------------------
/src/autofj/join_function_space/join_function/preprocessor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from nltk.stem.porter import PorterStemmer
3 | from nltk.stem import SnowballStemmer
4 | import time
5 | import pandas as pd
6 | import re
7 |
8 | ps = PorterStemmer()
9 | # ps = SnowballStemmer("english")
10 |
11 | def lower(x):
12 | return str(x).lower()
13 |
14 | def removePunctuation(x):
15 | return re.sub(r'[^\w\s]', '', x)
16 |
17 | def stem(x):
18 | return " ".join([ps.stem(w) for w in x.split()])
19 |
20 | def lowerStem(x):
21 | x = lower(x)
22 | x = stem(x)
23 | return x
24 |
25 | def lowerRemovePunctuation(x):
26 | x = lower(x)
27 | x = removePunctuation(x)
28 | return x
29 |
30 | def lowerRemovePunctuationStem(x):
31 | x = lower(x)
32 | x = removePunctuation(x)
33 | x = stem(x)
34 | return x
35 |
36 | class Preprocessor:
37 | """Preprocess data
38 |
39 | Parameters
40 | ----------
41 | method: string
42 | Preprocessing method. The available methods are listed as follows.
43 | - lower: lowercase
44 | - lowerStem: lowercase and stem
45 | - lowerRemovePunctuation: lowercase and remove punctuation
46 | - lowerRemovePunctuationStem: lowercase, remove punctuation and stem
47 | """
48 | def __init__(self, method):
49 | self.method = method
50 | if method == "lower":
51 | self.func = lower
52 | elif method == "lowerStem":
53 | self.func = lowerStem
54 | elif method == "lowerRemovePunctuation":
55 | self.func = lowerRemovePunctuation
56 | elif method == "lowerRemovePunctuationStem":
57 | self.func = lowerRemovePunctuationStem
58 | else:
59 | raise Exception("{} is an invalid preprocessing method"
60 | .format(method))
61 | def preprocess(self, X):
62 | """ Preprocess the given data
63 |
64 | Parameters
65 | ----------
66 | X: pd.Series
67 | Input data
68 | """
69 | X = X.apply(self.func)
70 | return X
71 |
72 | # data = pd.read_csv("../../data/left.csv")["title"]
73 | # X = np.concatenate([data.values for _ in range(20)])
74 | # X = pd.Series(X)
75 |
76 | # pre1 = Preprocessor("lowerRemovePunctuationStem")
77 | # tic = time.time()
78 | # pre1.preprocess(X)
79 | # print(time.time() - tic)
80 |
81 | # pre2 = OldPreprocess(X)
82 | # tic = time.time()
83 | # pre2.process(("lower", "remove_punctuation", "stem"))
84 | # print(time.time() - tic)
85 |
86 |
--------------------------------------------------------------------------------
/src/autofj/50-single-column-datasets.md:
--------------------------------------------------------------------------------
1 | |Dataset |Left|Right|Matches|
2 | |----------------------------|----|-----|-------|
3 | |Amphibian |3663|1161 |1161 |
4 | |ArtificialSatellite |1801|72 |72 |
5 | |Artwork |3112|245 |245 |
6 | |Award |3380|384 |384 |
7 | |BasketballTeam |928 |166 |166 |
8 | |Case |2474|380 |380 |
9 | |ChristianBishop |5363|494 |494 |
10 | |ClericalAdministrativeRegion|2547|190 |190 |
11 | |Country |2791|291 |291 |
12 | |Device |6933|658 |658 |
13 | |Drug |5356|157 |157 |
14 | |Election |6565|727 |727 |
15 | |Enzyme |3917|48 |48 |
16 | |EthnicGroup |4317|946 |946 |
17 | |FootballLeagueSeason |4457|280 |280 |
18 | |FootballMatch |1999|53 |53 |
19 | |Galaxy |555 |17 |17 |
20 | |GivenName |3021|154 |154 |
21 | |GovernmentAgency |3977|571 |571 |
22 | |HistoricBuilding |5064|512 |512 |
23 | |Hospital |2424|257 |257 |
24 | |Legislature |1314|216 |216 |
25 | |Magazine |4005|274 |274 |
26 | |MemberOfParliament |5774|503 |503 |
27 | |Monarch |2033|242 |242 |
28 | |MotorsportSeason |1465|388 |388 |
29 | |Museum |3982|305 |305 |
30 | |NCAATeamSeason |5619|34 |34 |
31 | |NationalFootballLeagueSeason|3003|10 |10 |
32 | |NaturalEvent |970 |51 |51 |
33 | |Noble |3609|364 |364 |
34 | |PoliticalParty |5254|495 |495 |
35 | |Race |2382|175 |175 |
36 | |RailwayLine |2189|298 |298 |
37 | |Reptile |666 |819 |562 |
38 | |RugbyLeague |418 |58 |58 |
39 | |ShoppingMall |201 |227 |159 |
40 | |SoccerClubSeason |1197|51 |51 |
41 | |SoccerLeague |1315|238 |238 |
42 | |SoccerTournament |2714|290 |290 |
43 | |Song |5726|440 |440 |
44 | |SportFacility |6392|672 |672 |
45 | |SportsLeague |3106|481 |481 |
46 | |Stadium |5105|619 |619 |
47 | |TelevisionStation |6752|1152 |1152 |
48 | |TennisTournament |324 |27 |27 |
49 | |Tournament |4858|459 |459 |
50 | |UnitOfWork |2483|380 |380 |
51 | |Venue |4079|384 |384 |
52 | |Wrestler |3150|464 |464 |
--------------------------------------------------------------------------------
/src/autofj/join_function_space/join_function/token_weight.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import math
3 | import pandas as pd
4 | import time
5 | import numpy as np
6 |
7 | class defaultdict(dict):
8 | def set_default_value(self, default_value):
9 | self.default_value = default_value
10 |
11 | def __missing__(self, key):
12 | self[key] = self.default_value
13 | return self.default_value
14 |
15 | def uniformWeight(document):
16 | """Uniform weight"""
17 | weight = defaultdict()
18 | weight.set_default_value(1)
19 | return weight
20 |
21 | def idfWeight(document):
22 | """Compute idf weight for tokens
23 |
24 | Parameters:
25 | -----------
26 | document: list of sets
27 | A list of token sets, which is the document on which the idf is
28 | computed.
29 |
30 | Return:
31 | -------
32 | weight: dict
33 | idf weight of tokens
34 | """
35 | token_count = collections.defaultdict(set)
36 |
37 | for i, row in enumerate(document):
38 | for token in row:
39 | token_count[token].add(i)
40 |
41 | # calculate idf value
42 | weight = defaultdict()
43 | weight.set_default_value(math.log(len(document)))
44 |
45 | for k, v in token_count.items():
46 | weight[k] = math.log(len(document) / (len(v) + 1))
47 | return weight
48 |
49 | class TokenWeight(object):
50 | """Token weight
51 |
52 | Parameters
53 | ----------
54 | method: string
55 | Token weighting schema. The available methods are listed as follows.
56 | - uniformWight
57 | - idfWeight
58 | - None (no weights)
59 | """
60 | def __init__(self, method):
61 | self.method = method
62 | if method is None:
63 | self.func = None
64 | elif method == "uniformWeight":
65 | self.func = uniformWeight
66 | elif method == "idfWeight":
67 | self.func = idfWeight
68 | else:
69 | raise Exception("{} is an invalid weighting schema"
70 | .format(method))
71 |
72 | def weight(self, X):
73 | """ Weight tokens
74 |
75 | Parameters
76 | ----------
77 | X: pd.Series
78 | Input data
79 |
80 | Return
81 | ------
82 | weight: dict
83 | weight of tokens
84 | """
85 | if self.func is not None:
86 | weight = self.func(X)
87 | return weight
88 | else:
89 | return None
90 | #
91 | # data = pd.read_csv("../../data/left.csv")["title"]
92 | # X = np.concatenate([data.values for _ in range(20)])
93 | # X = pd.Series(X)
94 | #
95 | # weight = TokenWeight("idfWeight")
96 | # tic = time.time()
97 | # weight.weight(X)
98 | # print(time.time() - tic)
99 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/ArtificialSatellite/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 119,Soyuz 18a,0,Soyuz 7K-T No.39
3 | 200,Pad Abort Test-1 (Apollo),1,Pad Abort Test 1
4 | 201,A-101 (spacecraft),2,AS-101 (spacecraft)
5 | 202,A-102 (spacecraft),3,AS-102 (spacecraft)
6 | 219,Spirit (rover),4,Spirit rover
7 | 220,Opportunity (rover),5,Opportunity rover
8 | 302,UoSAT-2,6,UO-11
9 | 374,A-104 (spacecraft),7,AS-104 (spacecraft)
10 | 375,A-103 (spacecraft),8,AS-103 (spacecraft)
11 | 376,A-105 (spacecraft),9,AS-105 (spacecraft)
12 | 378,Pad Abort Test-2 (Apollo),10,Pad Abort Test 2
13 | 410,Soyuz T-10-1,11,Soyuz 7K-ST No. 16L
14 | 450,SpaceShipOne flight 17P,12,SpaceShipOne Flight 17P
15 | 527,NuSTAR,13,Nuclear Spectroscopic Telescope Array
16 | 530,Eutelsat 28A,14,Eutelsat 33C
17 | 542,SEASAT,15,Seasat
18 | 544,Foton-M2,16,Foton-M No.2
19 | 584,CHAMP,17,CHAMP (satellite)
20 | 607,Bion 1,18,Kosmos 605
21 | 608,Bion 7,19,Kosmos 1667
22 | 619,CUTE-1.7,20,CUTE-1.7 + APD
23 | 622,Eutelsat 33A,21,Eutelsat 31A
24 | 640,Nilesat 103,22,Eutelsat 16B
25 | 669,Geotail,23,GEOTAIL
26 | 688,Fengyun 2-05,24,Fengyun 2D
27 | 716,Active Cavity Radiometer Irradiance Monitor Satellite,25,ACRIMSAT
28 | 733,Resurs-DK1,26,Resurs-DK No.1
29 | 759,HYLAS-1,27,HYLAS
30 | 783,Landsat Data Continuity Mission,28,Landsat 8
31 | 786,Proba-2,29,PROBA2
32 | 796,P78-1,30,Solwind
33 | 798,UoSat-1,31,UoSAT-1
34 | 798,UoSat-1,32,UoSat-OSCAR 9
35 | 804,A Broadband Imaging X-ray All-sky Survey,33,ABRIXAS
36 | 831,Agila 2,34,ABS-3
37 | 832,Sentinel 3,35,Sentinel-3
38 | 840,Soil Moisture and Ocean Salinity satellite,36,Soil Moisture and Ocean Salinity
39 | 854,MightySat-2,37,MightySat-2.1
40 | 886,Aditya (spacecraft),38,Aditya (satellite)
41 | 898,Eutelsat 28B,39,Eutelsat 48D
42 | 898,Eutelsat 28B,40,Eutelsat 48B
43 | 898,Eutelsat 28B,41,Afghansat 1
44 | 899,Hot Bird 9,42,Hot Bird 13C
45 | 911,AMC 1,43,AMC-1
46 | 924,Prisma,44,Prisma (satellite project)
47 | 953,Eurobird 4A,45,Eutelsat 4A
48 | 959,CP-6,46,CP6 (satellite)
49 | 1089,Satmex 6,47,Eutelsat 113 West A
50 | 1093,Orion 3 (satellite),48,Orion 3
51 | 1219,Azerspace-1/Africasat-1a,49,Azerspace
52 | 1274,LightSail-1,50,LightSail 2
53 | 1277,Eutelsat 70A,51,Eutelsat 33B
54 | 1277,Eutelsat 70A,52,Eutelsat 25C
55 | 1337,SpaceX COTS Demo Flight 1,53,COTS Demo Flight 1
56 | 1370,Dragon C2+,54,COTS Demo Flight 2
57 | 1371,SpaceX CRS-1,55,Dragon C3
58 | 1415,NEE-01 Pegasus,56,NEE-01 Pegaso
59 | 1421,SES7,57,SES-7
60 | 1423,Europa Lander,58,Laplace-P
61 | 1460,New Dawn (satellite),59,Intelsat 28
62 | 1470,Ziyuan 3,60,Ziyuan III-01
63 | 1472,Zond 3MV-1 No.2,61,Venera 3MV-1 No.2
64 | 1490,Cygnus 1,62,Cygnus Orb-D1
65 | 1491,Ekspress-AM4,63,Ekspress AM4
66 | 1495,IRNSS-1,64,IRNSS-1A
67 | 1575,SpaceX CRS-2,65,Dragon C4
68 | 1577,Estrela do Sul 1,66,Telstar 14
69 | 1646,GPS IIF-4,67,USA-242
70 | 1699,Mangalyaan,68,Mars Orbiter Mission
71 | 1703,AIDA (spacecraft),69,AIDA (mission)
72 | 1774,TDRS-K,70,TDRS-11
73 | 1798,Satmex 8,71,Eutelsat 117 West A
74 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/NCAATeamSeason/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 24,1893 LSU Tigers football team,0,1893 LSU football team
3 | 25,1894 LSU Tigers football team,1,1894 LSU football team
4 | 26,1895 LSU Tigers football team,2,1895 LSU football team
5 | 380,2002 Florida Atlantic Blue Wave baseball team,25,2002 Florida Atlantic Owls baseball team
6 | 450,1892 Alabama Crimson Tide football team,3,1892 Alabama Cadets football team
7 | 451,1894 Alabama Crimson Tide football team,4,1894 Alabama Crimson White football team
8 | 452,1893 Alabama Crimson Tide football team,5,1893 Alabama Crimson White football team
9 | 453,1895 Alabama Crimson Tide football team,6,1895 Alabama Crimson White football team
10 | 454,1898 Alabama Crimson Tide football team,7,1898 Alabama Crimson White football team
11 | 455,1897 Alabama Crimson Tide football team,8,1897 Alabama Crimson White football team
12 | 456,1896 Alabama Crimson Tide football team,9,1896 Alabama Crimson White football team
13 | 457,1899 Alabama Crimson Tide football team,10,1899 Alabama Crimson White football team
14 | 458,1900 Alabama Crimson Tide football team,11,1900 Alabama Crimson White football team
15 | 459,1901 Alabama Crimson Tide football team,12,1901 Alabama Crimson White football team
16 | 460,1902 Alabama Crimson Tide football team,13,1902 Alabama Crimson White football team
17 | 461,1903 Alabama Crimson Tide football team,14,1903 Alabama Crimson White football team
18 | 462,1904 Alabama Crimson Tide football team,15,1904 Alabama Crimson White football team
19 | 463,1906 Alabama Crimson Tide football team,16,1906 Alabama Crimson White football team
20 | 464,1905 Alabama Crimson Tide football team,17,1905 Alabama Crimson White football team
21 | 482,1974 Oregon Webfoots football team,26,1974 Oregon Ducks football team
22 | 496,1964 Oregon Webfoots football team,27,1964 Oregon Ducks football team
23 | 964,1906 Arkansas Razorbacks football team,18,1906 Arkansas Cardinals football team
24 | 1712,1963 Oregon Webfoots football team,28,1963 Oregon Ducks football team
25 | 2880,2010 FIU Panthers football team,29,2010 FIU Golden Panthers football team
26 | 2978,1975 Oregon Webfoots football team,30,1975 Oregon Ducks football team
27 | 3227,2011 FIU Panthers football team,31,2011 FIU Golden Panthers football team
28 | 3335,2011 Austin Peay State Governors football team,19,2011 Austin Peay Governors football team
29 | 3389,1907 Notre Dame Fighting Irish football team,20,1907 Notre Dame football team
30 | 3843,1979 UCF Knights football team,32,1979 UCF Golden Knights football team
31 | 4222,2012 FIU Panthers football team,33,2012 FIU Golden Panthers football team
32 | 4301,2012 Austin Peay State Governors football team,21,2012 Austin Peay Governors football team
33 | 5402,1919 Washington football team,22,1919 Washington Sun Dodgers football team
34 | 5480,2013 Hawaii Warriors football team,23,2013 Hawaii Rainbow Warriors football team
35 | 5496,1887 Notre Dame Fighting Irish football team,24,1887 Notre Dame football team
36 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/NaturalEvent/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 21,1988 Spitak earthquake,0,1988 Armenian earthquake
3 | 44,1935 Balochistan earthquake,1,1935 Quetta earthquake
4 | 54,May 2006 Java earthquake,2,2006 Yogyakarta earthquake
5 | 59,July 2006 Java earthquake,3,2006 Pangandaran earthquake and tsunami
6 | 79,2006 Hengchun earthquake,4,2006 Hengchun earthquakes
7 | 91,2001 El Salvador earthquakes,5,January 2001 El Salvador earthquake
8 | 103,1509 Istanbul earthquake,6,1509 Constantinople earthquake
9 | 108,1959 Yellowstone earthquake,7,1959 Hebgen Lake earthquake
10 | 116,1985 Algarrobo earthquake,42,1985 Santiago earthquake
11 | 134,"February 4, 1998 Afghanistan earthquake",8,February 1998 Afghanistan earthquake
12 | 135,"May 30, 1998 Afghanistan earthquake",9,May 1998 Afghanistan earthquake
13 | 142,2004 Morocco earthquake,10,2004 Al Hoceima earthquake
14 | 153,1968 Dasht-e Bayaz and Ferdows earthquake,11,1968 Dasht-e Bayaz and Ferdows earthquakes
15 | 173,2000 Sumatra earthquake,12,2000 Enggano earthquake
16 | 194,1980 Central Mexico earthquake,13,1980 Oaxaca earthquake
17 | 197,1894 Meiji Tokyo earthquake,14,1894 Tokyo earthquake
18 | 207,1953 Ionian Earthquake,15,1953 Ionian earthquake
19 | 215,2009 Papua earthquake,16,2009 Papua earthquakes
20 | 219,1929 Koppeh Dagh earthquake,17,1929 Kopet Dag earthquake
21 | 225,1962 Buin Zahra earthquake,43,1962 Bou'in-Zahra earthquake
22 | 240,2008 Bandar Abbas earthquake,18,2008 Qeshm earthquake
23 | 250,Near East earthquakes of 1759,44,Near East earthquake of 1759
24 | 256,1755 Cape Ann Earthquake,19,1755 Cape Ann earthquake
25 | 282,The Seventh Earthquake,20,749 Galilee earthquake
26 | 284,2009 Samoa earthquake,21,2009 Samoa earthquake and tsunami
27 | 294,1896 Meiji-Sanriku earthquake,22,1896 Sanriku earthquake
28 | 368,1854 Ansei-Nankai earthquake,23,1854 Nankai earthquake
29 | 539,1940 New Hampshire earthquake,24,1940 New Hampshire earthquakes
30 | 677,January 2010 Solomon Islands earthquake,25,2010 Solomon Islands earthquake
31 | 699,Duvall earthquake,26,1996 Duvall earthquake
32 | 705,2010 Kaohsiung earthquakes,45,2010 Kaohsiung earthquake
33 | 719,1903 Malazgirt earthquake,27,1903 Manzikert earthquake
34 | 735,1995 Gulf of Aqaba earthquake,46,1995 Egypt earthquake
35 | 748,1914 Burdur earthquake,47,1914 Afyon-Bolvadin earthquake
36 | 750,1653 East Symirna earthquake,28,1653 East Smyrna earthquake
37 | 751,1688 Izmir earthquake,29,1688 Smyrna earthquake
38 | 759,1855 Ansei Edo earthquake,30,1855 Edo earthquake
39 | 778,1927 earthquake in Palestine,31,1927 Jericho earthquake
40 | 810,1909 Lambesc earthquake,32,1909 Provence earthquake
41 | 829,869 Jogan Sanriku earthquake,33,869 Sanriku earthquake
42 | 832,2011 Burma earthquake,34,2011 Myanmar earthquake
43 | 844,1954 Adelaide earthquake,48,Great Adelaide Earthquake
44 | 867,December 1911 Guerrero earthquake,35,1911 Guerrero earthquake
45 | 887,1611 Keicho Sanriku earthquake,36,1611 Sanriku earthquake
46 | 894,1932 Jalisco earthquake,37,1932 Jalisco earthquakes
47 | 899,847 Damascus earthquake,49,847 Antioch earthquake
48 | 915,2012 Indian Ocean earthquakes,50,2012 Indian Ocean earthquake
49 | 924,June 2012 Afghanistan earthquakes,38,2012 Afghanistan earthquakes
50 | 932,2012 Yangzhou Earthquake,39,2012 Yangzhou earthquake
51 | 962,2008 Qeshm earthquake,40,2008 Bandar Abbas earthquake
52 | 969,2013 Nantou earthquake,41,March 2013 Nantou earthquake
53 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/GivenName/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Luke (name)
3 | 1,Julia
4 | 2,Fatima (name)
5 | 3,Fanny (name)
6 | 4,Lyfing
7 | 5,Adele
8 | 6,Jeffrey (name)
9 | 7,Kathryn
10 | 8,Cadwallader
11 | 9,Gita (given name)
12 | 10,Banu
13 | 11,Casper
14 | 12,Hana (name)
15 | 13,Abd al-Aziz
16 | 14,Guillaume
17 | 15,Dustin (name)
18 | 16,Shahrokh (mythical bird)
19 | 17,William (given name)
20 | 18,Arif (given name)
21 | 19,Corinne
22 | 20,Haruna (given name)
23 | 21,Brianna
24 | 22,Takeru (name)
25 | 23,Yasmin (given name)
26 | 24,Calvin (name)
27 | 25,Alexis
28 | 26,Hjalmar (disambiguation)
29 | 27,Rei (given name)
30 | 28,Asad (name)
31 | 29,Mervin
32 | 30,Margaret
33 | 31,Aurora (given name)
34 | 32,Michelle (name)
35 | 33,Jahsh
36 | 34,Mark (name)
37 | 35,Sujata (name)
38 | 36,Anupama (given name)
39 | 37,Paulina (given name)
40 | 38,Padraic
41 | 39,Bojan
42 | 40,Chetan (name)
43 | 41,Juanfran (disambiguation)
44 | 42,Gun (Swedish name)
45 | 43,Dalia (given name)
46 | 44,Travis
47 | 45,Merwin
48 | 46,Daniel
49 | 47,Arun (given name)
50 | 48,Michele
51 | 49,Zakiah
52 | 50,Parvati (given name)
53 | 51,Medad
54 | 52,Joseph
55 | 53,Joseph (name)
56 | 54,Aida (name)
57 | 55,Edwina
58 | 56,Zvonimir
59 | 57,Annetta (given name)
60 | 58,Asa (name)
61 | 59,Hannu (disambiguation)
62 | 60,Hayley (given name)
63 | 61,Bauyrzhan
64 | 62,Abd al-Rahman
65 | 63,Bram (given name)
66 | 64,Hannah (name)
67 | 65,Katherine
68 | 66,Hideyoshi (disambiguation)
69 | 67,Alexandru
70 | 68,Lee (English given name)
71 | 69,Leonard
72 | 70,Vivian (given name)
73 | 71,Marvin (given name)
74 | 72,Faiz (disambiguation)
75 | 73,Anthony (name)
76 | 74,Stanley (name)
77 | 75,Cory
78 | 76,Gerard
79 | 77,Lindita (given name)
80 | 78,Sophie (given name)
81 | 79,Leonie
82 | 80,Raizo
83 | 81,Raffaello (disambiguation)
84 | 82,Sorin (given name)
85 | 83,Ljubica
86 | 84,Ludovica
87 | 85,Alessia
88 | 86,Coralie
89 | 87,Navneet
90 | 88,Aisling (name)
91 | 89,Nelofar
92 | 90,Miley (given name)
93 | 91,Mary (name)
94 | 92,Madeleine (name)
95 | 93,Schuyler (name)
96 | 94,Milica
97 | 95,Laimonis
98 | 96,Uldis
99 | 97,Dzintars
100 | 98,Modris
101 | 99,Gatis
102 | 100,Indulis
103 | 101,Priya (given name)
104 | 102,Maki (name)
105 | 103,Junichi
106 | 104,Alison (given name)
107 | 105,Kanye (name)
108 | 106,Kaj
109 | 107,Tawfik
110 | 108,Kayo (name)
111 | 109,Pia (given name)
112 | 110,Hristo
113 | 111,Heather (given name)
114 | 112,Lubomir
115 | 113,Ctirad
116 | 114,Tom (given name)
117 | 115,Aileen
118 | 116,Bita (Persian)
119 | 117,Christopher
120 | 118,Abid (name)
121 | 119,Brooke (name)
122 | 120,Mira (given name)
123 | 121,Parisa (disambiguation)
124 | 122,Tuukka
125 | 123,Soo-young (name)
126 | 124,Roosevelt (name)
127 | 125,Hamad
128 | 126,Ziemowit
129 | 127,Graciela (disambiguation)
130 | 128,Dobroslav
131 | 129,Tess (given name)
132 | 130,Ellen
133 | 131,Kalina (name)
134 | 132,Kaede
135 | 133,Ranald
136 | 134,Sandy (given name)
137 | 135,Maytham
138 | 136,Damayanti (disambiguation)
139 | 137,Faith (name)
140 | 138,Llewellyn (name)
141 | 139,Veronica (name)
142 | 140,Amos (name)
143 | 141,Bora (Turkish name)
144 | 142,Oladapo
145 | 143,Jerry (given name)
146 | 144,Sania (disambiguation)
147 | 145,Domagoj (given name)
148 | 146,Okonma
149 | 147,Keon
150 | 148,Enn
151 | 149,Haruchika (given name)
152 | 150,Farah (name)
153 | 151,Harutyun
154 | 152,Sora (Japanese given name)
155 | 153,Verity
156 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Enzyme/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 0,Adenylate cyclase,0,Adenylyl cyclase
3 | 15,NADH dehydrogenase (ubiquinone),1,NADH:ubiquinone reductase (H+-translocating)
4 | 42,RNase H,2,Ribonuclease H
5 | 49,Guanylate cyclase,3,Guanylyl cyclase
6 | 151,Ribonuclease A,4,Bovine pancreatic ribonuclease
7 | 153,Gelatinase a,5,Gelatinase A
8 | 189,Carboxypeptidase u,6,Carboxypeptidase U
9 | 234,3-beta-HSD,7,3beta-Hydroxysteroid dehydrogenase
10 | 236,Aspartokinase,8,Aspartate kinase
11 | 261,Serotonin N-acetyltransferase,9,Aralkylamine N-acetyltransferase
12 | 262,CTP synthetase,10,CTP synthase
13 | 328,Alpha-amylase,11,Alpha-Amylase
14 | 333,Carboxypeptidase c,12,Carboxypeptidase C
15 | 350,Amine oxidase,13,Diamine oxidase
16 | 365,A-N-acetylgalactosaminidase,14,Alpha-N-acetylgalactosaminidase
17 | 376,"1,3-Beta-glucan synthase",15,"1,3-beta-glucan synthase"
18 | 380,4-hydroxybutyrate dehydrogenase,16,4-Hydroxybutyrate dehydrogenase
19 | 469,Glycerol-1-phosphate dehydrogenase (NAD(P)+),17,Sn-glycerol-1-phosphate dehydrogenase
20 | 610,3-keto-steroid reductase,18,3-Ketosteroid reductase
21 | 684,(Iso)eugenol O-methyltransferase,19,(iso)eugenol O-methyltransferase
22 | 702,(Myelin basic protein)-arginine N-methyltransferase,20,(myelin basic protein)-arginine N-methyltransferase
23 | 723,(Ribulose-bisphosphate carboxylase)-lysine N-methyltransferase,21,(ribulose-bisphosphate carboxylase)-lysine N-methyltransferase
24 | 855,(Formate-C-acetyltransferase)-activating enzyme,22,(formate-C-acetyltransferase)-activating enzyme
25 | 858,Sulfur reductase,23,Sulfhydrogenase
26 | 898,4-Hydroxyphenylacetate 3-monooxygenase,24,4-hydroxyphenylacetate 3-monooxygenase
27 | 945,Deacetoxyvindoline 4-hydroxylase,25,Desacetoxyvindoline 4-hydroxylase
28 | 1043,2-hydroxyglutarate dehydrogenase,26,L-2-hydroxyglutarate dehydrogenase
29 | 1185,4-Cresol dehydrogenase (hydroxylating),27,4-cresol dehydrogenase (hydroxylating)
30 | 1199,(Methionine synthase) reductase,28,(methionine synthase) reductase
31 | 1220,"5,10-Methenyltetrahydromethanopterin hydrogenase",29,"5,10-methenyltetrahydromethanopterin hydrogenase"
32 | 1233,Chlorite O2-lyase,30,Chlorite dismutase
33 | 1363,NAD(P)+ transhydrogenase (AB-specific),31,NAD(P)+ transhydrogenase (Re/Si-specific)
34 | 1364,NAD(P)+ transhydrogenase (B-specific),32,NAD(P)+ transhydrogenase (Si-specific)
35 | 2192,(myosin-light-chain) phosphatase,33,Myosin-light-chain phosphatase
36 | 2728,2-Hydroxyacylsphingosine 1-beta-galactosyltransferase,34,2-hydroxyacylsphingosine 1-beta-galactosyltransferase
37 | 3031,Isocitrate dehydrogenase (NADP+) kinase,35,(isocitrate dehydrogenase (NADP+)) kinase
38 | 3118,CCA tRNA nucleotidyltransferase,36,TRNA cytidylyltransferase
39 | 3151,4-Hydroxy-3-methylbut-2-enyl diphosphate reductase,37,4-hydroxy-3-methylbut-2-enyl diphosphate reductase
40 | 3179,Cyanate hydratase,38,Cyanase
41 | 3180,Formylglycine-generating sulfatase enzyme,39,Formylglycine-generating enzyme
42 | 3185,Nucleotide Pyrophosphatase/Phosphodiesterase (NPP),40,Nucleotide pyrophosphatase/phosphodiesterase
43 | 3188,IgA protease,41,IgA specific serine endopeptidase
44 | 3286,Cortisone b-reductase,42,5-beta-reductase
45 | 3736,"Dolichyl-P-Glc:Glc2Man9GlcNAc2-PP-dolichol alpha-1,2-glucosyltransferase",43,ALG10 (enzyme class)
46 | 3745,"Dolichyl-P-Glc:Glc1Man9GlcNAc2-PP-dolichol alpha-1,3-glucosyltransferase",44,ALG8 (enzyme class)
47 | 3747,"Dolichyl-P-Glc:Man9GlcNAc2-PP-dolichol alpha-1,3-glucosyltransferase",45,ALG6 (enzyme class)
48 | 3828,Metionin transaminase,46,Methionine transaminase
49 | 3915,4-sulfomuconolactone hydrolase,47,4-Sulfomuconolactone hydrolase
50 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/RugbyLeague/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 5,IRB Sevens World Series,0,World Rugby Sevens Series
3 | 5,IRB Sevens World Series,57,Sevens World Series
4 | 13,World Club Challenge,1,World Club Series
5 | 19,ITM Cup,2,Mitre 10 Cup
6 | 21,Pro 12,3,Pro12
7 | 25,National Championship of Excellence,42,National Championship of Excellence (Italian premiership)
8 | 29,Regal Trophy,4,League Cup (rugby league)
9 | 34,Microsoft Cup,5,Top League Champions Cup
10 | 35,All-Japan University Rugby Championship,43,All Japan University Rugby Championship
11 | 48,Colonial Cup (rugby union),44,Colonial Cup
12 | 59,IRB Pacific Nations Cup,6,World Rugby Pacific Nations Cup
13 | 59,IRB Pacific Nations Cup,45,Pacific Nations Cup
14 | 60,Ron Massey Cup,46,Bundaberg Red Cup
15 | 62,New South Wales Cup,7,Intrust Super Premiership NSW
16 | 66,London and South East Merit League,47,"London, South and East Merit League"
17 | 79,Pacific Rugby Cup,8,World Rugby Pacific Challenge
18 | 85,Tom Richards Trophy,9,Tom Richards Cup
19 | 88,IRB Nations Cup,10,World Rugby Nations Cup
20 | 98,Paris Sevens,11,France Sevens
21 | 119,Rugby League Charity Shield (Australia),12,Charity Shield (NRL)
22 | 139,Caledonia Regional League (rugby union),13,Caledonia Regional League
23 | 143,Tongan National Rugby League,48,Tonga National Rugby League
24 | 150,ARFU Women's Rugby Championship,14,Asia Rugby Women's Championship
25 | 152,Rugby World Cup Final,15,List of Rugby World Cup finals
26 | 153,Scottish Premiership,16,Scottish Premiership (rugby)
27 | 163,Asian Five Nations,17,Asia Rugby Championship
28 | 163,Asian Five Nations,56,Asian Rugby Championship
29 | 166,CAR Development Trophy,18,African Development Trophy
30 | 177,Rugby League European Shield,19,Rugby League European Championship B
31 | 181,IRB Junior World Championship,20,World Rugby Under 20 Championship
32 | 183,IRB Junior World Rugby Trophy,21,World Rugby Under 20 Trophy
33 | 188,Singer Sri Lankan Airlines Rugby 7's,22,Sri Lanka Sevens
34 | 190,Club ANZAC Game,23,ANZAC Day Cup
35 | 193,Nations Cup (women's rugby union),24,Women's Nations Cup (rugby union)
36 | 203,Rugby League European Bowl,25,Rugby League European Championship C
37 | 206,FORU Oceania Cup,26,Oceania Rugby Cup
38 | 212,ARL Schoolboy Cup,27,GIO Schoolboy Cup
39 | 213,South Premier (rugby league),28,South Premier
40 | 213,South Premier (rugby league),49,Rugby League Conference South Premier
41 | 233,National Women's Championship,29,National Women's Rugby Championship
42 | 235,2nd Rugby-Bundesliga,30,2. Rugby-Bundesliga
43 | 267,All Stars Match,31,All Stars match
44 | 308,Asian Women's Sevens,32,Asian Women's Sevens Championship
45 | 309,African Women's Sevens,33,African Women's Sevens Championship
46 | 311,Caribbean Women's Sevens Championship,34,North America and Caribbean Women's Sevens Championship
47 | 312,Pacific Women's Sevens Championship,35,Oceania Women's Sevens Championship
48 | 313,South American Women's Sevens Championship,36,Women's rugby sevens in South America
49 | 313,South American Women's Sevens Championship,55,South American Women's Sevens
50 | 315,Rugby Ekstraliga,37,Ekstraliga (rugby)
51 | 319,Saint Patrick's Day Test,50,St. Patrick's Day Test
52 | 327,North East Rugby League,38,North East Rugby League Premier Division
53 | 327,North East Rugby League,51,Rugby League Conference North East Division
54 | 329,London & South East Men's League,52,Rugby League Conference London & South Division
55 | 332,International Origin,53,International Origin Match
56 | 336,College Premier Division,39,Division 1-A Rugby
57 | 358,Ron Coote Cup,54,The Ron Coote Cup
58 | 411,SARU Community Cup,40,SARU Gold Cup
59 | 413,IRB Women's Sevens World Series,41,World Rugby Women's Sevens Series
60 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Drug/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Bacillus Calmette-Guérin
3 | 1,"N,N-Dimethyltryptamine"
4 | 2,Hyoscine hydrobromide
5 | 3,Valproate
6 | 4,Interferon beta 1a
7 | 5,4-Androstenedione
8 | 6,Percodan
9 | 7,5-MeO-aMT
10 | 8,FluMist
11 | 9,Tenofovir disoproxil
12 | 10,Ursodeoxycholic acid
13 | 11,Quinacrine
14 | 12,Glycopyrrolate
15 | 13,Neomycin/polymyxin B/bacitracin
16 | 14,Enoxolone
17 | 15,Hyoscine butylbromide
18 | 16,Paromomycin sulfate
19 | 17,Bacitracin/polymyxin B
20 | 18,Ethinylestradiol
21 | 19,Dactinomycin
22 | 20,Interferon beta 1b
23 | 21,Metandienone
24 | 22,Caffeine/ergotamine
25 | 23,Roxatidine acetate
26 | 24,HPV vaccines
27 | 25,Tenoretic
28 | 26,Xyrem
29 | 27,Crotalidae polyvalent immune fab
30 | 28,4-Hydroxyamphetamine
31 | 29,Chlordiazepoxide/clidinium bromide
32 | 30,Umifenovir
33 | 31,Efavirenz/emtricitabine/tenofovir
34 | 32,N1-Methyl-lysergic acid diethylamide
35 | 33,Dimethyllysergamide
36 | 34,Gestonorone caproate
37 | 35,Zoster vaccine
38 | 36,Politor
39 | 37,"1,8-Dihydroxyanthraquinone"
40 | 38,Corbadrine
41 | 39,Hydroxyprogesterone caproate
42 | 40,Mestranol/noretynodrel
43 | 41,Ethylamphetamine
44 | 42,GHRP-6
45 | 43,Abiraterone acetate
46 | 44,Phenyramidol
47 | 45,Digoxin immune fab
48 | 46,Senna glycosides
49 | 47,Norbolethone
50 | 48,ASAQ
51 | 49,Methopholine
52 | 50,Betamethylfentanyl
53 | 51,MN-18
54 | 52,4-Fluoro-N-methylamphetamine
55 | 53,Interferon alfa 2b
56 | 54,GW501516
57 | 55,W-18
58 | 56,Regular insulin
59 | 57,Thiobromadol
60 | 58,Droxidopa
61 | 59,Thiambutene
62 | 60,MDV3100
63 | 61,Periciazine
64 | 62,SB-242084
65 | 63,Carbidopa/levodopa/entacapone
66 | 64,Carbaldrate
67 | 65,Docusate
68 | 66,Bradanicline
69 | 67,EMD-386088
70 | 68,SB-258585
71 | 69,SB-399885
72 | 70,SB-357134
73 | 71,Obinutuzumab
74 | 72,4-Bromomethcathinone
75 | 73,SB-271046
76 | 74,Entolimod
77 | 75,A-372159
78 | 76,SB-699551
79 | 77,RS-127445
80 | 78,SB-204741
81 | 79,RS-102221
82 | 80,CP-94253
83 | 81,Suloctidil
84 | 82,Insulin (medication)
85 | 83,Ro 04-6790
86 | 84,SB-216641
87 | 85,SB-269970
88 | 86,Tedizolid
89 | 87,Dasotraline
90 | 88,Radium-223
91 | 89,Radium-223 chloride
92 | 90,MDMAI
93 | 91,Pivhydrazine
94 | 92,MMDMA (drug)
95 | 93,2-Methoxymethyl salvinorin B
96 | 94,Afegostat
97 | 95,SB-215505
98 | 96,Meclinertant
99 | 97,GR-127935
100 | 98,Homarylamine
101 | 99,BDPC
102 | 100,Norethisterone enanthate
103 | 101,CP-809101
104 | 102,5-IT
105 | 103,Chromium(III) nicotinate
106 | 104,Clazakizumab
107 | 105,Fabomotizole
108 | 106,Oxomemazine/guaifenesin
109 | 107,Surfaxin
110 | 108,Idalopirdine
111 | 109,Acetylsalicylic acid/dipyridamole
112 | 110,Lavoltidine
113 | 111,Phencyclamine
114 | 112,Erbuzole
115 | 113,6-Methyl-2-ethyl-3-hydroxypyridine
116 | 114,LY-293284
117 | 115,Triflunordazepam
118 | 116,S-14671
119 | 117,Zoptarelin doxorubicin
120 | 118,Meclizine
121 | 119,CP-93129
122 | 120,25TFM-NBOMe
123 | 121,25C-NBOMe
124 | 122,1-(Thiophen-2-yl)-2-aminopropane
125 | 123,TM38837
126 | 124,Ethinyl estradiol/drospirenone/levomefolic acid
127 | 125,Carphenazine
128 | 126,Perfosfamide
129 | 127,Org 12962
130 | 128,Censavudine
131 | 129,Omecamtiv Mecarbil (CK-1827452)
132 | 130,SB-258719
133 | 131,GR-113808
134 | 132,LY-310762
135 | 133,SB-204070
136 | 134,CJ-033466
137 | 135,SB-206553
138 | 136,CP-135807
139 | 137,VX-809
140 | 138,Tenofovir alafenamide
141 | 139,GS 7340
142 | 140,PSI-7977
143 | 141,Abaloparatide
144 | 142,RG7795
145 | 143,ANA773
146 | 144,C16 (drug)
147 | 145,Valsartan/sacubitril
148 | 146,APINACA
149 | 147,Trifluridine/tipiracil
150 | 148,Rapastinel
151 | 149,APICA (synthetic cannabinoid drug)
152 | 150,Andexanet alfa
153 | 151,Doravirine
154 | 152,Ledipasvir
155 | 153,Deleobuvir
156 | 154,QUPIC
157 | 155,QUCHIC
158 | 156,Elafibranor
159 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/FootballMatch/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 1,Battle of Santiago,0,Battle of Santiago (1962 FIFA World Cup)
3 | 2,Battle of Berne,1,Battle of Berne (1954 FIFA World Cup)
4 | 9,2005 International Rules Series,2,2005 international rules series
5 | 13,Austria v Switzerland (1954),3,Austria v Switzerland (1954 FIFA World Cup)
6 | 41,2001 Germany vs England football match,4,2001 Germany v England football match
7 | 67,Shamrock Rovers XI vs Brazil,5,Shamrock Rovers XI v Brazil
8 | 94,2006 International Rules Series,6,2006 International Rules series
9 | 94,2006 International Rules Series,7,2006 international rules series
10 | 101,Poland v Brazil (1938),8,Poland v Brazil (1938 FIFA World Cup)
11 | 287,2001 International Rules Series,9,2001 international rules series
12 | 294,1998 International Rules Series,10,1998 international rules series
13 | 358,Miracle of Cordoba,11,Austria v West Germany (1978 FIFA World Cup)
14 | 363,2000 England vs Germany football match,12,2000 England v Germany football match
15 | 394,Battle of Nuremberg (association football),13,Battle of Nuremberg (2006 FIFA World Cup)
16 | 442,2008 Conference National play-off Final,14,2008 Conference Premier play-off Final
17 | 466,2008 International Rules Series,15,2008 international rules series
18 | 545,1968 DFB Cup Final,16,1968 DFB-Pokal Final
19 | 546,1988 DFB Cup Final,17,1988 DFB-Pokal Final
20 | 568,1999 International Rules Series,18,1999 international rules series
21 | 597,1989 Major Indoor Soccer League All-Star Game,19,1989 MISL All-Star Game
22 | 607,Argentina v England (1986 FIFA World Cup),20,Argentina 2â1 England (1986 FIFA World Cup)
23 | 702,2009 African Nations Championship Final,21,2009 African Championship of Nations Final
24 | 704,Hungary vs El Salvador (1982),22,Hungary v El Salvador (1982 FIFA World Cup)
25 | 704,Hungary vs El Salvador (1982),23,Hungary 10â1 El Salvador (1982)
26 | 706,2009 Conference National play-off Final,24,2009 Conference Premier play-off Final
27 | 777,WPS All-Star 2009,25,2009 WPS All-Star Game
28 | 811,1871 England versus Scotland rugby union match,26,1871 Scotland versus England rugby union match
29 | 811,1871 England versus Scotland rugby union match,27,1870â71 Home Nations rugby union matches
30 | 846,2009 Republic of Ireland vs France football matches,28,2009 Republic of Ireland v France football matches
31 | 846,2009 Republic of Ireland vs France football matches,29,France 1â1 Ireland (18 November 2009)
32 | 991,19 May incident,30,1985 China v Hong Kong football match
33 | 1005,1876 Scotland vs Wales football match,31,1876 Scotland v Wales football match
34 | 1036,2010 International Rules Series,32,2010 international rules series
35 | 1051,1993 PTT Telecom Cup,33,1993 Dutch Supercup
36 | 1052,2010 Conference National play-off Final,34,2010 Conference Premier play-off Final
37 | 1091,WPS All-Star 2010,35,2010 WPS All-Star Game
38 | 1141,West Germany vs France (1982),36,West Germany v France (1982 FIFA World Cup)
39 | 1159,2002 International Rules Series,37,2002 international rules series
40 | 1244,2011 W-League Grand Final,38,2010â11 W-League Grand Final
41 | 1250,2006 Copa Indonesia Final,39,2006 Copa Indonesia final
42 | 1273,Brazil vs Italy (1982),40,Brazil v Italy (1982 FIFA World Cup)
43 | 1314,1992 PTT Telecom Cup,41,1992 Dutch Supercup
44 | 1315,1991 PTT Telecom Cup,42,1991 Dutch Supercup
45 | 1319,2011 Conference National play-off Final,43,2011 Conference Premier play-off Final
46 | 1347,All-Ireland Minor Hurling Championship 2011,44,2011 All-Ireland Minor Hurling Championship
47 | 1362,2011 International Rules Series,45,2011 international rules series
48 | 1828,2011 J. League Cup Final,46,2011 J.League Cup Final
49 | 1843,2013 Kenyan Super Cup,47,2013 Kenyan Super Cup (pre-season)
50 | 1859,2012 Conference National play-off Final,48,2012 Conference Premier play-off Final
51 | 1879,2011 UEFA European Under-21 Football Championship Final,49,2011 UEFA European Under-21 Championship Final
52 | 1901,Albanian Supercup 2012,50,2012 Albanian Supercup
53 | 1925,1985 Wales vs Scotland football match,51,1985 Wales v Scotland football match
54 | 1972,2013 Soccer Bowl,52,Soccer Bowl 2013
55 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/SoccerClubSeason/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 1,2006 Australia national football team season,0,2006 Australia national soccer team season
3 | 2,2007 Australia national football team season,1,2007 Australia national soccer team season
4 | 9,2008 Australia national football team season,2,2008 Australia national soccer team season
5 | 14,2008 Derry Gaelic football season,46,Derry football season 2008
6 | 18,2005 Australia national football team season,3,2005 Australia national soccer team season
7 | 26,2009 Australia national football team season,4,2009 Australia national soccer team season
8 | 40,2009 Derry Gaelic football season,47,Derry football season 2009
9 | 41,2010 Australia national football team season,5,2010 Australia national soccer team season
10 | 93,Cienciano season 2003,6,2003 Cienciano season
11 | 106,HJK Helsinki season 2009,7,2009 Helsingin Jalkapalloklubi season
12 | 141,2010 Derry Gaelic football season,48,Derry football season 2010
13 | 172,2010 Jeonbuk Hyundai Motors season,8,2010 Jeonbuk Hyundai Motors FC season
14 | 211,2010 Down Senior Football season,9,2010 Down football season
15 | 211,2010 Down Senior Football season,49,2010 Down GAA Senior Football
16 | 212,2011 Down Senior Football season,10,2011 Down football season
17 | 212,2011 Down Senior Football season,50,2011 Down GAA Senior Football
18 | 247,2011 Daejeon Citizen season,11,2011 Daejeon Citizen FC season
19 | 258,2011 Australia national football team season,12,2011 Australia national soccer team season
20 | 277,2011 Orlando City S.C. season,13,2011 Orlando City SC season
21 | 281,2011 Jeonbuk Hyundai Motors season,14,2011 Jeonbuk Hyundai Motors FC season
22 | 300,2011 Chunnam Dragons season,15,2011 Jeonnam Dragons season
23 | 306,2004 Australia national football team season,16,2004 Australia national soccer team season
24 | 395,2011 Incheon United season,17,2011 Incheon United FC season
25 | 496,2003 Australia national football team season,18,2003 Australia national soccer team season
26 | 571,2002 Australia national football team season,19,2002 Australia national soccer team season
27 | 572,1999 Australia national football team season,20,1999 Australia national soccer team season
28 | 575,1998 Australia national football team season,21,1998 Australia national soccer team season
29 | 576,2001 Australia national football team season,22,2001 Australia national soccer team season
30 | 577,2000 Australia national football team season,23,2000 Australia national soccer team season
31 | 795,2012 Down GAA Senior Football,24,2012 Down football season
32 | 796,1997 Australia national football team season,25,1997 Australia national soccer team season
33 | 797,1996 Australia national football team season,26,1996 Australia national soccer team season
34 | 874,Kuala Lumpur FA season 2012,27,2012 Kuala Lumpur FA season
35 | 881,2012 Daejeon Citizen season,28,2012 Daejeon Citizen FC season
36 | 887,2012 Orlando City S.C. season,29,2012 Orlando City SC season
37 | 899,2012 Australia national football team season,30,2012 Australia national soccer team season
38 | 934,2012 Jeonbuk Hyundai Motors season,31,2012 Jeonbuk Hyundai Motors FC season
39 | 936,2012 Chunnam Dragons season,32,2012 Jeonnam Dragons season
40 | 978,2012 Incheon United season,33,2012 Incheon United FC season
41 | 979,2012 Jeju United season,34,2012 Jeju United FC season
42 | 1015,2012 Woodlands Wellington Season,35,2012 Woodlands Wellington FC season
43 | 1016,2011 Woodlands Wellington Season,36,2011 Woodlands Wellington FC season
44 | 1020,2010 Woodlands Wellington Season,37,2010 Woodlands Wellington FC season
45 | 1044,2013 Down Senior Football season,38,2013 Down football season
46 | 1073,2013 Orlando City S.C. season,39,2013 Orlando City SC season
47 | 1075,2013 Woodlands Wellington Season,40,2013 Woodlands Wellington FC season
48 | 1080,Negeri Sembilan FA Season 2013,41,2013 Negeri Sembilan FA season
49 | 1151,2009 Down Senior Football season,42,2009 Down football season
50 | 1158,2013 Australia national football team season,43,2013 Australia national soccer team season
51 | 1161,2013 Carolina RailHawks FC season,44,2013 Carolina RailHawks season
52 | 1187,2013 Incheon United season,45,2013 Incheon United FC season
53 |
--------------------------------------------------------------------------------
/src/autofj/negative_rule.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from nltk.stem.porter import PorterStemmer
3 | import re
4 |
5 |
6 | class NegativeRule(object):
7 | """Negative rules"""
8 |
9 | def __init__(self, left, right, id_column):
10 | self.left = self._preprocess(left, id_column)
11 | self.right = self._preprocess(right, id_column)
12 | self.id_column = id_column
13 | self.negative_rules = set()
14 |
15 | def get_tokens_diff(self, l_tokens, r_tokens):
16 | # get difference of l_token set and r_token set
17 | l_diff = [l - r for l, r in zip(l_tokens, r_tokens)]
18 | r_diff = [r - l for l, r in zip(l_tokens, r_tokens)]
19 | return l_diff, r_diff
20 |
21 | def _preprocess(self, df, id_column):
22 | """ Preprocess the records: (1) concatenate all columns. (2) lowercase,
23 | remove punctuation and do stemming (3) split by space
24 |
25 | Parameters
26 | ----------
27 | df: pd.DataFrame
28 | Original table
29 |
30 | id_column: string
31 | The name of id column in two tables.
32 |
33 | Reutrn
34 | ------
35 | result: pd.DataFrame
36 | Preprocessed table that has two columns, am id column named as "id"
37 | and a column for preprocessed record named "value"
38 | """
39 | # get column names except id
40 | columns = [c for c in df.columns if c != id_column]
41 | ps = PorterStemmer()
42 |
43 | # concat all columns, lowercase, remove punctuation, split by space,
44 | # and do stemming
45 | new_value = []
46 | for x in df[columns].values:
47 | concat_x = " ".join([str(i) for i in x])
48 | lower_x = re.sub('[^\w\s]', " ", concat_x.lower())
49 | stem_x = [ps.stem(w) for w in lower_x.split()]
50 | new_x = set(stem_x)
51 | new_value.append(new_x)
52 |
53 | id_df = df[id_column].values
54 | result = pd.DataFrame({id_column: id_df, "value": new_value})
55 | return result
56 |
57 | def learn(self, LL_blocked):
58 | """Learn opposite rules from LL"""
59 | # merge LL with left
60 | LL = self._merge(self.left, self.left, LL_blocked)
61 |
62 | # get token difference
63 | l_diff, r_diff = self.get_tokens_diff(LL["value_l"].values,
64 | LL["value_r"].values)
65 |
66 | # get rules: (l_token, r_token) that have one different token from each other
67 | for l, r, l_set in zip(l_diff, r_diff, LL["value_l"]):
68 | if len(l) == 1 and len(r) == 1 and len(l_set) != 1:
69 | self.negative_rules.add((list(l)[0], list(r)[0]))
70 | self.negative_rules.add((list(r)[0], list(l)[0]))
71 |
72 | # print(self.negative_rules)
73 | # raise
74 |
75 | def _merge(self, left, right, LR_blocked):
76 | id_column = self.id_column
77 | LR = LR_blocked[[id_column + "_l", id_column + "_r"]]
78 | LR = LR.merge(left, left_on=id_column + "_l", right_on=id_column)\
79 | .drop(columns=id_column) \
80 | .merge(right, left_on=id_column + "_r", right_on=id_column,
81 | suffixes=("_l", "_r"))\
82 | .drop(columns=id_column)
83 | return LR
84 |
85 | def apply(self, LR_blocked):
86 | """Apply opposite rule on LR blocked"""
87 | # merge LR with left, right
88 | LR = self._merge(self.left, self.right, LR_blocked)
89 |
90 | # get token difference
91 | l_diff, r_diff = self.get_tokens_diff(LR["value_l"].values,
92 | LR["value_r"].values)
93 |
94 | # apply rule
95 | mask = []
96 | for lid, rid, l_d, r_d in zip(LR["autofj_id_l"].values,
97 | LR["autofj_id_r"].values,
98 | l_diff,
99 | r_diff):
100 | pairs = [(l, r) for l in l_d for r in r_d]
101 | meet_rule = any([p in self.negative_rules for p in pairs])
102 | mask.append(not meet_rule)
103 |
104 | LR_blocked = LR[mask][["autofj_id_l", "autofj_id_r"]]
105 | return LR_blocked
106 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/BasketballTeam/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,New Orleans Pelicans
3 | 1,Charlotte Hornets
4 | 2,Polonia Warszawa (basketball)
5 | 3,Asseco Gdynia
6 | 4,Melbourne United
7 | 5,Liaoning Flying Leopards
8 | 6,Shandong Golden Stars
9 | 7,Fujian Sturgeons
10 | 8,P.A.O.K. BC
11 | 9,Ulriken Elite
12 | 10,Mens Sana 1871 Basket
13 | 11,Oklahoma City Blue
14 | 12,Austin Spurs
15 | 13,KK Olimpija
16 | 14,Surrey Scorchers
17 | 15,Guildford Heat
18 | 16,Surrey United (basketball)
19 | 17,Shenzhen Leopards
20 | 18,Treviso Basket
21 | 19,Universo Treviso Basket
22 | 20,UNES FC Barcelona
23 | 21,University of Canberra Capitals
24 | 22,Otago Nuggets
25 | 23,Helsinki Seagulls
26 | 24,BC Torpan Pojat
27 | 25,Milton Keynes Lions
28 | 26,Brooklyn Kings (basketball)
29 | 27,CB Bilbao Berri
30 | 28,BC Cherno More Port Varna
31 | 29,KK Vojvodina Srbijagas
32 | 30,Dumbarton Dodgers
33 | 31,FC Porto (basketball)
34 | 32,U.D. Oliveirense (basketball)
35 | 33,CA Queluz
36 | 34,MHP Riesen Ludwigsburg
37 | 35,EnBW Ludwigsburg
38 | 36,Gladiators Trier
39 | 37,Pallacanestro Don Bosco Livorno
40 | 38,PBC Ural Great
41 | 39,Ayrshire Tornadoes
42 | 40,Bree B.B.C.
43 | 41,Delaware 87ers
44 | 42,Fubon Braves
45 | 43,Fubon Braves Basketball Team
46 | 44,Omaha Racers
47 | 45,Edinburgh Tigers
48 | 46,PBC Lokomotiv-Kuban
49 | 47,Bristol Flyers
50 | 48,PAWS London Capital
51 | 49,Trabzonspor B.K.
52 | 50,Olympiada Patras BC
53 | 51,Sporting BC
54 | 52,Galatasaray S.K. (men's basketball)
55 | 53,Cheshire Jets
56 | 54,BC Cherkasy
57 | 55,KK Mega Basket
58 | 56,KK Mega Leks
59 | 57,Nuova AMG Sebastiani Basket Rieti
60 | 58,BC Yenisey Krasnoyarsk
61 | 59,KK Krka
62 | 60,Roseto Basket
63 | 61,Veroli Basket
64 | 62,Gruppo Triboldi Basket
65 | 63,AEL 1964 B.C.
66 | 64,Bnei HaSharon
67 | 65,Belfast Star
68 | 66,Melbourne Boomers
69 | 67,B.C. Zenit Saint Petersburg
70 | 68,Swans Gmunden
71 | 69,Donar (basketball club)
72 | 70,Rethymno Cretan Kings B.C.
73 | 71,Egaleo BC
74 | 72,CB Tizona
75 | 73,Bunbury Slammers
76 | 74,Ilissiakos B.C.
77 | 75,MENT B.C.
78 | 76,Dafni BC
79 | 77,Xanthi B.C.
80 | 78,Athlitikos Omilos Paleou Falirou BC
81 | 79,AGEH Gymnastikos B.C.
82 | 80,Union Kavala B.C.
83 | 81,Peramatos Ermis B.C.
84 | 82,ICBS B.C.
85 | 83,Ionikos Lamias BC
86 | 84,AO Pagrati BC
87 | 85,Toros de Nuevo Laredo
88 | 86,Trikala 2000 B.C.
89 | 87,A.S. Trikala 2000 BC
90 | 88,Iraklio BC
91 | 89,KK Millenium Strumica
92 | 90,Incheon Electroland Elephants
93 | 91,Goyang Orion Orions
94 | 92,Gigantes de Carolina (men's basketball)
95 | 93,South China AA (basketball)
96 | 94,Galatasaray S.K. (women's basketball)
97 | 95,Galatasaray Medical Park (women's basketball)
98 | 96,Elitzur Givat Shmuel
99 | 97,Kent Crusaders (basketball)
100 | 98,AS Ionikos Neas Filadelfeias BC
101 | 99,Ionikos Nikaias BC
102 | 100,Polytekhnika-Halychyna Lviv
103 | 101,Galatasaray S.K. (wheelchair basketball)
104 | 102,Galatasaray Wheelchair Basketball Team
105 | 103,SK Valmiera
106 | 104,Rakvere Tarvas
107 | 105,Al-Ahli Benghazi (basketball club)
108 | 106,BC Partizani Tirana
109 | 107,Gymnastikos S. Larissas B.C.
110 | 108,BC Budivelnik
111 | 109,Bintulu Eagles B.C.
112 | 110,Bintulu Rainbow B.C.
113 | 111,Perak Farmcochem B.C.
114 | 112,CS Otopeni (Basketball)
115 | 113,CS Energia
116 | 114,Hapoel Afula B.C.
117 | 115,Satria Muda Pertamina Jakarta
118 | 116,Hi-Tech Bangkok City
119 | 117,Sports Rev Thailand Slammers
120 | 118,Maccabi Ra'anana
121 | 119,Logan Thunder (WNBL)
122 | 120,Al Riyadi Amman
123 | 121,Ezzahra Sports
124 | 122,Barak Netanya B.C.
125 | 123,Athinaikos women's basketball
126 | 124,Ikaros Chalkidas B.C.
127 | 125,Medi Bayreuth
128 | 126,RosaSport Radom
129 | 127,Gent Hawks
130 | 128,Halcones de Xalapa
131 | 129,Aspac Jakarta
132 | 130,Leeds Force
133 | 131,Pelita Jaya Energi Mega Persada
134 | 132,CLS Knights Surabaya
135 | 133,Yongin Samsung Blueminx
136 | 134,Yongin Samsung Life Bichumi
137 | 135,Piimameister Otto/Rapla
138 | 136,TYCO Rapla
139 | 137,Muba Hangtuah Sumatera Selatan
140 | 138,Blackwater Sports
141 | 139,Black Water Sports
142 | 140,BC Juventus
143 | 141,BC Rūdupis
144 | 142,BC Palanga
145 | 143,BC Naglis
146 | 144,Cuxhaven BasCats
147 | 145,Mississauga Power
148 | 146,AB Pas
149 | 147,BC Minsk-2006
150 | 148,Jalaa SC (men's basketball)
151 | 149,Porta XI Ensino CBF
152 | 150,UNIQA Sopron
153 | 151,Al Rayan SC Basketball Team
154 | 152,Stella Artois Leuven Bears
155 | 153,BC Barsy Atyrau
156 | 154,El Ittihad Alexandria (basketball)
157 | 155,Al Ittihad Alexandria (basketball)
158 | 156,Zamalek (basketball)
159 | 157,Tanduay Light Rhum Masters
160 | 158,Al Kuwait SC (basketball)
161 | 159,SOMB Boulogne-sur-Mer
162 | 160,KK Slavonski Brod
163 | 161,CB Ciudad de Algeciras
164 | 162,Pacific Caesar Surabaya
165 | 163,C.D. Primeiro de Agosto (basketball)
166 | 164,G.S. FIAT
167 | 165,Orangeville A's
168 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Race/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,2000 Guineas Stakes
3 | 1,St Leger Stakes
4 | 2,W.S. Cox Plate
5 | 3,1000 Guineas Stakes
6 | 4,Duchess of Cambridge Stakes
7 | 5,Champagne Stakes (ATC)
8 | 6,Breeders' Cup Distaff
9 | 7,Vincent O'Brien National Stakes
10 | 8,National Stakes
11 | 9,Champions Cup (horse race)
12 | 10,The Metropolitan (ATC)
13 | 11,Darley Classic
14 | 12,Futurity Stakes (MRC)
15 | 13,The BMW
16 | 14,Queen Elizabeth Stakes (VRC)
17 | 15,Australian Oaks (ATC)
18 | 16,Queen Elizabeth Stakes (ATC)
19 | 17,Sires' Produce Stakes (ATC)
20 | 18,Sires Produce Stakes (ATC)
21 | 19,Schweppes Oaks
22 | 20,Myer Classic
23 | 21,Lightning Stakes
24 | 22,Vinery Stud Stakes
25 | 23,TJ Smith Stakes
26 | 24,The J. J. Atkins
27 | 25,Bing Crosby Stakes
28 | 26,Los Alamitos Futurity
29 | 27,Starlet Stakes
30 | 28,Oak Leaf Stakes
31 | 29,Rodeo Drive Stakes
32 | 30,Belmont Derby
33 | 31,Baring Bingham Novices' Hurdle
34 | 32,Wise Dan Handicap
35 | 33,Red Carpet Handicap
36 | 34,Summertime Oaks
37 | 35,Los Alamitos Derby
38 | 36,Goodwood Stakes
39 | 37,Norfolk Stakes (United States)
40 | 38,Lady's Secret Stakes
41 | 39,Oak Tree Mile Stakes
42 | 40,Santa Ana Stakes
43 | 41,Linlithgow Stakes
44 | 42,A.V. Kewney Stakes
45 | 43,Makybe Diva Stakes
46 | 44,Sires' Produce Stakes (VRC)
47 | 45,Sires' Produce Stakes (BRC)
48 | 46,Cape Town Cycle Tour
49 | 47,Cape Argus Cycle Race
50 | 48,The Goodwood
51 | 49,South Australian Derby
52 | 50,Robert Sangster Stakes
53 | 51,The Galaxy (ATC)
54 | 52,Miracle Mile (harness race)
55 | 53,Grosser Preis von Bayern
56 | 54,Dubai Turf
57 | 55,Dato' Tan Chin Nam Stakes
58 | 56,Dato Tan Chin Nam Stakes
59 | 57,Welsh Grand National
60 | 58,Peter Young Stakes
61 | 59,Caulfield Classic
62 | 60,Caulfield Guineas Prelude
63 | 61,Gazet van Antwerpen Trophy
64 | 62,Moonee Valley Vase
65 | 63,British Champions Fillies & Mares Stakes
66 | 64,King Richard III Stakes
67 | 65,People's Choice Classic
68 | 66,Tour of Wellington
69 | 67,Yallambee Classic
70 | 68,SA Fillies Classic
71 | 69,Arrowfield 3YO Sprint
72 | 70,UCI Track Cycling World Cup
73 | 71,Tour de Filipinas
74 | 72,Breeders' Stakes (SAJC)
75 | 73,Lord Reims Stakes
76 | 74,National Stakes (SAJC)
77 | 75,Sires' Produce Stakes (SAJC)
78 | 76,Spring Stakes (SAJC)
79 | 77,Prix Bertrand du Breuil
80 | 78,Emakumeen Bira
81 | 79,Grand Prix of Aargau Canton
82 | 80,Settimana internazionale di Coppi e Bartali
83 | 81,Rund um die Hainleite
84 | 82,Munster Oaks
85 | 83,Denny Cordell Lavarack Fillies Stakes
86 | 84,Caulfield Sprint
87 | 85,Strade Bianche - Eroica Pro
88 | 86,Crystal Mile
89 | 87,A J Moir Stakes
90 | 88,Moonee Valley Gold Cup
91 | 89,Tesio Stakes
92 | 90,Eliza Park International Stakes
93 | 91,Matriarch Stakes (VRC)
94 | 92,Matriarch Stakes (Australia)
95 | 93,Moonee Valley Fillies Classic
96 | 94,Australia Stakes
97 | 95,The Marathon (horse race)
98 | 96,Las Vegas Marathon (horse race)
99 | 97,Champagne Stakes (MVRC)
100 | 98,Telstra Phonewords Stakes
101 | 99,Walther J. Jacobs-Stutenpreis
102 | 100,Hamburger Stutenpreis
103 | 101,Frankfurter Stutenpreis
104 | 102,Grafenberger Meilen-Trophy
105 | 103,Excelsior Stakes
106 | 104,Play the King Stakes
107 | 105,T.S. Carlyon Cup
108 | 106,Caulfield Autumn Classic
109 | 107,Pol Roger Stakes
110 | 108,BRC Sprint
111 | 109,Eagle Farm Cup
112 | 110,Grand Prix Stakes
113 | 111,Dane Ripper Stakes
114 | 112,Tattersall's Tiara
115 | 113,Bucks County Classic
116 | 114,Herald Champion Novice Hurdle
117 | 115,Tattersalls Ireland Champion Novice Hurdle
118 | 116,Tour of Iran (Azerbaijan)
119 | 117,Ryanair Gold Cup
120 | 118,December Gold Cup
121 | 119,Spinal Research The Atlantic 4 Gold Cup
122 | 120,Turf Sprint Stakes
123 | 121,Mathis Brothers Mile
124 | 122,1965 Chase
125 | 123,Dance Design Stakes
126 | 124,Mildmay of Flete Challenge Cup
127 | 125,Brown Advisory and Merriebelle Stable Plate
128 | 126,Appleton Handicap
129 | 127,Tour des Fjords
130 | 128,The Run to the Rose
131 | 129,The Run to the Roses
132 | 130,Bobbie Lewis Quality
133 | 131,Tramway Stakes
134 | 132,Golden Pendant
135 | 133,Spring Stakes (NJC)
136 | 134,Blazer Stakes
137 | 135,Rose Of Kingston Stakes
138 | 136,The Shorts (ATC)
139 | 137,Carbine Club Stakes (VRC)
140 | 138,Glenfarclas Cross Country Chase
141 | 139,Chairman's Handicap (ATC)
142 | 140,Chairmans Handicap (ATC)
143 | 141,Sapphire Stakes (ATC)
144 | 142,Sires' Produce Stakes (WA)
145 | 143,Sires Produce Stakes (WA)
146 | 144,Gunsynd Classic
147 | 145,Victory Stakes
148 | 146,Chairman's Handicap (BRC)
149 | 147,Champagne Classic (BRC)
150 | 148,Air Force Association Cycling Classic
151 | 149,Prince Of Wales Stakes (Australia)
152 | 150,Eclipse Stakes (MRC)
153 | 151,Summer Cup (ATC)
154 | 152,Summer Cup (horse racing)
155 | 153,Autumn Stakes (MRC)
156 | 154,Breeders Classic
157 | 155,Geoffrey Belmaine Stakes
158 | 156,Schweppervescence Trophy
159 | 157,Challenge Stakes (ATC)
160 | 158,AJC Challenge Stakes
161 | 159,Fort Lauderdale Stakes
162 | 160,MTB Himalaya
163 | 161,Velothon Berlin
164 | 162,Scandinavian Race Uppsala
165 | 163,Tour Series
166 | 164,Halfords Tour Series
167 | 165,Okolo Slovenska
168 | 166,World Ports Cycling Classic
169 | 167,Tour of Faroe Islands
170 | 168,Irish St Leger Trial Stakes
171 | 169,Tour of Norway
172 | 170,Teio Sho
173 | 171,Sodexo Gold Cup
174 | 172,Murphy Group Handicap Chase
175 | 173,Bow Mistress Trophy
176 | 174,South African National Road Race Championships
177 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/ShoppingMall/left.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,The Galleria (Houston)
3 | 1,Shops at Prudential Center
4 | 2,The Fashion Centre at Pentagon City
5 | 3,Westfield Sydney Central Plaza
6 | 4,Mantri Square
7 | 6,Downtown Disney (Walt Disney World)
8 | 7,Bluewater (shopping centre)
9 | 8,Lancaster Mall
10 | 9,"The Summit (Birmingham, Alabama)"
11 | 10,Centro Box Hill
12 | 11,Victoria Square Shopping Centre
13 | 12,Irvine Spectrum Center
14 | 13,Liffey Valley Shopping Centre
15 | 14,ISQUARE
16 | 15,Asheville Mall
17 | 16,Metropolis at Metrotown
18 | 17,St. Louis Outlet Mall
19 | 18,Northlake Mall (Charlotte)
20 | 19,Grand Central Mall
21 | 20,Philadelphia Premium Outlets
22 | 21,Touchwood
23 | 22,Atlantic Terminal (shopping mall)
24 | 23,Vintage Faire Mall
25 | 24,RioCan Centre Kingston
26 | 25,City Central
27 | 26,Hamilton Place (shopping mall)
28 | 27,Langham Place (Hong Kong)
29 | 28,Centrale (Croydon)
30 | 29,Centrio
31 | 30,Broadmarsh (shopping centre)
32 | 31,Rhodes Shopping Centre
33 | 32,The CentrePoint
34 | 33,Wilton Mall
35 | 34,Palm Beach Mall
36 | 35,Great Northern Mall
37 | 36,Centro Bankstown
38 | 37,Sunway Carnival Mall
39 | 38,HarbourFront Centre
40 | 39,CityPlace
41 | 40,Galleria Shopping Centre
42 | 41,"Westgate Shopping Centre, Oxford"
43 | 42,Westfield Annapolis
44 | 43,Brentwood Town Centre
45 | 44,St. David's (Cardiff)
46 | 45,Centro Toombul
47 | 46,Grand Indonesia Shopping Town
48 | 47,Rockville Mall
49 | 48,Westfield Connecticut Post
50 | 49,Fox Run Mall
51 | 50,Downtown Shopping Centre
52 | 51,Downtown Plaza (Sacramento)
53 | 52,Westfield MainPlace
54 | 53,Capital City Mall
55 | 54,Westfield Fox Valley
56 | 55,Westfield Chicago Ridge
57 | 56,Louis Joliet Mall
58 | 57,Indian Mall
59 | 58,Westfield Franklin Park
60 | 59,Westfield Southland
61 | 61,Westfield Belden Village
62 | 62,Solano Town Center
63 | 63,Eastland Center
64 | 64,Westfield West Covina
65 | 65,Parkway Mall
66 | 66,Sahara Mall (Riyadh)
67 | 67,Crossroads Center
68 | 68,St. Laurent Centre
69 | 69,Market Place Mall
70 | 70,"Zona Rosa (Kansas City, Missouri)"
71 | 71,Centro Lutwyche
72 | 72,Les Promenades de l'Outaouais
73 | 73,Westdale Mall
74 | 74,Victoria Mall
75 | 75,Woodbury Common Premium Outlets
76 | 76,El Con Mall
77 | 77,Epping Plaza
78 | 78,The Paragon
79 | 79,Auburn Mall
80 | 81,Cataraqui Town Centre
81 | 82,Royal Victoria Place
82 | 83,Pier Park (Florida)
83 | 84,"Conestoga Mall (Waterloo, Ontario)"
84 | 85,Maple Hill Pavilion
85 | 86,Centro Roselands
86 | 87,Mall at The Source
87 | 88,The Oaks Mall
88 | 89,Santa Rosa Mall (Florida)
89 | 90,"Crossroads Center (Waterloo, IA)"
90 | 91,Spires Shopping Centre
91 | 92,Change Alley (Singapore)
92 | 93,Seacon Square
93 | 94,Northfield Square
94 | 96,Paradise Park (Mall)
95 | 97,Centro Colonnades
96 | 98,Centro The Glen
97 | 99,CentrO
98 | 100,Eastridge
99 | 101,The Florida Mall
100 | 102,Rivergate Mall
101 | 103,Jantzen Beach SuperCenter
102 | 104,St. Charles Towne Center
103 | 106,Galeria Kazimierz
104 | 107,Northwest Plaza
105 | 108,Acadiana Mall
106 | 109,Arsenal Mall
107 | 110,Antioch Center
108 | 111,Omni Park Shopping Centre
109 | 113,West Mall
110 | 114,Mail Champlain
111 | 115,Splendid China Tower
112 | 116,Brunswick shopping centre
113 | 117,Mid Rivers Mall
114 | 118,Centro Mayor
115 | 119,Direct Factory Outlets
116 | 120,Westfield Warrawong
117 | 121,Westfield Figtree
118 | 122,Westfield Pakuranga
119 | 124,Knollwood Mall
120 | 125,Shangri-la Plaza Mall
121 | 127,Lakeshore Mall (Florida)
122 | 128,Menara Great Eastern
123 | 129,Hickory Ridge Mall
124 | 130,Avenue Carriage Crossing
125 | 132,The Mall at Shelter Cove
126 | 133,Bishops Corner (West Hartford)
127 | 135,The Mall in Columbia
128 | 137,Tallahassee Mall
129 | 138,La Encantada
130 | 139,Woodfield Mall
131 | 140,The Shoppes at Eastchase
132 | 141,Forest Lake Shopping Centre
133 | 142,MegaBox (shopping mall)
134 | 143,Westfield CastleCourt
135 | 145,Deira City Centre
136 | 146,Paddock Mall
137 | 147,The Promenade Shopping Centre
138 | 148,The Mall at Cortana
139 | 149,Merry Hill Shopping Centre
140 | 150,Bentley Bridge Retail Park
141 | 152,City Centre Plaza
142 | 155,Hillside Shopping Centre
143 | 156,NewPark Mall
144 | 157,Springfield Mall (Pennsylvania)
145 | 158,"Regency Square Mall (Florence, Alabama)"
146 | 159,Pace Shopping Mall
147 | 160,Centro Taigum
148 | 161,Sta. Lucia East Grand Mall
149 | 162,Florence Mall
150 | 163,SM City
151 | 164,Centro Karingal
152 | 165,Regency Square Mall (Jacksonville)
153 | 166,Broadway Mall
154 | 168,Dembel City Center
155 | 169,Colton Plaza
156 | 171,Winrock Shopping Center
157 | 172,Nex
158 | 174,Chesapeake Square Mall
159 | 175,Lulu Mall
160 | 177,Mirdif City Centre
161 | 178,"Shaktan Thampuran Private Bus Stand, Thrissur"
162 | 179,"Star City, Seoul"
163 | 182,Exchange Ilford
164 | 183,My Mall Limassol
165 | 184,Castletown Shoppingworld
166 | 185,The Market Common Myrtle Beach
167 | 186,Kenwood Towne Centre
168 | 187,The Mall Pavilions
169 | 188,Settlers' Green Outlet Village
170 | 189,"The Summit (Reno, Nevada)"
171 | 190,"The Summit (Wheatfield, New York)"
172 | 191,Domain Central
173 | 192,Granada Center
174 | 193,The Base (mall)
175 | 195,Tulsa Promenade Mall
176 | 196,West Manchester Mall
177 | 197,Lakeview Square
178 | 198,Palladium Square
179 | 199,Centro Lavington
180 | 200,Bahrain City Centre
181 | 201,Spinderiet (Copenhagen)
182 | 202,Central Park (shopping complex)
183 | 203,Kukui Grove Shopping Center
184 | 205,Albany Mall
185 | 207,"University Mall (Chapel Hill, North Carolina)"
186 | 208,The Outlets at Sands Bethlehem
187 | 209,Enfield Square
188 | 210,"Harbor Point, Subic"
189 | 211,Crystal Palace Complex (Dieppe)
190 | 212,Centre at Glen Burnie
191 | 213,South City (shopping mall)
192 | 214,The Gallery at Military Circle
193 | 215,West 12 Shepherds Bush
194 | 217,The Brentwood Country Mart
195 | 218,South Point (shopping mall)
196 | 219,Cross County Plaza
197 | 220,Werribee Plaza
198 | 221,Monroe Crossing Mall
199 | 222,"Northwoods Mall (Peoria, Illinois)"
200 | 223,Square 2 (Shopping Mall)
201 | 225,Westshore Mall
202 | 226,Florin Mall
203 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/GivenName/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 7,Luke,0,Luke (name)
3 | 12,Julia (given name),1,Julia
4 | 16,Fatima (given name),2,Fatima (name)
5 | 20,Fanny (given name),3,Fanny (name)
6 | 22,Lyfing (given name),4,Lyfing
7 | 70,Adele (given name),5,Adele
8 | 72,Jeffrey (given name),6,Jeffrey (name)
9 | 73,Kathryn (name),7,Kathryn
10 | 85,Cadwaladr (name),8,Cadwallader
11 | 86,Geeta,9,Gita (given name)
12 | 102,Banu (name),10,Banu
13 | 104,Casper (name),11,Casper
14 | 140,Hana (given name),12,Hana (name)
15 | 170,Abdul Aziz,13,Abd al-Aziz
16 | 178,Guillaume (given name),14,Guillaume
17 | 190,Dustin (given name),15,Dustin (name)
18 | 200,Shahrokh,16,Shahrokh (mythical bird)
19 | 233,William (name),17,William (given name)
20 | 238,Arif,18,Arif (given name)
21 | 254,Corinne (name),19,Corinne
22 | 334,Haruna,20,Haruna (given name)
23 | 342,Brianna (given name),21,Brianna
24 | 360,Takeru,22,Takeru (name)
25 | 389,Yasmin,23,Yasmin (given name)
26 | 431,Calvin (given name),24,Calvin (name)
27 | 440,Alexis (disambiguation),25,Alexis
28 | 449,Hjalmar (given name),26,Hjalmar (disambiguation)
29 | 464,Rei,27,Rei (given name)
30 | 473,Asad,28,Asad (name)
31 | 486,Mervin (given name),29,Mervin
32 | 495,Margaret (name),30,Margaret
33 | 514,Aurora (name),31,Aurora (given name)
34 | 521,Michelle (given name),32,Michelle (name)
35 | 524,Jahsh (name),33,Jahsh
36 | 530,Mark (given name),34,Mark (name)
37 | 582,Sujata,35,Sujata (name)
38 | 605,Anupama,36,Anupama (given name)
39 | 610,Paulina (name),37,Paulina (given name)
40 | 619,Padraig,38,Padraic
41 | 625,Bojan (name),39,Bojan
42 | 629,Chetan,40,Chetan (name)
43 | 640,Juanfran,41,Juanfran (disambiguation)
44 | 652,Gun (name),42,Gun (Swedish name)
45 | 669,Dalia (name),43,Dalia (given name)
46 | 672,Travis (given name),44,Travis
47 | 773,Merwin (name),45,Merwin
48 | 786,Daniel (name),46,Daniel
49 | 798,Arun (name),47,Arun (given name)
50 | 835,Michele (given name),48,Michele
51 | 861,Zakiah (female name),49,Zakiah
52 | 880,Parvathy,50,Parvati (given name)
53 | 882,Medad (name),51,Medad
54 | 886,Joseph (given name),52,Joseph
55 | 886,Joseph (given name),53,Joseph (name)
56 | 889,Aida (given name),54,Aida (name)
57 | 892,Edwina (given name),55,Edwina
58 | 909,Zvonimir (name),56,Zvonimir
59 | 915,Annetta (name),57,Annetta (given name)
60 | 936,Asa (given name),58,Asa (name)
61 | 944,Hannu (given name),59,Hannu (disambiguation)
62 | 948,Hayley,60,Hayley (given name)
63 | 957,Baurzhan,61,Bauyrzhan
64 | 961,Abdur Rahman,62,Abd al-Rahman
65 | 964,Bram (name),63,Bram (given name)
66 | 1005,Hannah (given name),64,Hannah (name)
67 | 1019,Katherine (given name),65,Katherine
68 | 1023,Hideyoshi (given name),66,Hideyoshi (disambiguation)
69 | 1035,Alexandru (name),67,Alexandru
70 | 1042,Lee (given name),68,Lee (English given name)
71 | 1044,Leonard (name),69,Leonard
72 | 1049,Vivienne,70,Vivian (given name)
73 | 1057,Marvin (name),71,Marvin (given name)
74 | 1127,Faiz,72,Faiz (disambiguation)
75 | 1145,Anthony (given name),73,Anthony (name)
76 | 1150,Stanley (surname),74,Stanley (name)
77 | 1165,Cory (name),75,Cory
78 | 1172,Gerhard,76,Gerard
79 | 1253,Lindita,77,Lindita (given name)
80 | 1268,Sophie,78,Sophie (given name)
81 | 1281,Leonie (given name),79,Leonie
82 | 1282,Raizo (given name),80,Raizo
83 | 1283,Raffaello,81,Raffaello (disambiguation)
84 | 1291,Sorin (first name),82,Sorin (given name)
85 | 1317,Ljubica (name),83,Ljubica
86 | 1336,Ludovica (given name),84,Ludovica
87 | 1337,Alessia (given name),85,Alessia
88 | 1366,Coralie (given name),86,Coralie
89 | 1372,Navneet (given name),87,Navneet
90 | 1385,Aisling (given name),88,Aisling (name)
91 | 1395,Niloufar,89,Nelofar
92 | 1404,Miley (name),90,Miley (given name)
93 | 1430,Mary (given name),91,Mary (name)
94 | 1432,Madeleine (given name),92,Madeleine (name)
95 | 1443,Schuyler (given name),93,Schuyler (name)
96 | 1471,Milica (given name),94,Milica
97 | 1482,Laimonis (name),95,Laimonis
98 | 1483,Uldis (name),96,Uldis
99 | 1484,Dzintars (name),97,Dzintars
100 | 1485,Modris (name),98,Modris
101 | 1487,Gatis (name),99,Gatis
102 | 1488,Indulis (name),100,Indulis
103 | 1497,Priya,101,Priya (given name)
104 | 1544,Maki (given name),102,Maki (name)
105 | 1547,Jun'ichi,103,Junichi
106 | 1588,Alison (name),104,Alison (given name)
107 | 1589,Kanye (Igbo name),105,Kanye (name)
108 | 1596,Kaj (name),106,Kaj
109 | 1618,Tawfik (given name),107,Tawfik
110 | 1625,Kayo (Nigerian name),108,Kayo (name)
111 | 1656,Pia (name),109,Pia (given name)
112 | 1669,Khristo,110,Hristo
113 | 1708,Heather (name),111,Heather (given name)
114 | 1709,Lubomir (given name),112,Lubomir
115 | 1712,Ctirad (name),113,Ctirad
116 | 1716,Tom (name),114,Tom (given name)
117 | 1719,Aileen (given name),115,Aileen
118 | 1729,Bita,116,Bita (Persian)
119 | 1730,Christopher (given name),117,Christopher
120 | 1734,Abid,118,Abid (name)
121 | 1760,Brooke (given name),119,Brooke (name)
122 | 1761,Mira (name),120,Mira (given name)
123 | 1796,Parisa (given name),121,Parisa (disambiguation)
124 | 1806,Tuukka (given name),122,Tuukka
125 | 1834,Soo-young,123,Soo-young (name)
126 | 1842,Roosevelt (surname),124,Roosevelt (name)
127 | 1862,Hamad (name),125,Hamad
128 | 1877,Ziemowit (given name),126,Ziemowit
129 | 1913,Graciela (given name),127,Graciela (disambiguation)
130 | 1922,Dobroslaw (name),128,Dobroslav
131 | 1955,Tess,129,Tess (given name)
132 | 1976,Ellen (given name),130,Ellen
133 | 1993,Kalina (given name),131,Kalina (name)
134 | 2027,Kaede (disambiguation),132,Kaede
135 | 2039,Ranald (given name),133,Ranald
136 | 2046,Sandy (name),134,Sandy (given name)
137 | 2052,Maytham (name),135,Maytham
138 | 2055,Damayanti (given name),136,Damayanti (disambiguation)
139 | 2091,Faith (given name),137,Faith (name)
140 | 2094,Llywelyn (name),138,Llewellyn (name)
141 | 2110,Veronica (given name),139,Veronica (name)
142 | 2121,Amos,140,Amos (name)
143 | 2162,Bora (given name),141,Bora (Turkish name)
144 | 2173,Oladapo (name),142,Oladapo
145 | 2198,Jerry (name),143,Jerry (given name)
146 | 2199,Sania,144,Sania (disambiguation)
147 | 2207,Domagoj,145,Domagoj (given name)
148 | 2210,Okonma (surname),146,Okonma
149 | 2247,Keon (given name),147,Keon
150 | 2336,Enn (given name),148,Enn
151 | 2506,Haruchika,149,Haruchika (given name)
152 | 2968,Farah (given name),150,Farah (name)
153 | 2999,Harutyun (given name),151,Harutyun
154 | 3007,Sora (given name),152,Sora (Japanese given name)
155 | 3008,Verity (given name),153,Verity
156 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Drug/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 4,BCG vaccine,0,Bacillus Calmette-Guérin
3 | 10,Dimethyltryptamine,1,"N,N-Dimethyltryptamine"
4 | 30,Scopolamine,2,Hyoscine hydrobromide
5 | 46,Valproic acid,3,Valproate
6 | 161,Interferon beta-1a,4,Interferon beta 1a
7 | 180,Androstenedione,5,4-Androstenedione
8 | 224,Oxycodone/aspirin,6,Percodan
9 | 342,5-MeO-AMT,7,5-MeO-aMT
10 | 437,Live attenuated influenza vaccine,8,FluMist
11 | 530,Tenofovir,9,Tenofovir disoproxil
12 | 563,Ursodiol,10,Ursodeoxycholic acid
13 | 565,Mepacrine,11,Quinacrine
14 | 628,Glycopyrronium bromide,12,Glycopyrrolate
15 | 664,Neosporin,13,Neomycin/polymyxin B/bacitracin
16 | 699,Glycyrrhetinic acid,14,Enoxolone
17 | 752,Butylscopolamine,15,Hyoscine butylbromide
18 | 761,Paromomycin,16,Paromomycin sulfate
19 | 809,Polysporin,17,Bacitracin/polymyxin B
20 | 831,Ethinyl estradiol,18,Ethinylestradiol
21 | 889,Actinomycin,19,Dactinomycin
22 | 904,Interferon beta-1b,20,Interferon beta 1b
23 | 912,Methandrostenolone,21,Metandienone
24 | 994,Cafergot,22,Caffeine/ergotamine
25 | 1009,Roxatidine,23,Roxatidine acetate
26 | 1016,HPV vaccine,24,HPV vaccines
27 | 1050,Atenolol/chlorthalidone,25,Tenoretic
28 | 1051,Sodium oxybate,26,Xyrem
29 | 1114,CroFab,27,Crotalidae polyvalent immune fab
30 | 1274,Hydroxyamfetamine,28,4-Hydroxyamphetamine
31 | 1377,Librax,29,Chlordiazepoxide/clidinium bromide
32 | 1386,Arbidol,30,Umifenovir
33 | 1428,Emtricitabine/tenofovir/efavirenz,31,Efavirenz/emtricitabine/tenofovir
34 | 1435,MLD-41,32,N1-Methyl-lysergic acid diethylamide
35 | 1439,DAM-57,33,Dimethyllysergamide
36 | 1580,Gestonorone,34,Gestonorone caproate
37 | 1735,Zostavax,35,Zoster vaccine
38 | 1786,Pioglitazone/metformin,36,Politor
39 | 1904,Dantron,37,"1,8-Dihydroxyanthraquinone"
40 | 1915,Levonordefrin,38,Corbadrine
41 | 1999,17-Hydroxyprogesterone caproate,39,Hydroxyprogesterone caproate
42 | 2027,Mestranol/norethynodrel,40,Mestranol/noretynodrel
43 | 2051,Etilamfetamine,41,Ethylamphetamine
44 | 2114,Growth hormone releasing hexapeptide,42,GHRP-6
45 | 2151,Abiraterone,43,Abiraterone acetate
46 | 2252,Fenyramidol,44,Phenyramidol
47 | 2260,Digoxin Immune Fab,45,Digoxin immune fab
48 | 2536,Senna glycoside,46,Senna glycosides
49 | 2586,Norboletone,47,Norbolethone
50 | 2622,Artesunate/amodiaquine,48,ASAQ
51 | 2649,Metofoline,49,Methopholine
52 | 2676,Beta-Methylfentanyl,50,Betamethylfentanyl
53 | 2718,MN 18,51,MN-18
54 | 2753,4-Fluoromethamphetamine,52,4-Fluoro-N-methylamphetamine
55 | 2763,Interferon alfa-2b,53,Interferon alfa 2b
56 | 2863,GW 501516,54,GW501516
57 | 2940,1-(4-Nitrophenylethyl)piperidylidene-2-(4-chlorophenyl)sulfonamide,55,W-18
58 | 2977,Humulin,56,Regular insulin
59 | 3006,C-8813,57,Thiobromadol
60 | 3038,L-DOPS,58,Droxidopa
61 | 3113,Thiambutenes,59,Thiambutene
62 | 3122,Enzalutamide,60,MDV3100
63 | 3185,Pericyazine,61,Periciazine
64 | 3277,"SB-242,084",62,SB-242084
65 | 3359,Stalevo,63,Carbidopa/levodopa/entacapone
66 | 3392,Dihydroxialumini sodium carbonate,64,Carbaldrate
67 | 3403,Dioctyl sodium sulfosuccinate,65,Docusate
68 | 3454,TC-5619,66,Bradanicline
69 | 3481,"EMD-386,088",67,EMD-386088
70 | 3482,"SB-258,585",68,SB-258585
71 | 3483,"SB-399,885",69,SB-399885
72 | 3484,"SB-357,134",70,SB-357134
73 | 3492,Afutuzumab,71,Obinutuzumab
74 | 3541,4-Bromo-N-methylcathinone,72,4-Bromomethcathinone
75 | 3552,"SB-271,046",73,SB-271046
76 | 3595,CBLB502,74,Entolimod
77 | 3633,"A-372,159",75,A-372159
78 | 3635,"SB-699,551",76,SB-699551
79 | 3692,"RS-127,445",77,RS-127445
80 | 3693,"SB-204,741",78,SB-204741
81 | 3694,"RS-102,221",79,RS-102221
82 | 3713,"CP-94,253",80,CP-94253
83 | 3720,Sulcotidil,81,Suloctidil
84 | 3773,Insulin therapy,82,Insulin (medication)
85 | 3821,Ro04-6790,83,Ro 04-6790
86 | 3830,"SB-216,641",84,SB-216641
87 | 3835,"SB-269,970",85,SB-269970
88 | 3889,Torezolid,86,Tedizolid
89 | 3913,SEP-225289,87,Dasotraline
90 | 3918,Alpharadin,88,Radium-223
91 | 3918,Alpharadin,89,Radium-223 chloride
92 | 3921,"5,6-Methylenedioxy-N-methyl-2-aminoindane",90,MDMAI
93 | 3960,Pivalylbenzhydrazine,91,Pivhydrazine
94 | 3974,MMDMA,92,MMDMA (drug)
95 | 4013,Salvinorin B methoxymethyl ether,93,2-Methoxymethyl salvinorin B
96 | 4014,Isofagomine tartrate,94,Afegostat
97 | 4044,"SB-215,505",95,SB-215505
98 | 4045,SR-48692,96,Meclinertant
99 | 4047,"GR-127,935",97,GR-127935
100 | 4081,Methylenedioxymethylphenethylamine,98,Homarylamine
101 | 4096,Bromadol,99,BDPC
102 | 4103,Norethindrone enanthate,100,Norethisterone enanthate
103 | 4114,"CP-809,101",101,CP-809101
104 | 4152,5-(2-Aminopropyl)indole,102,5-IT
105 | 4154,Chromium polynicotinate,103,Chromium(III) nicotinate
106 | 4215,BMS-945429,104,Clazakizumab
107 | 4223,Afobazole,105,Fabomotizole
108 | 4264,Toplexil,106,Oxomemazine/guaifenesin
109 | 4288,Lucinactant,107,Surfaxin
110 | 4347,Lu AE58054,108,Idalopirdine
111 | 4368,Asasantin,109,Acetylsalicylic acid/dipyridamole
112 | 4451,Loxtidine,110,Lavoltidine
113 | 4475,PCPr,111,Phencyclamine
114 | 4484,Erbulozole,112,Erbuzole
115 | 4507,Emoxypine,113,6-Methyl-2-ethyl-3-hydroxypyridine
116 | 4551,"LY-293,284",114,LY-293284
117 | 4588,Ro5-2904,115,Triflunordazepam
118 | 4597,"S-14,671",116,S-14671
119 | 4611,AEZS-108,117,Zoptarelin doxorubicin
120 | 4658,Meclozine,118,Meclizine
121 | 4662,"CP-93,129",119,CP-93129
122 | 4687,2C-TFM-NBOMe,120,25TFM-NBOMe
123 | 4688,2C-C-NBOMe,121,25C-NBOMe
124 | 4703,Thiopropamine,122,1-(Thiophen-2-yl)-2-aminopropane
125 | 4713,TM-38837,123,TM38837
126 | 4727,Beyaz (drug),124,Ethinyl estradiol/drospirenone/levomefolic acid
127 | 4735,Carfenazine,125,Carphenazine
128 | 4736,4-Hydroxycyclophosphamide,126,Perfosfamide
129 | 4746,"Org 12,962",127,Org 12962
130 | 4748,Festinavir,128,Censavudine
131 | 4780,Omecamtiv mecarbil,129,Omecamtiv Mecarbil (CK-1827452)
132 | 4926,"SB-258,719",130,SB-258719
133 | 4927,"GR-113,808",131,GR-113808
134 | 4931,"LY-310,762",132,LY-310762
135 | 4932,"SB-204,070",133,SB-204070
136 | 4933,"CJ-033,466",134,CJ-033466
137 | 4934,"SB-206,553",135,SB-206553
138 | 4935,"CP-135,807",136,CP-135807
139 | 4984,Lumacaftor,137,VX-809
140 | 4990,Tenofovir alafenamide fumarate,138,Tenofovir alafenamide
141 | 4990,Tenofovir alafenamide fumarate,139,GS 7340
142 | 4997,Sofosbuvir,140,PSI-7977
143 | 4998,BA058,141,Abaloparatide
144 | 5017,Ana773,142,RG7795
145 | 5017,Ana773,143,ANA773
146 | 5031,C16 (PKR inhibitor),144,C16 (drug)
147 | 5052,LCZ696,145,Valsartan/sacubitril
148 | 5224,AKB48 (drug),146,APINACA
149 | 5249,TAS-102,147,Trifluridine/tipiracil
150 | 5262,GLYX-13,148,Rapastinel
151 | 5273,SDB-001,149,APICA (synthetic cannabinoid drug)
152 | 5332,PRT064445,150,Andexanet alfa
153 | 5333,Mk-1439,151,Doravirine
154 | 5334,GS-5885,152,Ledipasvir
155 | 5336,Bi 207127,153,Deleobuvir
156 | 5344,PB-22,154,QUPIC
157 | 5345,BB-22 (drug),155,QUCHIC
158 | 5348,GFT505,156,Elafibranor
159 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Monarch/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Emperor Kinmei
3 | 1,Emperor Tenmu
4 | 2,Emperor Monmu
5 | 3,Anastasius I Dicorus
6 | 4,Theodoric the Great
7 | 5,Xerxes I
8 | 6,Herod Agrippa
9 | 7,Theodosius III
10 | 8,Michael V
11 | 9,Michael VI Stratiotikos
12 | 10,Berenice II of Egypt
13 | 11,Oswiu
14 | 12,Antiochus V Eupator
15 | 13,Hiram I
16 | 14,Khosrow I
17 | 15,Khosrow II
18 | 16,Phetsarath Ratanavongsa
19 | 17,Stefan Radoslav
20 | 18,"Nur ad-Din, atabeg of Aleppo"
21 | 19,Dost Mohammad Khan (Emir of Afghanistan)
22 | 20,Dost Mohammad Barakzai
23 | 21,Yax Nuun Ahiin I
24 | 22,Cyaxares
25 | 23,Bardylis
26 | 24,Ajen Yohl Mat
27 | 25,K'uk' Bahlam I
28 | 26,Ahkal Mo' Nahb I
29 | 27,Kan Bahlam I
30 | 28,K'inich Ahkal Mo' Nahb III
31 | 29,Idris I
32 | 30,Nicias (Indo-Greek king)
33 | 31,Avittom Thirunal Balarama Varma
34 | 32,Dutthagamani of Anuradhapura
35 | 33,Fatâ²h-Ali Shah Qajar
36 | 34,Batbayan
37 | 35,Shuja Shah Durrani
38 | 36,Artemidoros Aniketos
39 | 37,Qutb al-Din Aibak
40 | 38,Qutbu l-Din Aibak
41 | 39,An-Nasir Muhammad
42 | 40,Demetrius III Aniketos
43 | 41,Ahmad ibn Ibrahim al-Ghazi
44 | 42,Zhanibek Khan
45 | 43,Phraates II
46 | 44,Phraates I
47 | 45,Artabanus V of Parthia
48 | 46,Artabanus IV of Parthia
49 | 47,Artabanus III of Parthia
50 | 48,Artabanus II of Parthia
51 | 49,Yaqub al-Mansur
52 | 50,Phriapatius
53 | 51,Gotarzes I
54 | 52,Yasovarman
55 | 53,Gelawdewos of Ethiopia
56 | 54,Yaqob of Ethiopia
57 | 55,Susenyos of Ethiopia
58 | 56,Fasilides of Ethiopia
59 | 57,Yohannes I of Ethiopia
60 | 58,Iyasu I of Ethiopia
61 | 59,Amda Seyon I of Ethiopia
62 | 60,Dawit I of Ethiopia
63 | 61,Telephos Euergetes
64 | 62,Baeda Maryam of Ethiopia
65 | 63,Ya'qub-i Laith Saffari
66 | 64,Iyoas I of Ethiopia
67 | 65,Susenyos II of Ethiopia
68 | 66,Tekle Giyorgis I of Ethiopia
69 | 67,Salomon II of Ethiopia
70 | 68,Hezqeyas of Ethiopia
71 | 69,Baeda Maryam III of Ethiopia
72 | 70,Wazir Akbar Khan
73 | 71,Asaf ad-Dawlah Mir Ali Salabat Jang
74 | 72,Ayub Khan (Emir of Afghanistan)
75 | 73,Ghazi Ayub Khan
76 | 74,Ssuuna II of Buganda
77 | 75,Caranus (king)
78 | 76,Coenus (king)
79 | 77,Empress Ma (Han dynasty)
80 | 78,Asaf-ud-Daula
81 | 79,Izz al-Dawla
82 | 80,'Izz al-Daula
83 | 81,"Yusuf II, Almohad caliph"
84 | 82,Abd al-Wahid I
85 | 83,Mu'izz al-Daula
86 | 84,Imad al-Dawla
87 | 85,'Imad al-Daula
88 | 86,Majd al-Dawla
89 | 87,Ljudevit
90 | 88,Constantine Doukas
91 | 89,Idris II
92 | 90,Ismail Samani
93 | 91,Rajasinha II of Kandy
94 | 92,Drest X
95 | 93,Bridei VII
96 | 94,Ciniod II
97 | 95,Bridei VI
98 | 96,Ciniod I
99 | 97,Raja Dahir
100 | 98,Man Singh II
101 | 99,Shao Kang
102 | 100,Ram Singh I
103 | 101,Nasir ad-Din Qabacha
104 | 102,Ilyas ibn Asad
105 | 103,Baraq (Golden Horde)
106 | 104,Olafr Godredsson
107 | 105,Amr Saffari
108 | 106,Ahmad I bin Mohammed
109 | 107,Khalaf I
110 | 108,Mohammad Khodabanda
111 | 109,Muhammad of Ghazni
112 | 110,Mawdud of Ghazni
113 | 111,Maw'dud of Ghazni
114 | 112,Jaswant Singh of Bharatpur
115 | 113,Keshri Singh
116 | 114,Mohammadu Maccido
117 | 115,Agrasen
118 | 116,Abdullah ibn Tahir al-Khurasani
119 | 117,Parakramabahu I of Polonnaruwa
120 | 118,Yahya ibn al-Qasim
121 | 119,Al-Hajjam al-Hasan ibn Muhammad ibn al-Qasim
122 | 120,Al-Qasim Guennoun
123 | 121,Abul-Aish Ahmad
124 | 122,Fadl ibn Muhammad
125 | 123,Fadl I
126 | 124,Muhammad ibn Abi'l-Saj
127 | 125,Yusuf Ibn Abi'l-Saj
128 | 126,Mirwais Hotak
129 | 127,Pandara Vanniyan
130 | 128,Saadatullah Khan I
131 | 129,Tidal (king)
132 | 130,Alexander II Mircea
133 | 131,Mahmud Hotak
134 | 132,Beorna of East Anglia
135 | 133,Ashraf Hotak
136 | 134,Chashtana
137 | 135,Tia (princess)
138 | 136,Abd al-Latif ibn Muhammad Taraghay Ulughbek
139 | 137,Shivaji of Thanjavur
140 | 138,Amar Singh of Thanjavur
141 | 139,Muhammad ibn Shaddad
142 | 140,Lashkari ibn Muhammad
143 | 141,Marzuban ibn Muhammad ibn Shaddad
144 | 142,Marzuban ibn Muhammad (Shaddadid)
145 | 143,Abu'l-Fath Musa
146 | 144,Lashkari ibn Musa
147 | 145,Lashkari ibn Fadl
148 | 146,Anushirvan ibn Lashkari
149 | 147,Fadl ibn Shavur
150 | 148,Fadl II
151 | 149,Abu'l-Aswar Shavur ibn Fadl
152 | 150,Abu'l-Asvar Shavur I
153 | 151,Ashot ibn Shavur
154 | 152,"Vikramabahu, Prince of Ruhuna"
155 | 153,Tode Mongke
156 | 154,Vicar-ul-Umra
157 | 155,Chaiyasiri
158 | 156,Najmuddin Ali Khan
159 | 157,Najabat Ali Khan
160 | 158,Mansur Ali Khan of Bengal
161 | 159,Lutf Allah (Sarbadar)
162 | 160,Uthram Thirunal Marthanda Varma
163 | 161,Monunius I
164 | 162,Monunius
165 | 163,Bato (Dardanian chieftain)
166 | 164,Cleitus (Dardania)
167 | 165,Mytilus (Dardania)
168 | 166,Mytilus
169 | 167,Yax Nuun Ahiin II
170 | 168,Kapeliele Faupala
171 | 169,Mahathammaracha II
172 | 170,Kaloyan and Desislava
173 | 171,Vimaladharmasuriya II of Kandy
174 | 172,Maravarman Rajasimha III
175 | 173,"As-Salih Ismail, Emir of Damascus"
176 | 174,Akhsitan I
177 | 175,"Kavan Tissa, Prince of Ruhuna"
178 | 176,"Gothabhaya, Prince of Ruhuna"
179 | 177,"Mahanaga, Prince of Ruhuna"
180 | 178,"Yatala Tissa, Prince of Ruhuna"
181 | 179,Muhammad ibn Suri
182 | 180,Ghiyath al-Din Muhammad
183 | 181,Hussain Hotak
184 | 182,Vistahm
185 | 183,Parameswara (king)
186 | 184,George I (Miskito)
187 | 185,Monunius II
188 | 186,Monunius of Dardania
189 | 187,Indradyumna (Mythological King)
190 | 188,"Idris I, Almohad Caliph"
191 | 189,"Idris II, Almohad Caliph"
192 | 190,Bhuvanaikabahu VII of Kotte
193 | 191,Parakramabahu IX of Kotte
194 | 192,Sihyaj Chan K'awiil II
195 | 193,"Al-Ashraf Musa, Emir of Homs"
196 | 194,Moggallana II
197 | 195,Khosrow IV
198 | 196,Khosrow III
199 | 197,Haytham b. Khalid
200 | 198,Muhammad I of Shirvan
201 | 199,Muhammad II Shirvanshah
202 | 200,Haytham II of Shirvan
203 | 201,Ali I of Shirvan
204 | 202,Muhammad II of Shirvan
205 | 203,Muhammad III Shirvanshah
206 | 204,Ahmad of Shirvan
207 | 205,Muhammad IV of Shirvan
208 | 206,Muhammad V Shirvanshah
209 | 207,Yazid II of Shirvan
210 | 208,Manuchihr I of Shirvan
211 | 209,Ali II of Shirvan
212 | 210,Qubad of Shirvan
213 | 211,Ali III of Shirvan
214 | 212,Sallar of Shirvan
215 | 213,Fariburz I
216 | 214,Manuchihr II of Shirvan
217 | 215,Afridun I
218 | 216,Manuchihr III of Shirvan
219 | 217,Afridun II
220 | 218,Shahanshah (Shirvanshah)
221 | 219,Fariburz II
222 | 220,Farrukhzad I
223 | 221,Gushtasb I
224 | 222,Fariburz III
225 | 223,Akhsitan II
226 | 224,Farrukhzad II
227 | 225,Akhsitan III
228 | 226,Keykavus I (Shirvanshah)
229 | 227,Kayqubad I of Shirvan
230 | 228,Kavus I
231 | 229,Hushang of Shirvan
232 | 230,Daniyal (Mughal prince)
233 | 231,Lu'lu' al-Kabir
234 | 232,Senekerim-Hovhannes Artsruni
235 | 233,Senekerim-Hovhannes
236 | 234,Parakramabahu Epa of Gampola
237 | 235,Parakramabahu V of Gampola
238 | 236,Varaz-Tiridates I
239 | 237,Varaz Trdat I
240 | 238,Gaumata
241 | 239,Parakramabahu II of Dambadeniya
242 | 240,Parakramabahu III of Dambadeniya
243 | 241,Parakramabahu IV of Dambadeniya
244 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/ShoppingMall/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,The Galleria
3 | 1,The Shops at Prudential Center
4 | 2,Fashion Centre at Pentagon City
5 | 3,Sydney Central Plaza
6 | 4,Mahatma Jyotiba Phule Mandai
7 | 5,Disney Springs
8 | 6,Downtown Disney (Walt Disney World Resort)
9 | 7,Bluewater
10 | 8,"The Triangle, Manchester"
11 | 9,The Summit (Birmingham)
12 | 10,Box Hill Central Shopping Centre
13 | 11,Victoria Centre
14 | 12,Irvine Spectrum
15 | 13,Liffey Valley
16 | 14,The Square Tallaght
17 | 15,Asheville Outlets
18 | 16,Metrotown
19 | 17,St. Louis Mills
20 | 18,"Northlake Mall (Charlotte, North Carolina)"
21 | 19,"Grand Central, Birmingham"
22 | 20,Philadelphia Mills
23 | 21,"Touchwood, Solihull"
24 | 22,Atlantic Terminal (Shopping Mall)
25 | 23,Forest Fair Village
26 | 24,Kingston Collection
27 | 25,Knox City Shopping Centre
28 | 26,Hamilton Place
29 | 27,"Langham Place, Hong Kong"
30 | 28,Centrale
31 | 29,Strathpine Centre
32 | 30,Broadmarsh
33 | 31,Rhodes Waterside
34 | 32,The Centrepoint
35 | 33,Wilton Mall at Saratoga
36 | 34,Palm Beach Outlets
37 | 35,Great Northern Mall (New York)
38 | 36,Bankstown Central Shopping Centre
39 | 37,Sunway Putra Mall
40 | 38,HarbourFront Centre (Singapore)
41 | 39,CityPlace (West Palm Beach)
42 | 40,Galleria Shopping Centre (Toronto)
43 | 41,"Westgate, Oxford"
44 | 42,Annapolis Mall
45 | 43,Brentwood Town Centre (mall)
46 | 44,"St David's, Cardiff"
47 | 45,Toombul Shopping Centre
48 | 46,Grand Indonesia
49 | 47,Yorkville Village
50 | 48,Connecticut Post Mall
51 | 49,The Mall at Fox Run
52 | 50,Downtown Commons
53 | 51,Westfield Downtown Plaza
54 | 52,MainPlace Mall
55 | 53,Capital Mall
56 | 54,Fox Valley Mall
57 | 55,Chicago Ridge Mall
58 | 56,Westfield Louis Joliet
59 | 57,Southlake Mall (Indiana)
60 | 58,Franklin Park Mall
61 | 59,Westfield SouthPark
62 | 60,Great Northern Mall (Ohio)
63 | 61,Belden Village Mall
64 | 62,Westfield Solano
65 | 63,Westfield Eastland
66 | 64,Plaza West Covina
67 | 65,Parkway Plaza
68 | 66,Riyadh Sahara Mall
69 | 67,"Crossroads Center (St. Cloud, Minnesota)"
70 | 68,St. Laurent Shopping Centre
71 | 69,Artegon Marketplace
72 | 70,Zona Rosa (Kansas City)
73 | 71,Lutwyche City Shopping Centre
74 | 72,Les Promenades Gatineau
75 | 73,Westgate Entertainment District
76 | 74,Victoria Gardens (Rancho Cucamonga)
77 | 75,Woodburn Premium Outlets
78 | 76,El Con Center
79 | 77,Pacific Epping
80 | 78,Parmatown Mall
81 | 79,Auburn Mall (Massachusetts)
82 | 80,"Auburn Mall (Auburn, Massachusetts)"
83 | 81,Cataraqui Centre
84 | 82,Westfield Royal Victoria Place
85 | 83,Playground Pier
86 | 84,Conestoga Mall
87 | 85,Maple Hill Mall
88 | 86,Roselands Shopping Centre
89 | 87,The Mall at the Source
90 | 88,The Oaks Shopping Center
91 | 89,Santa Rosa Mall
92 | 90,"Crossroads Center (Waterloo, Iowa)"
93 | 91,The Spires Shopping Centre
94 | 92,"Change Alley, Singapore"
95 | 93,Seacon Square Srinakarin
96 | 94,Northfield Square Mall
97 | 95,GreenStreet
98 | 96,Paradise Park (mall)
99 | 97,Colonnades Shopping Centre
100 | 98,The Glen Shopping Centre
101 | 99,North Rocks Shopping Centre
102 | 100,Eastridge Mall (Gastonia)
103 | 101,The Oaks Mall (Florida)
104 | 102,The Outlet Collection at Riverwalk
105 | 103,Jantzen Beach Center
106 | 104,The Quad St. Charles
107 | 105,Galleria Shopping Centre (Perth)
108 | 106,Galleria 220
109 | 107,The Crossings at Northwest
110 | 108,Mall of Acadiana
111 | 109,The Arsenal Project of Watertown
112 | 110,Antioch Crossing
113 | 111,Omni Park
114 | 112,"The Paragon, Singapore"
115 | 113,The Legends at Village West
116 | 114,Champlain Mall
117 | 115,Splendid China Mall
118 | 116,Brunswick Shopping Centre
119 | 117,River Drive Mall
120 | 118,Plaza Mayor (Oklahoma)
121 | 119,Uni Hill Factory Outlets
122 | 120,Warrawong Plaza
123 | 121,Figtree Grove
124 | 122,Pakuranga Plaza
125 | 123,Westfield Downtown
126 | 124,Shoppes at Knollwood
127 | 125,Shangri-La Plaza
128 | 126,Shangri-La Plaza (shopping mall)
129 | 127,Lakeshore Mall
130 | 128,Great Eastern Tower
131 | 129,Hickory Hollow Mall
132 | 130,Carriage Crossing
133 | 131,Victoria Gate
134 | 132,Shelter Cove Towne Centre
135 | 133,"Bishops Corner, West Hartford"
136 | 134,Harbor Square
137 | 135,The Columbia Mall
138 | 136,The Centre on Barton
139 | 137,Centre of Tallahassee
140 | 138,La Encantada (shopping center)
141 | 139,Westfield Gateway
142 | 140,The Shoppes at Gateway
143 | 141,Forest Lake Village Shopping Centre
144 | 142,Megabox (shopping mall)
145 | 143,CastleCourt
146 | 144,Plaza Central (Texas)
147 | 145,City Centre Deira
148 | 146,Paddock Shops
149 | 147,Promenade (shopping centre)
150 | 148,Cortana Mall
151 | 149,Mercury Shopping Centre
152 | 150,Bentley Bridge
153 | 151,Iluma
154 | 152,"City Centre Plaza, Rockhampton"
155 | 153,SuperMall of the Great Northwest
156 | 154,Downtown Summerlin (shopping center)
157 | 155,Hillside Village
158 | 156,Newpark Mall
159 | 157,Springfield Mall (Virginia)
160 | 158,Florence Mall (Alabama)
161 | 159,Ellsworth Place
162 | 160,Taigum Square Shopping Centre
163 | 161,Sta. Lucia East Grandmall
164 | 162,Florence Mall (Kentucky)
165 | 163,SM City Taguig
166 | 164,Karingal Hub Shopping Centre
167 | 165,"Regency Square Mall (Jacksonville, Florida)"
168 | 166,"The Broadway, Bradford"
169 | 167,Motherwell Shopping Centre
170 | 168,Bole Dembel Shopping Center
171 | 169,Worcester Common Outlets
172 | 170,City Mall (Amman)
173 | 171,Winrock Center
174 | 172,"Northway Mall (Colonie, New York)"
175 | 173,"Saratoga Mall (Wilton, New York)"
176 | 174,Chesapeake Square
177 | 175,LuLu International Shopping Mall
178 | 176,Lulu Cochin Mall
179 | 177,City Centre Mirdif
180 | 178,Shaktan Thampuran Private Bus Stand
181 | 179,Star City (shopping mall)
182 | 180,"Auburn Mall (Auburn, Alabama)"
183 | 181,"Village Mall (Auburn, Alabama)"
184 | 182,"The Exchange, Ilford"
185 | 183,MY MALL Limassol
186 | 184,CastleTown Shoppingworld
187 | 185,The Market Common
188 | 186,The Kenwood Collection
189 | 187,The Pavilions
190 | 188,Settlers Green
191 | 189,The Summit (Reno)
192 | 190,The Summit (Wheatfield)
193 | 191,"Domain Central, Townsville"
194 | 192,Granada Centre
195 | 193,The Base (shopping centre)
196 | 194,The Base (Shopping Centre)
197 | 195,Tulsa Promenade
198 | 196,West Manchester Town Center
199 | 197,Lakeview Square Mall
200 | 198,Palladium World
201 | 199,Lavington Square Shopping Centre
202 | 200,City Centre Bahrain
203 | 201,Spinderiet
204 | 202,Fars Shopping Complex
205 | 203,Kukui Grove Center
206 | 204,Riverdale Village
207 | 205,Festival Alabang
208 | 206,Toa Payoh Entertainment Centre
209 | 207,"University Place (Chapel Hill, North Carolina)"
210 | 208,The Shoppes at Sands
211 | 209,Enfield Square Mall
212 | 210,Harbor Point (Subic)
213 | 211,Bass Pro Complex (Dieppe)
214 | 212,Glen Burnie Mall
215 | 213,South City Mall
216 | 214,Military Circle Mall
217 | 215,West 12
218 | 216,West 12 Shepherd's Bush
219 | 217,Brentwood Country Mart
220 | 218,South Point Mall
221 | 219,Cross County Mall
222 | 220,Pacific Werribee
223 | 221,Monroe Crossing
224 | 222,Northwoods Mall (Illinois)
225 | 223,Square 2
226 | 224,Square 2 (shopping mall)
227 | 225,The Shops at Westshore
228 | 226,ViaPort Florida
229 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AutoFJ
2 |
3 | The official code for our SIGMOD 2021 paper: [Auto-FuzzyJoin: Auto-Program Fuzzy Similarity Joins Without Labeled Examples](https://arxiv.org/abs/2103.04489). To reproduce the main results in our paper, switch to `reproduce` branch.
4 |
5 | AutoFJ automatically produces record pairs that approximately match in two input
6 | tables without requiring explicit human input such as labeled training data. Using AutoFJ,
7 | users only need to provide two input tables, and a desired precision target (say 0.9).
8 | AutoFJ leverages the fact that one of the input is a reference table to
9 | automatically program fuzzy-joins that meet the precision target in expectation,
10 | while maximizing fuzzy-join recall (defined as the number of correctly joined records).
11 |
12 | In AutoFJ, the left table refers to a reference table, which is assumed to be almost "duplicate-free". AutoFJ attempts to solve many-to-one join problems, where each record in the right table will be joined with at most one record in the left table, but each record in left table can be joined with multiple records in the right table.
13 |
14 | AutoFJ also provides a benchmark that contains [50 diverse datasets](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/50-single-column-datasets.md) for single-column fuzzy-join tasks constructed from [DBPedia](https://www.dbpedia.org).
15 |
16 | ## Installation
17 |
18 | Install the package using pip
19 |
20 | ```
21 | pip install autofj
22 | ```
23 |
24 | ## Usage
25 |
26 | Let `left_table` be the reference table and `right_table` be another input table. The two tables are assumed to have the same schema and have an id column named `id_column`. To join `left_table` and `right_table` with
27 | precision target 0.9, run the following code. The result will be a joined table of record pairs that are identified as matches from two input tables.
28 | ```python
29 | from autofj import AutoFJ
30 | fj = AutoFJ(precision_target=0.9)
31 | result = fj.join(left_table, right_table, id_column)
32 | ```
33 |
34 | To load a benchmark dataset named as `dataset_name`, run the following code. Each dataset contains a left table (reference table), a right table and a ground-truth table of matched record pairs. The id column of each dataset is named as "id" and the column to be joined is named as "title". The names of all benchmark datasets are listed [here](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/50-single-column-datasets.md).
35 | ```python
36 | from autofj.datasets import load_data
37 | left_table, right_table, gt_table = load_data(dataset_name)
38 | ```
39 | ## Example
40 | Run the following code to join the left and right table of TennisTournament dataset.
41 | ```python
42 | from autofj.datasets import load_data
43 | from autofj import AutoFJ
44 | left_table, right_table, gt_table = load_data("TennisTournament")
45 | fj = AutoFJ(precision_target=0.9)
46 | result = fj.join(left_table, right_table, "id")
47 | ```
48 |
49 | ## Documentation
50 | ```python
51 | class AutoFJ(object):
52 | def __init__(self,
53 | precision_target=0.9,
54 | join_function_space="autofj_sm",
55 | distance_threshold_space=50,
56 | column_weight_space=10,
57 | blocker=None,
58 | n_jobs=-1,
59 | verbose=False):
60 | ```
61 |
62 | ### Parameters
63 | * **precision_target: *float*, default=0.9**
64 | Precision target. The value is taken from 0-1. The default value is 0.9.
65 |
66 | * **join_function_space: *string, dict or list of objects*, default="autofj_sm"**
67 | Space of join functions. There are three ways to define the space of join functions:
68 | 1. Use the name (string) of built-in join function space. There are three
69 | options, including "autofj_lg", "autofj_md" and "autofj_sm" that use
70 | 136, 68 and 14 join functions, respectively. Using less join functions
71 | can improve efficiency but may worsen performance.
72 | 2. Use a dict specifying the options for preprocessing methods,
73 | tokenization methods, token weighting methods and distance functions.
74 | The space will be the cartesian product of all options in the dict.
75 | See [options.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/join_function_space/options.py) for defining join functions using
76 | a dict.
77 | 3. Use a list of customized JoinFunction objects. Define JoinFunction class using prototype in [join_function.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/join_function_space/join_function/join_function.py).
78 |
79 | * **distance_threshold_space: *int or list of floats*, default=50**
80 | The number of candidate distance thresholds or a list of candidate
81 | distance thresholds in the space. If the number of distance thresholds
82 | (integer) is given, distance thresholds are spaced evenly from 0 to 1.
83 | Otherwise, it should be a list of floats from 0 to 1. Using fewer candidates
84 | can improve efficiency but may worsen performance.
85 |
86 | * **column_weight_space: *int or list of floats*, default=10**
87 | The number of candidate column weights or a list of candidate
88 | column weights in the space. If the number of column weights
89 | (integer) is given, column weights are spaced evenly from 0 to 1.
90 | Otherwise, it should be a list of floats from 0 to 1. Using fewer candidates
91 | can improve efficiency but may worsen performance.
92 |
93 |
94 | * **blocker: *None or a Blocker object*, default None**
95 | A Blocker object that performs blocking on two tables. If None, use
96 | the built-in blocker. For using customized blocker, define Blocker class using prototype in [blocker.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/blocker/blocker.py).
97 |
98 | * **n_jobs : *int*, default=-1**
99 | Number of CPU cores used. -1 means using all processors.
100 |
101 | * **verbose: *bool*, default=False**
102 | Whether to print logging
103 |
104 | ### Attributes
105 | * **selected_column_weights: *dict***
106 | The columns and column weights selected by the algorithm. The key is the
107 | column name, the value is the weight selected for the column.
108 |
109 | * **selected_join_configs: *list of tuples***
110 | The union of join configurations selected by the algorithm. Each tuple
111 | (join_function, threshold) in the list is a join configuration that
112 | consists of the name of the join function and its distance threshold.
113 |
114 | ### Methods
115 | ```python
116 | join(left_table, right_table, id_column, on=None)
117 | ```
118 |
119 | Join left table and right table.
120 |
121 | #### Parameters
122 | * **left_table: *pandas.DataFrame***
123 | Reference table. The left table is assumed to be almost duplicate-free, which means it has no or only few duplicates.
124 |
125 | * **right_table: *pandas.DataFrame***
126 | Another input table.
127 |
128 | * **id_column: *string***
129 | The name of id column in the two tables. This column will not be
130 | used to join two tables.
131 |
132 | * **on: *list or None*, default=None**
133 | A list of column names (multi-column fuzzy join) that the two tables
134 | will be joined on. If None, two tables will be joined on all columns
135 | that exist in both tables, excluding the id column.
136 |
137 | #### Return
138 | * ***pandas.DataFrame***
139 | A table of joining pairs. The columns of left table are
140 | suffixed with "_l" and the columns of right table are suffixed
141 | with "_r".
--------------------------------------------------------------------------------
/src/autofj/benchmark/Magazine/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,The Raven: Anarchist Quarterly
3 | 1,Next Magazine (Chinese magazine)
4 | 2,Men's Health
5 | 3,Seventeen (American magazine)
6 | 4,Liberty (libertarian magazine)
7 | 5,Q (magazine)
8 | 6,Run (magazine)
9 | 7,Net (magazine)
10 | 8,Foreign Policy (magazine)
11 | 9,This England (magazine)
12 | 10,City Journal (New York City)
13 | 11,L'Obs
14 | 12,Cat Fancy
15 | 13,Answer Me! (magazine)
16 | 14,MikroBitti
17 | 15,Painkiller Magazine
18 | 16,The Australian Women's Weekly
19 | 17,PCWorld (magazine)
20 | 18,The Wipers Times
21 | 19,Chart Attack
22 | 20,Panorama (magazine)
23 | 21,Now (magazine)
24 | 22,Automobile (magazine)
25 | 23,MIT Technology Review
26 | 24,L'Express
27 | 25,Hotdog (magazine)
28 | 26,Storm Track
29 | 27,Next Generation Magazine
30 | 28,Playboy magazine (Brazilian issue)
31 | 29,Perfect 10
32 | 30,O: The Oprah Magazine
33 | 31,Monthly Afternoon
34 | 32,CPC Attack!
35 | 33,Comics Scene (magazine)
36 | 34,X-One
37 | 35,Compute!'s Gazette
38 | 36,Slam (magazine)
39 | 37,Custom PC (magazine)
40 | 38,Owl (magazine)
41 | 39,Back Issue!
42 | 40,Atari Age (magazine)
43 | 41,Sugarscape.com
44 | 42,Sugar Magazine
45 | 43,Railroad Magazine
46 | 44,Relevant (magazine)
47 | 45,Weekly Young Magazine
48 | 46,Dilema Veche
49 | 47,PC User
50 | 48,The New Atlantis
51 | 49,Maayan (magazine)
52 | 50,Armchair General (magazine)
53 | 51,Chip-India
54 | 52,Chronicle of Current Events
55 | 53,Animation Magazine
56 | 54,Gempak Starz
57 | 55,Hero (magazine)
58 | 56,Hero (gay magazine)
59 | 57,ANALOG Computing
60 | 58,Linux For You
61 | 59,NW (magazine)
62 | 60,Ray Li (magazine)
63 | 61,Dazed
64 | 62,AirForces Monthly
65 | 63,True Detective (magazine)
66 | 64,SKY Magazine
67 | 65,PlayStation Magazine (Italy)
68 | 66,Amtix
69 | 67,In-Training (magazine)
70 | 68,America (Jesuit magazine)
71 | 69,"Brick, A Literary Journal"
72 | 70,Monthly Magazine Z
73 | 71,Swimming World
74 | 72,Humanism Ireland
75 | 73,Wholphin (DVD magazine)
76 | 74,Blitz (magazine)
77 | 75,North & South (New Zealand magazine)
78 | 76,Leading Edge (magazine)
79 | 77,Leading Edge (fiction magazine)
80 | 78,Executive Travel
81 | 79,CHIP (magazine)
82 | 80,Bowlers Journal
83 | 81,Out Front (newspaper)
84 | 82,Sporting Life (magazine)
85 | 83,BlackBook (magazine)
86 | 84,Washingtonian (magazine)
87 | 85,CKM (adult magazine)
88 | 86,Your Family Tree Magazine
89 | 87,The New-York Magazine
90 | 88,Backstage (magazine)
91 | 89,Borderline Comics Magazine
92 | 90,CHARGE!
93 | 91,Now (UK magazine)
94 | 92,NOW (British magazine)
95 | 93,Flux Magazine
96 | 94,Flux Magazine (US)
97 | 95,16 Magazine
98 | 96,Inquiry (magazine)
99 | 97,Xbox World 360
100 | 98,YOU (South African magazine)
101 | 99,FACTS (magazine)
102 | 100,Bluff (magazine)
103 | 101,Kvant (magazine)
104 | 102,Samakalika Malayalam Vaarika
105 | 103,Wink (manhwa)
106 | 104,Resurgence
107 | 105,Essentials (magazine)
108 | 106,Storyworks
109 | 107,Tilllate Magazine
110 | 108,The Messenger Magazine
111 | 109,Guernica (magazine)
112 | 110,The Fed (newspaper)
113 | 111,Familia (magazine)
114 | 112,Commodore Power/Play
115 | 113,Renditions (magazine)
116 | 114,House and Home
117 | 115,The Blast (magazine)
118 | 116,Epoch (magazine)
119 | 117,Sheeko magazine
120 | 118,CURSOR
121 | 119,Computing (Urdu magazine)
122 | 120,PlayStation Official Magazine - UK
123 | 121,The Boulevard Magazine
124 | 122,OffBeat
125 | 123,Urban Realm
126 | 124,Thirteen Minutes Magazine
127 | 125,Comic BomBom
128 | 126,The New Times (magazine)
129 | 127,Down East (magazine)
130 | 128,Monthly Asuka Fantasy DX
131 | 129,Due South Magazine
132 | 130,SPORT (magazine)
133 | 131,Safari (magazine)
134 | 132,Abitare
135 | 133,Touchstone (magazine)
136 | 134,Nash Country Weekly
137 | 135,Next City
138 | 136,T: The New York Times Style Magazine
139 | 137,Morbid Curiosity (magazine)
140 | 138,BeE Woman
141 | 139,Hinduism Today Magazine
142 | 140,Fortnight (magazine)
143 | 141,The Caterer
144 | 142,The Caterer (formerly Caterer and Hotelkeeper)
145 | 143,Australian 4WD Monthly
146 | 144,PlayStation Official Magazine - Australia
147 | 145,Car (magazine)
148 | 146,DIY (magazine)
149 | 147,Gulf Coast (magazine)
150 | 148,Lighting & Sound International
151 | 149,Veckorevyn
152 | 150,Light (journal)
153 | 151,Sport (magazine)
154 | 152,Venture Magazine
155 | 153,OC Metro magazine
156 | 154,Revolution magazine
157 | 155,Faithworks Magazine
158 | 156,Sidewalk magazine
159 | 157,Popstar! Magazine
160 | 158,Louisville (magazine)
161 | 159,Rattle (magazine)
162 | 160,ShortList
163 | 161,Icon (lifestyle magazine)
164 | 162,Icon Magazine
165 | 163,Modern Dog (magazine)
166 | 164,OPEN (North Dakota magazine)
167 | 165,Business Matters
168 | 166,Muziekkrant OOR
169 | 167,Boutique Design
170 | 168,EMS World
171 | 169,'47 (magazine)
172 | 170,The Hispanic Outlook in Higher Education
173 | 171,Booking (manhwa)
174 | 172,Rouleur Magazine
175 | 173,PRISM international
176 | 174,Noggin Magazine
177 | 175,Blanco y Negro (magazine)
178 | 176,Ukrainskyi Tyzhden
179 | 177,Avotaynu (magazine)
180 | 178,Success (magazine)
181 | 179,Wave Magazine
182 | 180,FACT (United Kingdom magazine)
183 | 181,Caravan magazine
184 | 182,Kiss (Japanese magazine)
185 | 183,The European (1953 magazine)
186 | 184,Maxim India (magazine)
187 | 185,The Zamboni
188 | 186,Max (German magazine)
189 | 187,Oxonian Review
190 | 188,Huntin' Fool Magazine
191 | 189,FHM India
192 | 190,The Fountain (magazine)
193 | 191,Full Circle Magazine
194 | 192,The Middle East in London
195 | 193,WildTomato
196 | 194,Classical Music (magazine)
197 | 195,Environment and Rights
198 | 196,Junk Jet
199 | 197,Online (magazine)
200 | 198,Ciak
201 | 199,Cincinnati (magazine)
202 | 200,Forza Milan!
203 | 201,REM (magazine)
204 | 202,Gariyoshi
205 | 203,Uralsky Sledopyt
206 | 204,Frank Leslie's Weekly
207 | 205,Chorus (magazine)
208 | 206,You (Japanese magazine)
209 | 207,Professional Pilot
210 | 208,Tank Magazine
211 | 209,National Wildlife
212 | 210,Suspense Magazine
213 | 211,Top Gear (Indian magazine)
214 | 212,Disco 45
215 | 213,Grafik Magazine
216 | 214,Irish America (magazine)
217 | 215,OPEN (magazine)
218 | 216,Proto (magazine)
219 | 217,Huck (magazine)
220 | 218,Minerva (archaeology magazine)
221 | 219,Port Folio (magazine)
222 | 220,Square Mile (magazine)
223 | 221,Aeroplane (magazine)
224 | 222,Canadian Immigrant
225 | 223,Art Collector (magazine)
226 | 224,Fenuxe Magazine
227 | 225,Sisterhood Magazine
228 | 226,Black Sea Security
229 | 227,Delayed Gratification magazine
230 | 228,"Studia theologica, Czech Republic"
231 | 229,Bass Musician
232 | 230,Nightshift (Oxford Music Magazine)
233 | 231,Chief Investment Officer Magazine
234 | 232,SoGlos
235 | 233,Manga Action
236 | 234,The Family Friend (magazine)
237 | 235,Antiques info magazine
238 | 236,In Out
239 | 237,WORD Magazine
240 | 238,Women in Music (periodical)
241 | 239,Illustrated Rhodesia Life
242 | 240,Royal flush magazine
243 | 241,The Trades
244 | 242,Science Reporter
245 | 243,Exame
246 | 244,Glass Mountain (magazine)
247 | 245,Musica e dischi
248 | 246,She Kicks
249 | 247,Climbing Magazine
250 | 248,Weekly Shonen Jump Alpha
251 | 249,Natural Home & Garden
252 | 250,Sportsnet Magazine
253 | 251,The Connoisseur (magazine)
254 | 252,Women with Vision!
255 | 253,Explore (magazine)
256 | 254,Zest (magazine)
257 | 255,4-Wheel & Off-Road (magazine)
258 | 256,FACTA (magazine)
259 | 257,Teenage Survival Handbook
260 | 258,Stone Soup Magazine
261 | 259,Literary Club bulgarian
262 | 260,Orange Coast Magazine
263 | 261,Respect. (magazine)
264 | 262,Shokun!
265 | 263,Today's Trucking
266 | 264,SPUR (Australian newspaper)
267 | 265,Das Gedicht
268 | 266,Film Magazine (magazine)
269 | 267,Chalachithram
270 | 268,Rock Australia Magazine
271 | 269,Ideas and Discoveries
272 | 270,Web Techniques
273 | 271,C California Style Magazine
274 | 272,Contemporary Review (Chinese magazine)
275 | 273,Dhanam (business magazine)
276 |
--------------------------------------------------------------------------------
/src/autofj/join_function_space/join_function/distance_function.py:
--------------------------------------------------------------------------------
1 | """Compute distance"""
2 | import editdistance
3 | import jellyfish
4 | import collections
5 | from collections import Counter
6 | import time
7 | import numpy as np
8 | import pandas as pd
9 | import spacy
10 |
11 |
12 | """Distance Functions"""
13 | def jaccardDistance(x, y, w=None):
14 | inter = set(x).intersection(set(y))
15 | union = set(x).union(set(y))
16 | if w is None:
17 | sum_inter = len(inter)
18 | sum_union = len(union)
19 | else:
20 | sum_inter = sum([w[s] for s in inter])
21 | sum_union = sum([w[s] for s in union])
22 | d = 1 - sum_inter / (sum_union + 1e-9)
23 | return d
24 |
25 | def cosineDistance(x, y, w=None):
26 | c1 = Counter(x)
27 | c2 = Counter(y)
28 | inter = set(x).intersection(set(y))
29 |
30 | if w is None:
31 | uv = sum([c1[s]*c2[s] for s in inter])
32 | u = np.sqrt(sum([c1[s]**2 for s in set(x)]))
33 | v = np.sqrt(sum([c2[s]**2 for s in set(y)]))
34 | else:
35 | uv = sum([w[s]*c1[s]*w[s]*c2[s] for s in inter])
36 | u = np.sqrt(sum([(w[s]*c1[s])**2 for s in set(x)]))
37 | v = np.sqrt(sum([(w[s]*c2[s])**2 for s in set(y)]))
38 |
39 | d = 1 - uv / (u * v + 1e-9)
40 | return d
41 |
42 | def diceDistance(x, y, w=None):
43 | inter = set(x).intersection(set(y))
44 | union = set(x).union(set(y))
45 | if w is None:
46 | sum_inter = len(inter)
47 | sum_union = len(union)
48 | else:
49 | sum_inter = sum([w[s] for s in inter])
50 | sum_union = sum([w[s] for s in union])
51 | d = 1 - (2 * sum_inter / (sum_inter + sum_union + 1e-9))
52 | return d
53 |
54 | def maxincDistance(x, y, w=None):
55 | inter = set(x).intersection(set(y))
56 | if w is None:
57 | sum_inter = len(inter)
58 | else:
59 | sum_inter = sum([w[s] for s in inter])
60 |
61 | if w is None:
62 | sum_x = len(set(x))
63 | sum_y = len(set(y))
64 | else:
65 | sum_x = sum([w[s] for s in set(x)])
66 | sum_y = sum([w[s] for s in set(y)])
67 | min_sum = min(sum_x, sum_y)
68 | d = 1 - (sum_inter / (min_sum + 1e-9))
69 | return d
70 |
71 | def intersectDistance(x, y, w=None):
72 | inter = set(x).intersection(set(y))
73 | union = set(x).union(set(y))
74 | if w is None:
75 | sum_inter = len(inter)
76 | sum_union = len(union)
77 | else:
78 | sum_inter = sum([w[s] for s in inter])
79 | sum_union = sum([w[s] for s in union])
80 | d = 1 - sum_inter / (sum_inter + sum_union + 1e-9)
81 | return d
82 |
83 | def isContain(x, y):
84 | set_x = set(x)
85 | set_y = set(y)
86 |
87 | if len(set_x) > len(set_y):
88 | return set_y.issubset(set_x)
89 | else:
90 | return set_x.issubset(set_y)
91 |
92 | def containCosineDistance(x, y, w=None):
93 | if isContain(x, y):
94 | return cosineDistance(x, y, w)
95 | else:
96 | return 1
97 |
98 | def containJaccardDistance(x, y, w=None):
99 | if isContain(x, y):
100 | return jaccardDistance(x, y, w)
101 | else:
102 | return 1
103 |
104 | def containDiceDistance(x, y, w=None):
105 | if isContain(x, y):
106 | return diceDistance(x, y, w)
107 | else:
108 | return 1
109 |
110 | def editDistance(x, y):
111 | d = editdistance.eval(x, y)
112 | return d
113 |
114 | def jaroDistance(x, y):
115 | d = 1 - jellyfish.jaro_winkler_similarity(x, y)
116 | return d
117 |
118 | def embedDistance(x, y, embedding):
119 | x = embedding(x)
120 | y = embedding(y)
121 | d = 1 - x.similarity(y)
122 | return d
123 |
124 | class DistanceFunction(object):
125 | """Distance function
126 |
127 | Parameters
128 | ----------
129 | method: string
130 | Method of computing distance. The available methods are listed as
131 | follows.
132 | Set-based distance
133 | - jaccardDistance
134 | - cosineDistance
135 | - diceDistance
136 | - maxincDistance
137 | - intersectDistance
138 | - containCosineDistance
139 | - containJaccardDistance
140 | - containDiceDistance
141 | Char-based distance
142 | - editDistance
143 | - jaroDistance
144 |
145 | """
146 | def __init__(self, method):
147 | self.method = method
148 | if method == "jaccardDistance":
149 | self.func = jaccardDistance
150 | elif method == "cosineDistance":
151 | self.func = cosineDistance
152 | elif method == "diceDistance":
153 | self.func = diceDistance
154 | elif method == "maxincDistance":
155 | self.func = maxincDistance
156 | elif method == "intersectDistance":
157 | self.func = intersectDistance
158 | elif method == "editDistance":
159 | self.func = editDistance
160 | elif method == "jaroDistance":
161 | self.func = jaroDistance
162 | elif method == "containCosineDistance":
163 | self.func = containCosineDistance
164 | elif method == "containJaccardDistance":
165 | self.func = containJaccardDistance
166 | elif method == "containDiceDistance":
167 | self.func = containDiceDistance
168 | elif method == "embedDistance":
169 | self.func = embedDistance
170 | self.embedding = spacy.load("en_core_web_lg")
171 | else:
172 | raise Exception("{} is an invalid distance function"
173 | .format(method))
174 |
175 | def compute_distance(self, LR, weight=None):
176 | """"Compute distance score between tuple pairs
177 |
178 | Parameters:
179 | ----------
180 | LR: pd.DataFrame
181 | A table of tuple pairs. The columns of left and right values are
182 | named as "value_l" and "value_r". For char-based distance the type
183 | of values are string. For set-based distance the type of values are
184 | token set.
185 |
186 | weight: dict, default=None
187 | Weighting schema. If none, uniform weight or no weight is used.
188 |
189 | Return:
190 | -------
191 | distance: pd.Series
192 | distance between tuple pairs for each row
193 | """
194 | if weight is None:
195 | if self.method != "embedDistance":
196 | distance = LR.apply(lambda x: self.func(x.value_l, x.value_r), axis=1)
197 | else:
198 | distance = LR.apply(lambda x: self.func(x.value_l, x.value_r, self.embedding), axis=1)
199 | else:
200 | distance = LR.apply(lambda x: self.func(x.value_l, x.value_r, weight), axis=1)
201 | return distance
202 |
203 | # data = pd.read_csv("../../data/left.csv")["title"]
204 | # X = np.concatenate([data.values for _ in range(20)])
205 | # X = pd.Series(X)
206 | #
207 | # L = X
208 | # R = X.sample(frac=1)
209 | #
210 | # from tokenizer import Tokenizer
211 | # tokenizer = Tokenizer("splitBySpace")
212 | # L = tokenizer.tokenize(L)
213 | # R = tokenizer.tokenize(R)
214 | # LR = pd.DataFrame({"value_l":L, "value_r":R})
215 | #
216 | # tic = time.time()
217 | # methods = ["jaccardDistance", "maxincDistance", "containCosineDistance"]
218 | # distance_function = DistanceFunction("jaccardDistance")
219 | # distance_function.compute_distance(LR)
220 | # distance_function = DistanceFunction("maxincDistance")
221 | # distance_function.compute_distance(LR)
222 | # distance_function = DistanceFunction("containCosineDistance")
223 | # distance_function.compute_distance(LR)
224 | # print(time.time() - tic)
225 |
--------------------------------------------------------------------------------
/src/autofj.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | MANIFEST.in
2 | README.md
3 | pyproject.toml
4 | setup.py
5 | src/autofj/__init__.py
6 | src/autofj/autofj.py
7 | src/autofj/datasets.py
8 | src/autofj/negative_rule.py
9 | src/autofj/utils.py
10 | src/autofj.egg-info/PKG-INFO
11 | src/autofj.egg-info/SOURCES.txt
12 | src/autofj.egg-info/dependency_links.txt
13 | src/autofj.egg-info/requires.txt
14 | src/autofj.egg-info/top_level.txt
15 | src/autofj/benchmark/.DS_Store
16 | src/autofj/benchmark/Amphibian/gt.csv
17 | src/autofj/benchmark/Amphibian/left.csv
18 | src/autofj/benchmark/Amphibian/right.csv
19 | src/autofj/benchmark/ArtificialSatellite/gt.csv
20 | src/autofj/benchmark/ArtificialSatellite/left.csv
21 | src/autofj/benchmark/ArtificialSatellite/right.csv
22 | src/autofj/benchmark/Artwork/gt.csv
23 | src/autofj/benchmark/Artwork/left.csv
24 | src/autofj/benchmark/Artwork/right.csv
25 | src/autofj/benchmark/Award/gt.csv
26 | src/autofj/benchmark/Award/left.csv
27 | src/autofj/benchmark/Award/right.csv
28 | src/autofj/benchmark/BasketballTeam/gt.csv
29 | src/autofj/benchmark/BasketballTeam/left.csv
30 | src/autofj/benchmark/BasketballTeam/right.csv
31 | src/autofj/benchmark/Case/gt.csv
32 | src/autofj/benchmark/Case/left.csv
33 | src/autofj/benchmark/Case/right.csv
34 | src/autofj/benchmark/ChristianBishop/gt.csv
35 | src/autofj/benchmark/ChristianBishop/left.csv
36 | src/autofj/benchmark/ChristianBishop/right.csv
37 | src/autofj/benchmark/ClericalAdministrativeRegion/gt.csv
38 | src/autofj/benchmark/ClericalAdministrativeRegion/left.csv
39 | src/autofj/benchmark/ClericalAdministrativeRegion/right.csv
40 | src/autofj/benchmark/Country/gt.csv
41 | src/autofj/benchmark/Country/left.csv
42 | src/autofj/benchmark/Country/right.csv
43 | src/autofj/benchmark/Device/gt.csv
44 | src/autofj/benchmark/Device/left.csv
45 | src/autofj/benchmark/Device/right.csv
46 | src/autofj/benchmark/Drug/gt.csv
47 | src/autofj/benchmark/Drug/left.csv
48 | src/autofj/benchmark/Drug/right.csv
49 | src/autofj/benchmark/Election/gt.csv
50 | src/autofj/benchmark/Election/left.csv
51 | src/autofj/benchmark/Election/right.csv
52 | src/autofj/benchmark/Enzyme/gt.csv
53 | src/autofj/benchmark/Enzyme/left.csv
54 | src/autofj/benchmark/Enzyme/right.csv
55 | src/autofj/benchmark/EthnicGroup/gt.csv
56 | src/autofj/benchmark/EthnicGroup/left.csv
57 | src/autofj/benchmark/EthnicGroup/right.csv
58 | src/autofj/benchmark/FootballLeagueSeason/gt.csv
59 | src/autofj/benchmark/FootballLeagueSeason/left.csv
60 | src/autofj/benchmark/FootballLeagueSeason/right.csv
61 | src/autofj/benchmark/FootballMatch/gt.csv
62 | src/autofj/benchmark/FootballMatch/left.csv
63 | src/autofj/benchmark/FootballMatch/right.csv
64 | src/autofj/benchmark/Galaxy/gt.csv
65 | src/autofj/benchmark/Galaxy/left.csv
66 | src/autofj/benchmark/Galaxy/right.csv
67 | src/autofj/benchmark/GivenName/gt.csv
68 | src/autofj/benchmark/GivenName/left.csv
69 | src/autofj/benchmark/GivenName/right.csv
70 | src/autofj/benchmark/GovernmentAgency/gt.csv
71 | src/autofj/benchmark/GovernmentAgency/left.csv
72 | src/autofj/benchmark/GovernmentAgency/right.csv
73 | src/autofj/benchmark/HistoricBuilding/gt.csv
74 | src/autofj/benchmark/HistoricBuilding/left.csv
75 | src/autofj/benchmark/HistoricBuilding/right.csv
76 | src/autofj/benchmark/Hospital/gt.csv
77 | src/autofj/benchmark/Hospital/left.csv
78 | src/autofj/benchmark/Hospital/right.csv
79 | src/autofj/benchmark/Legislature/gt.csv
80 | src/autofj/benchmark/Legislature/left.csv
81 | src/autofj/benchmark/Legislature/right.csv
82 | src/autofj/benchmark/Magazine/gt.csv
83 | src/autofj/benchmark/Magazine/left.csv
84 | src/autofj/benchmark/Magazine/right.csv
85 | src/autofj/benchmark/MemberOfParliament/gt.csv
86 | src/autofj/benchmark/MemberOfParliament/left.csv
87 | src/autofj/benchmark/MemberOfParliament/right.csv
88 | src/autofj/benchmark/Monarch/gt.csv
89 | src/autofj/benchmark/Monarch/left.csv
90 | src/autofj/benchmark/Monarch/right.csv
91 | src/autofj/benchmark/MotorsportSeason/gt.csv
92 | src/autofj/benchmark/MotorsportSeason/left.csv
93 | src/autofj/benchmark/MotorsportSeason/right.csv
94 | src/autofj/benchmark/Museum/gt.csv
95 | src/autofj/benchmark/Museum/left.csv
96 | src/autofj/benchmark/Museum/right.csv
97 | src/autofj/benchmark/NCAATeamSeason/gt.csv
98 | src/autofj/benchmark/NCAATeamSeason/left.csv
99 | src/autofj/benchmark/NCAATeamSeason/right.csv
100 | src/autofj/benchmark/NationalFootballLeagueSeason/gt.csv
101 | src/autofj/benchmark/NationalFootballLeagueSeason/left.csv
102 | src/autofj/benchmark/NationalFootballLeagueSeason/right.csv
103 | src/autofj/benchmark/NaturalEvent/gt.csv
104 | src/autofj/benchmark/NaturalEvent/left.csv
105 | src/autofj/benchmark/NaturalEvent/right.csv
106 | src/autofj/benchmark/Noble/gt.csv
107 | src/autofj/benchmark/Noble/left.csv
108 | src/autofj/benchmark/Noble/right.csv
109 | src/autofj/benchmark/PoliticalParty/gt.csv
110 | src/autofj/benchmark/PoliticalParty/left.csv
111 | src/autofj/benchmark/PoliticalParty/right.csv
112 | src/autofj/benchmark/Race/gt.csv
113 | src/autofj/benchmark/Race/left.csv
114 | src/autofj/benchmark/Race/right.csv
115 | src/autofj/benchmark/RailwayLine/gt.csv
116 | src/autofj/benchmark/RailwayLine/left.csv
117 | src/autofj/benchmark/RailwayLine/right.csv
118 | src/autofj/benchmark/Reptile/gt.csv
119 | src/autofj/benchmark/Reptile/left.csv
120 | src/autofj/benchmark/Reptile/right.csv
121 | src/autofj/benchmark/RugbyLeague/gt.csv
122 | src/autofj/benchmark/RugbyLeague/left.csv
123 | src/autofj/benchmark/RugbyLeague/right.csv
124 | src/autofj/benchmark/ShoppingMall/gt.csv
125 | src/autofj/benchmark/ShoppingMall/left.csv
126 | src/autofj/benchmark/ShoppingMall/right.csv
127 | src/autofj/benchmark/SoccerClubSeason/gt.csv
128 | src/autofj/benchmark/SoccerClubSeason/left.csv
129 | src/autofj/benchmark/SoccerClubSeason/right.csv
130 | src/autofj/benchmark/SoccerLeague/gt.csv
131 | src/autofj/benchmark/SoccerLeague/left.csv
132 | src/autofj/benchmark/SoccerLeague/right.csv
133 | src/autofj/benchmark/SoccerTournament/gt.csv
134 | src/autofj/benchmark/SoccerTournament/left.csv
135 | src/autofj/benchmark/SoccerTournament/right.csv
136 | src/autofj/benchmark/Song/gt.csv
137 | src/autofj/benchmark/Song/left.csv
138 | src/autofj/benchmark/Song/right.csv
139 | src/autofj/benchmark/SportFacility/gt.csv
140 | src/autofj/benchmark/SportFacility/left.csv
141 | src/autofj/benchmark/SportFacility/right.csv
142 | src/autofj/benchmark/SportsLeague/gt.csv
143 | src/autofj/benchmark/SportsLeague/left.csv
144 | src/autofj/benchmark/SportsLeague/right.csv
145 | src/autofj/benchmark/Stadium/gt.csv
146 | src/autofj/benchmark/Stadium/left.csv
147 | src/autofj/benchmark/Stadium/right.csv
148 | src/autofj/benchmark/TelevisionStation/gt.csv
149 | src/autofj/benchmark/TelevisionStation/left.csv
150 | src/autofj/benchmark/TelevisionStation/right.csv
151 | src/autofj/benchmark/TennisTournament/gt.csv
152 | src/autofj/benchmark/TennisTournament/left.csv
153 | src/autofj/benchmark/TennisTournament/right.csv
154 | src/autofj/benchmark/Tournament/gt.csv
155 | src/autofj/benchmark/Tournament/left.csv
156 | src/autofj/benchmark/Tournament/right.csv
157 | src/autofj/benchmark/UnitOfWork/gt.csv
158 | src/autofj/benchmark/UnitOfWork/left.csv
159 | src/autofj/benchmark/UnitOfWork/right.csv
160 | src/autofj/benchmark/Venue/gt.csv
161 | src/autofj/benchmark/Venue/left.csv
162 | src/autofj/benchmark/Venue/right.csv
163 | src/autofj/benchmark/Wrestler/gt.csv
164 | src/autofj/benchmark/Wrestler/left.csv
165 | src/autofj/benchmark/Wrestler/right.csv
166 | src/autofj/blocker/__init__.py
167 | src/autofj/blocker/autofj_blocker.py
168 | src/autofj/blocker/blocker.py
169 | src/autofj/join_function_space/__init__.py
170 | src/autofj/join_function_space/autofj_join_function_space.py
171 | src/autofj/join_function_space/options.py
172 | src/autofj/join_function_space/join_function/__init__.py
173 | src/autofj/join_function_space/join_function/autofj_join_function.py
174 | src/autofj/join_function_space/join_function/distance_function.py
175 | src/autofj/join_function_space/join_function/join_function.py
176 | src/autofj/join_function_space/join_function/preprocessor.py
177 | src/autofj/join_function_space/join_function/token_weight.py
178 | src/autofj/join_function_space/join_function/tokenizer.py
179 | src/autofj/optimizer/__init__.py
180 | src/autofj/optimizer/autofj_multi_column_greedy_algorithm.py
181 | src/autofj/optimizer/autofj_single_column_greedy_algorithm.py
--------------------------------------------------------------------------------
/src/autofj.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: autofj
3 | Version: 0.0.6
4 | Summary: Auto-Program Fuzzy Similarity Joins Without Labeled Examples
5 | Home-page: https://github.com/chu-data-lab/AutomaticFuzzyJoin
6 | Author: Peng Li
7 | Author-email: lipengpublic@gmail.com
8 | License: UNKNOWN
9 | Platform: UNKNOWN
10 | Classifier: Programming Language :: Python :: 3
11 | Classifier: License :: OSI Approved :: MIT License
12 | Classifier: Operating System :: OS Independent
13 | Requires-Python: >=3.7
14 | Description-Content-Type: text/markdown
15 |
16 | # AutoFJ
17 |
18 | The official code for our SIGMOD 2021 paper: [Auto-FuzzyJoin: Auto-Program Fuzzy Similarity Joins Without Labeled Examples](https://arxiv.org/abs/2103.04489). To reproduce the main results in our paper, switch to `reproduce` branch.
19 |
20 | AutoFJ automatically produces record pairs that approximately match in two input
21 | tables without requiring explicit human input such as labeled training data. Using AutoFJ,
22 | users only need to provide two input tables, and a desired precision target (say 0.9).
23 | AutoFJ leverages the fact that one of the input is a reference table to
24 | automatically program fuzzy-joins that meet the precision target in expectation,
25 | while maximizing fuzzy-join recall (defined as the number of correctly joined records).
26 |
27 | In AutoFJ, the left table refers to a reference table, which is assumed to be almost "duplicate-free". AutoFJ attempts to solve many-to-one join problems, where each record in the right table will be joined with at most one record in the left table, but each record in left table can be joined with multiple records in the right table.
28 |
29 | AutoFJ also provides a benchmark that contains [50 diverse datasets](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/50-single-column-datasets.md) for single-column fuzzy-join tasks constructed from [DBPedia](https://www.dbpedia.org).
30 |
31 | ## Installation
32 |
33 | Install the package using pip
34 |
35 | ```
36 | pip install autofj
37 | ```
38 |
39 | ## Usage
40 |
41 | Let `left_table` be the reference table and `right_table` be another input table. The two tables are assumed to have the same schema and have an id column named `id_column`. To join `left_table` and `right_table` with
42 | precision target 0.9, run the following code. The result will be a joined table of record pairs that are identified as matches from two input tables.
43 | ```python
44 | from autofj import AutoFJ
45 | fj = AutoFJ(precision_target=0.9)
46 | result = fj.join(left_table, right_table, id_column)
47 | ```
48 |
49 | To load a benchmark dataset named as `dataset_name`, run the following code. Each dataset contains a left table (reference table), a right table and a ground-truth table of matched record pairs. The id column of each dataset is named as "id" and the column to be joined is named as "title". The names of all benchmark datasets are listed [here](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/50-single-column-datasets.md).
50 | ```python
51 | from autofj.datasets import load_data
52 | left_table, right_table, gt_table = load_data(dataset_name)
53 | ```
54 | ## Example
55 | Run the following code to join the left and right table of TennisTournament dataset.
56 | ```python
57 | from autofj.datasets import load_data
58 | from autofj import AutoFJ
59 | left_table, right_table, gt_table = load_data("TennisTournament")
60 | fj = AutoFJ(precision_target=0.9)
61 | result = fj.join(left_table, right_table, "id")
62 | ```
63 |
64 | ## Documentation
65 | ```python
66 | class AutoFJ(object):
67 | def __init__(self,
68 | precision_target=0.9,
69 | join_function_space="autofj_sm",
70 | distance_threshold_space=50,
71 | column_weight_space=10,
72 | blocker=None,
73 | n_jobs=-1,
74 | verbose=False):
75 | ```
76 |
77 | ### Parameters
78 | * **precision_target: *float*, default=0.9**
79 | Precision target. The value is taken from 0-1. The default value is 0.9.
80 |
81 | * **join_function_space: *string, dict or list of objects*, default="autofj_sm"**
82 | Space of join functions. There are three ways to define the space of join functions:
83 | 1. Use the name (string) of built-in join function space. There are three
84 | options, including "autofj_lg", "autofj_md" and "autofj_sm" that use
85 | 136, 68 and 14 join functions, respectively. Using less join functions
86 | can improve efficiency but may worsen performance.
87 | 2. Use a dict specifying the options for preprocessing methods,
88 | tokenization methods, token weighting methods and distance functions.
89 | The space will be the cartesian product of all options in the dict.
90 | See [options.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/join_function_space/options.py) for defining join functions using
91 | a dict.
92 | 3. Use a list of customized JoinFunction objects. Define JoinFunction class using prototype in [join_function.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/join_function_space/join_function/join_function.py).
93 |
94 | * **distance_threshold_space: *int or list of floats*, default=50**
95 | The number of candidate distance thresholds or a list of candidate
96 | distance thresholds in the space. If the number of distance thresholds
97 | (integer) is given, distance thresholds are spaced evenly from 0 to 1.
98 | Otherwise, it should be a list of floats from 0 to 1. Using fewer candidates
99 | can improve efficiency but may worsen performance.
100 |
101 | * **column_weight_space: *int or list of floats*, default=10**
102 | The number of candidate column weights or a list of candidate
103 | column weights in the space. If the number of column weights
104 | (integer) is given, column weights are spaced evenly from 0 to 1.
105 | Otherwise, it should be a list of floats from 0 to 1. Using fewer candidates
106 | can improve efficiency but may worsen performance.
107 |
108 |
109 | * **blocker: *None or a Blocker object*, default None**
110 | A Blocker object that performs blocking on two tables. If None, use
111 | the built-in blocker. For using customized blocker, define Blocker class using prototype in [blocker.py](https://github.com/chu-data-lab/AutomaticFuzzyJoin/blob/master/src/autofj/blocker/blocker.py).
112 |
113 | * **n_jobs : *int*, default=-1**
114 | Number of CPU cores used. -1 means using all processors.
115 |
116 | * **verbose: *bool*, default=False**
117 | Whether to print logging
118 |
119 | ### Attributes
120 | * **selected_column_weights: *dict***
121 | The columns and column weights selected by the algorithm. The key is the
122 | column name, the value is the weight selected for the column.
123 |
124 | * **selected_join_configs: *list of tuples***
125 | The union of join configurations selected by the algorithm. Each tuple
126 | (join_function, threshold) in the list is a join configuration that
127 | consists of the name of the join function and its distance threshold.
128 |
129 | ### Methods
130 | ```python
131 | join(left_table, right_table, id_column, on=None)
132 | ```
133 |
134 | Join left table and right table.
135 |
136 | #### Parameters
137 | * **left_table: *pandas.DataFrame***
138 | Reference table. The left table is assumed to be almost duplicate-free, which means it has no or only few duplicates.
139 |
140 | * **right_table: *pandas.DataFrame***
141 | Another input table.
142 |
143 | * **id_column: *string***
144 | The name of id column in the two tables. This column will not be
145 | used to join two tables.
146 |
147 | * **on: *list or None*, default=None**
148 | A list of column names (multi-column fuzzy join) that the two tables
149 | will be joined on. If None, two tables will be joined on all columns
150 | that exist in both tables, excluding the id column.
151 |
152 | #### Return
153 | * ***pandas.DataFrame***
154 | A table of joining pairs. The columns of left table are
155 | suffixed with "_l" and the columns of right table are suffixed
156 | with "_r".
157 |
158 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/SoccerLeague/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,National League (English football)
3 | 1,USL W-League
4 | 2,J1 League
5 | 3,Gibraltar Premier Division
6 | 4,Persian Gulf Pro League
7 | 5,Southern Counties East Football League
8 | 6,Kent League
9 | 7,National League North
10 | 8,National League South
11 | 9,Wessex League
12 | 10,National League (division)
13 | 11,Conference Premier
14 | 12,United Counties Football League
15 | 13,Southern Combination Football League
16 | 14,Professional Indoor Football League (1998)
17 | 15,Premier Development League
18 | 16,LigaPro
19 | 17,Liga de Honra
20 | 18,West Cheshire Amateur Football League
21 | 19,K-League
22 | 20,Croatian First Football League
23 | 21,Belgian Fourth Division
24 | 22,Croatian Second Football League
25 | 23,Swedish football Division 2
26 | 24,Norwegian First Division
27 | 25,Japan Women's Football League
28 | 26,Division 1 (Swedish football)
29 | 27,Swedish football Division 1
30 | 28,National Premier Leagues NSW
31 | 29,National Premier Leagues Victoria
32 | 30,Championnat de France Amateur
33 | 31,Russian Football National League
34 | 32,Football Championship of the National League
35 | 33,Russian Professional Football League
36 | 34,Czech First League
37 | 35,Gambrinus Liga
38 | 36,Croatian Prva HMNL
39 | 37,Championnat de France Amateur 2
40 | 38,"Brighton, Worthing & District Football League"
41 | 39,Cheshire Football League
42 | 40,Gloucestershire County League
43 | 41,Somerset County Football League
44 | 42,Northamptonshire Football Combination
45 | 43,Thames Valley Premier Football League
46 | 44,West Yorkshire League
47 | 45,First Capital Plus Premier League
48 | 46,Midland Football League (Scotland)
49 | 47,West of Scotland Super League Premier Division
50 | 48,West of Scotland Super League First Division
51 | 49,Capital League 1
52 | 50,Capital 1 League
53 | 51,NIFL Premiership
54 | 52,NAIA Men's Soccer Championship
55 | 53,Macedonian First Football League
56 | 54,Umaglesi Liga
57 | 55,Georgian Premier League
58 | 56,National Premier Leagues South Australia
59 | 57,FFSA Premier League
60 | 58,Bristol Downs Football League
61 | 59,Subroto Cup Football Tournament
62 | 60,Bristol and District League
63 | 61,NAIA Women's Soccer Championship
64 | 62,Ligat Nashim
65 | 63,Dorset Senior League
66 | 64,Plymouth and West Devon Football League
67 | 65,Plymouth and West Devon Combination
68 | 66,Doncaster & District Senior League
69 | 67,Bristol and Avon League
70 | 68,South Yorkshire Amateur League
71 | 69,I Liga (Slovakia)
72 | 70,Bath and North Somerset District Football League
73 | 71,Bath and District League
74 | 72,Andover and District Saturday Football League
75 | 73,Cheltenham League
76 | 74,Craven and District League
77 | 75,North Gloucestershire League
78 | 76,Stroud and District League
79 | 77,Taunton & District Saturday League
80 | 78,Weston super Mare and District League
81 | 79,Slovak Third League
82 | 80,Calcutta Football League
83 | 81,Kingdom Caledonian Amateur Football Association
84 | 82,Liga Portuguesa de Futebol Profissional
85 | 83,Portuguese Handball Super Cup
86 | 84,Czech National Football League
87 | 85,Cape Verdean Football Championship
88 | 86,V.League 1
89 | 87,V-League (Vietnam)
90 | 88,Oman Professional League
91 | 89,Rwanda National Football League
92 | 90,Rwandan Premier League
93 | 91,United Indoor Football League
94 | 92,SVB Hoofdklasse
95 | 93,Ligue 1 Mauritania
96 | 94,Thimphu League
97 | 95,Bangladesh Football Premier League
98 | 96,Bangladesh League
99 | 97,National Premier Leagues Northern NSW
100 | 98,Somali First Division
101 | 99,Somali League
102 | 100,Provo Premier League
103 | 101,MFL League
104 | 102,Saint Kitts Premier Division
105 | 103,SKNFA Super League
106 | 104,AFA Senior Male League
107 | 105,Anguillian League
108 | 106,Cayman Islands Premier League
109 | 107,Guam Soccer League
110 | 108,FFAS Senior League
111 | 109,Port Vila Football League
112 | 110,TVL League
113 | 111,Tahiti Ligue 1
114 | 112,International rules series
115 | 113,St Helens Combination
116 | 114,Swedish football Division 1 Norra
117 | 115,Southern Championship
118 | 116,Northern Championship
119 | 117,Vale of Clwyd and Conwy Football League
120 | 118,British Virgin Islands Championship
121 | 119,Norwegian Second Division
122 | 120,National Premier Leagues NSW 2
123 | 121,National Premier Leagues NSW 3
124 | 122,NSW State League Division 1
125 | 123,Northern Mariana Championship
126 | 124,Northern NSW State League Division 1
127 | 125,Vodacom League
128 | 126,K3 League
129 | 127,K3 Challengers League
130 | 128,National Premier Leagues Capital Football
131 | 129,National Premier Leagues ACT
132 | 130,Wilson Cup
133 | 131,Hampshire Premier Football League
134 | 132,II liiga
135 | 133,F.League
136 | 134,III liiga
137 | 135,Alberton Football Netball League
138 | 136,IV liiga
139 | 137,Swiss 1. Liga (football)
140 | 138,The Football League
141 | 139,Macedonian Second Football League
142 | 140,2. Oberliga West
143 | 141,K league
144 | 142,National Youth League (Australia)
145 | 143,Football West State League Division 1
146 | 144,Surrey Elite Intermediate League
147 | 145,Macedonian Third Football League
148 | 146,Mexican Primera División
149 | 147,Major Arena Soccer League
150 | 148,Yarra Valley Mountain District Football and Netball League
151 | 149,Oman Super Cup
152 | 150,Ballymena & Provincial Football League
153 | 151,Scottish Women's First Division
154 | 152,R League
155 | 153,Rugby League Conference Welsh Premier
156 | 154,Scottish National League (rugby league)
157 | 155,Rugby League Conference Scotland Division
158 | 156,NSW State League
159 | 157,NSW State League Division 2
160 | 158,WK League
161 | 159,National Premier Leagues Western Australia
162 | 160,SPFL Development League
163 | 161,Scottish Premier under-19 League
164 | 162,SPFL U20 League
165 | 163,Zone League One
166 | 164,Zone League Two
167 | 165,Zone League Three
168 | 166,League 1 (rugby league)
169 | 167,Tehran Province League
170 | 168,Tehran Province league
171 | 169,Iraq Division One
172 | 170,Liga de Ascenso
173 | 171,Belgian Futsal Division 1
174 | 172,FAM League
175 | 173,Azerbaijan Futsal Premier League
176 | 174,V.League 2
177 | 175,Israeli Noar Leumit League
178 | 176,Czech Futsal First League
179 | 177,Georgian Futsal Super League
180 | 178,Rugby League Conference South West Division
181 | 179,Russian women's football championship
182 | 180,Vietnamese National Football Second League
183 | 181,Afghan National League
184 | 182,FA WSL 1
185 | 183,J2 League
186 | 184,North Wales Championship
187 | 185,Ekstraliga (women's football)
188 | 186,Romanian Superliga (women's football)
189 | 187,Welsh Premier League (women)
190 | 188,Bosnian women's football championship
191 | 189,OK League
192 | 190,United Soccer League
193 | 191,Campeonato Nacional de Futebol de Praia
194 | 192,Slovenian Regional League
195 | 193,SVB Eerste Klasse
196 | 194,Cypriot Futsal First Division
197 | 195,Serbian Prva Futsal Liga
198 | 196,Pertiwi Cup
199 | 197,Indonesian Womens Football Tournament
200 | 198,West Cork League Premier Division
201 | 199,Slovenian under-19 League
202 | 200,Norwegian Third Division
203 | 201,UAE Arabian Gulf League
204 | 202,UAE Pro-League
205 | 203,Rugby League Conference East Division
206 | 204,Kyrgyzstan Women's Championship
207 | 205,I-League U18
208 | 206,I-League U19
209 | 207,Serbian Zone League
210 | 208,Albanian Women's National Championship
211 | 209,5. Liga (Slovakia)
212 | 210,Slovak Fifth League
213 | 211,Great Yarmouth and District League
214 | 212,Norwich and District Business Houses League
215 | 213,Delhi Senior Division
216 | 214,Korea Semi-Professional Football League
217 | 215,Bromley and District League
218 | 216,AFL Europe Championship
219 | 217,Professional Indoor Football League
220 | 218,Qatargas League
221 | 219,Qatari 2nd Division
222 | 220,Professional Football League (Algeria)
223 | 221,WPSL Elite League
224 | 222,East Entry League
225 | 223,Kenyan Regional Leagues
226 | 224,Kenyan Provincial Leagues
227 | 225,Kenyan County Leagues
228 | 226,Kenyan District Leagues
229 | 227,Phthiotis Football Clubs Association
230 | 228,Corinthia Football Clubs Association
231 | 229,Maldivian Second Division Football Tournament
232 | 230,Oberliga Mittelrhein
233 | 231,National Premier Leagues Tasmania
234 | 232,Oman Professional League Cup
235 | 233,FAO Football League
236 | 234,A1 Ethniki Women's Water Polo
237 | 235,National Premier Leagues Queensland
238 | 236,Maldivian Third Division Football Tournament
239 | 237,J3 League
240 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Legislature/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Sejm
3 | 1,House of Representatives of the Netherlands
4 | 2,Parliament of Sweden
5 | 3,Diet of Japan
6 | 4,Storting
7 | 5,States General of the Netherlands
8 | 6,National Assembly of the Republic of China
9 | 7,House of Councillors (Japan)
10 | 8,House of Representatives of Japan
11 | 9,National Council of Austria
12 | 10,Federal Council of Austria
13 | 11,Secretariat of the Communist Party of China
14 | 12,National Assembly of France
15 | 13,National Council of Switzerland
16 | 14,Council of States of Switzerland
17 | 15,New Zealand House of Representatives
18 | 16,Croatian Parliament
19 | 17,Indian Parliament
20 | 18,Congress of Mexico
21 | 19,Chamber of Deputies of Mexico
22 | 20,National Assembly of Kuwait
23 | 21,The Folketing
24 | 22,Senate of Mexico
25 | 23,Senate of Poland
26 | 24,National Council of the Slovak Republic
27 | 25,National Assembly of Venezuela
28 | 26,National Assembly of the Republic of Poland
29 | 27,Parliament of Wallonia
30 | 28,Senate of France
31 | 29,National Assembly of Hungary
32 | 30,Senate of the Netherlands
33 | 31,National Assembly of South Korea
34 | 32,National Assembly of Serbia
35 | 33,National Assembly of Panama
36 | 34,Parliament of Fiji
37 | 35,National Assembly of Pakistan
38 | 36,Italian Senate
39 | 37,"City Council of Cincinnati, Ohio elections"
40 | 38,General Assembly of Nova Scotia
41 | 39,Parliament of France
42 | 40,Belgian Senate
43 | 41,Assembly of the Republic of Portugal
44 | 42,Italian Chamber of Deputies
45 | 43,Senate of Romania
46 | 44,Chamber of Deputies of Romania
47 | 45,Federal Assembly of Russia
48 | 46,Federation Council of Russia
49 | 47,25th Alberta Legislature
50 | 48,Alberta Legislature
51 | 49,Parliament of Italy
52 | 50,Arizona Legislature
53 | 51,House of Representatives of Trinidad and Tobago
54 | 52,Senate of Trinidad and Tobago
55 | 53,Landtag of the Free State of Saxony
56 | 54,Central Commission for Discipline Inspection
57 | 55,Greater Chennai Corporation
58 | 56,National Congress of Bolivia
59 | 57,Chamber of Deputies of Brazil
60 | 58,National Congress of Argentina
61 | 59,National Assembly of Nicaragua
62 | 60,National Council of Monaco
63 | 61,House of Elders
64 | 62,Federal Assembly of Switzerland
65 | 63,National Assembly of Nigeria
66 | 64,People's Assembly of Egypt
67 | 65,General Council (Andorra)
68 | 66,National Assembly of Bulgaria
69 | 67,National Assembly of Belize
70 | 68,Parliament of Austria
71 | 69,House of Representatives of Bosnia and Herzegovina
72 | 70,National Assembly of Burundi
73 | 71,Senate of Burundi
74 | 72,Chamber of Deputies of Tunisia
75 | 73,Urban Council
76 | 74,Urban Council (Hong Kong)
77 | 75,National Assembly of Djibouti
78 | 76,Chamber of Deputies of Luxembourg
79 | 77,Chamber of Deputies of Rwanda
80 | 78,House of Representatives of Belize
81 | 79,Majlis of the Maldives
82 | 80,House of Representatives of Liberia
83 | 81,Senate of Liberia
84 | 82,National Assembly of Bahrain
85 | 83,Council of Representatives of Bahrain
86 | 84,Consultative Council of Bahrain
87 | 85,Senate of the Democratic Republic of the Congo
88 | 86,National Parliament of East Timor
89 | 87,National Assembly of Botswana
90 | 88,National Assembly of Cape Verde
91 | 89,National Assembly of Namibia
92 | 90,Assembly of the Republic of Mozambique
93 | 91,House of Assembly of Zimbabwe
94 | 92,National Council of Namibia
95 | 93,National Assembly of Niger
96 | 94,National Assembly of the Central African Republic
97 | 95,National Assembly of Zambia
98 | 96,National Assembly of Seychelles
99 | 97,National Assembly of Guinea
100 | 98,National Assembly of Lesotho
101 | 99,Supreme Council (Transnistria)
102 | 100,Chamber of Deputies (Equatorial Guinea)
103 | 101,National Assembly of Togo
104 | 102,National People's Assembly of Guinea-Bissau
105 | 103,House of Representatives of Nigeria
106 | 104,National Assembly of Mauritania
107 | 105,National Legislature of Sudan
108 | 106,Zanzibar House of Representatives
109 | 107,House of Representatives of Zanzibar
110 | 108,House of Representatives of Antigua and Barbuda
111 | 109,National Assembly of Burkina Faso
112 | 110,National Assembly of Mali
113 | 111,National Assembly of Kenya
114 | 112,Belgian Chamber of Representatives
115 | 113,National Assembly of Suriname
116 | 114,National Assembly of Afghanistan
117 | 115,National Assembly of Angola
118 | 116,National Assembly of Armenia
119 | 117,National Assembly of Azerbaijan
120 | 118,Senate of Thailand
121 | 119,House of Representatives of Thailand
122 | 120,National Assembly of the Republika Srpska
123 | 121,People's Assembly (Republika Srpska)
124 | 122,National Assembly of Vietnam
125 | 123,National Assembly of Benin
126 | 124,Senate of Belize
127 | 125,Senate of Antigua and Barbuda
128 | 126,National Assembly of Côte d'Ivoire
129 | 127,Assembly of Representatives of Yemen
130 | 128,Supreme Assembly of Uzbekistan
131 | 129,National Assembly of Tanzania
132 | 130,National Assembly of Cameroon
133 | 131,Senate of Cambodia
134 | 132,National Assembly of Cambodia
135 | 133,National Assembly of Chad
136 | 134,National Assembly of Eritrea
137 | 135,House of Representatives of Cyprus
138 | 136,National Assembly of Guyana
139 | 137,House of Peoples of Bosnia and Herzegovina
140 | 138,Chamber of Deputies of Haiti
141 | 139,National Assembly of Laos
142 | 140,House of Assembly of Kiribati
143 | 141,National Assembly of Malawi
144 | 142,National Assembly of Mauritius
145 | 143,National Parliament of the Solomon Islands
146 | 144,National Assembly of Saint Kitts and Nevis
147 | 145,Assembly of the Republic of Macedonia
148 | 146,Supreme Assembly of Tajikistan
149 | 147,Assembly of Representatives of Morocco
150 | 148,Parliament of Mauritania
151 | 149,Supreme Council of Kyrgyzstan
152 | 150,Council of States of Sudan
153 | 151,National Assembly of Sudan
154 | 152,Peterborough City Council
155 | 153,Senate of Brazil
156 | 154,National Assembly of the Democratic Republic of the Congo
157 | 155,Senate of Haiti
158 | 156,State Council of Crimea
159 | 157,Länderkammer
160 | 158,Imperial Legislative Council
161 | 159,Prussian House of Lords
162 | 160,Iraqi Kurdistan Parliament
163 | 161,Charlotte City Council
164 | 162,Madrid Assembly
165 | 163,General Assembly of Prince Edward Island
166 | 164,National Assembly of Thailand
167 | 165,KwaZuluâNatal Legislature
168 | 166,New Brunswick Legislature
169 | 167,National Council of Bhutan
170 | 168,National Assembly of Bhutan
171 | 169,Penang Island City Council
172 | 170,27th Alberta Legislature
173 | 171,Wigan Metropolitan Borough Council
174 | 172,Senate of Ceylon
175 | 173,Borough of Poole
176 | 174,Chamber of Councillors of Tunisia
177 | 175,House of Representatives of Ceylon
178 | 176,Landtag of the Saarland
179 | 177,City of Bradford Metropolitan District Council
180 | 178,"Assembly of the Community of Municipalities, Autonomous Province of Kosovo and Metohija"
181 | 179,Assembly of the Community of Serbian municipalities
182 | 180,Ards and North Down Borough Council
183 | 181,Antrim and Newtownabbey Borough Council
184 | 182,Mid and East Antrim Borough Council
185 | 183,General Assembly of Newfoundland and Labrador
186 | 184,State Assembly of the Republic of Bashkortostan
187 | 185,House of Commons of South Korea
188 | 186,Senate of South Korea
189 | 187,Legislative Assembly of the Autonomous Region of the Azores
190 | 188,Cumberland County Council
191 | 189,Current members of the Bolivian Plurinational Legislative Assembly
192 | 190,Senate of Kenya
193 | 191,City of York Council
194 | 192,Soviet of the Republic
195 | 193,National Assembly of Ecuador
196 | 194,Pyidaungsu Hluttaw
197 | 195,Amyotha Hluttaw
198 | 196,House of Representatives (Myanmar)
199 | 197,Pyithu Hluttaw
200 | 198,Orissa Legislative Assembly
201 | 199,Legislative Assembly of Odisha
202 | 200,Parliament of the Ottoman Empire
203 | 201,National Legislature of South Sudan
204 | 202,National Legislative Assembly of South Sudan
205 | 203,Council of States of South Sudan
206 | 204,Parliament of British Columbia
207 | 205,Chavakacheri Urban Council
208 | 206,Coimbatore Municipal Corporation
209 | 207,Chavakacheri Divisional Council
210 | 208,Saskatchewan Legislature
211 | 209,Egyptian Constituent Assembly of 2012
212 | 210,Constituent Assembly of Portugal
213 | 211,List of constituencies of Gujarat Legislative Assembly
214 | 212,General Junta of the Principality of Asturias
215 | 213,Diet of Galicia and Lodomeria
216 | 214,Legislative Assembly of Emilia-Romagna
217 | 215,History of the National Assembly of Pakistan
218 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/BasketballTeam/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 8,New Orleans Hornets,0,New Orleans Pelicans
3 | 30,Charlotte Bobcats,1,Charlotte Hornets
4 | 58,Polonia Warbud Warszawa,2,Polonia Warszawa (basketball)
5 | 59,Asseco Prokom Gdynia,3,Asseco Gdynia
6 | 96,Melbourne Tigers,4,Melbourne United
7 | 99,Liaoning Dinosaurs,5,Liaoning Flying Leopards
8 | 103,Shandong Lions,6,Shandong Golden Stars
9 | 106,Fujian Xunxing,7,Fujian Sturgeons
10 | 117,P.A.O.K. B.C.,8,P.A.O.K. BC
11 | 120,Ulriken Eagles,9,Ulriken Elite
12 | 121,Mens Sana Basket,10,Mens Sana 1871 Basket
13 | 124,Tulsa 66ers,11,Oklahoma City Blue
14 | 125,Austin Toros,12,Austin Spurs
15 | 130,KK Union Olimpija,13,KK Olimpija
16 | 137,Surrey Heat,14,Surrey Scorchers
17 | 137,Surrey Heat,15,Guildford Heat
18 | 137,Surrey Heat,16,Surrey United (basketball)
19 | 139,Dongguan Leopards,17,Shenzhen Leopards
20 | 150,Pallacanestro Treviso,18,Treviso Basket
21 | 150,Pallacanestro Treviso,19,Universo Treviso Basket
22 | 152,FC Barcelona-Institut Guttmann,20,UNES FC Barcelona
23 | 163,Canberra Capitals,21,University of Canberra Capitals
24 | 167,OceanaGold Nuggets,22,Otago Nuggets
25 | 181,Torpan Pojat,23,Helsinki Seagulls
26 | 181,Torpan Pojat,24,BC Torpan Pojat
27 | 183,London Lions (basketball),25,Milton Keynes Lions
28 | 196,Brooklyn Kings,26,Brooklyn Kings (basketball)
29 | 199,Bilbao Basket,27,CB Bilbao Berri
30 | 202,Euroins Cherno More,28,BC Cherno More Port Varna
31 | 225,KK Vojvodina,29,KK Vojvodina Srbijagas
32 | 242,Dumbarton Dodgers Basketball Club,30,Dumbarton Dodgers
33 | 251,F.C. Porto (basketball),31,FC Porto (basketball)
34 | 253,Oliveirense Basquetebol,32,U.D. Oliveirense (basketball)
35 | 255,C.A. Queluz,33,CA Queluz
36 | 259,Neckar Riesen Ludwigsburg,34,MHP Riesen Ludwigsburg
37 | 259,Neckar Riesen Ludwigsburg,35,EnBW Ludwigsburg
38 | 262,TBB Trier,36,Gladiators Trier
39 | 266,Basket Livorno,37,Pallacanestro Don Bosco Livorno
40 | 279,PBC Ural Great Perm,38,PBC Ural Great
41 | 281,Troon Tornadoes,39,Ayrshire Tornadoes
42 | 295,Bree BBC,40,Bree B.B.C.
43 | 301,Utah Flash,41,Delaware 87ers
44 | 317,Taiwan Mobile Clouded Leopards,42,Fubon Braves
45 | 317,Taiwan Mobile Clouded Leopards,43,Fubon Braves Basketball Team
46 | 319,Wisconsin Flyers,44,Omaha Racers
47 | 324,Pentland Tigers Basketball Club,45,Edinburgh Tigers
48 | 338,PBC Lokomotiv Kuban,46,PBC Lokomotiv-Kuban
49 | 344,Bristol Academy Flyers,47,Bristol Flyers
50 | 348,London Capital,48,PAWS London Capital
51 | 349,Trabzonspor Basketball,49,Trabzonspor B.K.
52 | 350,Olympias Patras B.C.,50,Olympiada Patras BC
53 | 352,Sporting B.C.,51,Sporting BC
54 | 357,Galatasaray Medical Park (men's basketball),52,Galatasaray S.K. (men's basketball)
55 | 366,Cheshire Phoenix,53,Cheshire Jets
56 | 367,Cherkaski Mavpy,54,BC Cherkasy
57 | 368,KK Mega Vizura,55,KK Mega Basket
58 | 368,KK Mega Vizura,56,KK Mega Leks
59 | 371,Nuova AMG Sebastiani Basket Napoli,57,Nuova AMG Sebastiani Basket Rieti
60 | 374,BC Enisey,58,BC Yenisey Krasnoyarsk
61 | 375,BC Krka,59,KK Krka
62 | 376,Roseto Sharks,60,Roseto Basket
63 | 388,Societa Veroli Basket,61,Veroli Basket
64 | 389,Guerino Vanoli Basket,62,Gruppo Triboldi Basket
65 | 390,A.E.L. 1964 B.C.,63,AEL 1964 B.C.
66 | 393,Bnei Herzliya,64,Bnei HaSharon
67 | 401,Star of the Sea (basketball),65,Belfast Star
68 | 403,Bulleen Boomers,66,Melbourne Boomers
69 | 406,BC Triumph Lyubertsy,67,B.C. Zenit Saint Petersburg
70 | 407,Allianz Swans Gmunden,68,Swans Gmunden
71 | 411,GasTerra Flames,69,Donar (basketball club)
72 | 413,Rethymno Aegean B.C.,70,Rethymno Cretan Kings B.C.
73 | 418,Egaleo B.C.,71,Egaleo BC
74 | 438,CB Atapuerca,72,CB Tizona
75 | 452,South West Slammers,73,Bunbury Slammers
76 | 467,Ilysiakos B.C.,74,Ilissiakos B.C.
77 | 468,M.E.N.T. B.C.,75,MENT B.C.
78 | 471,Amyntas Dafnis B.C.,76,Dafni BC
79 | 472,Xanthi BC,77,Xanthi B.C.
80 | 473,Paleo Faliro B.C.,78,Athlitikos Omilos Paleou Falirou BC
81 | 474,Chalkida BC,79,AGEH Gymnastikos B.C.
82 | 475,Kavala B.C.,80,Union Kavala B.C.
83 | 477,ICBS BC,81,Peramatos Ermis B.C.
84 | 477,ICBS BC,82,ICBS B.C.
85 | 481,Ionikos Lamias B.C.,83,Ionikos Lamias BC
86 | 482,Pagrati B.C.,84,AO Pagrati BC
87 | 485,Toros de Los Dos Laredos,85,Toros de Nuevo Laredo
88 | 490,Trikala B.C.,86,Trikala 2000 B.C.
89 | 490,Trikala B.C.,87,A.S. Trikala 2000 BC
90 | 495,Irakleio B.C.,88,Iraklio BC
91 | 501,BC Strumica 2005,89,KK Millenium Strumica
92 | 502,Incheon ET Land Elephants,90,Incheon Electroland Elephants
93 | 505,Goyang Orions,91,Goyang Orion Orions
94 | 506,Gigantes de Carolina (basketball),92,Gigantes de Carolina (men's basketball)
95 | 508,SCAA Basketball,93,South China AA (basketball)
96 | 513,Galatasaray SK (women's basketball),94,Galatasaray S.K. (women's basketball)
97 | 513,Galatasaray SK (women's basketball),95,Galatasaray Medical Park (women's basketball)
98 | 517,Habik'a B.C.,96,Elitzur Givat Shmuel
99 | 519,Medway Park Crusaders,97,Kent Crusaders (basketball)
100 | 520,Ionikos N.F. B.C.,98,AS Ionikos Neas Filadelfeias BC
101 | 521,Ionikos Nikaias B.C.,99,Ionikos Nikaias BC
102 | 526,BC Politekhnika-Halychyna,100,Polytekhnika-Halychyna Lviv
103 | 527,Galatasaray SK (wheelchair basketball),101,Galatasaray S.K. (wheelchair basketball)
104 | 527,Galatasaray SK (wheelchair basketball),102,Galatasaray Wheelchair Basketball Team
105 | 535,BK Valmiera,103,SK Valmiera
106 | 553,BC Rakvere Tarvas,104,Rakvere Tarvas
107 | 558,Al-Ahly (Benghazi) Men's Basketball Team,105,Al-Ahli Benghazi (basketball club)
108 | 563,B.C. Partizani Tirana,106,BC Partizani Tirana
109 | 567,Gymnastikos S. Larissas,107,Gymnastikos S. Larissas B.C.
110 | 593,BC Budivelnyk,108,BC Budivelnik
111 | 602,Bintulu Rainbow BC,109,Bintulu Eagles B.C.
112 | 602,Bintulu Rainbow BC,110,Bintulu Rainbow B.C.
113 | 603,Perak YSL Farmcochem BC,111,Perak Farmcochem B.C.
114 | 605,CS Otopeni (basketball),112,CS Otopeni (Basketball)
115 | 606,CS Energia Rovinari,113,CS Energia
116 | 614,Hapoel Afula,114,Hapoel Afula B.C.
117 | 615,Satria Muda BritAma Jakarta,115,Satria Muda Pertamina Jakarta
118 | 617,Chang Thailand Slammers,116,Hi-Tech Bangkok City
119 | 617,Chang Thailand Slammers,117,Sports Rev Thailand Slammers
120 | 626,Maccabi Raanana,118,Maccabi Ra'anana
121 | 632,Logan Thunder,119,Logan Thunder (WNBL)
122 | 639,Aramex (Jordan),120,Al Riyadi Amman
123 | 642,Ezzahra Sport Rades,121,Ezzahra Sports
124 | 644,Barak Netanya,122,Barak Netanya B.C.
125 | 657,Sony Athinaikos Athens,123,Athinaikos women's basketball
126 | 658,Ikaros Kallitheas B.C.,124,Ikaros Chalkidas B.C.
127 | 670,BBC Bayreuth,125,Medi Bayreuth
128 | 672,Rosa Radom,126,RosaSport Radom
129 | 675,Optima Gent,127,Gent Hawks
130 | 677,Halcones UV Xalapa,128,Halcones de Xalapa
131 | 688,Dell Aspac Jakarta,129,Aspac Jakarta
132 | 689,Leeds Carnegie (basketball),130,Leeds Force
133 | 690,Pelita Jaya Esia,131,Pelita Jaya Energi Mega Persada
134 | 702,Nuvo CLS Knights,132,CLS Knights Surabaya
135 | 704,Yongin Samsung Life Blue Minx,133,Yongin Samsung Blueminx
136 | 704,Yongin Samsung Life Blue Minx,134,Yongin Samsung Life Bichumi
137 | 713,Rapla KK,135,Piimameister Otto/Rapla
138 | 713,Rapla KK,136,TYCO Rapla
139 | 715,Muba Hangtuah Indonesia Muda Sumatera Selatan,137,Muba Hangtuah Sumatera Selatan
140 | 717,Black Water Elite,138,Blackwater Sports
141 | 717,Black Water Elite,139,Black Water Sports
142 | 722,Juventus (basketball club),140,BC Juventus
143 | 723,BC Prienai,141,BC Rūdupis
144 | 724,Palanga Triobet,142,BC Palanga
145 | 724,Palanga Triobet,143,BC Naglis
146 | 754,Cuxhaven Bascats,144,Cuxhaven BasCats
147 | 762,Oshawa Power,145,Mississauga Power
148 | 771,ADB Pas,146,AB Pas
149 | 795,BC Tsmoki-Minsk,147,BC Minsk-2006
150 | 800,Jalaa FC,148,Jalaa SC (men's basketball)
151 | 811,Porta XI CBF,149,Porta XI Ensino CBF
152 | 812,UNIQA Euroleasing Sopron,150,UNIQA Sopron
153 | 816,Al Rayyan Basketball Team,151,Al Rayan SC Basketball Team
154 | 824,Leuven Bears,152,Stella Artois Leuven Bears
155 | 826,Barsy Atyrau,153,BC Barsy Atyrau
156 | 830,Al-Ittihad Alexandria,154,El Ittihad Alexandria (basketball)
157 | 830,Al-Ittihad Alexandria,155,Al Ittihad Alexandria (basketball)
158 | 832,Zamalek (basketball club),156,Zamalek (basketball)
159 | 840,Boracay Rum Waves,157,Tanduay Light Rhum Masters
160 | 871,Kuwait SC (basketball),158,Al Kuwait SC (basketball)
161 | 875,Stade Olympique Maritime Boulonnais,159,SOMB Boulogne-sur-Mer
162 | 876,KK Brod,160,KK Slavonski Brod
163 | 916,Club Baloncesto Ciudad de Algeciras,161,CB Ciudad de Algeciras
164 | 918,Pacific Caesar,162,Pacific Caesar Surabaya
165 | 923,Primeiro de Agosto Basketball,163,C.D. Primeiro de Agosto (basketball)
166 | 925,Gruppo Sportivo FIAT,164,G.S. FIAT
167 | 927,Brampton A's,165,Orangeville A's
168 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/ShoppingMall/gt.csv:
--------------------------------------------------------------------------------
1 | id_l,title_l,id_r,title_r
2 | 0,The Galleria (Houston),0,The Galleria
3 | 1,Shops at Prudential Center,1,The Shops at Prudential Center
4 | 2,The Fashion Centre at Pentagon City,2,Fashion Centre at Pentagon City
5 | 3,Westfield Sydney Central Plaza,3,Sydney Central Plaza
6 | 6,Downtown Disney (Walt Disney World),5,Disney Springs
7 | 6,Downtown Disney (Walt Disney World),6,Downtown Disney (Walt Disney World Resort)
8 | 7,Bluewater (shopping centre),7,Bluewater
9 | 9,"The Summit (Birmingham, Alabama)",9,The Summit (Birmingham)
10 | 10,Centro Box Hill,10,Box Hill Central Shopping Centre
11 | 12,Irvine Spectrum Center,12,Irvine Spectrum
12 | 13,Liffey Valley Shopping Centre,13,Liffey Valley
13 | 16,Metropolis at Metrotown,16,Metrotown
14 | 17,St. Louis Outlet Mall,17,St. Louis Mills
15 | 18,Northlake Mall (Charlotte),18,"Northlake Mall (Charlotte, North Carolina)"
16 | 21,Touchwood,21,"Touchwood, Solihull"
17 | 22,Atlantic Terminal (shopping mall),22,Atlantic Terminal (Shopping Mall)
18 | 26,Hamilton Place (shopping mall),26,Hamilton Place
19 | 27,Langham Place (Hong Kong),27,"Langham Place, Hong Kong"
20 | 28,Centrale (Croydon),28,Centrale
21 | 30,Broadmarsh (shopping centre),30,Broadmarsh
22 | 31,Rhodes Shopping Centre,31,Rhodes Waterside
23 | 32,The CentrePoint,32,The Centrepoint
24 | 33,Wilton Mall,33,Wilton Mall at Saratoga
25 | 34,Palm Beach Mall,34,Palm Beach Outlets
26 | 35,Great Northern Mall,35,Great Northern Mall (New York)
27 | 36,Centro Bankstown,36,Bankstown Central Shopping Centre
28 | 38,HarbourFront Centre,38,HarbourFront Centre (Singapore)
29 | 39,CityPlace,39,CityPlace (West Palm Beach)
30 | 40,Galleria Shopping Centre,40,Galleria Shopping Centre (Toronto)
31 | 41,"Westgate Shopping Centre, Oxford",41,"Westgate, Oxford"
32 | 42,Westfield Annapolis,42,Annapolis Mall
33 | 43,Brentwood Town Centre,43,Brentwood Town Centre (mall)
34 | 44,St. David's (Cardiff),44,"St David's, Cardiff"
35 | 45,Centro Toombul,45,Toombul Shopping Centre
36 | 46,Grand Indonesia Shopping Town,46,Grand Indonesia
37 | 48,Westfield Connecticut Post,48,Connecticut Post Mall
38 | 49,Fox Run Mall,49,The Mall at Fox Run
39 | 51,Downtown Plaza (Sacramento),50,Downtown Commons
40 | 51,Downtown Plaza (Sacramento),51,Westfield Downtown Plaza
41 | 52,Westfield MainPlace,52,MainPlace Mall
42 | 54,Westfield Fox Valley,54,Fox Valley Mall
43 | 55,Westfield Chicago Ridge,55,Chicago Ridge Mall
44 | 56,Louis Joliet Mall,56,Westfield Louis Joliet
45 | 58,Westfield Franklin Park,58,Franklin Park Mall
46 | 61,Westfield Belden Village,61,Belden Village Mall
47 | 62,Solano Town Center,62,Westfield Solano
48 | 64,Westfield West Covina,64,Plaza West Covina
49 | 66,Sahara Mall (Riyadh),66,Riyadh Sahara Mall
50 | 67,Crossroads Center,67,"Crossroads Center (St. Cloud, Minnesota)"
51 | 68,St. Laurent Centre,68,St. Laurent Shopping Centre
52 | 70,"Zona Rosa (Kansas City, Missouri)",70,Zona Rosa (Kansas City)
53 | 71,Centro Lutwyche,71,Lutwyche City Shopping Centre
54 | 72,Les Promenades de l'Outaouais,72,Les Promenades Gatineau
55 | 76,El Con Mall,76,El Con Center
56 | 77,Epping Plaza,77,Pacific Epping
57 | 79,Auburn Mall,79,Auburn Mall (Massachusetts)
58 | 79,Auburn Mall,80,"Auburn Mall (Auburn, Massachusetts)"
59 | 81,Cataraqui Town Centre,81,Cataraqui Centre
60 | 82,Royal Victoria Place,82,Westfield Royal Victoria Place
61 | 84,"Conestoga Mall (Waterloo, Ontario)",84,Conestoga Mall
62 | 85,Maple Hill Pavilion,85,Maple Hill Mall
63 | 86,Centro Roselands,86,Roselands Shopping Centre
64 | 87,Mall at The Source,87,The Mall at the Source
65 | 89,Santa Rosa Mall (Florida),89,Santa Rosa Mall
66 | 90,"Crossroads Center (Waterloo, IA)",90,"Crossroads Center (Waterloo, Iowa)"
67 | 91,Spires Shopping Centre,91,The Spires Shopping Centre
68 | 92,Change Alley (Singapore),92,"Change Alley, Singapore"
69 | 93,Seacon Square,93,Seacon Square Srinakarin
70 | 94,Northfield Square,94,Northfield Square Mall
71 | 96,Paradise Park (Mall),96,Paradise Park (mall)
72 | 97,Centro Colonnades,97,Colonnades Shopping Centre
73 | 98,Centro The Glen,98,The Glen Shopping Centre
74 | 88,The Oaks Mall,101,The Oaks Mall (Florida)
75 | 103,Jantzen Beach SuperCenter,103,Jantzen Beach Center
76 | 107,Northwest Plaza,107,The Crossings at Northwest
77 | 108,Acadiana Mall,108,Mall of Acadiana
78 | 109,Arsenal Mall,109,The Arsenal Project of Watertown
79 | 110,Antioch Center,110,Antioch Crossing
80 | 111,Omni Park Shopping Centre,111,Omni Park
81 | 78,The Paragon,112,"The Paragon, Singapore"
82 | 114,Mail Champlain,114,Champlain Mall
83 | 115,Splendid China Tower,115,Splendid China Mall
84 | 116,Brunswick shopping centre,116,Brunswick Shopping Centre
85 | 120,Westfield Warrawong,120,Warrawong Plaza
86 | 121,Westfield Figtree,121,Figtree Grove
87 | 122,Westfield Pakuranga,122,Pakuranga Plaza
88 | 50,Downtown Shopping Centre,123,Westfield Downtown
89 | 124,Knollwood Mall,124,Shoppes at Knollwood
90 | 125,Shangri-la Plaza Mall,125,Shangri-La Plaza
91 | 125,Shangri-la Plaza Mall,126,Shangri-La Plaza (shopping mall)
92 | 127,Lakeshore Mall (Florida),127,Lakeshore Mall
93 | 128,Menara Great Eastern,128,Great Eastern Tower
94 | 130,Avenue Carriage Crossing,130,Carriage Crossing
95 | 132,The Mall at Shelter Cove,132,Shelter Cove Towne Centre
96 | 133,Bishops Corner (West Hartford),133,"Bishops Corner, West Hartford"
97 | 137,Tallahassee Mall,137,Centre of Tallahassee
98 | 138,La Encantada,138,La Encantada (shopping center)
99 | 141,Forest Lake Shopping Centre,141,Forest Lake Village Shopping Centre
100 | 142,MegaBox (shopping mall),142,Megabox (shopping mall)
101 | 143,Westfield CastleCourt,143,CastleCourt
102 | 145,Deira City Centre,145,City Centre Deira
103 | 147,The Promenade Shopping Centre,147,Promenade (shopping centre)
104 | 148,The Mall at Cortana,148,Cortana Mall
105 | 150,Bentley Bridge Retail Park,150,Bentley Bridge
106 | 152,City Centre Plaza,152,"City Centre Plaza, Rockhampton"
107 | 156,NewPark Mall,156,Newpark Mall
108 | 158,"Regency Square Mall (Florence, Alabama)",158,Florence Mall (Alabama)
109 | 160,Centro Taigum,160,Taigum Square Shopping Centre
110 | 161,Sta. Lucia East Grand Mall,161,Sta. Lucia East Grandmall
111 | 162,Florence Mall,162,Florence Mall (Kentucky)
112 | 164,Centro Karingal,164,Karingal Hub Shopping Centre
113 | 165,Regency Square Mall (Jacksonville),165,"Regency Square Mall (Jacksonville, Florida)"
114 | 168,Dembel City Center,168,Bole Dembel Shopping Center
115 | 171,Winrock Shopping Center,171,Winrock Center
116 | 174,Chesapeake Square Mall,174,Chesapeake Square
117 | 175,Lulu Mall,175,LuLu International Shopping Mall
118 | 175,Lulu Mall,176,Lulu Cochin Mall
119 | 177,Mirdif City Centre,177,City Centre Mirdif
120 | 178,"Shaktan Thampuran Private Bus Stand, Thrissur",178,Shaktan Thampuran Private Bus Stand
121 | 179,"Star City, Seoul",179,Star City (shopping mall)
122 | 182,Exchange Ilford,182,"The Exchange, Ilford"
123 | 183,My Mall Limassol,183,MY MALL Limassol
124 | 184,Castletown Shoppingworld,184,CastleTown Shoppingworld
125 | 185,The Market Common Myrtle Beach,185,The Market Common
126 | 187,The Mall Pavilions,187,The Pavilions
127 | 188,Settlers' Green Outlet Village,188,Settlers Green
128 | 189,"The Summit (Reno, Nevada)",189,The Summit (Reno)
129 | 190,"The Summit (Wheatfield, New York)",190,The Summit (Wheatfield)
130 | 191,Domain Central,191,"Domain Central, Townsville"
131 | 192,Granada Center,192,Granada Centre
132 | 193,The Base (mall),193,The Base (shopping centre)
133 | 193,The Base (mall),194,The Base (Shopping Centre)
134 | 195,Tulsa Promenade Mall,195,Tulsa Promenade
135 | 196,West Manchester Mall,196,West Manchester Town Center
136 | 197,Lakeview Square,197,Lakeview Square Mall
137 | 198,Palladium Square,198,Palladium World
138 | 199,Centro Lavington,199,Lavington Square Shopping Centre
139 | 200,Bahrain City Centre,200,City Centre Bahrain
140 | 201,Spinderiet (Copenhagen),201,Spinderiet
141 | 203,Kukui Grove Shopping Center,203,Kukui Grove Center
142 | 207,"University Mall (Chapel Hill, North Carolina)",207,"University Place (Chapel Hill, North Carolina)"
143 | 208,The Outlets at Sands Bethlehem,208,The Shoppes at Sands
144 | 209,Enfield Square,209,Enfield Square Mall
145 | 210,"Harbor Point, Subic",210,Harbor Point (Subic)
146 | 211,Crystal Palace Complex (Dieppe),211,Bass Pro Complex (Dieppe)
147 | 212,Centre at Glen Burnie,212,Glen Burnie Mall
148 | 213,South City (shopping mall),213,South City Mall
149 | 214,The Gallery at Military Circle,214,Military Circle Mall
150 | 215,West 12 Shepherds Bush,215,West 12
151 | 215,West 12 Shepherds Bush,216,West 12 Shepherd's Bush
152 | 217,The Brentwood Country Mart,217,Brentwood Country Mart
153 | 218,South Point (shopping mall),218,South Point Mall
154 | 219,Cross County Plaza,219,Cross County Mall
155 | 220,Werribee Plaza,220,Pacific Werribee
156 | 221,Monroe Crossing Mall,221,Monroe Crossing
157 | 222,"Northwoods Mall (Peoria, Illinois)",222,Northwoods Mall (Illinois)
158 | 223,Square 2 (Shopping Mall),223,Square 2
159 | 223,Square 2 (Shopping Mall),224,Square 2 (shopping mall)
160 | 225,Westshore Mall,225,The Shops at Westshore
161 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Country/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Kosovo (region)
3 | 1,Myanmar
4 | 2,Qing dynasty
5 | 3,Special administrative regions of China
6 | 4,Kingdom of Northumbria
7 | 5,Sui dynasty
8 | 6,Tang dynasty
9 | 7,Ming dynasty
10 | 8,Qin dynasty
11 | 9,Shang dynasty
12 | 10,Zhou dynasty
13 | 11,Xia dynasty
14 | 12,Han dynasty
15 | 13,Cape Colony
16 | 14,Duchy of Lorraine
17 | 15,Liu Song dynasty
18 | 16,Song dynasty
19 | 17,Côte d'Ivoire
20 | 18,Episcopal principality of Utrecht
21 | 19,Free France
22 | 20,Malacca Sultanate
23 | 21,Emirate of Transjordan
24 | 22,Qara Khitai
25 | 23,Nassau (state)
26 | 24,Lan Na
27 | 25,Schaumburg-Lippe
28 | 26,Upper Mustang
29 | 27,Archbishopric of Mainz
30 | 28,Xin dynasty
31 | 29,Dzungar Khanate
32 | 30,Communist Romania
33 | 31,Hephthalite
34 | 32,Kingdom of Iberia
35 | 33,Buyeo
36 | 34,Zimbabwe Rhodesia
37 | 35,Former Zhao
38 | 36,French protectorate of Cambodia
39 | 37,Polish People's Republic
40 | 38,Nakhchivan Autonomous Republic
41 | 39,Nakhchivan
42 | 40,Moche culture
43 | 41,Moche
44 | 42,History of the Lands of the Bohemian Crown (Middle Ages)
45 | 43,Isle de France
46 | 44,Chen dynasty
47 | 45,Liang dynasty
48 | 46,Later Liang (Sixteen Kingdoms)
49 | 47,Western Liang (Sixteen Kingdoms)
50 | 48,Ryūkyū Kingdom
51 | 49,Rustamid
52 | 50,Hammadid
53 | 51,Later Liang (Five Dynasties)
54 | 52,Later Tang
55 | 53,Croatia in the union with Hungary
56 | 54,Indo-Greek Kingdom
57 | 55,French protectorate of Morocco
58 | 56,Nanda Dynasty
59 | 57,Samanid Empire
60 | 58,Nueva Navarra
61 | 59,Shunga Empire
62 | 60,Pandyan dynasty
63 | 61,Jordanian occupation of the West Bank and East Jerusalem
64 | 62,Spanish protectorate in Morocco
65 | 63,Spanish Protectorate of Morocco
66 | 64,Mandatory Iraq
67 | 65,Early Dynastic Period (Egypt)
68 | 66,Bavarian Council Republic
69 | 67,Kadamba dynasty
70 | 68,Dali Kingdom
71 | 69,Later Baekje
72 | 70,Kartl-Kakheti
73 | 71,Tierra Firma
74 | 72,Merkit
75 | 73,Mamluk Dynasty (Delhi)
76 | 74,Kerma Culture
77 | 75,Principality of Reuss Younger Line
78 | 76,Keraites
79 | 77,Khereid
80 | 78,Provisional Government of Autonomous Siberia
81 | 79,Saar Protectorate
82 | 80,Nanyo (Japanese mandated territory)
83 | 81,Muzaffarids (Iran)
84 | 82,Reorganized National Government of the Republic of China
85 | 83,Kingdom of Soissons
86 | 84,Indian-Ocean Rim Association
87 | 85,Restoration (Spain)
88 | 86,Imperial County of Ortenburg
89 | 87,Bushmanland
90 | 88,Kingdom of Albania (medieval)
91 | 89,Colony of Fiji
92 | 90,Kediri Kingdom
93 | 91,Pasai
94 | 92,Free and Independent Republic of West Florida
95 | 93,"Eastern Slavonia, Baranja and Western Syrmia"
96 | 94,Kurt dynasty
97 | 95,Kartids
98 | 96,Republic of Ancona
99 | 97,Ghurid dynasty
100 | 98,Kingdom of Limmu-Ennarea
101 | 99,Yamataikoku
102 | 100,First Austrian Republic
103 | 101,Aztec Empire
104 | 102,Qi (Li Maozhen's state)
105 | 103,Sultanate of Ifat
106 | 104,Nawabs of Bengal & Murshidabad
107 | 105,United Belgian States
108 | 106,Hotak dynasty
109 | 107,Hotaki Empire
110 | 108,Chera dynasty
111 | 109,Later Jin (Five Dynasties)
112 | 110,Rai dynasty
113 | 111,Chavda dynasty
114 | 112,Hadiya Sultanate
115 | 113,Mahdist Sudan
116 | 114,Odanad
117 | 115,First Republic of Armenia
118 | 116,Rashtrakuta dynasty
119 | 117,Osraige
120 | 118,Kingdom of Osraige
121 | 119,Jalairid Sultanate
122 | 120,Jalayirids
123 | 121,Belarusian Central Council
124 | 122,Margravate of Meissen
125 | 123,Kingdom of the Suebi
126 | 124,Tuyuhun
127 | 125,Tarkhan dynasty
128 | 126,Western Ganga dynasty
129 | 127,Arghun dynasty
130 | 128,Samma dynasty
131 | 129,Wadai Empire
132 | 130,Alid dynasties of northern Iran
133 | 131,Alavids
134 | 132,Restoration and Regeneration in Switzerland
135 | 133,French Protectorate of Laos
136 | 134,Mleccha dynasty
137 | 135,Panchala Kingdom
138 | 136,Ganja Khanate
139 | 137,Kurdistansky Uyezd
140 | 138,Japanese occupation of Malaya
141 | 139,Erivan khanate
142 | 140,Beylik of Dulkadir
143 | 141,German military administration in occupied France during World War II
144 | 142,Hungarian People's Republic
145 | 143,Prince-Bishopric of Strasbourg
146 | 144,Realm of Stefan Dragutin
147 | 145,Occupation of Turkish Armenia
148 | 146,Duchy of Luxemburg
149 | 147,"County, Duchy and Grand Duchy of Luxembourg"
150 | 148,Sultanate of Aussa
151 | 149,County of Brabant
152 | 150,Sasanian Empire
153 | 151,Colony of Singapore
154 | 152,Nayakas of Keladi
155 | 153,Jabal al-Druze (state)
156 | 154,Japanese occupation of the Dutch East Indies
157 | 155,Sajid dynasty
158 | 156,County of Luxemburg
159 | 157,Duchy of Neopatras
160 | 158,Albona Republic
161 | 159,First Hungarian Republic
162 | 160,Zanzibar Sultanate
163 | 161,Kingdom of Fouta Tooro
164 | 162,Kingdom of Fouta Djallon
165 | 163,Later Han (Five Dynasties)
166 | 164,Later Zhou
167 | 165,Italian Islands of the Aegean
168 | 166,Yin (Five Dynasties period)
169 | 167,Yan (Five Dynasties period)
170 | 168,Duchy of Gascony
171 | 169,Shun dynasty
172 | 170,Anhalt
173 | 171,Crown Colony of Malacca
174 | 172,Malacca (British Crown colony)
175 | 173,Sallarid
176 | 174,Kingdom of Tashir-Dzoraget
177 | 175,Dutch Ceylon
178 | 176,Yueban
179 | 177,St. Ulrich's and St. Afra's Abbey
180 | 178,Kingdom of Italy (1861â1946)
181 | 179,Principality of Reuss Elder Line
182 | 180,Bavand dynasty
183 | 181,Heungyo
184 | 182,Lordship of Negroponte
185 | 183,"Yanam, French India"
186 | 184,Margraviate of Austria
187 | 185,Arminiya
188 | 186,Emirate of Armenia
189 | 187,Byzantium under the Komnenos dynasty
190 | 188,Byzantium under the Palaiologoi
191 | 189,Trust Territory of Somaliland
192 | 190,Lordship of Glamorgan
193 | 191,Seljuk Empire
194 | 192,Seljuq Empire
195 | 193,Byzantine Empire under the Angeloi
196 | 194,Byzantium under the Heraclians
197 | 195,Jeongan
198 | 196,Mamluk dynasty of Iraq
199 | 197,Lordship of Winneburg and Beilstein
200 | 198,Byzantium under the Isaurians
201 | 199,Byzantium under the Macedonians
202 | 200,Principality of Iberia
203 | 201,Austro-Hungarian rule in Bosnia and Herzegovina
204 | 202,Condominium of Bosnia and Herzegovina
205 | 203,K'iche' kingdom of Q'umarkaj
206 | 204,Provisional Administration of South Ossetia
207 | 205,Tanganyika (territory)
208 | 206,Bagratid Armenia
209 | 207,Unification of Hispaniola
210 | 208,Maha-Meghavahana Dynasty
211 | 209,Pudukkottai State
212 | 210,Namayan
213 | 211,Independent State of Macedonia
214 | 212,British Kenya
215 | 213,Yuan dynasty
216 | 214,Principality of Arbër
217 | 215,Principality of Albania (medieval)
218 | 216,Kingdom of Hejaz and Nejd
219 | 217,British invasion of Manila
220 | 218,Rajahnate of Butuan
221 | 219,Northern Yuan dynasty
222 | 220,Post-Soviet transition in Ukraine
223 | 221,Commonwealth of Independent States Free Trade Area
224 | 222,Adriatic Ionian Euroregion
225 | 223,Sultanate of Ternate
226 | 224,Bengal Sultanate
227 | 225,Kingdom of Chiang Mai
228 | 226,German military administration in occupied Poland
229 | 227,Four Oirat
230 | 228,Liao dynasty
231 | 229,Principality of Turov
232 | 230,Sultanate of the Geledi
233 | 231,Gobroon dynasty
234 | 232,Byzantium under the Justinian dynasty
235 | 233,Zhou (Zhang Shicheng's kingdom)
236 | 234,Ettaiyapuram estate
237 | 235,Kingdom of Ava
238 | 236,Arab Kingdom of Syria
239 | 237,Ror dynasty
240 | 238,History of Iraq under Ba'athist rule
241 | 239,Japanese colonial empire
242 | 240,Kingdom of Spain under Joseph Bonaparte
243 | 241,Spain under Joseph Bonaparte
244 | 242,Provisional Government of Bangladesh
245 | 243,Eurasian Economic Union
246 | 244,Gazikumukh Shamkhalate
247 | 245,Shamkhalate of Kazi-Kumukh
248 | 246,Kition (ancient state)
249 | 247,Gazikumukh Khanate
250 | 248,Khanate of Kazi-Kumukh
251 | 249,People's Republic of Zanzibar
252 | 250,Imperial Abbey of Kempten
253 | 251,Liao (Zhou dynasty state)
254 | 252,Kalingga Kingdom
255 | 253,Insular Government
256 | 254,Zhao (Five Dynasties period)
257 | 255,Vaspurakan Kingdom
258 | 256,Interim Government of Iran
259 | 257,Cyrenaica Emirate
260 | 258,General Government of Belgium
261 | 259,Federation of Nigeria
262 | 260,Dominion of Mauritius
263 | 261,Grand Principality of Serbia
264 | 262,Amecatl
265 | 263,Duchy of Dol-Combourg
266 | 264,Shirvan Baylarbaylik
267 | 265,Imperial Throne (micronation)
268 | 266,Imperial Throne (Sovereign Nation)
269 | 267,Kalinyamat Sultanate
270 | 268,Karabakh Beylerbeylik
271 | 269,Kara Del
272 | 270,Crown Colony of Labuan
273 | 271,Jin (Later Tang precursor)
274 | 272,United States Military Government in Cuba
275 | 273,Colony of Santiago
276 | 274,Spanish occupation of Santiago (Jamaica)
277 | 275,Kempten (Imperial Free City)
278 | 276,Carniola (Early Middle Ages)
279 | 277,Armi (Syria)
280 | 278,Armani (Ancient kingdom)
281 | 279,Turkish Provisional Government
282 | 280,Cupul
283 | 281,Duchy of Croatia
284 | 282,Rojava
285 | 283,Kingdom of Qocho
286 | 284,Transnistria autonomous territorial unit with special legal status
287 | 285,Passports issued by the European Union candidate states
288 | 286,Nagas of Padmavati
289 | 287,Margraviate of Moravia
290 | 288,Crown Colony of Malta
291 | 289,Gozo (independent state)
292 | 290,Gozo (1798â1800)
293 |
--------------------------------------------------------------------------------
/src/autofj/autofj.py:
--------------------------------------------------------------------------------
1 | from .join_function_space.autofj_join_function_space import AutoFJJoinFunctionSpace
2 | from .blocker.autofj_blocker import AutoFJBlocker
3 | from .optimizer.autofj_multi_column_greedy_algorithm import \
4 | AutoFJMulticolGreedyAlgorithm
5 | import pandas as pd
6 | from .utils import print_log
7 | import os
8 | from .negative_rule import NegativeRule
9 | import numpy as np
10 |
11 |
12 | class AutoFJ(object):
13 | """
14 | AutoFJ automatically produces record pairs that approximately match in
15 | two tables L and R. It proceeds to configure suitable parameters
16 | automatically, which when used to fuzzy-join L and R, meets the
17 | user-specified precision target, while maximizing recall.
18 |
19 | AutoFJ attempts to solve many-to-one join problems, where each record in R
20 | will be joined with at most one record in L, but each record in L can be
21 | joined with multiple records in R. In AutoFJ, L refers to a reference
22 | table, which is assumed to be almost "duplicate-free".
23 |
24 | Parameters
25 | ----------
26 | precision_target: float, default=0.9
27 | Precision target.
28 |
29 | join_function_space: string or dict or list of objects, default="autofj_sm"
30 | There are following three ways to define the space of join functions:
31 | (1) Use the name of built-in join function space. There are three
32 | options, including "autofj_lg", "autofj_lg" and "autofj_sm" that use
33 | 136, 68 and 14 join functions, respectively. Using less join functions
34 | can improve efficiency but may worsen performance.
35 | (2) Use a dict specifying the options for preprocessing methods,
36 | tokenization methods, token weighting methods and distance functions.
37 | The space will be the cartesian product of all options in the dict.
38 | See ./join_function_space/options.py for defining join functions using
39 | a dict.
40 | (3) Use a list of customized JoinFunction objects.
41 |
42 | distance_threshold_space: int or list, default=50
43 | The number of candidate distance thresholds or a list of candidate
44 | distance thresholds in the space. If the number of distance thresholds
45 | (integer) is given, distance thresholds are spaced evenly from 0 to 1.
46 | Otherwise, it should be a list of floats from 0 to 1.
47 |
48 | column_weight_space: int or list, default=10
49 | The number of candidate column weights or a list of candidate
50 | column weights in the space. If the number of column weights
51 | (integer) is given, column weights are spaced evenly from 0 to 1.
52 | Otherwise, it should be a list of floats from 0 to 1.
53 |
54 | blocker: a Blocker object or None, default None
55 | A Blocker object that performs blocking on two tables. If None, use
56 | the built-in blocker. For customized blocker, see Blocker class.
57 |
58 | n_jobs : int, default=-1
59 | Number of CPU cores used. -1 means using all processors.
60 |
61 | verbose: bool, default=False
62 | Whether to print logging
63 | """
64 |
65 | def __init__(self,
66 | precision_target=0.9,
67 | join_function_space="autofj_sm",
68 | distance_threshold_space=50,
69 | column_weight_space=10,
70 | blocker=None,
71 | n_jobs=-1,
72 | verbose=False):
73 | self.precision_target = precision_target
74 | self.join_function_space = join_function_space
75 |
76 | if type(distance_threshold_space) == int:
77 | self.distance_threshold_space = list(
78 | np.linspace(0, 1, distance_threshold_space))
79 | else:
80 | self.distance_threshold_space = distance_threshold_space
81 |
82 | if type(column_weight_space) == int:
83 | self.column_weight_space = list(
84 | np.linspace(0, 1, column_weight_space))
85 | else:
86 | self.column_weight_space = column_weight_space
87 |
88 | if blocker is None:
89 | self.blocker = AutoFJBlocker(n_jobs=n_jobs)
90 | else:
91 | self.blocker = blocker
92 |
93 | self.n_jobs = n_jobs if n_jobs > 0 else os.cpu_count()
94 | self.verbose = verbose
95 |
96 | def join(self, left_table, right_table, id_column, on=None):
97 | """Join left table and right table.
98 |
99 | Parameters
100 | ----------
101 | left_table: pd.DataFrame
102 | Reference table. The left table is assumed to be almost
103 | duplicate-free, which means it has no or only few duplicates.
104 |
105 | right_table: pd.DataFrame
106 | Another input table.
107 |
108 | id_column: string
109 | The name of id column in the two tables. This column will not be
110 | used to join two tables.
111 |
112 | on: list or None
113 | A list of column names (multi-column fuzzy join) that the two tables
114 | will be joined on. If None, two tables will be joined on all columns
115 | that exist in both tables, excluding the id column.
116 |
117 | Returns:
118 | --------
119 | result: pd.DataFrame
120 | A table of joining pairs. The columns of left table are
121 | suffixed with "_l" and the columns of right table are suffixed
122 | with "_r"
123 | """
124 | left = left_table.copy(deep=True)
125 | right = right_table.copy(deep=True)
126 |
127 | # create internal id columns (use internal ids)
128 | left["autofj_id"] = range(len(left))
129 | right["autofj_id"] = range(len(right))
130 |
131 | # remove original ids
132 | left.drop(columns=id_column, inplace=True)
133 | right.drop(columns=id_column, inplace=True)
134 |
135 | # get names of columns to be joined
136 | if on is None:
137 | on = sorted(list(set(left.columns).intersection(right.columns)))
138 | left = left[on]
139 | right = right[on]
140 |
141 | # do blocking
142 | if self.verbose:
143 | print_log("Start blocking")
144 | LL_blocked = self.blocker.block(left, left, "autofj_id")
145 | LR_blocked = self.blocker.block(left, right, "autofj_id")
146 |
147 | # remove equi-joins on LL
148 | LL_blocked = LL_blocked[
149 | LL_blocked["autofj_id_l"] != LL_blocked["autofj_id_r"]]
150 |
151 | # learn and apply negative rules
152 | nr = NegativeRule(left, right, "autofj_id")
153 | nr.learn(LL_blocked)
154 | LR_blocked = nr.apply(LR_blocked)
155 |
156 | # create join function space
157 | jf_space = AutoFJJoinFunctionSpace(self.join_function_space,
158 | n_jobs=self.n_jobs)
159 |
160 | # compute distance
161 | if self.verbose:
162 | print_log("Start computing distances. Size of join function space: {}"
163 | .format(len(jf_space.join_functions)))
164 |
165 | LL_distance, LR_distance = jf_space.compute_distance(left,
166 | right,
167 | LL_blocked,
168 | LR_blocked)
169 |
170 | # run greedy algorithm
171 | if self.verbose:
172 | print_log("Start running greedy algorithm.")
173 |
174 | optimizer = AutoFJMulticolGreedyAlgorithm(
175 | LL_distance,
176 | LR_distance,
177 | precision_target=self.precision_target,
178 | candidate_thresholds=self.distance_threshold_space,
179 | candidate_column_weights=self.column_weight_space,
180 | n_jobs=self.n_jobs
181 | )
182 |
183 | self.selected_column_weights, self.selected_join_configs, LR_joins = \
184 | optimizer.run()
185 |
186 | if LR_joins is None:
187 | print("Warning: The precision target cannot be achieved.",
188 | "Try a lower precision target or a larger space of join functions,",
189 | "distance thresholds and column weights.")
190 | LR_joins = pd.DataFrame(columns=[c+"_l" for c in left_table.columns]+
191 | [c+"_r" for c in right_table.columns])
192 | return LR_joins
193 |
194 | # merge with original left and right tables
195 | left_idx = [l for l, r in LR_joins]
196 | right_idx = [r for l, r in LR_joins]
197 | L = left_table.iloc[left_idx].add_suffix("_l").reset_index(drop=True)
198 | R = right_table.iloc[right_idx].add_suffix("_r").reset_index(drop=True)
199 | result = pd.concat([L, R], axis=1).sort_values(by=id_column + "_r")
200 | return result
201 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/ClericalAdministrativeRegion/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Latin Patriarchate of Jerusalem
3 | 1,Baptist Union of Australia
4 | 2,Diocese of New Westminster
5 | 3,Roman Catholic Diocese of Sankt Gallen
6 | 4,Diocese of Gibraltar in Europe
7 | 5,Patriarch of Lisbon
8 | 6,Roman Catholic Diocese of Busan
9 | 7,Roman Catholic Diocese of Lafayette in Indiana
10 | 8,Episcopal see of Carthage
11 | 9,Roman Catholic Diocese of Kon Tum
12 | 10,Syro-Malabar Catholic Archeparchy of Changanassery
13 | 11,Syro-Malabar Catholic Major Archeparchy of Ernakulam-Angamaly
14 | 12,Diocese of Niassa
15 | 13,Eparchy of Gornji Karlovac
16 | 14,Syro-Malabar Catholic Eparchy of St. Thomas of Chicago
17 | 15,St. Thomas Syro-Malabar Catholic Eparchy of Chicago
18 | 16,Melkite Greek Catholic Archeparchy of Zahle and Forzol
19 | 17,Anglican Diocese of Eastern Newfoundland and Labrador
20 | 18,Anglican Diocese of Central Newfoundland
21 | 19,Anglican Diocese of Western Newfoundland
22 | 20,Anglican Diocese of Nova Scotia and Prince Edward Island
23 | 21,Diocese of Ripon
24 | 22,Anglican Diocese of Niagara
25 | 23,Anglican Diocese of Qu'Appelle
26 | 24,Ukrainian Catholic Eparchy of Saints Peter and Paul of Melbourne
27 | 25,Diocese of Jos
28 | 26,Diocese of Makurdi
29 | 27,Diocese of Yola
30 | 28,Syro-Malabar Catholic Archeparchy of Tellicherry
31 | 29,Roman Catholic Archdiocese of Valencia in Spain
32 | 30,Episcopal Church in Minnesota
33 | 31,Roman Catholic Archdiocese of Kingston in Canada
34 | 32,Apostolic Vicariate of Southern Arabia
35 | 33,Apostolic Vicariate of Iles Saint Pierre and Miquelon
36 | 34,Diocese of Namibia
37 | 35,Diocese of Natal
38 | 36,Syro-Malabar Catholic Archeparchy of Thrissur
39 | 37,Anglican Diocese of Grafton
40 | 38,Syro-Malabar Catholic Eparchy of Kanjirappally
41 | 39,Syro-Malabar Catholic Eparchy of Kothamangalam
42 | 40,Syro-Malabar Catholic Eparchy of Idukki
43 | 41,Syro-Malabar Catholic Eparchy of Belthangady
44 | 42,Romanian Catholic Eparchy of Cluj-Gherla
45 | 43,Syro-Malabar Catholic Eparchy of Rajkot
46 | 44,Syro-Malabar Catholic Eparchy of Sagar
47 | 45,Syro-Malabar Catholic Eparchy of Adilabad
48 | 46,Syro-Malabar Catholic Eparchy of Bijnor
49 | 47,Syro-Malabar Catholic Eparchy of Chanda
50 | 48,Syro-Malabar Catholic Eparchy of Gorakhpur
51 | 49,Syro-Malabar Catholic Eparchy of Kalyan
52 | 50,Syro-Malabar Catholic Eparchy of Irinjalakuda
53 | 51,Anglican Diocese of Bendigo
54 | 52,Anglican Diocese of North West Australia
55 | 53,Syro-Malabar Catholic Eparchy of Jagdalpur
56 | 54,Syro-Malabar Catholic Eparchy of Satna
57 | 55,Syro-Malabar Catholic Eparchy of Thamarassery
58 | 56,Syro-Malabar Catholic Eparchy of Thuckalay
59 | 57,Ukrainian Catholic Eparchy of the Holy Family of London
60 | 58,Apostolic Exarchate for Ukrainians in Great Britain
61 | 59,Melkite Greek Catholic Eparchy of Saint Michael Archangel in Sydney
62 | 60,Roman Catholic Diocese of Bathurst in Australia
63 | 61,Maronite Catholic Eparchy of Saint Maron of Sydney
64 | 62,Ukrainian Catholic Eparchy of Saint Vladimir the Great of Paris
65 | 63,"Apostolic Exarchate in France, Benelux and Switzerland for the Ukrainians"
66 | 64,Roman Catholic Archdiocese of Xalapa
67 | 65,Anglican Diocese of Keewatin
68 | 66,Anglican Diocese of British Columbia
69 | 67,Anglican Diocese of Ontario
70 | 68,Roman Catholic Archdiocese of Santiago de Guatemala
71 | 69,Apostolic Vicariate of Izabal
72 | 70,Roman Catholic Diocese of Victoria in Canada
73 | 71,Roman Catholic Diocese of Bathurst in Canada
74 | 72,Roman Catholic Diocese of Concordia in Argentina
75 | 73,Roman Catholic Archdiocese of La Plata in Argentina
76 | 74,Suburbicarian Diocese of Porto e Santa Rufina
77 | 75,Anglican Diocese of Pretoria
78 | 76,Anglican Diocese of Huron
79 | 77,Anglican Diocese of Arctic
80 | 78,Anglican Diocese of Rupert's Land
81 | 79,Anglican Diocese of Algoma
82 | 80,Archdiocese of Russian Orthodox churches in Western Europe
83 | 81,Ruthenian Catholic Archeparchy of Pittsburgh
84 | 82,Ruthenian Catholic Eparchy of Passaic
85 | 83,Ruthenian Catholic Eparchy of Parma
86 | 84,Byzantine Catholic Apostolic Exarchate of Serbia
87 | 85,Croatian Catholic Apostolic Exarchate of Serbia
88 | 86,Apostolic Exarchate of Serbia
89 | 87,Roman Catholic Archdiocese of Ho Chi Minh City
90 | 88,Roman Catholic Archdiocese of Mary Most Holy in Astana
91 | 89,Roman Catholic Diocese of Sora-Cassino-Aquino-Pontecorvo
92 | 90,Diocese of Novgorod
93 | 91,Archbishop of Novgorod
94 | 92,Diocese of Novgorod and Staraya Russa
95 | 93,"Roman Catholic Diocese of Santa Rosa, Argentina"
96 | 94,Roman Catholic Diocese of Port Harcourt
97 | 95,Anglican Diocese of San Joaquin
98 | 96,Roman Catholic Diocese of Bethlehem in South Africa
99 | 97,Roman Catholic Diocese of Pietersburg
100 | 98,Roman Catholic Diocese of Lwiza
101 | 99,Ukrainian Catholic Eparchy of Toronto
102 | 100,Diocese of Grahamstown
103 | 101,Roman Catholic Diocese of Aire and Dax
104 | 102,Roman Catholic Diocese of La Rochelle
105 | 103,Syro-Malabar Catholic Eparchy of Bhadravathi
106 | 104,Roman Catholic Diocese of Brugge
107 | 105,Roman Catholic Archdiocese of Dodoma
108 | 106,Chaldean Catholic Eparchy of Saint Peter the Apostle of San Diego
109 | 107,"Roman Catholic Archdiocese of Naxos, Tinos, Andros and Mykonos"
110 | 108,Diocese of Medak of the Church of South India
111 | 109,Roman Catholic Archdiocese of Toamasina
112 | 110,Roman Catholic Diocese of Alessandria
113 | 111,Italo-Albanese Eparchy of Piana degli Albanesi
114 | 112,Slovak Catholic Eparchy of Bratislava
115 | 113,Albanian Catholic Apostolic Administration of Southern Albania
116 | 114,Roman Catholic Diocese of Syros
117 | 115,Romanian Catholic Eparchy of Oradea Mare
118 | 116,Roman Catholic Archdiocese of Cuzco
119 | 117,Roman Catholic Archdiocese of Cartagena in Colombia
120 | 118,Roman Catholic Diocese of San Juan de Calama
121 | 119,Maronite Catholic Eparchy of Saint Maron of Montreal
122 | 120,Eparchy of Saint Maron of Montreal Maronites
123 | 121,Anglican Diocese of Zululand
124 | 122,Roman Catholic Diocese of Coari
125 | 123,Roman Catholic Diocese of Nicopoli
126 | 124,Anglican Diocese of Athabasca
127 | 125,Anglican Diocese of Saint Helena
128 | 126,Roman Catholic Diocese of Trincomalee
129 | 127,Diocese of Lucknow of the Church of North India
130 | 128,Roman Catholic Diocese of Jalandhar
131 | 129,Roman Catholic Diocese of Simla and Chandigarh
132 | 130,Diocese of Angola
133 | 131,Diocese of Lebombo
134 | 132,Anglican Diocese of Lesotho
135 | 133,Latin Catholic Diocese of Punalur
136 | 134,Diocese of the Highveld
137 | 135,Diocese of Swaziland
138 | 136,Diocese of St Mark the Evangelist
139 | 137,Diocese of Mpumalanga
140 | 138,Latin Catholic Archdiocese of Baghdad
141 | 139,Anglican Diocese of Fredericton
142 | 140,Anglican Diocese of Saskatchewan
143 | 141,Anglican Diocese of Caledonia
144 | 142,Anglican Diocese of Brandon
145 | 143,Anglican Diocese of Kootenay
146 | 144,Anglican Diocese of Yukon
147 | 145,Territorial Prelature of the Mission de France at Pontigny
148 | 146,Apostolic Vicariate of Anatolia
149 | 147,Diocese of Karimnagar of the Church of South India
150 | 148,Diocese of Coimbatore of the Church of South India
151 | 149,Diocese of Coimbatore
152 | 150,Ukrainian Catholic Archeparchy of Ivano-Frankivsk
153 | 151,Ruthenian Catholic Eparchy of Mukacheve
154 | 152,Ruthenian Catholic Apostolic Exarchate of Czech Republic
155 | 153,Trichy-Tanjore Diocese of the Church of South India
156 | 154,Roman Catholic Diocese of Yarmouth
157 | 155,Military Ordinariate of the South African Defence Force
158 | 156,Military Ordinariate of Bolivia
159 | 157,Military Ordinariate of Chile
160 | 158,Military Ordinariate of Colombia
161 | 159,Military Ordinariate of the Dominican Republic
162 | 160,Military Bishopric of Dominican Republic
163 | 161,Military Ordinariate of Ecuador
164 | 162,Military Ordinariate of El Salvador
165 | 163,Military Ordinariate of Paraguay
166 | 164,Military Ordinariate of Peru
167 | 165,Military Ordinariate of Italy
168 | 166,Syro-Malankara Catholic Eparchy of the United States of America and Canada
169 | 167,Syro-Malankara Catholic Apostolic Exarchate in the United States
170 | 168,Metropolis of Servia and Kozani
171 | 169,Chaldean Catholic Archeparchy of Amida
172 | 170,Syro-Malabar Catholic Eparchy of Ramanathapuram
173 | 171,Diocese of South Kerala of the Church of South India
174 | 172,"Metropolitanate of Zagreb, Ljubljana and all Italy"
175 | 173,Roman Catholic Diocese of Roskilde
176 | 174,Diocese of Madhya Kerala of the Church of South India
177 | 175,Melkite Greek Catholic Archeparchy of Beirut and Jbeil
178 | 176,Syro-Malabar Catholic Eparchy of Mandya
179 | 177,Diocese of Iran
180 | 178,Archbishopric of Belgrade and Karlovci
181 | 179,Maronite Catholic Eparchy of Our Lady of Lebanon of Paris
182 | 180,Melkite Greek Catholic Archeparchy of Akka
183 | 181,Sufes
184 | 182,Maronite Catholic Eparchy of Saint Maron of Brooklyn
185 | 183,Armenian Catholic Eparchy of Our Lady of Nareg in the United States of America and Canada
186 | 184,Maronite Catholic Eparchy of San Charbel in Buenos Aires
187 | 185,Roman Catholic Apostolic Vicariate of El Beni
188 | 186,Roman Catholic Vicariate Apostolic of El Beni
189 | 187,Melkite Greek Catholic Patriarchal Archeparchy of Jerusalem
190 | 188,Melkite Greek Catholic Archeparchy of Jerusalem
191 | 189,Maronite Catholic Archeparchy of Haifa and the Holy Land
192 |
--------------------------------------------------------------------------------
/src/autofj/benchmark/Artwork/right.csv:
--------------------------------------------------------------------------------
1 | id,title
2 | 0,Portlandia
3 | 1,La Danse (Bouguereau)
4 | 2,La Danse
5 | 3,The Return of Spring
6 | 4,The Night Watch (painting)
7 | 5,Guernica (Picasso)
8 | 6,The Chess Players (Eakins painting)
9 | 7,Alone in the World (Bouguereau)
10 | 8,Alone in the World
11 | 9,The Gilded Cage (Evelyn De Morgan painting)
12 | 10,The Virgin and Child with St. Anne (Leonardo da Vinci)
13 | 11,Adoration of the Magi (Leonardo da Vinci)
14 | 12,Polyptych of the Misericordia (Piero della Francesca)
15 | 13,"The Bride Stripped Bare by Her Bachelors, Even"
16 | 14,"Number 11, 1952 (painting)"
17 | 15,Bijin-ga
18 | 16,The Shepherdess (1889)
19 | 17,Portrait of Bindo Altoviti
20 | 18,La Fornarina
21 | 19,The Art of Painting (Vermeer)
22 | 20,Madonna of Foligno
23 | 21,Self-portrait with a friend
24 | 22,Madonna of the Book
25 | 23,Christ Falling on the Way to Calvary
26 | 24,Boy Bitten by a Lizard
27 | 25,The Little Street (Vermeer)
28 | 26,Neptune and Triton
29 | 27,Charity with Four Children
30 | 28,The Death of Sardanapalus
31 | 29,"Saint Jerome Writing (Caravaggio, Valletta)"
32 | 30,The Bohemian (Bouguereau)
33 | 31,The Bohemian
34 | 32,San Giorgio Maggiore at Dusk
35 | 33,Stone Flower (sculpture)
36 | 34,Sacred Cod
37 | 35,Farms near Auvers
38 | 36,The Artist's Studio
39 | 37,La maja vestida
40 | 38,Declaration of Independence (Trumbull)
41 | 39,The Roulin Family
42 | 40,Flowering Orchards
43 | 41,Sleeping Hermaphroditus
44 | 42,St. Michael Vanquishing Satan (Raphael)
45 | 43,Battle of Ostia (Raphael's painting)
46 | 44,"Adoration of the Magi (Bosch, Madrid)"
47 | 45,"Christ Carrying the Cross (Bosch, Vienna)"
48 | 46,The Hermit Saint
49 | 47,Blessed Ludovica Albertoni
50 | 48,Haystacks (Monet series)
51 | 49,The Monarch of the Glen (painting)
52 | 50,Corpus (Bernini)
53 | 51,28th Regiment at Quatre Bras (painting)
54 | 52,Portrait of a Princess (Pisanello)
55 | 53,Portrait of a princess (Pisanello)
56 | 54,Rouen Cathedral (Monet series)
57 | 55,April Love (painting)
58 | 56,The Open Window (Matisse)
59 | 57,"Queen Victoria Statue, Bristol"
60 | 58,Queen Victoria Statue
61 | 59,Boreas (painting)
62 | 60,Put Down Your Whip (painting)
63 | 61,St. Peter's baldachin
64 | 62,Cigarette (sculpture)
65 | 63,Gloria Victis (sculpture)
66 | 64,The Prodigal Son in the Brothel
67 | 65,Enigma of the Hour
68 | 66,Dippy
69 | 67,Large Interior Form
70 | 68,Saint Augustine (Botticelli)
71 | 69,Portrait of a Lady Known as Smeralda Brandini
72 | 70,Portrait of a Lady known as Smeralda Brandini
73 | 71,Non Violence
74 | 72,Young Knight in a Landscape
75 | 73,Butcher's Shop
76 | 74,Landscape with the Flight into Egypt
77 | 75,Landscape with the Flight into Egypt (Annibale Carracci)
78 | 76,Salting Madonna
79 | 77,The Judgement of Solomon (Giorgione)
80 | 78,Saint Augustine in His Cell (Botticelli)
81 | 79,Resurrection (Piero della Francesca)
82 | 80,Portrait of Adele Bloch-Bauer II
83 | 81,Manfred on the Jungfrau (Martin)
84 | 82,The Black Brunswicker (Millais)
85 | 83,Manfred on the Jungfrau (Madox Brown)
86 | 84,The Great Day of His Wrath
87 | 85,The Sortie Made by the Garrison of Gibraltar
88 | 86,Pinkie (painting)
89 | 87,The pioneer (painting)
90 | 88,Maman (sculpture)
91 | 89,The Goose Girl (Bouguereau)
92 | 90,Lincoln Monument
93 | 91,St. Jerome in the Wilderness (Leonardo)
94 | 92,The Bookworm (painting)
95 | 93,The Tribute Money
96 | 94,Frog Baby Fountain
97 | 95,Bathsheba at Her Bath
98 | 96,Walking on a mountain path in spring
99 | 97,Crucifixion with the Virgin and St John
100 | 98,The Crucifixion with the Virgin and St John (Hendrick ter Brugghen)
101 | 99,The Fortune Teller (de La Tour painting)
102 | 100,Aristotle with a Bust of Homer
103 | 101,The Needlewoman (painting)
104 | 102,George Washington (statue)
105 | 103,George Washington (1840 statue)
106 | 104,La Parisienne
107 | 105,Annunciation of Ustyug
108 | 106,Andromeda Chained to the Rocks
109 | 107,Garden at Sainte-Adresse
110 | 108,Insane Woman (La Monomane de l'envie)
111 | 109,Langlois Bridge at Arles
112 | 110,The Jester Don John of Austria
113 | 111,Prince Balthasar Charles with a Dwarf
114 | 112,Old Woman Frying Eggs
115 | 113,Convergence (Pollock)
116 | 114,"Portrait of a Young Woman (Vermeer, New York)"
117 | 115,Abraham Lincoln (1920 statue)
118 | 116,The Falconer (Simonds)
119 | 117,The Falconer (sculpture)
120 | 118,Traffic Light Tree
121 | 119,Iron Man (Minnesota statue)
122 | 120,Barrow (sculpture)
123 | 121,26 October 1993
124 | 122,"Adoration of the Magi (Rubens, Cambridge)"
125 | 123,Portrait of a Man in Red Chalk
126 | 124,Self-portrait (Leonardo da Vinci)
127 | 125,Portrait of a man in red chalk (Leonardo)
128 | 126,Bathsheba at Bath
129 | 127,La Parisienne (Hidalgo painting)
130 | 128,Farmhouses Among Trees
131 | 129,"Two Open Rectangles, Excentric, Variation VI"
132 | 130,Penitent Magdalene (Donatello)
133 | 131,Self-Portrait with Two Circles
134 | 132,Self-Portrait with Beret and Turned-Up Collar
135 | 133,"Portrait of a Young Woman (Botticelli, Frankfurt)"
136 | 134,View of Delft (Vermeer)
137 | 135,A Girl Asleep (Vermeer)
138 | 136,Portrait of Pope Julius II
139 | 137,Abraham Lincoln (1912 statue)
140 | 138,1.26
141 | 139,Madonna and Child Playing with the Veil
142 | 140,A Road at Saint-Remy with Female Figure
143 | 141,A Lane near Arles
144 | 142,A Young Tiger Playing with Its Mother
145 | 143,The Judgment of Paris (Rubens)
146 | 144,Battle of Vigo Bay (painting)
147 | 145,Christopher Columbus (Vittori)
148 | 146,George Washington (DeLue)
149 | 147,Edwin B. Hay (bust)
150 | 148,Eight Stone Lions
151 | 149,"Sundial, Boy with Spider"
152 | 150,George Washington (1785-1792 statue)
153 | 151,Francesca da Rimini and Paolo Malatesta Appraised by Dante and Virgil
154 | 152,R. D. Whitehead Monument
155 | 153,Lieutenant General George Washington (statue)
156 | 154,Aurora (di Suvero)
157 | 155,Alexander Pushkin (Bourganov)
158 | 156,"Philip Jaisohn (Washington, DC)"
159 | 157,Seated Woman
160 | 158,Still Life: Vase with Pink Roses
161 | 159,"Bottle, Glass, Fork"
162 | 160,On the wallaby track
163 | 161,The Last Supper (Ghirlandaio)
164 | 162,Almond Blossoms
165 | 163,Hospital in Arles
166 | 164,Saint Francis Receiving the Stigmata (Giotto)
167 | 165,Church Pew with Worshippers
168 | 166,Hermes (Sculpture)
169 | 167,The Great Day of Girona
170 | 168,A Young Man Being Introduced to the Seven Liberal Arts
171 | 169,Venus and the Three Graces Presenting Gifts to a Young Woman (Botticelli)
172 | 170,Saint Jerome in His Study (after van Eyck)
173 | 171,Portrait of the Duke of Wellington
174 | 172,"The Entombment (Titian, 1525)"
175 | 173,Captain Nathan Hale
176 | 174,Drunkenness of Noah
177 | 175,Martyrdom of Saint Lawrence
178 | 176,Bust of Thomas Baker
179 | 177,"Statue of Yuri Gagarin, Greenwich"
180 | 178,Two Busts of Cardinal Scipione Borghese
181 | 179,Lamentation of Christ (van der Weyden)
182 | 180,Head of a Woman (Leonardo da Vinci)
183 | 181,Memorial to Maria Raggi
184 | 182,Babe's Dream
185 | 183,"Equestrian statue of Charles I, Charing Cross"
186 | 184,The Virgin and Child with Two Angels (Andrea del Verrochio)
187 | 185,Madonna and Child (Lippi)
188 | 186,The Hope of a Condemned Man
189 | 187,Cafe Gurzuf
190 | 188,Salvator Mundi (Leonardo da Vinci)
191 | 189,Saint Sebastian (Bernini)
192 | 190,Bust of Francesco I d'Este
193 | 191,Bust of Camilla Barbadoni
194 | 192,Bust of Francesco Barberini
195 | 193,Bust of Cardinal Richelieu
196 | 194,Bust of Cardinal Richilieu
197 | 195,Saint Bibiana (Bernini)
198 | 196,Double Ascension
199 | 197,James Garfield Memorial
200 | 198,Robert Burns (Stevenson)
201 | 199,Fallen Firefighters Memorial (Wu)
202 | 200,General Thaddeus Kosciuszko
203 | 201,Wandering Rocks (Smith)
204 | 202,Fishing (Carracci)
205 | 203,Hunting (Carracci)
206 | 204,The Beggars (Bruegel)
207 | 205,Magdalene with the Smoking Flame
208 | 206,Pia de' Tolomei (Rossetti painting)
209 | 207,Thatched Cottages and Houses
210 | 208,"Statue of Margaret Thatcher, Palace of Westminster"
211 | 209,Adrianus Jacobus Zuyderland
212 | 210,Self-Portrait as the Allegory of Painting
213 | 211,Trio (Sugarman)
214 | 212,Queen Califias Magic Circle
215 | 213,Frontal from La Seu d'Urgell or of The Apostles
216 | 214,The Consecration of Saint Augustine
217 | 215,"South Wind, Clear Sky"
218 | 216,"The Elder Sister (Bouguereau, 1869)"
219 | 217,Bust of Pope Paul V
220 | 218,"Ritual wine server (guang), Indianapolis"
221 | 219,Ritual wine server
222 | 220,"Ritual wine server (guang), Indianapolis Museum of Art, 60.43"
223 | 221,Jupiter and Antiope (van Dyck)
224 | 222,Saint Peter and Saint Paul (El Greco)
225 | 223,In the Loge
226 | 224,The Descent from the Cross (David Folley)
227 | 225,"Equestrian statue of George IV, Trafalgar Square"
228 | 226,Footballer (Nolan)
229 | 227,Bust of Alessandro Peretti di Montalto
230 | 228,White on White
231 | 229,Apse from the Carthedral of Urgell
232 | 230,Middlebury to Her Soldiers
233 | 231,Statue of Pope Clement X
234 | 232,Les Orangers
235 | 233,John Harvard (statue)
236 | 234,Before the Race
237 | 235,The Potato Harvest
238 | 236,Allegory of the Element Earth
239 | 237,The Duel After the Masquerade
240 | 238,The Archdukes Albert and Isabella Visiting a Collector's Cabinet
241 | 239,Angel of the Resurrection (Tiffany Studios stained glass window)
242 | 240,Charing Cross Bridge (Monet series)
243 | 241,Wall Street (photograph)
244 | 242,(Untitled) Blue Lady
245 | 243,First Steps (painting)
246 | 244,Sumbanese woman's ceremonial skirt (Indianapolis Museum of Art)
247 |
--------------------------------------------------------------------------------