├── .gitignore ├── pandas_dedupe ├── __init__.py ├── utility_functions.py ├── link_dataframes.py ├── dedupe_dataframe.py └── gazetteer_dataframe.py ├── setup.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | */__pycache__/* 2 | .python-version 3 | *_learned_settings 4 | *_training.json 5 | -------------------------------------------------------------------------------- /pandas_dedupe/__init__.py: -------------------------------------------------------------------------------- 1 | from pandas_dedupe.dedupe_dataframe import dedupe_dataframe 2 | from pandas_dedupe.link_dataframes import link_dataframes 3 | from pandas_dedupe.gazetteer_dataframe import gazetteer_dataframe 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | # read the contents of your README file 4 | from os import path 5 | this_directory = path.abspath(path.dirname(__file__)) 6 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 7 | long_description = f.read() 8 | 9 | 10 | setup(name='pandas_dedupe', 11 | version='1.5.0', 12 | description='The Dedupe library made easy with Pandas.', 13 | url='https://github.com/Lyonk71/pandas-dedupe', 14 | author='Keith Lyons', 15 | author_email='lyonk71@gmail.com', 16 | license='MIT', 17 | packages=['pandas_dedupe'], 18 | install_requires=[ 19 | 'dedupe>=2.0.0', 20 | 'unidecode', 21 | 'pandas', 22 | ], 23 | zip_safe=False, 24 | 25 | #Enable pypi description 26 | long_description=long_description, 27 | long_description_content_type="text/markdown") 28 | -------------------------------------------------------------------------------- /pandas_dedupe/utility_functions.py: -------------------------------------------------------------------------------- 1 | from unidecode import unidecode 2 | import pandas as pd 3 | import numpy as np 4 | from ast import literal_eval 5 | 6 | def trim(x): 7 | x = x.split() 8 | x = ' '.join(x) 9 | return x 10 | 11 | 12 | def clean_punctuation(df): 13 | for i in df.columns: 14 | df[i] = df[i].astype(str) 15 | df = df.applymap(lambda x: x.lower()) 16 | for i in df.columns: 17 | df[i] = df[i].str.replace('[^\w\s\.\-\(\)\,\:\/\\\\]','') 18 | df = df.applymap(lambda x: trim(x)) 19 | df = df.applymap(lambda x: unidecode(x)) 20 | for i in df.columns: 21 | df[i] = df[i].replace({'nan': None, 'none': None, 'nat': None}) 22 | return df 23 | 24 | def select_fields(fields, field_properties): 25 | for i in field_properties: 26 | if type(i)==str: 27 | fields.append({'field': i, 'type': 'String'}) 28 | elif len(i)==2: 29 | fields.append({'field': i[0], 'type': i[1]}) 30 | elif len(i)==3: 31 | if i[2] == 'has missing': 32 | fields.append({'field': i[0], 'type': i[1], 'has missing': True}) 33 | elif i[2] == 'crf': 34 | fields.append({'field': i[0], 'type': i[1], 'crf': True}) 35 | else: 36 | raise Exception(i[2] + " is not a valid field property") 37 | 38 | 39 | def latlong_datatype(x): 40 | if x is None: 41 | return None 42 | else: 43 | try: 44 | x = literal_eval(x) 45 | k,v = x 46 | k = float(k) 47 | v = float(v) 48 | return k, v 49 | except: 50 | raise Exception("Make sure that LatLong columns are tuples arranged like ('lat', 'lon')") 51 | 52 | 53 | def specify_type(df, field_properties): 54 | for i in field_properties: 55 | if i[1] == 'LatLong': 56 | df[i[0]] = df[i[0]].apply(lambda x: latlong_datatype(x)) 57 | elif i[1] == 'Price': 58 | try: 59 | df[i[0]] = df[i[0]].str.replace(",","") 60 | df[i[0]] = df[i[0]].replace({None: np.nan}) 61 | df[i[0]] = df[i[0]].astype(float) 62 | df[i[0]] = df[i[0]].replace({np.nan: None}) 63 | except: 64 | raise Exception('Make sure that Price columns can be converted to float.') 65 | 66 | -------------------------------------------------------------------------------- /pandas_dedupe/link_dataframes.py: -------------------------------------------------------------------------------- 1 | from pandas_dedupe.utility_functions import * 2 | 3 | import os 4 | import logging 5 | 6 | import dedupe 7 | 8 | 9 | import pandas as pd 10 | 11 | 12 | 13 | 14 | def link_dataframes(dfa, dfb, field_properties, config_name="link_dataframes", n_cores=None): 15 | 16 | config_name = config_name.replace(" ", "_") 17 | 18 | settings_file = config_name + '_learned_settings' 19 | training_file = config_name + '_training.json' 20 | 21 | print('Importing data ...') 22 | 23 | dfa = clean_punctuation(dfa) 24 | specify_type(dfa, field_properties) 25 | 26 | dfa['index_field'] = dfa.index 27 | dfa['index_field'] = dfa['index_field'].apply(lambda x: "dfa" + str(x)) 28 | dfa.set_index(['index_field'], inplace=True) 29 | 30 | data_1 = dfa.to_dict(orient='index') 31 | 32 | 33 | dfb = clean_punctuation(dfb) 34 | specify_type(dfb, field_properties) 35 | 36 | dfb['index_field'] = dfb.index 37 | dfb['index_field'] = dfb['index_field'].apply(lambda x: "dfb" + str(x)) 38 | dfb.set_index(['index_field'], inplace=True) 39 | 40 | 41 | data_2 = dfb.to_dict(orient='index') 42 | # --------------------------------------------------------------------------------- 43 | 44 | 45 | 46 | # ## Training 47 | 48 | 49 | if os.path.exists(settings_file): 50 | print('Reading from', settings_file) 51 | with open(settings_file, 'rb') as sf : 52 | linker = dedupe.StaticRecordLink(sf, num_cores=n_cores) 53 | 54 | else: 55 | # Define the fields the linker will pay attention to 56 | # 57 | # Notice how we are telling the linker to use a custom field comparator 58 | # for the 'price' field. 59 | 60 | fields = [] 61 | select_fields(fields, field_properties) 62 | 63 | 64 | 65 | # Create a new linker object and pass our data model to it. 66 | linker = dedupe.RecordLink(fields, num_cores=n_cores) 67 | # To train the linker, we feed it a sample of records. 68 | linker.prepare_training(data_1, data_2, sample_size=15000) 69 | 70 | # If we have training data saved from a previous run of linker, 71 | # look for it an load it in. 72 | # __Note:__ if you want to train from scratch, delete the training_file 73 | if os.path.exists(training_file): 74 | print('Reading labeled examples from ', training_file) 75 | with open(training_file) as tf : 76 | linker.prepare_training(data, training_file=tf) 77 | 78 | # ## Active learning 79 | # Dedupe will find the next pair of records 80 | # it is least certain about and ask you to label them as matches 81 | # or not. 82 | # use 'y', 'n' and 'u' keys to flag duplicates 83 | # press 'f' when you are finished 84 | print('Starting active labeling...') 85 | 86 | dedupe.console_label(linker) 87 | linker.train() 88 | 89 | # When finished, save our training away to disk 90 | with open(training_file, 'w') as tf : 91 | linker.write_training(tf) 92 | 93 | # Save our weights and predicates to disk. If the settings file 94 | # exists, we will skip all the training and learning next time we run 95 | # this file. 96 | with open(settings_file, 'wb') as sf : 97 | linker.write_settings(sf) 98 | 99 | 100 | # ## Blocking 101 | 102 | # ## Clustering 103 | 104 | # Find the threshold that will maximize a weighted average of our 105 | # precision and recall. When we set the recall weight to 2, we are 106 | # saying we care twice as much about recall as we do precision. 107 | # 108 | # If we had more data, we would not pass in all the blocked data into 109 | # this function but a representative sample. 110 | 111 | print('Clustering...') 112 | linked_records = linker.join(data_1, data_2, 0) 113 | 114 | print('# duplicate sets', len(linked_records)) 115 | 116 | 117 | #Convert linked records into dataframe 118 | df_linked_records = pd.DataFrame(linked_records) 119 | 120 | df_linked_records['dfa_link'] = df_linked_records[0].apply(lambda x: x[0]) 121 | df_linked_records['dfb_link'] = df_linked_records[0].apply(lambda x: x[1]) 122 | df_linked_records.rename(columns={1: 'confidence'}, inplace=True) 123 | df_linked_records.drop(columns=[0], inplace=True) 124 | df_linked_records['cluster id'] = df_linked_records.index 125 | 126 | 127 | #For both dfa & dfb, add cluster id & confidence score from liked_records 128 | dfa.index.rename('dfa_link', inplace=True) 129 | dfa = dfa.merge(df_linked_records, on='dfa_link', how='left') 130 | 131 | dfb.index.rename('dfb_link', inplace=True) 132 | dfb = dfb.merge(df_linked_records, on='dfb_link', how='left') 133 | 134 | #Concatenate results from dfa + dfb 135 | df_final = dfa.append(dfb, ignore_index=True, sort=True) 136 | df_final = df_final.sort_values(by=['cluster id']) 137 | df_final = df_final.drop(columns=['dfa_link','dfb_link']) 138 | 139 | return df_final 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pandas-dedupe 2 | 3 | The Dedupe library made easy with Pandas. 4 | 5 | # Installation 6 | 7 | ``` 8 | pip install pandas-dedupe 9 | ``` 10 | 11 | # Video Tutorials 12 | 13 | [Basic Deduplication](https://www.youtube.com/watch?v=lCFEzRaqoJA) 14 | 15 | # Basic Usage 16 | 17 | A training file and a settings file will be created while running Dedupe. 18 | Keeping these files will eliminate the need to retrain your model in the future. 19 | 20 | If you would like to retrain your model from scratch, just delete the settings and training files. 21 | 22 | ### Deduplication (dedupe_dataframe) 23 | `dedupe_dataframe` is for deduplication when you have data that can contain multiple records that can all refer to the same entity 24 | 25 | ```python 26 | import pandas as pd 27 | import pandas_dedupe 28 | 29 | #load dataframe 30 | df = pd.read_csv('test_names.csv') 31 | 32 | #initiate deduplication 33 | df_final = pandas_dedupe.dedupe_dataframe(df,['first_name', 'last_name', 'middle_initial']) 34 | 35 | #send output to csv 36 | df_final.to_csv('deduplication_output.csv') 37 | ``` 38 | 39 | ### Gazetteer deduplication (gazetteer_dataframe) 40 | `gazetteer_dataframe` is for matching a messy dataset against a 'canonical dataset' (i.e. the gazette) 41 | 42 | ```python 43 | import pandas as pd 44 | import pandas_dedupe 45 | 46 | #load dataframe 47 | df_clean = pd.read_csv('gazette.csv') 48 | df_messy = pd.read_csv('test_names.csv') 49 | 50 | #initiate deduplication 51 | df_final = pandas_dedupe.gazetteer_dataframe(df_clean, df_messy, 'fullname', canonicalize=True) 52 | 53 | #send output to csv 54 | df_final.to_csv('gazetteer_deduplication_output.csv') 55 | ``` 56 | 57 | 58 | ### Matching / Record Linkage 59 | 60 | Use identical field names when linking dataframes. 61 | Record linkage should only be used on dataframes that have been deduplicated. 62 | 63 | ```python 64 | import pandas as pd 65 | import pandas_dedupe 66 | 67 | #load dataframes 68 | dfa = pd.read_csv('file_a.csv') 69 | dfb = pd.read_csv('file_b.csv') 70 | 71 | #initiate matching 72 | df_final = pandas_dedupe.link_dataframes(dfa, dfb, ['field_1', 'field_2', 'field_3', 'field_4']) 73 | 74 | #send output to csv 75 | df_final.to_csv('linkage_output.csv') 76 | ``` 77 | 78 | # Advanced Usage 79 | 80 | ### Canonicalize Fields 81 | 82 | The canonicalize parameter will standardize names in a given cluster. Original fields are also kept. 83 | 84 | ```python 85 | pandas_dedupe.dedupe_dataframe(df,['first_name', 'last_name', 'payment_type'], canonicalize=True) 86 | ``` 87 | 88 | ### Update Threshold (dedupe_dataframe and gazetteer_dataframe only) 89 | 90 | Group records into clusters only if the cophenetic similarity of the cluster is greater than 91 | the threshold. 92 | 93 | ```python 94 | pandas_dedupe.dedupe_dataframe(df, ['first_name', 'last_name'], threshold=.7) 95 | ``` 96 | 97 | ### Update Existing Model (dedupe_dataframe and gazetteer_dataframe only) 98 | 99 | If `True`, it allows a user to update the existing model. 100 | 101 | ```python 102 | pandas_dedupe.dedupe_dataframe(df, ['first_name', 'last_name'], update_model=True) 103 | ``` 104 | 105 | ### Update Sample Size 106 | 107 | Specifies the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data. 108 | ```python 109 | pandas_dedupe.dedupe_dataframe(df, ['first_name', 'last_name'], sample_size=0.5) 110 | ``` 111 | 112 | ### Specifying Types 113 | 114 | If you'd like to specify dates, spatial data, etc, do so here. The structure must be like so: 115 | `('field', 'type', 'additional_parameter)`. the `additional_parameter` section can be omitted. 116 | The default type is `String`. 117 | 118 | See the full list of types [below](#Types). 119 | 120 | ```python 121 | # Price Example 122 | pandas_dedupe.dedupe_dataframe(df,['first_name', 'last_name', ('salary', 'Price')]) 123 | 124 | # has missing Example 125 | pandas_dedupe.link_dataframes(df,['SSN', ('bio_pgraph', 'Text'), ('salary', 'Price', 'has missing')]) 126 | 127 | # crf Example 128 | pandas_dedupe.dedupe_dataframe(df,[('first_name', 'String', 'crf'), 'last_name', (m_initial, 'Exact')]) 129 | ``` 130 | 131 | # Types 132 | 133 | Dedupe supports a variety of datatypes; a full list with documentation can be found [here.](https://docs.dedupe.io/en/latest/Variable-definition.html#) 134 | 135 | pandas-dedupe officially supports the following datatypes: 136 | 137 | - **String** - Standard string comparison using string distance metric. This is the default type. 138 | - **Text** - Comparison for sentences or paragraphs of text. Uses cosine similarity metric. 139 | - **Price** - For comparing positive, non zero numerical values. 140 | - **DateTime** - For comparing dates. 141 | - **LatLong** - (39.990334, 70.012) will not match to (40.01, 69.98) using a string distance 142 | metric, even though the points are in a geographically similar location. The LatLong type resolves 143 | this by calculating the haversine distance between compared coordinates. LatLong requires 144 | the field to be in the format (Lat, Long). The value can be a string, a tuple containing two 145 | strings, a tuple containing two floats, or a tuple containing two integers. If the format 146 | is not able to be processed, you will get a traceback. 147 | - **Exact** - Tests whether fields are an exact match. 148 | - **Exists** - Sometimes, the presence or absence of data can be useful in predicting a match. 149 | The Exists type tests for whether both, one, or neither of fields are null. 150 | 151 | Additional supported parameters are: 152 | 153 | - **has missing** - Can be used if one of your data fields contains null values 154 | - **crf** - Use conditional random fields for comparisons rather than distance metric. May be more 155 | accurate in some cases, but runs much slower. Works with String and ShortString types. 156 | 157 | # Contributors 158 | 159 | [Tyler Marrs](http://tylermarrs.com/) - Refactored code, added docstrings, added `threshold` parameter 160 | 161 | [Tawni Marrs](https://github.com/tawnimarrs) - refactored code, added docstrings 162 | 163 | [ieriii](https://github.com/ieriii) - Added `update_model` parameter, updated codebase to use `Dedupe 2.0`, added support for multiprocessing, added `gazetteer_dataframe`. 164 | 165 | [Daniel Marczin](https://github.com/dim5) - Extensive updates to documentation to enhance readability. 166 | 167 | [Alexis-Evelyn](https://github.com/alexis-evelyn) - Fixed logger warning with related to Pandas. 168 | 169 | [Niels Horn](https://github.com/nilq) - Cleaned up utility functions. 170 | 171 | # Credits 172 | 173 | Many thanks to folks at [DataMade](https://datamade.us/) for making the the [Dedupe library](https://github.com/dedupeio/dedupe) publicly available. People interested in a code-free implementation of the dedupe library can find a link here: [Dedupe.io](https://dedupe.io/pricing/). 174 | -------------------------------------------------------------------------------- /pandas_dedupe/dedupe_dataframe.py: -------------------------------------------------------------------------------- 1 | from pandas_dedupe.utility_functions import ( 2 | clean_punctuation, 3 | select_fields, 4 | specify_type 5 | ) 6 | 7 | import os 8 | import logging 9 | import math 10 | 11 | import dedupe 12 | import pandas as pd 13 | 14 | 15 | logging.getLogger().setLevel(logging.WARNING) 16 | 17 | 18 | def _active_learning(data, sample_size, deduper, training_file, settings_file): 19 | """Internal method that trains the deduper model using active learning. 20 | Parameters 21 | ---------- 22 | data : dict 23 | The dictionary form of the dataframe that dedupe requires. 24 | sample_size : float, default 0.3 25 | Specify the sample size used for training as a float from 0 to 1. 26 | By default it is 30% (0.3) of our data. 27 | deduper : a dedupe model instance 28 | training_file : str 29 | A path to a training file that will be loaded to keep training 30 | from. 31 | settings_file : str 32 | A path to a settings file that will be loaded if it exists. 33 | 34 | Returns 35 | ------- 36 | dedupe.Dedupe 37 | A trained dedupe model instance. 38 | """ 39 | # To train dedupe, we feed it a sample of records. 40 | sample_num = math.floor(len(data) * sample_size) 41 | deduper.prepare_training(data, sample_size=sample_num) 42 | 43 | print('Starting active labeling...') 44 | 45 | dedupe.console_label(deduper) 46 | 47 | # Using the examples we just labeled, train the deduper and learn 48 | # blocking predicates 49 | deduper.train() 50 | 51 | # When finished, save our training to disk 52 | with open(training_file, 'w') as tf: 53 | deduper.write_training(tf) 54 | 55 | # Save our weights and predicates to disk. 56 | with open(settings_file, 'wb') as sf: 57 | deduper.write_settings(sf) 58 | 59 | return deduper 60 | 61 | def _train(settings_file, training_file, data, field_properties, sample_size, update_model, n_cores): 62 | """Internal method that trains the deduper model from scratch or update 63 | an existing dedupe model. 64 | Parameters 65 | ---------- 66 | settings_file : str 67 | A path to a settings file that will be loaded if it exists. 68 | training_file : str 69 | A path to a training file that will be loaded to keep training 70 | from. 71 | data : dict 72 | The dictionary form of the dataframe that dedupe requires. 73 | field_properties : dict 74 | The mapping of fields to their respective data types. Please 75 | see the dedupe documentation for further details. 76 | sample_size : float, default 0.3 77 | Specify the sample size used for training as a float from 0 to 1. 78 | By default it is 30% (0.3) of our data. 79 | update_model : bool, default False 80 | If True, it allows user to update existing model by uploading 81 | training file. 82 | n_cores : int, default None 83 | Specify the number of cores to use during clustering. 84 | By default n_cores is equal to None (i.e. use multipressing equal to CPU count). 85 | Returns 86 | ------- 87 | dedupe.Dedupe 88 | A dedupe model instance. 89 | """ 90 | # Define the fields dedupe will pay attention to 91 | fields = [] 92 | select_fields(fields, field_properties) 93 | 94 | if update_model == False: 95 | 96 | # If a settings file already exists, we'll just load that and skip training 97 | if os.path.exists(settings_file): 98 | print('Reading from', settings_file) 99 | with open(settings_file, 'rb') as f: 100 | deduper = dedupe.StaticDedupe(f, num_cores=n_cores) 101 | 102 | #Create a new deduper object and pass our data model to it. 103 | else: 104 | # Initialise dedupe 105 | deduper = dedupe.Dedupe(fields, num_cores=n_cores) 106 | 107 | # Launch active learning 108 | deduper = _active_learning(data, sample_size, deduper, training_file, settings_file) 109 | 110 | else: 111 | # ## Training 112 | # Initialise dedupe 113 | deduper = dedupe.Dedupe(fields, num_cores=n_cores) 114 | 115 | # Import existing model 116 | print('Reading labeled examples from ', training_file) 117 | with open(training_file, 'rb') as f: 118 | deduper.prepare_training(data, training_file=f) 119 | 120 | # Launch active learning 121 | deduper = _active_learning(data, sample_size, deduper, training_file, settings_file) 122 | 123 | return deduper 124 | 125 | 126 | def _cluster(deduper, data, threshold, canonicalize): 127 | """Internal method that clusters the data. 128 | Parameters 129 | ---------- 130 | deduper : dedupe.Deduper 131 | A trained instance of dedupe. 132 | data : dict 133 | The dedupe formatted data dictionary. 134 | threshold : dedupe.Threshold 135 | The threshold used for clustering. 136 | canonicalize : bool or list, default False 137 | Option that provides the canonical records as additional columns. 138 | Specifying a list of column names only canonicalizes those columns. 139 | Returns 140 | ------- 141 | pd.DataFrame 142 | A dataframe storing the clustering results. 143 | """ 144 | # ## Clustering 145 | print('Clustering...') 146 | clustered_dupes = deduper.partition(data, threshold) 147 | 148 | print('# duplicate sets', len(clustered_dupes)) 149 | 150 | # Convert data_d to string so that Price & LatLong won't get traceback 151 | # during dedupe.canonicalize() 152 | for i in data.values(): 153 | for key in i: 154 | if i[key] is None: 155 | pass 156 | else: 157 | i[key] = str(i[key]) 158 | 159 | df_data = [] 160 | # ## Writing Results 161 | cluster_id = 0 162 | for (cluster_id, cluster) in enumerate(clustered_dupes): 163 | id_set, scores = cluster 164 | cluster_d = [data[c] for c in id_set] 165 | 166 | canonical_rep = None 167 | if canonicalize: 168 | canonical_rep = dedupe.canonicalize(cluster_d) 169 | 170 | for record_id, score in zip(id_set, scores): 171 | tmp = { 172 | 'Id': record_id, 173 | 'cluster id': cluster_id, 174 | 'confidence': score, 175 | } 176 | 177 | if canonicalize: 178 | fields_to_canon = canonical_rep.keys() 179 | 180 | if isinstance(canonicalize, list): 181 | fields_to_canon = canonicalize 182 | 183 | for key in fields_to_canon: 184 | canon_key = 'canonical_' + key 185 | tmp[canon_key] = canonical_rep[key] 186 | 187 | df_data.append(tmp) 188 | 189 | clustered_df = pd.DataFrame(df_data) 190 | clustered_df = clustered_df.set_index('Id') 191 | 192 | return clustered_df 193 | 194 | 195 | def dedupe_dataframe(df, field_properties, canonicalize=False, 196 | config_name="dedupe_dataframe", update_model=False, threshold=0.4, 197 | sample_size=0.3, n_cores=None): 198 | """Deduplicates a dataframe given fields of interest. 199 | Parameters 200 | ---------- 201 | df : pd.DataFrame 202 | The dataframe to deduplicate. 203 | field_properties : list 204 | A list specifying what fields to use for deduplicating records. 205 | canonicalize : bool or list, default False 206 | Option that provides the canonical records as additional columns. 207 | Specifying a list of column names only canonicalizes those columns. 208 | config_name : str, default dedupe_dataframe 209 | The configuration file name. Note that this will be used as 210 | a prefix to save the settings and training files. 211 | update_model : bool, default False 212 | If True, it allows user to update existing model by uploading 213 | training file. 214 | threshold : float, default 0.4 215 | Only put together records into clusters if the cophenetic similarity of the cluster 216 | is greater than the threshold. 217 | sample_size : float, default 0.3 218 | Specify the sample size used for training as a float from 0 to 1. 219 | By default it is 30% (0.3) of our data. 220 | n_cores : int, default None 221 | Specify the number of cores to use during clustering. 222 | By default n_cores is equal to None (i.e. use multipressing equal to CPU count). 223 | 224 | Returns 225 | ------- 226 | pd.DataFrame 227 | A pandas dataframe that contains the cluster id and confidence 228 | score. Optionally, it will contain canonicalized columns for all 229 | attributes of the record. 230 | """ 231 | # Import Data 232 | config_name = config_name.replace(" ", "_") 233 | 234 | settings_file = config_name + '_learned_settings' 235 | training_file = config_name + '_training.json' 236 | 237 | print('Importing data ...') 238 | 239 | df = clean_punctuation(df) 240 | 241 | specify_type(df, field_properties) 242 | 243 | df['dictionary'] = df.apply( 244 | lambda x: dict(zip(df.columns, x.tolist())), axis=1) 245 | data_d = dict(zip(df.index, df.dictionary)) 246 | 247 | # Train or load the model 248 | deduper = _train(settings_file, training_file, data_d, field_properties, 249 | sample_size, update_model, n_cores) 250 | 251 | # Cluster the records 252 | clustered_df = _cluster(deduper, data_d, threshold, canonicalize) 253 | results = df.join(clustered_df, how='left') 254 | results.drop(['dictionary'], axis=1, inplace=True) 255 | 256 | return results 257 | -------------------------------------------------------------------------------- /pandas_dedupe/gazetteer_dataframe.py: -------------------------------------------------------------------------------- 1 | from pandas_dedupe.utility_functions import ( 2 | clean_punctuation, 3 | select_fields, 4 | specify_type 5 | ) 6 | 7 | import os 8 | import io 9 | import logging 10 | import math 11 | 12 | import dedupe 13 | import pandas as pd 14 | 15 | 16 | logging.getLogger().setLevel(logging.WARNING) 17 | 18 | 19 | def _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file): 20 | """Internal method that trains the deduper model using active learning. 21 | Parameters 22 | ---------- 23 | clean_data : dict 24 | The dictionary form of the gazette that gazetteer_dedupe requires. 25 | messy_data : dict 26 | The dictionary form of the messy data that needs to be deduplicated 27 | (and canonicalized) 28 | sample_size : float, default 0.3 29 | Specify the sample size used for training as a float from 0 to 1. 30 | By default it is 30% (0.3) of our data. 31 | deduper : a gazetteer model instance 32 | training_file : str 33 | A path to a training file that will be loaded to keep training 34 | from. 35 | settings_file : str 36 | A path to a settings file that will be loaded if it exists. 37 | 38 | Returns 39 | ------- 40 | dedupe.Gazetteer 41 | A trained gazetteer model instance. 42 | """ 43 | # To train dedupe, we feed it a sample of records. 44 | sample_num = math.floor(len(messy_data) * sample_size) 45 | deduper.prepare_training(clean_data, messy_data, sample_size=sample_num) 46 | 47 | print('Starting active labeling...') 48 | 49 | dedupe.console_label(deduper) 50 | 51 | # Using the examples we just labeled, train the deduper and learn 52 | # blocking predicates 53 | deduper.train() 54 | 55 | # When finished, save our training to disk 56 | with open(training_file, 'w') as tf: 57 | deduper.write_training(tf) 58 | 59 | # Save our weights and predicates to disk. 60 | with open(settings_file, 'wb') as sf: 61 | deduper.write_settings(sf) 62 | 63 | return deduper 64 | 65 | def _train(settings_file, training_file, clean_data, messy_data, field_properties, sample_size, update_model, n_cores): 66 | """Internal method that trains the deduper model from scratch or update 67 | an existing dedupe model. 68 | Parameters 69 | ---------- 70 | settings_file : str 71 | A path to a settings file that will be loaded if it exists. 72 | training_file : str 73 | A path to a training file that will be loaded to keep training 74 | from. 75 | clean_data : dict 76 | The dictionary form of the gazette that gazetteer_dedupe requires. 77 | messy_data : dict 78 | The dictionary form of the messy data that needs to be deduplicated 79 | (and canonicalized) 80 | field_properties : dict 81 | The mapping of fields to their respective data types. Please 82 | see the dedupe documentation for further details. 83 | sample_size : float, default 0.3 84 | Specify the sample size used for training as a float from 0 to 1. 85 | By default it is 30% (0.3) of our data. 86 | update_model : bool, default False 87 | If True, it allows user to update existing model by uploading 88 | training file. 89 | n_cores : int, default None 90 | Specify the number of cores to use during clustering. 91 | By default n_cores is equal to None (i.e. use multipressing equal to CPU count). 92 | Returns 93 | ------- 94 | dedupe.Gazetteer 95 | A gazetteer model instance. 96 | """ 97 | # Define the fields dedupe will pay attention to 98 | fields = [] 99 | select_fields(fields, [field_properties]) 100 | 101 | if update_model == False: 102 | 103 | # If a settings file already exists, we'll just load that and skip training 104 | if os.path.exists(settings_file): 105 | print('Reading from', settings_file) 106 | with open(settings_file, 'rb') as f: 107 | deduper = dedupe.StaticGazetteer(f, num_cores=n_cores) 108 | 109 | #Create a new deduper object and pass our data model to it. 110 | else: 111 | # Initialise dedupe 112 | deduper = dedupe.Gazetteer(fields, num_cores=n_cores) 113 | 114 | # Launch active learning 115 | deduper = _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file) 116 | 117 | else: 118 | # ## Training 119 | # Initialise dedupe 120 | deduper = dedupe.Gazetteer(fields, num_cores=n_cores) 121 | 122 | # Import existing model 123 | print('Reading labeled examples from ', training_file) 124 | with open(training_file, 'rb') as f: 125 | deduper.prepare_training(clean_data, messy_data, training_file=f) 126 | 127 | # Launch active learning 128 | deduper = _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file) 129 | 130 | return deduper 131 | 132 | 133 | def _cluster(deduper, clean_data, messy_data, threshold, canonicalize): 134 | """Internal method that clusters the data. 135 | Parameters 136 | ---------- 137 | deduper : dedupe.Gazetteer 138 | A trained instance of gazetteer dedupe. 139 | clean_data : dict 140 | The dictionary form of the gazette that gazetteer_dedupe requires. 141 | messy_data : dict 142 | The dictionary form of the messy data that needs to be deduplicated 143 | (and canonicalized) 144 | threshold : dedupe.Threshold 145 | The threshold used for clustering. 146 | canonicalize : bool or list, default False 147 | Option that provides the canonical records as additional columns. 148 | Specifying a list of column names only canonicalizes those columns. 149 | Returns 150 | ------- 151 | pd.DataFrame 152 | A dataframe storing the clustering results. 153 | """ 154 | # ## Clustering 155 | print('Clustering...') 156 | deduper.index(clean_data) 157 | 158 | clustered_dupes = deduper.search(messy_data, threshold, n_matches=None, generator=False) 159 | print('# duplicate sets', len(clustered_dupes)) 160 | 161 | # Convert data_d to string so that Price & LatLong won't get traceback 162 | # during dedupe.canonicalize() 163 | for i in messy_data.values(): 164 | for key in i: 165 | if i[key] is None: 166 | pass 167 | else: 168 | i[key] = str(i[key]) 169 | 170 | df_data = [] 171 | # ## Writing Results 172 | for _, (messy_id, matches) in enumerate(clustered_dupes): 173 | for canon_id, scores in matches: 174 | 175 | tmp = { 176 | 'cluster id': canon_id, 177 | 'confidence': scores, 178 | 'record id': messy_id 179 | } 180 | df_data.append(tmp) 181 | 182 | # Add canonical name 183 | if canonicalize: 184 | clean_data_dict = pd.DataFrame.from_dict(clean_data).T.add_prefix('canonical_') 185 | clustered_df = (pd.DataFrame.from_dict(df_data) # Create cluster result dataframe 186 | .set_index('cluster id', drop=False) # Note: cluster id is the index of clean_data (i.e. gazette) 187 | .join(clean_data_dict, how='left') # join clustered results and gazette 188 | .set_index('record id') # Note: record id is the index of the messy_data 189 | ) 190 | else: 191 | clustered_df = (pd.DataFrame.from_dict(df_data) # Create clustered results dataframe 192 | .set_index('record id') # Note: record id is the index of messy_data 193 | ) 194 | 195 | # Drop duplicates (i.e. keep canonical name with max confidence) 196 | # Note: the reason for this is that gazetteer dedupe might assign the same obs to multiple clusters 197 | confidence_maxes = clustered_df.groupby([clustered_df.index])['confidence'].transform(max) # Calculate max confidence 198 | clustered_df = clustered_df.loc[clustered_df['confidence'] == confidence_maxes] # Keep rows with max confidence 199 | clustered_df = clustered_df.loc[~clustered_df.index.duplicated(keep='first')] # If same confidence keep the first obs 200 | 201 | return clustered_df 202 | 203 | 204 | def gazetteer_dataframe(clean_data, messy_data, field_properties, canonicalize=False, 205 | config_name="gazetteer_dataframe", update_model=False, threshold=0.3, 206 | sample_size=1, n_cores=None): 207 | """Deduplicates a dataframe given fields of interest. 208 | Parameters 209 | ---------- 210 | clean_data : pd.DataFrame 211 | The gazetteer dataframe. 212 | messy_data : pd.DataFrame 213 | The dataframe to deduplicate. 214 | field_properties : str 215 | A string specifying what fields to use for deduplicating records. 216 | canonicalize : bool or list, default False 217 | Option that provides the canonical records as additional columns. 218 | Specifying a list of column names only canonicalizes those columns. 219 | setting_file : str, default None. 220 | the default name of the setting file is dedupe_dataframe_settings if None is provided. 221 | training_file : str, default None 222 | the default name of the setting file is dedupe_dataframe_training.json if None is provided. 223 | Note: the name of the training file should include the .json extension. 224 | update_model : bool, default False 225 | If True, it allows user to update existing model by uploading 226 | training file. 227 | threshold : float, default 0.3 228 | only consider put together records into clusters if the cophenetic similarity of the cluster 229 | is greater than the threshold. 230 | sample_size : float, default 0.3 231 | Specify the sample size used for training as a float from 0 to 1. 232 | By default it is 30% (0.3) of our data. 233 | n_cores : int, default None 234 | Specify the number of cores to use during clustering. 235 | By default n_cores is equal to None (i.e. use multipressing equal to CPU count). 236 | Returns 237 | ------- 238 | pd.DataFrame 239 | A pandas dataframe that contains the cluster id and confidence 240 | score. Optionally, it will contain canonicalized columns for all 241 | attributes of the record. 242 | """ 243 | # Import Data 244 | config_name = config_name.replace(" ", "_") 245 | 246 | settings_file = config_name + '_learned_settings' 247 | training_file = config_name + '_training.json' 248 | 249 | print('Importing data ...') 250 | assert type(clean_data)==pd.core.frame.DataFrame, 'Please provide a gazette in pandas dataframe format' 251 | assert len(clean_data.columns)==1, 'Please provide a gazetteer dataframe made of a single variable' 252 | assert type(field_properties) == str, 'field_properties must be in string (str) format' 253 | 254 | # Common column name 255 | common_name = clean_data.columns[0] 256 | 257 | # Canonical dataset (i.e. gazette) 258 | df_canonical = clean_punctuation(clean_data) 259 | df_canonical.rename(columns={field_properties: common_name}, inplace=True) 260 | specify_type(df_canonical, [common_name]) 261 | 262 | df_canonical['dictionary'] = df_canonical.apply( 263 | lambda x: dict(zip(df_canonical.columns, x.tolist())), axis=1) 264 | canonical = dict(zip(df_canonical.index, df_canonical.dictionary)) 265 | 266 | # Messy dataset 267 | df_messy = clean_punctuation(messy_data) 268 | df_messy.rename(columns={field_properties: common_name}, inplace=True) 269 | specify_type(df_messy, [common_name]) 270 | 271 | df_messy['dictionary'] = df_messy.apply( 272 | lambda x: dict(zip(df_messy.columns, x.tolist())), axis=1) 273 | messy = dict(zip(df_messy.index, df_messy.dictionary)) 274 | 275 | # Train or load the model 276 | deduper = _train(settings_file, training_file, canonical, messy, common_name, 277 | sample_size, update_model, n_cores) 278 | 279 | # Cluster the records 280 | clustered_df = _cluster(deduper, canonical, messy, threshold, canonicalize) 281 | results = messy_data.join(clustered_df, how='left') 282 | results.rename(columns={'canonical_'+str(common_name): 'canonical_'+str(field_properties)}, inplace=True) 283 | 284 | return results 285 | --------------------------------------------------------------------------------