├── .gitignore
├── pandas_dedupe
    ├── __init__.py
    ├── utility_functions.py
    ├── link_dataframes.py
    ├── dedupe_dataframe.py
    └── gazetteer_dataframe.py
├── setup.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | */__pycache__/*
2 | .python-version
3 | *_learned_settings
4 | *_training.json
5 | 


--------------------------------------------------------------------------------
/pandas_dedupe/__init__.py:
--------------------------------------------------------------------------------
1 | from pandas_dedupe.dedupe_dataframe import dedupe_dataframe
2 | from pandas_dedupe.link_dataframes import link_dataframes
3 | from pandas_dedupe.gazetteer_dataframe import gazetteer_dataframe
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | # read the contents of your README file
 4 | from os import path
 5 | this_directory = path.abspath(path.dirname(__file__))
 6 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
 7 |     long_description = f.read()
 8 | 
 9 | 
10 | setup(name='pandas_dedupe',
11 |       version='1.5.0',
12 |       description='The Dedupe library made easy with Pandas.',
13 |       url='https://github.com/Lyonk71/pandas-dedupe',
14 |       author='Keith Lyons',
15 |       author_email='lyonk71@gmail.com',
16 |       license='MIT',
17 |       packages=['pandas_dedupe'],
18 |       install_requires=[
19 |           'dedupe>=2.0.0',
20 |           'unidecode',
21 |           'pandas',
22 |       ],
23 |       zip_safe=False,
24 |       
25 |       #Enable pypi description
26 |       long_description=long_description,
27 |       long_description_content_type="text/markdown")
28 | 


--------------------------------------------------------------------------------
/pandas_dedupe/utility_functions.py:
--------------------------------------------------------------------------------
 1 | from unidecode import unidecode
 2 | import pandas as pd
 3 | import numpy as np
 4 | from ast import literal_eval
 5 | 
 6 | def trim(x):
 7 |     x = x.split()
 8 |     x = ' '.join(x)
 9 |     return x   
10 | 
11 | 
12 | def clean_punctuation(df):
13 |     for i in df.columns:
14 |         df[i] = df[i].astype(str) 
15 |     df = df.applymap(lambda x: x.lower())
16 |     for i in df.columns:
17 |         df[i] = df[i].str.replace('[^\w\s\.\-\(\)\,\:\/\\\\]','')
18 |     df = df.applymap(lambda x: trim(x))
19 |     df = df.applymap(lambda x: unidecode(x))
20 |     for i in df.columns:
21 |         df[i] = df[i].replace({'nan': None, 'none': None, 'nat': None})
22 |     return df
23 | 
24 | def select_fields(fields, field_properties):
25 |     for i in field_properties:
26 |         if type(i)==str:
27 |             fields.append({'field': i, 'type': 'String'})
28 |         elif len(i)==2:
29 |             fields.append({'field': i[0], 'type': i[1]})
30 |         elif len(i)==3:
31 |             if i[2] == 'has missing':
32 |                 fields.append({'field': i[0], 'type': i[1], 'has missing': True})
33 |             elif i[2] == 'crf':
34 |                 fields.append({'field': i[0], 'type': i[1], 'crf': True})
35 |             else:
36 |                 raise Exception(i[2] + " is not a valid field property")
37 |                 
38 |     
39 | def latlong_datatype(x):
40 |     if x is None:
41 |         return None
42 |     else:
43 |         try:
44 |             x = literal_eval(x)
45 |             k,v = x
46 |             k = float(k)
47 |             v = float(v)
48 |             return k, v
49 |         except:
50 |             raise Exception("Make sure that LatLong columns are tuples arranged like ('lat', 'lon')")
51 |             
52 |             
53 | def specify_type(df, field_properties):
54 |     for i in field_properties:
55 |         if i[1] == 'LatLong':
56 |             df[i[0]] = df[i[0]].apply(lambda x: latlong_datatype(x))
57 |         elif i[1] == 'Price':
58 |             try:
59 |                 df[i[0]] = df[i[0]].str.replace(",","")
60 |                 df[i[0]] = df[i[0]].replace({None: np.nan})
61 |                 df[i[0]] = df[i[0]].astype(float)
62 |                 df[i[0]] = df[i[0]].replace({np.nan: None})
63 |             except:
64 |                 raise Exception('Make sure that Price columns can be converted to float.')
65 |  
66 | 


--------------------------------------------------------------------------------
/pandas_dedupe/link_dataframes.py:
--------------------------------------------------------------------------------
  1 | from pandas_dedupe.utility_functions import *
  2 | 
  3 | import os
  4 | import logging
  5 | 
  6 | import dedupe
  7 | 
  8 | 
  9 | import pandas as pd
 10 | 
 11 | 
 12 | 
 13 | 
 14 | def link_dataframes(dfa, dfb, field_properties, config_name="link_dataframes", n_cores=None):
 15 |     
 16 |     config_name = config_name.replace(" ", "_")
 17 |     
 18 |     settings_file = config_name + '_learned_settings'
 19 |     training_file = config_name + '_training.json'
 20 |  
 21 |     print('Importing data ...')
 22 | 
 23 |     dfa = clean_punctuation(dfa)
 24 |     specify_type(dfa, field_properties)
 25 |     
 26 |     dfa['index_field'] = dfa.index
 27 |     dfa['index_field'] = dfa['index_field'].apply(lambda x: "dfa" + str(x))
 28 |     dfa.set_index(['index_field'], inplace=True)
 29 |             
 30 |     data_1 = dfa.to_dict(orient='index')
 31 |    
 32 | 
 33 |     dfb = clean_punctuation(dfb)
 34 |     specify_type(dfb, field_properties)
 35 |     
 36 |     dfb['index_field'] = dfb.index
 37 |     dfb['index_field'] = dfb['index_field'].apply(lambda x: "dfb" + str(x))
 38 |     dfb.set_index(['index_field'], inplace=True)
 39 | 
 40 |     
 41 |     data_2 = dfb.to_dict(orient='index')
 42 |     # ---------------------------------------------------------------------------------
 43 | 
 44 | 
 45 | 
 46 |     # ## Training
 47 | 
 48 | 
 49 |     if os.path.exists(settings_file):
 50 |         print('Reading from', settings_file)
 51 |         with open(settings_file, 'rb') as sf :
 52 |             linker = dedupe.StaticRecordLink(sf, num_cores=n_cores)
 53 | 
 54 |     else:
 55 |         # Define the fields the linker will pay attention to
 56 |         #
 57 |         # Notice how we are telling the linker to use a custom field comparator
 58 |         # for the 'price' field. 
 59 | 
 60 |         fields = []
 61 |         select_fields(fields, field_properties)
 62 |                 
 63 |               
 64 |                 
 65 |         # Create a new linker object and pass our data model to it.
 66 |         linker = dedupe.RecordLink(fields, num_cores=n_cores)
 67 |         # To train the linker, we feed it a sample of records.
 68 |         linker.prepare_training(data_1, data_2, sample_size=15000)
 69 | 
 70 |         # If we have training data saved from a previous run of linker,
 71 |         # look for it an load it in.
 72 |         # __Note:__ if you want to train from scratch, delete the training_file
 73 |         if os.path.exists(training_file):
 74 |             print('Reading labeled examples from ', training_file)
 75 |             with open(training_file) as tf :
 76 |                 linker.prepare_training(data, training_file=tf)
 77 | 
 78 |         # ## Active learning
 79 |         # Dedupe will find the next pair of records
 80 |         # it is least certain about and ask you to label them as matches
 81 |         # or not.
 82 |         # use 'y', 'n' and 'u' keys to flag duplicates
 83 |         # press 'f' when you are finished
 84 |         print('Starting active labeling...')
 85 | 
 86 |         dedupe.console_label(linker)
 87 |         linker.train()
 88 | 
 89 |         # When finished, save our training away to disk
 90 |         with open(training_file, 'w') as tf :
 91 |             linker.write_training(tf)
 92 | 
 93 |         # Save our weights and predicates to disk.  If the settings file
 94 |         # exists, we will skip all the training and learning next time we run
 95 |         # this file.
 96 |         with open(settings_file, 'wb') as sf :
 97 |             linker.write_settings(sf)
 98 | 
 99 | 
100 |     # ## Blocking
101 | 
102 |     # ## Clustering
103 | 
104 |     # Find the threshold that will maximize a weighted average of our
105 |     # precision and recall.  When we set the recall weight to 2, we are
106 |     # saying we care twice as much about recall as we do precision.
107 |     #
108 |     # If we had more data, we would not pass in all the blocked data into
109 |     # this function but a representative sample.
110 | 
111 |     print('Clustering...')
112 |     linked_records = linker.join(data_1, data_2, 0)
113 | 
114 |     print('# duplicate sets', len(linked_records))
115 |     
116 | 
117 |     #Convert linked records into dataframe
118 |     df_linked_records = pd.DataFrame(linked_records)
119 |     
120 |     df_linked_records['dfa_link'] = df_linked_records[0].apply(lambda x: x[0])
121 |     df_linked_records['dfb_link'] = df_linked_records[0].apply(lambda x: x[1])
122 |     df_linked_records.rename(columns={1: 'confidence'}, inplace=True)
123 |     df_linked_records.drop(columns=[0], inplace=True)
124 |     df_linked_records['cluster id'] = df_linked_records.index
125 | 
126 |    
127 |     #For both dfa & dfb, add cluster id & confidence score from liked_records
128 |     dfa.index.rename('dfa_link', inplace=True)
129 |     dfa = dfa.merge(df_linked_records, on='dfa_link', how='left')
130 | 
131 |     dfb.index.rename('dfb_link', inplace=True)
132 |     dfb = dfb.merge(df_linked_records, on='dfb_link', how='left')
133 | 
134 |     #Concatenate results from dfa + dfb
135 |     df_final = dfa.append(dfb, ignore_index=True, sort=True)
136 |     df_final = df_final.sort_values(by=['cluster id'])
137 |     df_final = df_final.drop(columns=['dfa_link','dfb_link'])
138 | 
139 |     return df_final
140 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pandas-dedupe
  2 | 
  3 | The Dedupe library made easy with Pandas.
  4 | 
  5 | # Installation
  6 | 
  7 | ```
  8 | pip install pandas-dedupe
  9 | ```
 10 | 
 11 | # Video Tutorials
 12 | 
 13 | [Basic Deduplication](https://www.youtube.com/watch?v=lCFEzRaqoJA)
 14 | 
 15 | # Basic Usage
 16 | 
 17 | A training file and a settings file will be created while running Dedupe.
 18 | Keeping these files will eliminate the need to retrain your model in the future.
 19 | 
 20 | If you would like to retrain your model from scratch, just delete the settings and training files.
 21 | 
 22 | ### Deduplication (dedupe_dataframe)
 23 | `dedupe_dataframe` is for deduplication when you have data that can contain multiple records that can all refer to the same entity
 24 | 
 25 | ```python
 26 | import pandas as pd
 27 | import pandas_dedupe
 28 | 
 29 | #load dataframe
 30 | df = pd.read_csv('test_names.csv')
 31 | 
 32 | #initiate deduplication
 33 | df_final = pandas_dedupe.dedupe_dataframe(df,['first_name', 'last_name', 'middle_initial'])
 34 | 
 35 | #send output to csv
 36 | df_final.to_csv('deduplication_output.csv')
 37 | ```
 38 | 
 39 | ### Gazetteer deduplication (gazetteer_dataframe)
 40 | `gazetteer_dataframe` is for matching a messy dataset against a 'canonical dataset' (i.e. the gazette)
 41 | 
 42 | ```python
 43 | import pandas as pd
 44 | import pandas_dedupe
 45 | 
 46 | #load dataframe
 47 | df_clean = pd.read_csv('gazette.csv')
 48 | df_messy = pd.read_csv('test_names.csv')
 49 | 
 50 | #initiate deduplication
 51 | df_final = pandas_dedupe.gazetteer_dataframe(df_clean, df_messy, 'fullname', canonicalize=True)
 52 | 
 53 | #send output to csv
 54 | df_final.to_csv('gazetteer_deduplication_output.csv')
 55 | ```
 56 | 
 57 | 
 58 | ### Matching / Record Linkage
 59 | 
 60 | Use identical field names when linking dataframes.
 61 | Record linkage should only be used on dataframes that have been deduplicated.
 62 | 
 63 | ```python
 64 | import pandas as pd
 65 | import pandas_dedupe
 66 | 
 67 | #load dataframes
 68 | dfa = pd.read_csv('file_a.csv')
 69 | dfb = pd.read_csv('file_b.csv')
 70 | 
 71 | #initiate matching
 72 | df_final = pandas_dedupe.link_dataframes(dfa, dfb, ['field_1', 'field_2', 'field_3', 'field_4'])
 73 | 
 74 | #send output to csv
 75 | df_final.to_csv('linkage_output.csv')
 76 | ```
 77 | 
 78 | # Advanced Usage
 79 | 
 80 | ### Canonicalize Fields
 81 | 
 82 | The canonicalize parameter will standardize names in a given cluster. Original fields are also kept.
 83 | 
 84 | ```python
 85 | pandas_dedupe.dedupe_dataframe(df,['first_name', 'last_name', 'payment_type'], canonicalize=True)
 86 | ```
 87 | 
 88 | ### Update Threshold (dedupe_dataframe and gazetteer_dataframe only)
 89 | 
 90 | Group records into clusters only if the cophenetic similarity of the cluster is greater than
 91 | the threshold.
 92 | 
 93 | ```python
 94 | pandas_dedupe.dedupe_dataframe(df, ['first_name', 'last_name'], threshold=.7)
 95 | ```
 96 | 
 97 | ### Update Existing Model (dedupe_dataframe and gazetteer_dataframe only)
 98 | 
 99 | If `True`, it allows a user to update the existing model.
100 | 
101 | ```python
102 | pandas_dedupe.dedupe_dataframe(df, ['first_name', 'last_name'], update_model=True)
103 | ```
104 | 
105 | ### Update Sample Size
106 | 
107 | Specifies the sample size used for training as a float from 0 to 1. By default it is 30% (0.3) of our data.
108 | ```python
109 | pandas_dedupe.dedupe_dataframe(df, ['first_name', 'last_name'], sample_size=0.5)
110 | ```
111 | 
112 | ### Specifying Types
113 | 
114 | If you'd like to specify dates, spatial data, etc, do so here. The structure must be like so:
115 | `('field', 'type', 'additional_parameter)`. the `additional_parameter` section can be omitted.
116 | The default type is `String`.
117 | 
118 | See the full list of types [below](#Types).
119 | 
120 | ```python
121 | # Price Example
122 | pandas_dedupe.dedupe_dataframe(df,['first_name', 'last_name', ('salary', 'Price')])
123 | 
124 | # has missing Example
125 | pandas_dedupe.link_dataframes(df,['SSN', ('bio_pgraph', 'Text'), ('salary', 'Price', 'has missing')])
126 | 
127 | # crf Example
128 | pandas_dedupe.dedupe_dataframe(df,[('first_name', 'String', 'crf'), 'last_name', (m_initial, 'Exact')])
129 | ```
130 | 
131 | # Types
132 | 
133 | Dedupe supports a variety of datatypes; a full list with documentation can be found [here.](https://docs.dedupe.io/en/latest/Variable-definition.html#)
134 | 
135 | pandas-dedupe officially supports the following datatypes:
136 | 
137 | - **String** - Standard string comparison using string distance metric. This is the default type.
138 | - **Text** - Comparison for sentences or paragraphs of text. Uses cosine similarity metric.
139 | - **Price** - For comparing positive, non zero numerical values.
140 | - **DateTime** - For comparing dates.
141 | - **LatLong** - (39.990334, 70.012) will not match to (40.01, 69.98) using a string distance
142 |   metric, even though the points are in a geographically similar location. The LatLong type resolves
143 |   this by calculating the haversine distance between compared coordinates. LatLong requires
144 |   the field to be in the format (Lat, Long). The value can be a string, a tuple containing two
145 |   strings, a tuple containing two floats, or a tuple containing two integers. If the format
146 |   is not able to be processed, you will get a traceback.
147 | - **Exact** - Tests whether fields are an exact match.
148 | - **Exists** - Sometimes, the presence or absence of data can be useful in predicting a match.
149 |   The Exists type tests for whether both, one, or neither of fields are null.
150 | 
151 | Additional supported parameters are:
152 | 
153 | - **has missing** - Can be used if one of your data fields contains null values
154 | - **crf** - Use conditional random fields for comparisons rather than distance metric. May be more
155 |   accurate in some cases, but runs much slower. Works with String and ShortString types.
156 | 
157 | # Contributors
158 | 
159 | [Tyler Marrs](http://tylermarrs.com/) - Refactored code, added docstrings, added `threshold` parameter
160 | 
161 | [Tawni Marrs](https://github.com/tawnimarrs) - refactored code, added docstrings
162 | 
163 | [ieriii](https://github.com/ieriii) - Added `update_model` parameter, updated codebase to use `Dedupe 2.0`, added support for multiprocessing, added `gazetteer_dataframe`.
164 | 
165 | [Daniel Marczin](https://github.com/dim5) - Extensive updates to documentation to enhance readability.
166 | 
167 | [Alexis-Evelyn](https://github.com/alexis-evelyn) - Fixed logger warning with related to Pandas.
168 | 
169 | [Niels Horn](https://github.com/nilq) - Cleaned up utility functions.
170 | 
171 | # Credits
172 | 
173 | Many thanks to folks at [DataMade](https://datamade.us/) for making the the [Dedupe library](https://github.com/dedupeio/dedupe) publicly available. People interested in a code-free implementation of the dedupe library can find a link here: [Dedupe.io](https://dedupe.io/pricing/).
174 | 


--------------------------------------------------------------------------------
/pandas_dedupe/dedupe_dataframe.py:
--------------------------------------------------------------------------------
  1 | from pandas_dedupe.utility_functions import (
  2 |     clean_punctuation,
  3 |     select_fields,
  4 |     specify_type
  5 | )
  6 | 
  7 | import os
  8 | import logging
  9 | import math
 10 | 
 11 | import dedupe
 12 | import pandas as pd
 13 | 
 14 | 
 15 | logging.getLogger().setLevel(logging.WARNING)
 16 | 
 17 | 
 18 | def _active_learning(data, sample_size, deduper, training_file, settings_file):
 19 |     """Internal method that trains the deduper model using active learning.
 20 |         Parameters
 21 |         ----------
 22 |         data : dict
 23 |             The dictionary form of the dataframe that dedupe requires.
 24 |         sample_size : float, default 0.3
 25 |             Specify the sample size used for training as a float from 0 to 1.
 26 |             By default it is 30% (0.3) of our data.
 27 |         deduper : a dedupe model instance
 28 |         training_file : str
 29 |             A path to a training file that will be loaded to keep training
 30 |             from.
 31 |         settings_file : str
 32 |             A path to a settings file that will be loaded if it exists.
 33 |             
 34 |         Returns
 35 |         -------
 36 |         dedupe.Dedupe
 37 |             A trained dedupe model instance.
 38 |     """
 39 |     # To train dedupe, we feed it a sample of records.
 40 |     sample_num = math.floor(len(data) * sample_size)
 41 |     deduper.prepare_training(data, sample_size=sample_num)
 42 | 
 43 |     print('Starting active labeling...')
 44 | 
 45 |     dedupe.console_label(deduper)
 46 | 
 47 |     # Using the examples we just labeled, train the deduper and learn
 48 |     # blocking predicates
 49 |     deduper.train()
 50 | 
 51 |     # When finished, save our training to disk
 52 |     with open(training_file, 'w') as tf:
 53 |         deduper.write_training(tf)
 54 | 
 55 |     # Save our weights and predicates to disk.
 56 |     with open(settings_file, 'wb') as sf:
 57 |         deduper.write_settings(sf)
 58 |     
 59 |     return deduper
 60 | 
 61 | def _train(settings_file, training_file, data, field_properties, sample_size, update_model, n_cores):
 62 |     """Internal method that trains the deduper model from scratch or update
 63 |         an existing dedupe model.
 64 |         Parameters
 65 |         ----------
 66 |         settings_file : str
 67 |             A path to a settings file that will be loaded if it exists.
 68 |         training_file : str
 69 |             A path to a training file that will be loaded to keep training
 70 |             from.
 71 |         data : dict
 72 |             The dictionary form of the dataframe that dedupe requires.
 73 |         field_properties : dict
 74 |             The mapping of fields to their respective data types. Please
 75 |             see the dedupe documentation for further details.
 76 |         sample_size : float, default 0.3
 77 |             Specify the sample size used for training as a float from 0 to 1.
 78 |             By default it is 30% (0.3) of our data.
 79 |         update_model : bool, default False
 80 |             If True, it allows user to update existing model by uploading
 81 |             training file.
 82 |         n_cores : int, default None
 83 |             Specify the number of cores to use during clustering.
 84 |             By default n_cores is equal to None (i.e. use multipressing equal to CPU count).
 85 |         Returns
 86 |         -------
 87 |         dedupe.Dedupe
 88 |             A dedupe model instance.
 89 |     """
 90 |     # Define the fields dedupe will pay attention to
 91 |     fields = []
 92 |     select_fields(fields, field_properties)
 93 |     
 94 |     if update_model == False:
 95 |         
 96 |         # If a settings file already exists, we'll just load that and skip training
 97 |         if os.path.exists(settings_file):
 98 |             print('Reading from', settings_file)
 99 |             with open(settings_file, 'rb') as f:
100 |                 deduper = dedupe.StaticDedupe(f, num_cores=n_cores)
101 |         
102 |         #Create a new deduper object and pass our data model to it.
103 |         else:
104 |             # Initialise dedupe
105 |             deduper = dedupe.Dedupe(fields, num_cores=n_cores)
106 |             
107 |             # Launch active learning
108 |             deduper = _active_learning(data, sample_size, deduper, training_file, settings_file)
109 |             
110 |     else:
111 |         # ## Training
112 |         # Initialise dedupe
113 |         deduper = dedupe.Dedupe(fields, num_cores=n_cores)
114 |         
115 |         # Import existing model
116 |         print('Reading labeled examples from ', training_file)
117 |         with open(training_file, 'rb') as f:
118 |             deduper.prepare_training(data, training_file=f)
119 |         
120 |         # Launch active learning
121 |         deduper = _active_learning(data, sample_size, deduper, training_file, settings_file)
122 | 
123 |     return deduper
124 | 
125 | 
126 | def _cluster(deduper, data, threshold, canonicalize):
127 |     """Internal method that clusters the data.
128 |         Parameters
129 |         ----------
130 |         deduper : dedupe.Deduper
131 |             A trained instance of dedupe.
132 |         data : dict
133 |             The dedupe formatted data dictionary.
134 |         threshold : dedupe.Threshold
135 |             The threshold used for clustering.
136 |         canonicalize : bool or list, default False
137 |             Option that provides the canonical records as additional columns.
138 |             Specifying a list of column names only canonicalizes those columns.
139 |         Returns
140 |         -------
141 |         pd.DataFrame
142 |             A dataframe storing the clustering results.
143 |     """
144 |     # ## Clustering
145 |     print('Clustering...')
146 |     clustered_dupes = deduper.partition(data, threshold)
147 | 
148 |     print('# duplicate sets', len(clustered_dupes))
149 | 
150 |     # Convert data_d to string so that Price & LatLong won't get traceback
151 |     # during dedupe.canonicalize()
152 |     for i in data.values():
153 |         for key in i:
154 |             if i[key] is None:
155 |                 pass
156 |             else:
157 |                 i[key] = str(i[key])
158 |     
159 |     df_data = []
160 |     # ## Writing Results
161 |     cluster_id = 0
162 |     for (cluster_id, cluster) in enumerate(clustered_dupes):
163 |         id_set, scores = cluster
164 |         cluster_d = [data[c] for c in id_set]
165 | 
166 |         canonical_rep = None
167 |         if canonicalize:
168 |             canonical_rep = dedupe.canonicalize(cluster_d)
169 | 
170 |         for record_id, score in zip(id_set, scores):
171 |             tmp = {
172 |                 'Id': record_id,
173 |                 'cluster id': cluster_id,
174 |                 'confidence': score,
175 |             }
176 | 
177 |             if canonicalize:
178 |                 fields_to_canon = canonical_rep.keys()
179 | 
180 |                 if isinstance(canonicalize, list):
181 |                     fields_to_canon = canonicalize
182 | 
183 |                 for key in fields_to_canon:
184 |                     canon_key = 'canonical_' + key
185 |                     tmp[canon_key] = canonical_rep[key]
186 | 
187 |             df_data.append(tmp)
188 | 
189 |     clustered_df = pd.DataFrame(df_data)
190 |     clustered_df = clustered_df.set_index('Id')
191 | 
192 |     return clustered_df
193 | 
194 | 
195 | def dedupe_dataframe(df, field_properties, canonicalize=False,
196 |                      config_name="dedupe_dataframe", update_model=False, threshold=0.4,
197 |                      sample_size=0.3, n_cores=None):
198 |     """Deduplicates a dataframe given fields of interest.
199 |         Parameters
200 |         ----------
201 |         df : pd.DataFrame
202 |             The dataframe to deduplicate.
203 |         field_properties : list
204 |             A list specifying what fields to use for deduplicating records.
205 |         canonicalize : bool or list, default False
206 |             Option that provides the canonical records as additional columns.
207 |             Specifying a list of column names only canonicalizes those columns.
208 |         config_name : str, default dedupe_dataframe
209 |             The configuration file name. Note that this will be used as 
210 |             a prefix to save the settings and training files.
211 |         update_model : bool, default False
212 |             If True, it allows user to update existing model by uploading
213 |             training file. 
214 |         threshold : float, default 0.4
215 |            Only put together records into clusters if the cophenetic similarity of the cluster 
216 |            is greater than the threshold.
217 |         sample_size : float, default 0.3
218 |             Specify the sample size used for training as a float from 0 to 1.
219 |             By default it is 30% (0.3) of our data.
220 |         n_cores : int, default None
221 |             Specify the number of cores to use during clustering.
222 |             By default n_cores is equal to None (i.e. use multipressing equal to CPU count).
223 |     
224 |         Returns
225 |         -------
226 |         pd.DataFrame
227 |             A pandas dataframe that contains the cluster id and confidence
228 |             score. Optionally, it will contain canonicalized columns for all
229 |             attributes of the record.
230 |     """
231 |     # Import Data  
232 |     config_name = config_name.replace(" ", "_")
233 |    
234 |     settings_file = config_name + '_learned_settings'
235 |     training_file = config_name + '_training.json'
236 | 
237 |     print('Importing data ...')
238 | 
239 |     df = clean_punctuation(df)
240 |     
241 |     specify_type(df, field_properties)                
242 |     
243 |     df['dictionary'] = df.apply(
244 |         lambda x: dict(zip(df.columns, x.tolist())), axis=1)
245 |     data_d = dict(zip(df.index, df.dictionary))
246 | 
247 |     # Train or load the model
248 |     deduper = _train(settings_file, training_file, data_d, field_properties,
249 |                      sample_size, update_model, n_cores)
250 | 
251 |     # Cluster the records
252 |     clustered_df = _cluster(deduper, data_d, threshold, canonicalize)
253 |     results = df.join(clustered_df, how='left')
254 |     results.drop(['dictionary'], axis=1, inplace=True)
255 | 
256 |     return results
257 | 


--------------------------------------------------------------------------------
/pandas_dedupe/gazetteer_dataframe.py:
--------------------------------------------------------------------------------
  1 | from pandas_dedupe.utility_functions import (
  2 |     clean_punctuation,
  3 |     select_fields,
  4 |     specify_type
  5 | )
  6 | 
  7 | import os
  8 | import io
  9 | import logging
 10 | import math
 11 | 
 12 | import dedupe
 13 | import pandas as pd
 14 | 
 15 | 
 16 | logging.getLogger().setLevel(logging.WARNING)
 17 | 
 18 | 
 19 | def _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file):
 20 |     """Internal method that trains the deduper model using active learning.
 21 |         Parameters
 22 |         ----------
 23 |         clean_data : dict
 24 |             The dictionary form of the gazette that gazetteer_dedupe requires.
 25 |         messy_data : dict
 26 |             The dictionary form of the messy data that needs to be deduplicated 
 27 |             (and canonicalized)
 28 |         sample_size : float, default 0.3
 29 |             Specify the sample size used for training as a float from 0 to 1.
 30 |             By default it is 30% (0.3) of our data.
 31 |         deduper : a gazetteer model instance
 32 |         training_file : str
 33 |             A path to a training file that will be loaded to keep training
 34 |             from.
 35 |         settings_file : str
 36 |             A path to a settings file that will be loaded if it exists.
 37 |             
 38 |         Returns
 39 |         -------
 40 |         dedupe.Gazetteer
 41 |             A trained gazetteer model instance.
 42 |     """
 43 |     # To train dedupe, we feed it a sample of records.
 44 |     sample_num = math.floor(len(messy_data) * sample_size)
 45 |     deduper.prepare_training(clean_data, messy_data, sample_size=sample_num)
 46 | 
 47 |     print('Starting active labeling...')
 48 | 
 49 |     dedupe.console_label(deduper)
 50 | 
 51 |     # Using the examples we just labeled, train the deduper and learn
 52 |     # blocking predicates
 53 |     deduper.train()
 54 | 
 55 |     # When finished, save our training to disk
 56 |     with open(training_file, 'w') as tf:
 57 |         deduper.write_training(tf)
 58 | 
 59 |     # Save our weights and predicates to disk.
 60 |     with open(settings_file, 'wb') as sf:
 61 |         deduper.write_settings(sf)
 62 |     
 63 |     return deduper
 64 | 
 65 | def _train(settings_file, training_file, clean_data, messy_data, field_properties, sample_size, update_model, n_cores):
 66 |     """Internal method that trains the deduper model from scratch or update
 67 |         an existing dedupe model.
 68 |         Parameters
 69 |         ----------
 70 |         settings_file : str
 71 |             A path to a settings file that will be loaded if it exists.
 72 |         training_file : str
 73 |             A path to a training file that will be loaded to keep training
 74 |             from.
 75 |         clean_data : dict
 76 |             The dictionary form of the gazette that gazetteer_dedupe requires.
 77 |         messy_data : dict
 78 |             The dictionary form of the messy data that needs to be deduplicated 
 79 |             (and canonicalized)
 80 |         field_properties : dict
 81 |             The mapping of fields to their respective data types. Please
 82 |             see the dedupe documentation for further details.
 83 |         sample_size : float, default 0.3
 84 |             Specify the sample size used for training as a float from 0 to 1.
 85 |             By default it is 30% (0.3) of our data.
 86 |         update_model : bool, default False
 87 |             If True, it allows user to update existing model by uploading
 88 |             training file.
 89 |         n_cores : int, default None
 90 |             Specify the number of cores to use during clustering.
 91 |             By default n_cores is equal to None (i.e. use multipressing equal to CPU count).
 92 |         Returns
 93 |         -------
 94 |         dedupe.Gazetteer
 95 |             A gazetteer model instance.
 96 |     """
 97 |     # Define the fields dedupe will pay attention to
 98 |     fields = []
 99 |     select_fields(fields, [field_properties])
100 |     
101 |     if update_model == False:
102 |         
103 |         # If a settings file already exists, we'll just load that and skip training
104 |         if os.path.exists(settings_file):
105 |             print('Reading from', settings_file)
106 |             with open(settings_file, 'rb') as f:
107 |                 deduper = dedupe.StaticGazetteer(f, num_cores=n_cores)
108 |         
109 |         #Create a new deduper object and pass our data model to it.
110 |         else:
111 |             # Initialise dedupe
112 |             deduper = dedupe.Gazetteer(fields, num_cores=n_cores)
113 |             
114 |             # Launch active learning
115 |             deduper = _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file)
116 |             
117 |     else:
118 |         # ## Training
119 |         # Initialise dedupe
120 |         deduper = dedupe.Gazetteer(fields, num_cores=n_cores)
121 | 
122 |         # Import existing model
123 |         print('Reading labeled examples from ', training_file)
124 |         with open(training_file, 'rb') as f:
125 |             deduper.prepare_training(clean_data, messy_data, training_file=f)
126 | 
127 |         # Launch active learning
128 |         deduper = _active_learning(clean_data, messy_data, sample_size, deduper, training_file, settings_file)
129 | 
130 |     return deduper
131 | 
132 | 
133 | def _cluster(deduper, clean_data, messy_data, threshold, canonicalize):
134 |     """Internal method that clusters the data.
135 |         Parameters
136 |         ----------
137 |         deduper : dedupe.Gazetteer
138 |             A trained instance of gazetteer dedupe.
139 |         clean_data : dict
140 |             The dictionary form of the gazette that gazetteer_dedupe requires.
141 |         messy_data : dict
142 |             The dictionary form of the messy data that needs to be deduplicated 
143 |             (and canonicalized)
144 |         threshold : dedupe.Threshold
145 |             The threshold used for clustering.
146 |         canonicalize : bool or list, default False
147 |             Option that provides the canonical records as additional columns.
148 |             Specifying a list of column names only canonicalizes those columns.
149 |         Returns
150 |         -------
151 |         pd.DataFrame
152 |             A dataframe storing the clustering results.
153 |     """
154 |     # ## Clustering
155 |     print('Clustering...')
156 |     deduper.index(clean_data)                       
157 |     
158 |     clustered_dupes = deduper.search(messy_data, threshold, n_matches=None, generator=False)
159 |     print('# duplicate sets', len(clustered_dupes))
160 | 
161 |     # Convert data_d to string so that Price & LatLong won't get traceback
162 |     # during dedupe.canonicalize()
163 |     for i in messy_data.values():
164 |         for key in i:
165 |             if i[key] is None:
166 |                 pass
167 |             else:
168 |                 i[key] = str(i[key])
169 |     
170 |     df_data = []
171 |     # ## Writing Results    
172 |     for _, (messy_id, matches) in enumerate(clustered_dupes):
173 |         for canon_id, scores in matches:
174 |             
175 |             tmp = {
176 |                 'cluster id': canon_id,
177 |                 'confidence': scores, 
178 |                 'record id': messy_id
179 |             }
180 |             df_data.append(tmp)
181 |     
182 |     # Add canonical name
183 |     if canonicalize:
184 |         clean_data_dict = pd.DataFrame.from_dict(clean_data).T.add_prefix('canonical_')
185 |         clustered_df = (pd.DataFrame.from_dict(df_data)        # Create cluster result dataframe
186 |                         .set_index('cluster id', drop=False)   # Note: cluster id is the index of clean_data (i.e. gazette)
187 |                         .join(clean_data_dict, how='left')     # join clustered results and gazette
188 |                         .set_index('record id')                # Note: record id is the index of the messy_data
189 |                        )
190 |     else:
191 |         clustered_df = (pd.DataFrame.from_dict(df_data)        # Create clustered results dataframe
192 |                         .set_index('record id')                # Note: record id is the index of messy_data
193 |                        )
194 |                         
195 |     # Drop duplicates (i.e. keep canonical name with max confidence)
196 |     # Note: the reason for this is that gazetteer dedupe might assign the same obs to multiple clusters
197 |     confidence_maxes = clustered_df.groupby([clustered_df.index])['confidence'].transform(max) # Calculate max confidence
198 |     clustered_df = clustered_df.loc[clustered_df['confidence'] == confidence_maxes]   # Keep rows with max confidence 
199 |     clustered_df = clustered_df.loc[~clustered_df.index.duplicated(keep='first')]     # If same confidence keep the first obs
200 |                    
201 |     return clustered_df
202 | 
203 | 
204 | def gazetteer_dataframe(clean_data, messy_data, field_properties, canonicalize=False,
205 |                      config_name="gazetteer_dataframe", update_model=False, threshold=0.3,
206 |                      sample_size=1, n_cores=None):
207 |     """Deduplicates a dataframe given fields of interest.
208 |         Parameters
209 |         ----------
210 |         clean_data : pd.DataFrame
211 |             The gazetteer dataframe.
212 |         messy_data : pd.DataFrame
213 |             The dataframe to deduplicate.
214 |         field_properties : str
215 |             A string specifying what fields to use for deduplicating records.
216 |         canonicalize : bool or list, default False
217 |             Option that provides the canonical records as additional columns.
218 |             Specifying a list of column names only canonicalizes those columns.
219 |         setting_file : str, default None.
220 |             the default name of the setting file is dedupe_dataframe_settings if None is provided.
221 |         training_file : str, default None
222 |             the default name of the setting file is dedupe_dataframe_training.json if None is provided.
223 |             Note: the name of the training file should include the .json extension.
224 |         update_model : bool, default False
225 |             If True, it allows user to update existing model by uploading
226 |             training file. 
227 |         threshold : float, default 0.3
228 |            only consider put together records into clusters if the cophenetic similarity of the cluster 
229 |            is greater than the threshold.
230 |         sample_size : float, default 0.3
231 |             Specify the sample size used for training as a float from 0 to 1.
232 |             By default it is 30% (0.3) of our data.
233 |         n_cores : int, default None
234 |             Specify the number of cores to use during clustering.
235 |             By default n_cores is equal to None (i.e. use multipressing equal to CPU count).
236 |         Returns
237 |         -------
238 |         pd.DataFrame
239 |             A pandas dataframe that contains the cluster id and confidence
240 |             score. Optionally, it will contain canonicalized columns for all
241 |             attributes of the record.
242 |     """
243 |     # Import Data  
244 |     config_name = config_name.replace(" ", "_")
245 | 
246 |     settings_file = config_name + '_learned_settings'
247 |     training_file = config_name + '_training.json'
248 | 
249 |     print('Importing data ...')
250 |     assert type(clean_data)==pd.core.frame.DataFrame, 'Please provide a gazette in pandas dataframe format'
251 |     assert len(clean_data.columns)==1, 'Please provide a gazetteer dataframe made of a single variable'
252 |     assert type(field_properties) == str, 'field_properties must be in string (str) format'
253 | 
254 |     # Common column name
255 |     common_name = clean_data.columns[0]
256 |     
257 |     # Canonical dataset (i.e. gazette)
258 |     df_canonical = clean_punctuation(clean_data)
259 |     df_canonical.rename(columns={field_properties: common_name}, inplace=True)
260 |     specify_type(df_canonical, [common_name])                
261 |     
262 |     df_canonical['dictionary'] = df_canonical.apply(
263 |         lambda x: dict(zip(df_canonical.columns, x.tolist())), axis=1)
264 |     canonical = dict(zip(df_canonical.index, df_canonical.dictionary))
265 |     
266 |     # Messy dataset
267 |     df_messy = clean_punctuation(messy_data)
268 |     df_messy.rename(columns={field_properties: common_name}, inplace=True)
269 |     specify_type(df_messy, [common_name])                
270 | 
271 |     df_messy['dictionary'] = df_messy.apply(
272 |         lambda x: dict(zip(df_messy.columns, x.tolist())), axis=1)
273 |     messy = dict(zip(df_messy.index, df_messy.dictionary))
274 |     
275 |     # Train or load the model
276 |     deduper = _train(settings_file, training_file, canonical, messy, common_name,
277 |                      sample_size, update_model, n_cores)
278 |     
279 |     # Cluster the records
280 |     clustered_df = _cluster(deduper, canonical, messy, threshold, canonicalize)
281 |     results = messy_data.join(clustered_df, how='left')
282 |     results.rename(columns={'canonical_'+str(common_name): 'canonical_'+str(field_properties)}, inplace=True)
283 | 
284 |     return results
285 | 


--------------------------------------------------------------------------------