├── .gitignore ├── MANIFEST.in ├── README.md ├── d6tjoin ├── __init__.py ├── pre.py ├── smart_join.py ├── top1.py └── utils.py ├── docs ├── Makefile ├── make.bat ├── samples.py ├── shell-napoleon-html.sh ├── shell-napoleon-recreate.sh └── source │ ├── conf.py │ ├── d6tjoin.rst │ ├── index.rst │ ├── modules.rst │ └── setup.rst ├── examples-prejoin.ipynb ├── examples-tokencluster.ipynb ├── examples-top1.ipynb ├── requirements-dev.txt ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── test_pre_pd.py ├── test_smartjoin.py ├── test_top1.py └── tmp.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .env 3 | temp/ 4 | fiddle* 5 | .pytest_cache/ 6 | tests/tmp-local.py 7 | tests/tmp*.py 8 | 9 | docs-examples/ 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | .hypothesis/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | .static_storage/ 66 | .media/ 67 | local_settings.py 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # Environments 95 | .env 96 | .venv 97 | env/ 98 | venv/ 99 | ENV/ 100 | env.bak/ 101 | venv.bak/ 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # mkdocs documentation 111 | /site 112 | 113 | # mypy 114 | .mypy_cache/ 115 | 116 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Databolt Smart Join 2 | 3 | Easily join different datasets without writing custom code. Does best match joins on strings, dates and numbers. For example you can quickly join similar but not identical stock tickers, addresses, names and dates without manual processing. 4 | 5 | ## Installation 6 | 7 | **0.2.x is currently in beta. The github master is the latest dev version. The docs refer to <0.2.0** 8 | 9 | We recommend using the latest version from github `pip install git+https://github.com/d6t/d6tjoin.git` 10 | 11 | If you cannot install from github, use the latest published version `pip install d6tjoin`. To update, run `pip install d6tflow -U --no-deps` 12 | 13 | We recommend using [AffineGap](https://github.com/dedupeio/affinegap) which is not an official requirement, you can install using `pip install affinegap`. 14 | 15 | For the `jellyfish` library, make sure the C implementation is working else `d6tjoin` will be very slow. You can test by running `import jellyfish.cjellyfish` if the C version is installed. If you don't have a C compiler, you can `conda install -c conda-forge jellyfish`. 16 | 17 | ## Sample Use 18 | 19 | ``` 20 | 21 | import d6tjoin.top1 22 | import d6tjoin.utils 23 | import d6tjoin 24 | 25 | #************************ 26 | # pre join diagnostics 27 | #************************ 28 | 29 | # check join quality => none of the ids match 30 | 31 | d6tjoin.Prejoin([df1,df2],['id','date']).match_quality() 32 | 33 | key left key right all matched inner left right outer unmatched total unmatched left unmatched right 34 | 0 id id False 0 10 10 20 20 10 10 35 | 1 date date True 366 366 366 366 0 0 0 36 | 2 __all__ __all__ False 0 3660 3660 7320 7320 3660 3660 37 | 38 | #************************ 39 | # best match join on id 40 | #************************ 41 | 42 | result = d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], 43 | exact_left_on=['date'],exact_right_on=['date']).merge() 44 | 45 | result['merged'].head(2) 46 | 47 | date id val1 id_right val1_right val2 48 | 0 2010-01-01 e3e70682 0.020 3e7068 0.020 0.034 49 | 1 2010-01-01 f728b4fa 0.806 728b4f 0.806 0.849 50 | 51 | #************************ 52 | # debug best matches 53 | #************************ 54 | 55 | result['top1']['id'].head(2) 56 | 57 | date __top1left__ __top1right__ __top1diff__ __matchtype__ 58 | 10 2010-01-01 e3e70682 3e7068 2 top1 left 59 | 34 2010-01-01 e443df78 443df7 2 top1 left 60 | 61 | #************************ 62 | # customize similarity fct 63 | #************************ 64 | import affinegap 65 | 66 | result = d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], 67 | fun_diff=[affinegap.affineGapDistance]).merge() 68 | 69 | #************************ 70 | # token-based substring clusters and joins 71 | #************************ 72 | dftoken=d6tjoin.utils.splitcharTokenCount(df2['id']) 73 | 74 | word count 75 | 0 Equity 7 76 | 1 US 5 77 | 2 NA 2 78 | 3 PRIVATE 2 79 | 80 | 81 | d6tjoin.utils.unique_contains(df2['id'], dftoken['word'].values) 82 | >>> [('Equity', ['AAPL_US_Equity', 'AAP_US_Equity', 'AD_NA_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity', 'BMW_NA_Equity']), ('US', ['AAPL_US_Equity', 'AAP_US_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity']), ('NA', ['AD_NA_Equity', 'BMW_NA_Equity']), ('PRIVATE', ['PRIVATE_JLP', 'PRIVATE_NMG'])] 83 | 84 | import re 85 | splitchars="[^a-zA-Z0-9]+" 86 | def tokenmatch(s1,s2): 87 | return 3-len(set(re.split(splitchars,s1)) & set(re.split(splitchars,s2))) 88 | 89 | d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[tokenmatch], top_limit=[2]).merge()['top1']['id'] 90 | 91 | __top1left__ __top1right__ __matchtype__ __top1diff__ 92 | 0 AAP AAP_US_Equity top1 left 2 93 | 1 AAPL AAPL_US_Equity top1 left 2 94 | 2 AMZN-AMZN AMZN_US_Equity top1 left 2 95 | 3 APRN APRN_US_Equity top1 left 2 96 | 4 JLP PRIVATE_JLP top1 left 2 97 | 5 NMG PRIVATE_NMG top1 left 2 98 | 99 | ``` 100 | 101 | ## Features include 102 | Enhances `pd.merge()` function with: 103 | * Pre join diagnostics to identify mismatched join keys 104 | * Best match joins that finds the top1 most similar value 105 | * Quickly join stock identifiers, addresses, names without manual processing 106 | * Ability to customize similarity functions, set max difference and other advanced features 107 | 108 | ## Documentation 109 | 110 | * [PreJoin examples notebook](https://github.com/d6t/d6tjoin/blob/master/examples-prejoin.ipynb) - Examples for diagnosing join problems 111 | * [MergeTop1 notebook](https://github.com/d6t/d6tjoin/blob/master/examples-top1.ipynb) - Best match join examples notebook 112 | * [Token substring join notebook](https://github.com/d6t/d6tjoin/blob/master/examples-tokencluster.ipynb) - Find common substrings and joins on token substrings 113 | * [Official docs](http://d6tjoin.readthedocs.io/en/latest/py-modindex.html) - Detailed documentation for modules, classes, functions 114 | 115 | ## Pro version 116 | 117 | Additional features: 118 | * Join >2 dataframes 119 | * Automatic Content-based similarity joins 120 | * Advanced join quality checks 121 | * Fast approximations for big data 122 | 123 | [Request demo](https://pipe.databolt.tech/gui/request-premium/) 124 | 125 | ## Faster Data Engineering 126 | 127 | Check out other d6t libraries to solve common data engineering problems, including 128 | * data ingest, quickly ingest raw data 129 | * fuzzy joins, quickly join data 130 | * data pipes, quickly share and distribute data 131 | 132 | https://github.com/d6t/d6t-python 133 | 134 | And we encourage you to join the Databolt blog to get updates and tips+tricks http://blog.databolt.tech -------------------------------------------------------------------------------- /d6tjoin/__init__.py: -------------------------------------------------------------------------------- 1 | # import d6tjoin.top1 2 | import d6tjoin.utils 3 | 4 | from d6tjoin.pre import Prejoin 5 | pd = Prejoin -------------------------------------------------------------------------------- /d6tjoin/pre.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import itertools, warnings 3 | 4 | import pandas as pd 5 | pd.set_option('display.expand_frame_repr', False) 6 | import numpy as np 7 | 8 | from d6tstack.helpers import * 9 | from scipy.stats import mode 10 | 11 | 12 | # ****************************************** 13 | # utils 14 | # ****************************************** 15 | def head(dfs, nrows=1000): 16 | return [dfg.head(nrows) for dfg in dfs] 17 | 18 | # ****************************************** 19 | # prejoin stats class 20 | # ****************************************** 21 | 22 | class Prejoin(object): 23 | """ 24 | Analyze, slice & dice join keys and dataframes before joining. Useful for checking how good a join will be and quickly looking at unmatched join keys. 25 | 26 | Args: 27 | dfs (list): list of data frames to join 28 | keys (var): either list of strings `['a','b']` if join keys have the same names in all dataframes or list of lists if join keys are different across dataframes `[[leftkeys],[rightkeys]]`, eg `[['left1','left2'],['right1','right2']]` 29 | keys_bydf (bool): if False, specify multi-key join keys by join level eg `[['left1','right1'],['left2','right2']]` 30 | nrows (int): for `df.head(nrows)` 31 | print_only (bool): if False return results instead of printing 32 | """ 33 | 34 | def __init__(self, dfs, keys=None, keys_bydf=True, nrows=5, print_only=True): 35 | 36 | # inputs dfs 37 | self._init_dfs(dfs) 38 | 39 | if keys is not None: 40 | self.set_keys(keys, keys_bydf) 41 | else: 42 | self.keys = None; self.keysdf = None 43 | 44 | self.nrows = nrows 45 | self.print_only = print_only 46 | 47 | # df heads 48 | self.dfshead = [dfg.head(nrows) for idx, dfg in self._enumerate_dfs()] 49 | 50 | # init column scan 51 | self.columns_sniff() 52 | 53 | def _init_dfs(self, dfs): 54 | # check and save dfs 55 | if len(dfs)<2: 56 | raise ValueError('Need to pass at least 2 dataframes') 57 | 58 | if len(dfs)>2: 59 | raise NotImplementedError('Only handles 2 dataframes for now') 60 | 61 | self.dfs = dfs 62 | self.cfg_ndfs = len(dfs) 63 | 64 | def _enumerate_dfs(self): 65 | if self.keys is None: 66 | return enumerate(self.dfs) 67 | else: 68 | return [(idx, dfg[self.keysdf[idx]]) for idx, dfg in enumerate(self.dfs)] 69 | 70 | def set_keys(self, keys, keys_bydf=True): 71 | # check and save join keys 72 | self._check_keys(keys) 73 | keys, keysdf = self._prep_keys(keys, keys_bydf) 74 | self._check_keysdfs(keys, keysdf) 75 | # join keys 76 | self.cfg_njoins = len(keysdf[0]) 77 | self.keys = keys # keys by join level 78 | self.keysall = keys + [['__all__'] * len(self.dfs)] 79 | self.keysdf = keysdf # keys by df 80 | self.keysdfall = keysdf + [['__all__']] * len(self.dfs) 81 | self.uniques = [] # set of unique values for each join key individually 82 | self.keysets = [] # set of unique values for all join keys together __all__ 83 | 84 | return keys, keysdf 85 | 86 | def _check_keys(self, keys): 87 | if not keys or len(keys)<1: 88 | raise ValueError("Need to have join keys") 89 | # todo: no duplicate join keys passed 90 | 91 | def _check_keysdfs(self, keys, keysdf): 92 | if not all([len(k)==len(self.dfs) for k in keys]): 93 | raise ValueError("Need to provide join keys for all dataframes") 94 | 95 | for idf,dfg in enumerate(self.dfs): # check that keys present in dataframe 96 | missing = set(keysdf[idf]).difference(dfg.columns) 97 | if missing: 98 | raise KeyError(f'Columns missing in df#{idf}: {missing}') 99 | 100 | def _prep_keys(self, keys, keys_bydf): 101 | # deal with empty keys 102 | if not keys: 103 | return [], [] 104 | 105 | # get keys in correct format given user input 106 | if isinstance(keys[0], (str,)): 107 | keysdf = [keys]*len(self.dfs) 108 | keys = list(map(list, zip(*keysdf))) 109 | 110 | elif isinstance(keys[0], (list,)): 111 | keysdf = list(map(list, zip(*keys))) 112 | 113 | if keys_bydf: 114 | keys, keysdf = keysdf, keys 115 | 116 | else: 117 | raise ValueError("keys need to be either list of strings or list of lists") 118 | 119 | return keys, keysdf 120 | 121 | def _return(self, result): 122 | if self.print_only: 123 | print(result) 124 | else: 125 | return result 126 | 127 | def _returndict(self, result): 128 | if self.print_only: 129 | for idx,d in result.items(): 130 | print(f'dataframe #{idx}') 131 | print(d) 132 | else: 133 | return result 134 | 135 | def columns_sniff(self): 136 | # from d6tstack 137 | # todo: modularize d6tstack 138 | # tood: rewrite scipy mode function 139 | 140 | dfl_all = self.dfshead 141 | fname_list = range(len(self.dfs)) 142 | 143 | # process columns 144 | dfl_all_col = [df.columns.tolist() for df in dfl_all] 145 | col_files = dict(zip(fname_list, dfl_all_col)) 146 | col_common = list_common(list(col_files.values())) 147 | col_all = list_unique(list(col_files.values())) 148 | 149 | # find index in column list so can check order is correct 150 | df_col_present = {} 151 | for iFileName, iFileCol in col_files.items(): 152 | df_col_present[iFileName] = [iCol in iFileCol for iCol in col_all] 153 | 154 | df_col_present = pd.DataFrame(df_col_present, index=col_all).T 155 | df_col_present.index.names = ['file_path'] 156 | 157 | # find index in column list so can check order is correct 158 | df_col_idx = {} 159 | for iFileName, iFileCol in col_files.items(): 160 | df_col_idx[iFileName] = [iFileCol.index(iCol) if iCol in iFileCol else np.nan for iCol in col_all] 161 | df_col_idx = pd.DataFrame(df_col_idx, index=col_all).T 162 | 163 | # order columns by where they appear in file 164 | m=mode(df_col_idx,axis=0) 165 | df_col_pos = pd.DataFrame({'o':m[0][0],'c':m[1][0]},index=df_col_idx.columns) 166 | df_col_pos = df_col_pos.sort_values(['o','c']) 167 | df_col_pos['iscommon']=df_col_pos.index.isin(col_common) 168 | 169 | 170 | # reorder by position 171 | col_all = df_col_pos.index.values.tolist() 172 | col_common = df_col_pos[df_col_pos['iscommon']].index.values.tolist() 173 | col_unique = df_col_pos[~df_col_pos['iscommon']].index.values.tolist() 174 | df_col_present = df_col_present[col_all] 175 | df_col_idx = df_col_idx[col_all] 176 | 177 | sniff_results = {'files_columns': col_files, 'columns_all': col_all, 'columns_common': col_common, 178 | 'columns_unique': col_unique, 'is_all_equal': columns_all_equal(dfl_all_col), 179 | 'df_columns_present': df_col_present, 'df_columns_order': df_col_idx} 180 | 181 | self.sniff_results = sniff_results 182 | 183 | 184 | def _calc_keysets(self): 185 | 186 | self.keysets = [] # reset 187 | 188 | # find set of unique values for each join key 189 | for idx, dfg in enumerate(self.dfs): 190 | 191 | # keys individually 192 | uniquedict = OrderedDict() 193 | for key in self.keysdf[idx]: 194 | v = dfg[key].unique() 195 | uniquedict[key] = set(v[~pd.isnull(v)]) 196 | 197 | # keys _all__ 198 | dft = dfg[self.keysdf[idx]].drop_duplicates() 199 | uniquedict['__all__'] = {tuple(x) for x in dft.values} 200 | self.uniques.append(uniquedict) 201 | 202 | # perform set logic 203 | for keys in self.keysall: 204 | df_key = {} 205 | df_key['key left'] = keys[0] 206 | df_key['key right'] = keys[1] 207 | df_key['keyset left'] = self.uniques[0][df_key['key left']] 208 | df_key['keyset right'] = self.uniques[1][df_key['key right']] 209 | 210 | df_key['inner'] = df_key['keyset left'].intersection(df_key['keyset right']) 211 | df_key['outer'] = df_key['keyset left'].union(df_key['keyset right']) 212 | df_key['unmatched total'] = df_key['keyset left'].symmetric_difference(df_key['keyset right']) 213 | df_key['unmatched left'] = df_key['keyset left'].difference(df_key['keyset right']) 214 | df_key['unmatched right'] = df_key['keyset right'].difference(df_key['keyset left']) 215 | 216 | # check types are consistent 217 | vl = next(iter(df_key['keyset left'])) # take first element 218 | vr = next(iter(df_key['keyset right'])) # take first element 219 | 220 | df_key['value type'] = type(vl) 221 | 222 | self.keysets.append(df_key) 223 | 224 | def head(self, nrows=None): 225 | """ 226 | .head() of input dataframes 227 | 228 | Args: 229 | keys_only (bool): only print join keys 230 | nrows (int): number of rows to show 231 | print (bool): print or return df 232 | 233 | """ 234 | if nrows is None: 235 | result = {idx: dfg for idx, dfg in enumerate(self.dfshead)} 236 | else: 237 | result = {idx: dfg.head(nrows) for idx, dfg in self._enumerate_dfs()} 238 | return self._returndict(result) 239 | 240 | def columns_common(self): 241 | return self._return(self.sniff_results['columns_common']) 242 | 243 | def columns_all(self): 244 | return self._return(self.sniff_results['columns_all']) 245 | 246 | def columns_ispresent(self, as_bool=False): 247 | # todo: maintain column order of first dataframe => take from d6tstack 248 | col_union = list(set().union(*[dfg.columns.tolist() for dfg in self.dfs])) 249 | dfr = dict(zip(range(self.cfg_ndfs),[dfg.columns.isin(col_union) for dfg in self.dfs])) 250 | dfr = pd.DataFrame(dfr,index=col_union).sort_index() 251 | if not as_bool: 252 | dfr = dfr.replace([True,False],['+','-']) 253 | return self._return(dfr) 254 | 255 | def describe(self, **kwargs): 256 | """ 257 | .describe() of input dataframes 258 | 259 | Args: 260 | kwargs (misc): to pass to .describe() 261 | 262 | """ 263 | result = {idx: dfg.describe(**kwargs) for idx, dfg in self._enumerate_dfs()} 264 | return self._returndict(result) 265 | 266 | def shape(self): 267 | """ 268 | .shape of input dataframes 269 | 270 | Args: 271 | kwargs (misc): to pass to .describe() 272 | 273 | """ 274 | result = {idx: dfg.shape for idx, dfg in self._enumerate_dfs()} 275 | return self._returndict(result) 276 | 277 | def describe_str(self, unique_count=False): 278 | """ 279 | Returns statistics on length of all strings and other objects in pandas dataframe. Statistics include mean, median, min, max. Optional unique count. 280 | 281 | Args: 282 | dfg (dataframe): pandas dataframe 283 | columns (:obj:`list`, optional): column names to analyze. If None analyze all 284 | unique_count (:obj:`bool`, optional): include count of unique values 285 | 286 | Returns: 287 | dataframe: string length statistics 288 | """ 289 | def _apply_strlen(dfg, unique_count=False): 290 | lenv = np.vectorize(len) 291 | alens = lenv(dfg.values) 292 | r = {'median':np.median(alens),'mean':np.mean(alens),'min':np.min(alens),'max':np.max(alens),'nrecords':dfg.shape[0]} 293 | if unique_count: 294 | r['uniques'] = len(dfg.unique()) 295 | return pd.Series(r) 296 | 297 | result = {} 298 | for idx, dfg in enumerate(self.dfs): 299 | if unique_count: 300 | cfg_col_sel = ['median','min','max','nrecords','uniques'] 301 | else: 302 | cfg_col_sel = ['median','min','max','nrecords'] 303 | dfo = dfg.select_dtypes(include=['object']).apply(lambda x: _apply_strlen(x.dropna(), unique_count)).T[cfg_col_sel] 304 | result[idx] = dfo 305 | return self._returndict(result) 306 | 307 | def describe_data(self, ignore_value_columns=False): 308 | result = {} 309 | for idx, dfg in enumerate(self.dfs): 310 | 311 | if ignore_value_columns: 312 | columns_sel = dfg.select_dtypes(include=['object']).columns 313 | else: 314 | columns_sel = dfg.columns 315 | 316 | nunique = dfg[columns_sel].apply(lambda x: x.dropna().unique().shape[0]).rename('unique') 317 | nrecords = dfg[columns_sel].apply(lambda x: x.dropna().shape[0]).rename('nrecords') 318 | nnan = dfg[columns_sel].isna().sum().rename('nan') 319 | dfr = pd.concat([nrecords,nunique,nnan],1) 320 | dfr['unique rate'] = dfr['unique']/dfr['nrecords'] 321 | result[idx] = dfr 322 | 323 | return self._returndict(result) 324 | 325 | def data_match(self, how=None, topn=1, ignore_value_columns=True, max_unique_pct=0.8, min_unique_count=1, min_match_rate=0.5): 326 | ''' 327 | todo: 328 | order matters, sequential inner or left joins (no right or outer joins) 329 | jaccard 1:2 => intersection for inner, same set for left 330 | 331 | ''' 332 | how = 'inner' if how is None else how 333 | 334 | if self.cfg_ndfs >2: 335 | warnings.warn('Upgrade to PRO version to join >2 dataframes') 336 | 337 | from d6tjoin.utils import _filter_group_min 338 | 339 | if ignore_value_columns: 340 | df_left, df_right = [dfg.select_dtypes(include=['object']) for _, dfg in self._enumerate_dfs()] 341 | print('ignored columns (value type)', 'left:',set(self.dfs[0].columns)-set(df_left.columns), 'right:', set(self.dfs[1].columns)-set(df_right.columns)) 342 | else: 343 | df_left, df_right = [dfg for _, dfg in self._enumerate_dfs()] 344 | 345 | def unique_dict(dfg): 346 | d = dict(zip(dfg.columns, [set(dfg[x].dropna().unique()) for x in dfg.columns])) 347 | d = {k: v for k, v in d.items() if (len(v) > min_unique_count) and (len(v)/dfg[k].shape[0] <= max_unique_pct)} 348 | return d 349 | 350 | # todo: add len(key) and sample=next(key) 351 | values_left = unique_dict(df_left) 352 | values_right = unique_dict(df_right) 353 | values_left_ignored = set(df_left.columns)-set(values_left.keys()) 354 | values_right_ignored = set(df_right.columns)-set(values_right.keys()) 355 | if values_left_ignored: print('ignored columns (unique count)', 'left:', values_left_ignored) 356 | if values_right_ignored: print('ignored columns (unique count)', 'right:', values_right_ignored) 357 | 358 | df_candidates = list(itertools.product(values_left.keys(), values_right.keys())) 359 | df_candidates = pd.DataFrame(df_candidates, columns=['__left__', '__right__']) 360 | 361 | def jaccard_similarity(s1, s2, how): 362 | intersection = len(s1.intersection(s2)) 363 | if how=='left': 364 | ratio = float(intersection / len(s1)) 365 | else: 366 | union = (len(s1) + len(s2)) - intersection 367 | ratio = float(intersection / union) 368 | return ratio 369 | 370 | def jaccard_caller(col_left, col_right): 371 | return jaccard_similarity(values_left[col_left], values_right[col_right], how) 372 | 373 | df_candidates['__similarity__'] = df_candidates.apply(lambda x: jaccard_caller(x['__left__'], x['__right__']), axis=1) 374 | df_candidates = df_candidates.dropna(subset=['__similarity__']) 375 | if df_candidates.empty: 376 | raise ValueError('Failed to compute meaningful similarity, might need to loosen parameters') 377 | df_candidates['__similarity__'] = -df_candidates['__similarity__'] 378 | df_diff = df_candidates.groupby('__left__',group_keys=False).apply(lambda x: _filter_group_min(x,'__similarity__',topn)).reset_index(drop=True) 379 | df_diff['__similarity__'] = -df_diff['__similarity__'] 380 | 381 | df_diff['__left-sample__'] = df_diff['__left__'].map(lambda x: next(iter(values_left[x]),None)) 382 | df_diff['__right-sample__'] = df_diff['__right__'].map(lambda x: next(iter(values_right[x]),None)) 383 | df_diff['__left-nunique__'] = df_diff['__left__'].map(lambda x: len(values_left[x])) 384 | df_diff['__right-nunique__'] = df_diff['__right__'].map(lambda x: len(values_right[x])) 385 | 386 | if min_match_rate is not None: 387 | df_diff = df_diff[df_diff['__similarity__']>min_match_rate] 388 | 389 | # todo: sort by left df columns and then by similarity descending 390 | 391 | return self._return(df_diff) 392 | 393 | def data_similarity(self, how=None, columns=None): 394 | # goal: which columns data is most "similar" 395 | # todo: run similarity function show median/min/max similarity across columns 396 | # similarity on all vs all values? 397 | # find the top1/n similarity for each value. median across all values 398 | # above is strings. for numbers and dates: 399 | # numbers: "same distribution" => distribution similarity 400 | # dates: "same distribution" => distribution similarity 401 | # distribution similarity: non-parametric. interquartile range similar 402 | # want to find join keys not join value columns 403 | # 404 | 405 | raise NotImplementedError() 406 | 407 | 408 | def match_quality(self, rerun=False): 409 | """ 410 | Show prejoin statistics 411 | 412 | Args: 413 | return_results (bool): Return results as df instead of printing 414 | 415 | """ 416 | 417 | if not self.keysets or rerun: 418 | self._calc_keysets() 419 | 420 | df_out = [] 421 | 422 | for key_set in self.keysets: 423 | df_key = {} 424 | for k in ['keyset left','keyset right','inner','outer','unmatched total','unmatched left','unmatched right']: 425 | df_key[k] = len(key_set[k]) 426 | for k in ['key left','key right']: 427 | df_key[k] = key_set[k] 428 | df_key['all matched'] = df_key['inner']==df_key['outer'] 429 | df_out.append(df_key) 430 | 431 | df_out = pd.DataFrame(df_out) 432 | df_out = df_out.rename(columns={'keyset left':'left','keyset right':'right'}) 433 | df_out = df_out[['key left','key right','all matched','inner','left','right','outer','unmatched total','unmatched left','unmatched right']] 434 | 435 | return self._return(df_out) 436 | 437 | def is_all_matched(self, key='__all__',rerun=False): 438 | 439 | if not self.keysets or rerun: 440 | self._calc_keysets() 441 | 442 | keymask = [key in e for e in self.keysall] 443 | if not (any(keymask)): 444 | raise ValueError('key ', self.cfg_show_key, ' not a join key in ', self.keys) 445 | ilevel = keymask.index(True) 446 | 447 | return (self.keysets[ilevel]['key left']==key or self.keysets[ilevel]['key right']==key) and len(self.keysets[ilevel]['unmatched total'])==0 448 | 449 | def _show_prep_df(self, idf, mode): 450 | """ 451 | PRIVATE. prepare data for self.show() functions 452 | 453 | Args: 454 | idf (int): which df in self.dfs 455 | mode (str): matched vs unmatched 456 | 457 | """ 458 | 459 | if idf==0: 460 | side='left' 461 | elif idf==1: 462 | side='right' 463 | else: 464 | raise ValueError('invalid idx') 465 | 466 | if self.cfg_show_keys_only: 467 | if self.cfg_show_key == '__all__': 468 | cfg_col_sel = self.keysdf[idf] 469 | else: 470 | cfg_col_sel = self.cfg_show_key 471 | else: 472 | cfg_col_sel = self.dfs[idf].columns 473 | 474 | # which set to return? 475 | if mode=='matched': 476 | cfg_mode_sel = 'inner' 477 | elif mode=='unmatched': 478 | cfg_mode_sel = mode + ' ' + side 479 | else: 480 | raise ValueError('invalid mode', mode) 481 | 482 | keys = list(self.keysets[self.cfg_show_level][cfg_mode_sel]) 483 | if self.cfg_show_nrecords > 0: 484 | keys = keys[:self.cfg_show_nrecords] 485 | 486 | if self.cfg_show_key == '__all__' and self.cfg_njoins>1: 487 | dfg = self.dfs[idf].copy() 488 | dfg = self.dfs[idf].reset_index().set_index(self.keysdf[idf]) 489 | dfg = dfg.loc[keys] 490 | dfg = dfg.reset_index().sort_values('index')[cfg_col_sel].reset_index(drop=True) # reorder to original order 491 | elif self.cfg_show_key == '__all__' and self.cfg_njoins==1: 492 | dfg = self.dfs[idf] 493 | dfg = dfg.loc[dfg[self.keysdf[idf][0]].isin([e[0] for e in keys]), cfg_col_sel] 494 | else: 495 | dfg = self.dfs[idf] 496 | dfg = dfg.loc[dfg[self.cfg_show_key].isin(keys),cfg_col_sel] 497 | 498 | if self.cfg_show_nrows > 0: 499 | dfg = dfg.head(self.cfg_show_nrows) 500 | 501 | if self.cfg_show_print_only: 502 | print('%s %s for key %s' %(mode, side, self.cfg_show_key)) 503 | print(dfg) 504 | else: 505 | self.df_show_out[side] = dfg.copy() 506 | 507 | def _show(self, mode): 508 | if not self.keysets: 509 | raise RuntimeError('run .stats_prejoin() first') 510 | 511 | keymask = [self.cfg_show_key in e for e in self.keysall] 512 | if not (any(keymask)): 513 | raise ValueError('key ', self.cfg_show_key, ' not a join key in ', self.keys) 514 | self.cfg_show_level = keymask.index(True) 515 | 516 | for idf in range(self.cfg_ndfs): # run for all self.dfs 517 | if self.keysall[self.cfg_show_level][idf] == self.cfg_show_key: # check if key applies 518 | self._show_prep_df(idf, mode) 519 | 520 | def show_unmatched(self, key, nrecords=3, nrows=3, keys_only=False, print_only=False): 521 | """ 522 | Show unmatched records 523 | 524 | Args: 525 | key (str): join key 526 | nrecords (int): number of unmatched records 527 | nrows (int): number of rows 528 | keys_only (bool): show only join keys 529 | print_only (bool): if false return results instead of printing 530 | """ 531 | self.df_show_out = {} 532 | self.cfg_show_key = key 533 | self.cfg_show_nrecords = nrecords 534 | self.cfg_show_nrows = nrows 535 | self.cfg_show_keys_only = keys_only 536 | self.cfg_show_print_only = print_only 537 | 538 | self._show('unmatched') 539 | if not self.cfg_show_print_only: 540 | return self.df_show_out 541 | 542 | def show_matched(self, key, nrecords=3, nrows=3, keys_only=False, print_only=False): 543 | """ 544 | Show matched records 545 | 546 | Args: 547 | key (str): join key 548 | nrecords (int): number of unmatched records 549 | nrows (int): number of rows 550 | keys_only (bool): show only join keys 551 | print_only (bool): if false return results instead of printing 552 | """ 553 | self.df_show_out = {} 554 | self.cfg_show_key = key 555 | self.cfg_show_nrecords = nrecords 556 | self.cfg_show_nrows = nrows 557 | self.cfg_show_keys_only = keys_only 558 | self.cfg_show_print_only = print_only 559 | 560 | self._show('matched') 561 | if not self.cfg_show_print_only: 562 | return self.df_show_out 563 | 564 | def merge(self, **kwargs): 565 | """ 566 | Perform merge using keys 567 | 568 | Args: 569 | kwargs (misc): parameters to pass to `pd.merge()` 570 | """ 571 | if len(self.dfs) > 2: 572 | raise NotImplementedError('Only handles 2 dataframes for now') 573 | 574 | return self.dfs[0].merge(self.dfs[1], left_on=self.keysdf[0], right_on=self.keysdf[1], **kwargs) 575 | 576 | -------------------------------------------------------------------------------- /d6tjoin/smart_join.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from collections import OrderedDict 4 | import itertools 5 | import warnings 6 | import jellyfish 7 | 8 | from d6tjoin.pre import Prejoin as BaseJoin 9 | 10 | 11 | # ****************************************** 12 | # helpers 13 | # ****************************************** 14 | def set_values(dfg, key): 15 | v = dfg[key].unique() 16 | return v[~pd.isnull(v)] 17 | 18 | 19 | def apply_gen_candidates_group(dfg): 20 | return pd.DataFrame(list(itertools.product(dfg['__top1left__'].values[0],dfg['__top1right__'].values[0])),columns=['__top1left__','__top1right__']) 21 | 22 | 23 | def apply_gen_candidates(set1, set2): 24 | df_candidates = list(itertools.product(set1, set2)) 25 | df_candidates = pd.DataFrame(df_candidates,columns=['__top1left__','__top1right__']) 26 | 27 | return df_candidates 28 | 29 | 30 | def diff_arithmetic(x,y): 31 | return abs(x - y) 32 | 33 | 34 | def diff_edit(a,b): 35 | return jellyfish.levenshtein_distance(a,b) 36 | 37 | 38 | def filter_group_minmax(dfg, col): 39 | """ 40 | 41 | Returns all rows equal to min in col 42 | 43 | """ 44 | return dfg[dfg[col] == dfg[col].min()] 45 | 46 | 47 | def prep_match_df(dfg): 48 | dfg = dfg[['__top1left__', '__top1right__', '__top1diff__', '__match type__']] 49 | return dfg 50 | 51 | # ****************************************** 52 | # fuzzy join 53 | # ****************************************** 54 | class FuzzyJoinTop1(BaseJoin): 55 | 56 | def __init__(self, dfs, exact_keys=[], fuzzy_keys=[], exact_how='inner', fuzzy_how = {}, keys_bydf=False, init_merge=False): 57 | 58 | """ 59 | 60 | Smart joiner for top 1 similarity joins. By setting fuzzy keys, it calculates similarity metrics for strings, numbers and dates to join on the closest matching entry. 61 | 62 | Args: 63 | dfs (list): list of dataframes 64 | exact_keys (list): list of join keys for exact joins. See notes for details 65 | fuzzy_keys (list): list of join keys for fuzzy joins. See notes for details 66 | exact_how (str): exact join mode same as `pd.merge(how='inner')` 67 | fuzzy_how (dict): specify fuzzy join options by merge level eg {0:{'top_limit':1}} 68 | keys_bydf (bool): if keys list is by dataframe (default) or join level. See notes for details 69 | 70 | Note: 71 | * specifying join keys: 72 | * if both dataframes have matching columns: `fuzzy_keys=['key1','key2']` 73 | * else: `fuzzy_keys=[['key1df1','key1df2'],['key2df1','key2df2']]` 74 | * by default you provide keys by join level eg `[['key1df1','key1df2'],['key2df1','key2df2']]` instead you can also provide keys by dataframe `[['key1df1','key2df1'],['key1df2','key2df2']], keys_bydf=True` 75 | * fuzzy_how: controls join options by join level 76 | * dict keys are join level eg with `fuzzy_keys=[['key1df1','key1df2'],['key2df1','key2df2']]` you set `fuzzy_how={0:{'top_nrecords':5},0:{'top_nrecords':5}}` 77 | * options are: 78 | * fun_diff: difference function or list of difference functions applied sequentially. Needs to be 0=similar and >0 dissimilar 79 | * top_limit: maximum difference, keep only canidates with difference <= top_limit 80 | * top_nrecords: keep only n top_nrecords, good for generating previews 81 | 82 | """ 83 | 84 | # inputs dfs 85 | self._init_dfs(dfs) 86 | 87 | # check and save join keys 88 | if not exact_keys and not fuzzy_keys: 89 | raise ValueError("Must provide at least one of exact_keys or fuzzy_keys") 90 | 91 | self.keys_exact, self.keysdf_exact = self._prep_keys(exact_keys, keys_bydf) 92 | if self.keys_exact: 93 | self._check_keysdfs(self.keys_exact, self.keysdf_exact) 94 | 95 | self.keys_fuzzy, self.keysdf_fuzzy = self._prep_keys(fuzzy_keys, keys_bydf) 96 | if self.keys_fuzzy: 97 | self._check_keysdfs(self.keys_fuzzy, self.keysdf_fuzzy) 98 | 99 | # todo: no duplicate join keys passed 100 | 101 | if not isinstance(exact_how, (str,)): 102 | raise NotImplementedError('exact_how can only be applied globally for now') 103 | elif exact_how not in ('left','right','inner','outer'): 104 | raise ValueError("Invalid how parameter, check documentation for valid values") 105 | 106 | self.cfg_njoins_exact = len(self.keysdf_exact[0]) if self.keysdf_exact else 0 107 | self.cfg_njoins_fuzzy = len(self.keysdf_fuzzy[0]) if self.keysdf_fuzzy else 0 108 | 109 | if self.cfg_njoins_fuzzy>1: 110 | # raise NotImplementedError('Currently supports only 1 fuzzy key') 111 | warnings.warn('Multi-key fuzzy joins are currently done globally for each key indivudally, not hierarchically for each unique fuzzy key value pair') 112 | 113 | self.exact_how = exact_how 114 | self.set_fuzzy_how_all(fuzzy_how) 115 | 116 | if init_merge: 117 | self.join() 118 | else: 119 | self.dfjoined = None 120 | 121 | self.table_fuzzy = {} 122 | 123 | 124 | def set_fuzzy_how(self, ilevel, fuzzy_how): 125 | self.fuzzy_how[ilevel] = fuzzy_how 126 | self._gen_fuzzy_how(ilevel) 127 | 128 | def set_fuzzy_how_all(self, fuzzy_how): 129 | if not isinstance(fuzzy_how, (dict,)): 130 | raise ValueError('fuzzy_how needs to be a dict') 131 | self.fuzzy_how = fuzzy_how 132 | self._gen_fuzzy_how_all() 133 | 134 | def _gen_fuzzy_how_all(self): 135 | 136 | for ilevel in range(self.cfg_njoins_fuzzy): 137 | self._gen_fuzzy_how(ilevel) 138 | 139 | def _gen_fuzzy_how(self, ilevel): 140 | 141 | # check if entry exists 142 | cfg_top1 = self.fuzzy_how.get(ilevel,{}) 143 | 144 | keyleft = self.keys_fuzzy[ilevel][0] 145 | keyright = self.keys_fuzzy[ilevel][1] 146 | 147 | typeleft = self.dfs[0][keyleft].dtype 148 | typeright = self.dfs[1][keyright].dtype 149 | 150 | if 'type' not in cfg_top1: 151 | if typeleft == 'int64' or typeleft == 'float64' or typeleft == 'datetime64[ns]': 152 | cfg_top1['type'] = 'number' 153 | elif typeleft == 'object' and type(self.dfs[0][keyleft].values[~self.dfs[0][keyleft].isnull()][0])==str: 154 | cfg_top1['type'] = 'string' 155 | else: 156 | raise ValueError('Unrecognized data type for top match, need to pass fun_diff in arguments') 157 | 158 | # make defaults if no settings provided 159 | if 'fun_diff' not in cfg_top1: 160 | 161 | if cfg_top1['type'] == 'number': 162 | cfg_top1['fun_diff'] = pd.merge_asof 163 | elif cfg_top1['type'] == 'string': 164 | cfg_top1['fun_diff'] = diff_edit 165 | else: 166 | raise ValueError('Unrecognized data type for top match, need to pass fun_diff in arguments') 167 | else: 168 | is_valid = callable(cfg_top1['fun_diff']) or (type(cfg_top1['fun_diff']) == list and all([callable(f) for f in cfg_top1['fun_diff']])) 169 | if not is_valid: 170 | raise ValueError("'fun_diff' needs to be a function or a list of functions") 171 | 172 | if not type(cfg_top1['fun_diff']) == list: 173 | cfg_top1['fun_diff'] = [cfg_top1['fun_diff']] 174 | 175 | if 'top_limit' not in cfg_top1: 176 | cfg_top1['top_limit'] = None 177 | 178 | if 'top_nrecords' not in cfg_top1: 179 | cfg_top1['top_nrecords'] = None 180 | 181 | cfg_top1['dir'] = 'left' 182 | 183 | # save config 184 | # check if entry exists 185 | self.fuzzy_how[ilevel] = cfg_top1 186 | 187 | def preview_fuzzy(self, ilevel, top_nrecords=5): 188 | if top_nrecords>0: 189 | return self._gen_match_top1(ilevel, top_nrecords) 190 | else: 191 | return self._gen_match_top1(ilevel) 192 | 193 | def _gen_match_top1_left_number(self, cfg_group_left, cfg_group_right, keyleft, keyright, top_nrecords): 194 | if len(cfg_group_left) > 0: 195 | 196 | # unique values 197 | if top_nrecords is None: 198 | # df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique()) 199 | df_keys_left = self.dfs[0].groupby(cfg_group_left)[keyleft].apply(lambda x: pd.Series(x.unique())) 200 | df_keys_left.index = df_keys_left.index.droplevel(1) 201 | df_keys_left = pd.DataFrame(df_keys_left) 202 | else: 203 | # df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique()[:top_nrecords]) 204 | df_keys_left = self.dfs[0].groupby(cfg_group_left)[keyleft].apply(lambda x: pd.Series(x.unique()[:top_nrecords])) 205 | df_keys_left.index = df_keys_left.index.droplevel(1) 206 | df_keys_left = pd.DataFrame(df_keys_left) 207 | df_keys_right = self.dfs[1].groupby(cfg_group_right)[keyright].apply(lambda x: pd.Series(x.unique())) 208 | df_keys_right.index = df_keys_right.index.droplevel(1) 209 | df_keys_right = pd.DataFrame(df_keys_right) 210 | # df_keys_right = pd.DataFrame(self.dfs[1].groupby(cfg_group_right)[keyright].unique()) 211 | 212 | # sort 213 | df_keys_left = df_keys_left.sort_values(keyleft).reset_index().rename(columns={keyleft:'__top1left__'}) 214 | df_keys_right = df_keys_right.sort_values(keyright).reset_index().rename(columns={keyright:'__top1right__'}) 215 | 216 | df_match = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', left_by=cfg_group_left, right_by=cfg_group_right, direction='nearest') 217 | else: 218 | # uniques 219 | values_left = set_values(self.dfs[0], keyleft) 220 | values_right = set_values(self.dfs[1], keyright) 221 | 222 | if top_nrecords: 223 | values_left = values_left[:top_nrecords] 224 | 225 | df_keys_left = pd.DataFrame({'__top1left__':values_left}).sort_values('__top1left__') 226 | df_keys_right = pd.DataFrame({'__top1right__':values_right}).sort_values('__top1right__') 227 | 228 | df_match = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', direction='nearest') 229 | 230 | df_match['__top1diff__'] = (df_match['__top1left__']-df_match['__top1right__']).abs() 231 | 232 | return df_match 233 | 234 | def _gen_match_top1(self, ilevel, top_nrecords=None): 235 | """ 236 | 237 | Generates match table between two sets 238 | 239 | Args: 240 | keyssets (dict): values for join keys ['key left', 'key right', 'keyset left', 'keyset right', 'inner', 'outer', 'unmatched total', 'unmatched left', 'unmatched right'] 241 | mode (str, list): global string or list for each join. Possible values: ['exact inner', 'exact left', 'exact right', 'exact outer', 'top1 left', 'top1 right', 'top1 bidir all', 'top1 bidir unmatched'] 242 | is_lower_better (bool): True = difference, False = Similarity 243 | 244 | """ 245 | 246 | cfg_top1 = self.fuzzy_how[ilevel] 247 | fun_diff = cfg_top1['fun_diff'] 248 | top_limit = cfg_top1['top_limit'] 249 | if not top_nrecords: 250 | top_nrecords = cfg_top1['top_nrecords'] 251 | 252 | keyleft = self.keys_fuzzy[ilevel][0] 253 | keyright = self.keys_fuzzy[ilevel][1] 254 | 255 | #****************************************** 256 | # table LEFT 257 | #****************************************** 258 | if cfg_top1['dir']=='left': 259 | 260 | # exact keys for groupby 261 | cfg_group_left = self.keysdf_exact[0] if self.keysdf_exact else [] 262 | cfg_group_right = self.keysdf_exact[1] if self.keysdf_exact else [] 263 | 264 | if cfg_top1['type'] == 'string' or (cfg_top1['type'] == 'number' and cfg_top1['fun_diff'] != [pd.merge_asof]): 265 | 266 | if len(cfg_group_left)>0: 267 | # generate candidates if exact matches are present (= blocking index) 268 | 269 | if top_nrecords is None: 270 | df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique()) 271 | else: 272 | df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique()[:top_nrecords]) 273 | df_keys_right = pd.DataFrame(self.dfs[1].groupby(cfg_group_right)[keyright].unique()) 274 | df_keysets_groups = df_keys_left.merge(df_keys_right,left_index=True, right_index=True) 275 | df_keysets_groups.columns = ['__top1left__','__top1right__'] 276 | dfg = df_keysets_groups.reset_index().groupby(cfg_group_left).apply(apply_gen_candidates_group) 277 | dfg = dfg.reset_index(-1,drop=True).reset_index() 278 | dfg = dfg.dropna() 279 | 280 | else: 281 | # generate candidates if NO exact matches 282 | values_left = set_values(self.dfs[0],keyleft) 283 | values_right = set_values(self.dfs[1],keyright) 284 | 285 | if top_nrecords is None: 286 | dfg = apply_gen_candidates(values_left,values_right) 287 | else: 288 | dfg = apply_gen_candidates(values_left[:top_nrecords], values_right) 289 | 290 | 291 | # find exact matches and remove from candidates 292 | # todo: use set logic before generating candidates 293 | idxSelExact = dfg['__top1left__']==dfg['__top1right__'] 294 | df_match_exact = dfg[idxSelExact].copy() 295 | df_match_exact['__match type__'] = 'exact' 296 | df_match_exact['__top1diff__'] = 0 297 | 298 | idxSel = dfg['__top1left__'].isin(df_match_exact['__top1left__']) 299 | dfg = dfg[~idxSel] 300 | 301 | for fun_diff in cfg_top1['fun_diff']: 302 | dfg['__top1diff__'] = dfg.apply(lambda x: fun_diff(x['__top1left__'], x['__top1right__']), axis=1) 303 | 304 | # filtering 305 | if not top_limit is None: 306 | dfg = dfg[dfg['__top1diff__'] <= top_limit] 307 | 308 | # get top 1 309 | dfg = dfg.groupby('__top1left__',group_keys=False).apply(lambda x: filter_group_minmax(x,'__top1diff__')) 310 | 311 | # return results 312 | dfg['__match type__'] = 'top1 left' 313 | df_match = pd.concat([dfg,df_match_exact]) 314 | 315 | elif cfg_top1['type'] == 'number' and cfg_top1['fun_diff'] == [pd.merge_asof]: 316 | df_match = self._gen_match_top1_left_number(cfg_group_left, cfg_group_right, keyleft, keyright, top_nrecords).copy() 317 | 318 | # filtering 319 | if not top_limit is None: 320 | df_match = df_match[df_match['__top1diff__'] <= top_limit] 321 | 322 | df_match['__match type__'] = 'top1 left' 323 | df_match.loc[df_match['__top1left__'] == df_match['__top1right__'], '__match type__'] = 'exact' 324 | else: 325 | raise ValueError('Dev error: cfg_top1["type/fun_diff"]') 326 | 327 | 328 | #****************************************** 329 | # table RIGHT 330 | #****************************************** 331 | elif cfg_top1['dir']=='right' or cfg_top1['dir'] == 'inner': 332 | raise NotImplementedError('Only use left join for now') 333 | else: 334 | raise ValueError("wrong 'how' parameter for top1 join, check documentation") 335 | 336 | return {'key left':keyleft, 'key right':keyright, 337 | 'table':df_match,'has duplicates':df_match.groupby('__top1left__').size().max()>1} 338 | 339 | def run_match_top1_all(self, cfg_top1=None): 340 | 341 | for ilevel in range(self.cfg_njoins_fuzzy): 342 | self.table_fuzzy[ilevel] = self._gen_match_top1(ilevel) 343 | 344 | def join(self, is_keep_debug=False): 345 | if self.cfg_njoins_fuzzy==0: 346 | self.dfjoined = self.dfs[0].merge(self.dfs[1], left_on=self.keysdf_exact[0], right_on=self.keysdf_exact[1], how=self.exact_how) 347 | else: 348 | 349 | self.run_match_top1_all() 350 | 351 | cfg_group_left = self.keysdf_exact[0] if self.keysdf_exact else [] 352 | cfg_group_right = self.keysdf_exact[1] if self.keysdf_exact else [] 353 | self.dfjoined = self.dfs[0] 354 | for ilevel in range(self.cfg_njoins_fuzzy): 355 | keyleft = self.keys_fuzzy[ilevel][0] 356 | keyright = self.keys_fuzzy[ilevel][1] 357 | dft = self.table_fuzzy[ilevel]['table'].copy() 358 | dft.columns = [s + keyleft if s.startswith('__') else s for s in dft.columns] 359 | self.dfjoined = self.dfjoined.merge(dft, left_on=cfg_group_left+[keyleft], right_on=cfg_group_left+['__top1left__'+keyleft]) 360 | pass 361 | 362 | cfg_keys_left = cfg_group_left+['__top1right__'+k for k in self.keysdf_fuzzy[0]] 363 | cfg_keys_right = cfg_group_right+[k for k in self.keysdf_fuzzy[1]] 364 | 365 | self.dfjoined = self.dfjoined.merge(self.dfs[1], left_on = cfg_keys_left, right_on = cfg_keys_right, suffixes=['','__right__']) 366 | 367 | if not is_keep_debug: 368 | self.dfjoined = self.dfjoined[self.dfjoined.columns[~self.dfjoined.columns.str.startswith('__')]] 369 | 370 | return self.dfjoined 371 | 372 | 373 | -------------------------------------------------------------------------------- /d6tjoin/top1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from collections import OrderedDict 4 | import itertools 5 | import warnings 6 | import jellyfish 7 | 8 | # ****************************************** 9 | # helpers 10 | # ****************************************** 11 | 12 | 13 | from d6tjoin.utils import _applyFunMulticore, _filter_group_min, _set_values 14 | 15 | class MergeTop1Diff(object): 16 | """ 17 | 18 | Top1 minimum difference join. Mostly used for strings. Helper for `MergeTop1`. 19 | 20 | """ 21 | 22 | def __init__(self, df1, df2, fuzzy_left_on, fuzzy_right_on, fun_diff=None, exact_left_on=None, exact_right_on=None, 23 | top_limit=None, topn=1, fun_preapply = None, fun_postapply = None, is_keep_debug=False, use_multicore=True): 24 | 25 | # check exact keys 26 | if not exact_left_on: 27 | exact_left_on = [] 28 | if not exact_right_on: 29 | exact_right_on = [] 30 | 31 | if not isinstance(fuzzy_left_on, (str,)) or not isinstance(fuzzy_right_on, (str,)): 32 | raise ValueError('fuzzy_on needs to be a string') 33 | 34 | if len(exact_left_on) != len(exact_right_on): 35 | raise ValueError('Need to pass same number of exact keys') 36 | if not isinstance(exact_left_on, (list)) or not isinstance(exact_right_on, (list)): 37 | raise ValueError('Exact keys need to be a list') 38 | 39 | if not callable(fun_diff): 40 | raise ValueError('fun_diff needs to a function') 41 | 42 | if (fun_preapply and fun_postapply) and (not callable(fun_preapply) or not callable(fun_postapply)): 43 | raise ValueError('fun_preapply and fun_postapply needs to a function') 44 | 45 | # use blocking index? 46 | if not exact_left_on and not exact_right_on: 47 | self.cfg_is_block = False 48 | elif exact_left_on and exact_right_on: 49 | self.cfg_is_block = True 50 | else: 51 | raise ValueError('Need to pass exact keys for both or neither dataframe') 52 | 53 | # store data 54 | self.dfs = [df1,df2] 55 | 56 | # store config 57 | self.cfg_fuzzy_left_on = fuzzy_left_on 58 | self.cfg_fuzzy_right_on = fuzzy_right_on 59 | self.cfg_exact_left_on = exact_left_on 60 | self.cfg_exact_right_on = exact_right_on 61 | self.cfg_fun_diff = fun_diff 62 | self.cfg_fun_preapply = fun_preapply 63 | self.cfg_fun_postapply = fun_postapply 64 | self.cfg_top_limit = top_limit 65 | self.cfg_is_keep_debug = is_keep_debug 66 | self.cfg_topn = topn 67 | self.cfg_use_multicore = use_multicore 68 | 69 | def _allpairs_candidates(self): 70 | values_left = _set_values(self.dfs[0], self.cfg_fuzzy_left_on) 71 | values_right = _set_values(self.dfs[1], self.cfg_fuzzy_right_on) 72 | 73 | if self.cfg_topn>1: 74 | values_left_exact = set() 75 | values_left_fuzzy = values_left 76 | else: 77 | values_left_exact = values_left.intersection(values_right) 78 | values_left_fuzzy = values_left.difference(values_right) 79 | 80 | # pre apply a function 81 | if self.cfg_fun_preapply: 82 | values_left_fuzzy = [self.cfg_fun_preapply(v) for v in values_left_fuzzy] 83 | values_right = [self.cfg_fun_preapply(v) for v in values_right] 84 | 85 | df_candidates_fuzzy = list(itertools.product(values_left_fuzzy, values_right)) 86 | df_candidates_fuzzy = pd.DataFrame(df_candidates_fuzzy,columns=['__top1left__','__top1right__']) 87 | df_candidates_fuzzy['__matchtype__'] = 'top1 left' 88 | 89 | df_candidates_exact = pd.DataFrame({'__top1left__': list(values_left_exact)}) 90 | df_candidates_exact['__top1right__'] = df_candidates_exact['__top1left__'] 91 | df_candidates_exact['__matchtype__'] = 'exact' 92 | 93 | df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True) 94 | 95 | return df_candidates 96 | 97 | def _top1_diff_noblock(self): 98 | df_candidates = self._allpairs_candidates() 99 | 100 | idxSel = df_candidates['__matchtype__'] != 'exact' 101 | if self.cfg_use_multicore: 102 | df_candidates.loc[idxSel, '__top1diff__'] = _applyFunMulticore(df_candidates.loc[idxSel,'__top1left__'].values, df_candidates.loc[idxSel,'__top1right__'].values,self.cfg_fun_diff) 103 | else: 104 | df_candidates.loc[idxSel,'__top1diff__'] = df_candidates[idxSel].apply(lambda x: self.cfg_fun_diff(x['__top1left__'], x['__top1right__']), axis=1) 105 | 106 | df_candidates.loc[~idxSel, '__top1diff__'] = 0 107 | has_duplicates = False 108 | 109 | if self.cfg_fun_postapply: 110 | df_candidates['__top1left__']=df_candidates['__top1left__'].apply(self.cfg_fun_postapply,1) 111 | df_candidates['__top1right__']=df_candidates['__top1right__'].apply(self.cfg_fun_postapply,1) 112 | 113 | df_diff = df_candidates.groupby('__top1left__',group_keys=False).apply(lambda x: _filter_group_min(x,'__top1diff__',self.cfg_topn)) 114 | if self.cfg_top_limit is not None: 115 | df_diff = df_diff[df_diff['__top1diff__']<=self.cfg_top_limit] 116 | has_duplicates = df_diff.groupby('__top1left__').size().max()>1 117 | if has_duplicates: 118 | warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on) 119 | 120 | return df_diff, has_duplicates 121 | 122 | 123 | def _merge_top1_diff_noblock(self): 124 | df_diff, has_duplicates = self._top1_diff_noblock() 125 | dfjoin = self.dfs[0].merge(df_diff, left_on=self.cfg_fuzzy_left_on, right_on='__top1left__') 126 | dfjoin = dfjoin.merge(self.dfs[1], left_on='__top1right__', right_on=self.cfg_fuzzy_right_on, suffixes=['','__right__']) 127 | 128 | if not self.cfg_is_keep_debug: 129 | dfjoin = dfjoin[dfjoin.columns[~dfjoin.columns.str.startswith('__')]] 130 | 131 | return {'merged':dfjoin, 'top1':df_diff, 'duplicates':has_duplicates} 132 | 133 | 134 | def _top1_diff_withblock(self): 135 | 136 | def apply_gen_candidates_group(dfg): 137 | return pd.DataFrame(list(itertools.product(dfg['__top1left__'].values[0],dfg['__top1right__'].values[0])),columns=['__top1left__','__top1right__']) 138 | 139 | # find key unique values 140 | keysleft = self.dfs[0][self.cfg_exact_left_on+[self.cfg_fuzzy_left_on]].drop_duplicates().dropna() 141 | keysright = self.dfs[1][self.cfg_exact_right_on+[self.cfg_fuzzy_right_on]].drop_duplicates().dropna() 142 | keysleft = {tuple(x) for x in keysleft.values} 143 | keysright = {tuple(x) for x in keysright.values} 144 | values_left_exact = keysleft.intersection(keysright) 145 | values_left_fuzzy = keysleft.difference(keysright) 146 | 147 | df_keys_left_exact = pd.DataFrame(list(values_left_exact)) 148 | if not df_keys_left_exact.empty: 149 | df_keys_left_exact.columns = self.cfg_exact_left_on+['__top1left__'] 150 | df_keys_left_exact['__top1right__']=df_keys_left_exact['__top1left__'] 151 | df_keys_left_exact['__matchtype__'] = 'exact' 152 | 153 | df_keys_left_fuzzy = pd.DataFrame(list(values_left_fuzzy)) 154 | if not df_keys_left_fuzzy.empty: 155 | df_keys_left_fuzzy.columns = self.cfg_exact_left_on+[self.cfg_fuzzy_left_on] 156 | 157 | # fuzzy pair candidates 158 | df_keys_left = pd.DataFrame(df_keys_left_fuzzy.groupby(self.cfg_exact_left_on)[self.cfg_fuzzy_left_on].unique()) 159 | df_keys_right = pd.DataFrame(self.dfs[1].groupby(self.cfg_exact_right_on)[self.cfg_fuzzy_right_on].unique()) 160 | df_keysets_groups = df_keys_left.merge(df_keys_right, left_index=True, right_index=True) 161 | df_keysets_groups.columns = ['__top1left__', '__top1right__'] 162 | df_keysets_groups = df_keysets_groups.reset_index().groupby(self.cfg_exact_left_on).apply(apply_gen_candidates_group) 163 | df_keysets_groups = df_keysets_groups.reset_index(-1, drop=True).reset_index() 164 | df_keysets_groups = df_keysets_groups.dropna() 165 | 166 | df_candidates = df_keysets_groups[['__top1left__', '__top1right__']].drop_duplicates() 167 | if self.cfg_use_multicore: 168 | df_candidates['__top1diff__'] = _applyFunMulticore(df_candidates['__top1left__'].values, df_candidates['__top1right__'].values, self.cfg_fun_diff) 169 | else: 170 | df_candidates['__top1diff__'] = df_candidates.apply(lambda x: self.cfg_fun_diff(x['__top1left__'], x['__top1right__']), axis=1) 171 | df_candidates['__matchtype__'] = 'top1 left' 172 | 173 | # calculate difference 174 | df_diff = df_keysets_groups.merge(df_candidates, on=['__top1left__', '__top1right__']) 175 | 176 | df_diff = df_diff.append(df_keys_left_exact) 177 | df_diff['__top1diff__']=df_diff['__top1diff__'].fillna(0) # exact keys 178 | df_diff = df_diff.groupby(self.cfg_exact_left_on+['__top1left__'],group_keys=False).apply(lambda x: _filter_group_min(x,'__top1diff__')) 179 | if self.cfg_top_limit is not None: 180 | df_diff = df_diff[df_diff['__top1diff__']<=self.cfg_top_limit] 181 | has_duplicates = df_diff.groupby(self.cfg_exact_left_on+['__top1left__']).size().max()>1 182 | 183 | return df_diff, has_duplicates 184 | 185 | 186 | def _merge_top1_diff_withblock(self): 187 | 188 | df_diff, has_duplicates = self._top1_diff_withblock() 189 | 190 | dfjoin = self.dfs[0].merge(df_diff, left_on=self.cfg_exact_left_on+[self.cfg_fuzzy_left_on], right_on=self.cfg_exact_left_on+['__top1left__']) 191 | # todo: add exact join keys 192 | dfjoin = dfjoin.merge(self.dfs[1], left_on=self.cfg_exact_left_on+['__top1right__'], right_on=self.cfg_exact_right_on+[self.cfg_fuzzy_right_on], suffixes=['','__right__']) 193 | 194 | if not self.cfg_is_keep_debug: 195 | dfjoin = dfjoin[dfjoin.columns[~dfjoin.columns.str.startswith('__')]] 196 | 197 | return {'merged':dfjoin, 'top1':df_diff, 'duplicates':has_duplicates} 198 | 199 | def top1_diff(self): 200 | if self.cfg_is_block: 201 | return self._top1_diff_withblock() 202 | else: 203 | return self._top1_diff_noblock() 204 | 205 | def merge(self): 206 | 207 | if not self.cfg_exact_left_on and not self.cfg_exact_right_on: 208 | return self._merge_top1_diff_noblock() 209 | elif self.cfg_exact_left_on and self.cfg_exact_right_on: 210 | return self._merge_top1_diff_withblock() 211 | else: 212 | raise ValueError('Need to pass exact keys for both or neither dataframe') 213 | 214 | 215 | class MergeTop1Number(object): 216 | """ 217 | 218 | Top1 minimum difference join for numbers. Helper for `MergeTop1`. 219 | 220 | """ 221 | 222 | def __init__(self, df1, df2, fuzzy_left_on, fuzzy_right_on, exact_left_on=None, exact_right_on=None, 223 | direction='nearest', top_limit=None, is_keep_debug=False): 224 | 225 | # check exact keys 226 | if not exact_left_on: 227 | exact_left_on = [] 228 | if not exact_right_on: 229 | exact_right_on = [] 230 | 231 | if len(exact_left_on) != len(exact_right_on): 232 | raise ValueError('Need to pass same number of exact keys') 233 | if not isinstance(exact_left_on, (list)) or not isinstance(exact_right_on, (list)): 234 | raise ValueError('Exact keys need to be a list') 235 | 236 | # use blocking index? 237 | if not exact_left_on and not exact_right_on: 238 | self.cfg_is_block = False 239 | elif exact_left_on and exact_right_on: 240 | self.cfg_is_block = True 241 | else: 242 | raise ValueError('Need to pass exact keys for both or neither dataframe') 243 | 244 | # store data 245 | self.dfs = [df1,df2] 246 | 247 | # store config 248 | self.cfg_fuzzy_left_on = fuzzy_left_on 249 | self.cfg_fuzzy_right_on = fuzzy_right_on 250 | self.cfg_exact_left_on = exact_left_on 251 | self.cfg_exact_right_on = exact_right_on 252 | self.cfg_direction = direction 253 | self.cfg_top_limit = top_limit 254 | self.cfg_is_keep_debug = is_keep_debug 255 | 256 | def _top1_diff_withblock(self): 257 | 258 | # unique values 259 | df_keys_left = self.dfs[0].groupby(self.cfg_exact_left_on)[self.cfg_fuzzy_left_on].apply(lambda x: pd.Series(x.unique())) 260 | df_keys_left.index = df_keys_left.index.droplevel(-1) 261 | df_keys_left = pd.DataFrame(df_keys_left) 262 | df_keys_right = self.dfs[1].groupby(self.cfg_exact_right_on)[self.cfg_fuzzy_right_on].apply(lambda x: pd.Series(x.unique())) 263 | df_keys_right.index = df_keys_right.index.droplevel(-1) 264 | df_keys_right = pd.DataFrame(df_keys_right) 265 | 266 | # todo: global consolidation like with MergeTop1Diff 267 | 268 | # sort 269 | df_keys_left = df_keys_left.sort_values(self.cfg_fuzzy_left_on).reset_index().rename(columns={self.cfg_fuzzy_left_on:'__top1left__'}) 270 | df_keys_right = df_keys_right.sort_values(self.cfg_fuzzy_right_on).reset_index().rename(columns={self.cfg_fuzzy_right_on:'__top1right__'}) 271 | 272 | # merge 273 | df_diff = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', left_by=self.cfg_exact_left_on, right_by=self.cfg_exact_right_on, direction=self.cfg_direction) 274 | df_diff['__top1diff__'] = (df_diff['__top1left__']-df_diff['__top1right__']).abs() 275 | df_diff['__matchtype__'] = 'top1 left' 276 | df_diff.loc[df_diff['__top1left__'] == df_diff['__top1right__'], '__matchtype__'] = 'exact' 277 | if self.cfg_top_limit is not None: 278 | df_diff = df_diff[df_diff['__top1diff__']<=self.cfg_top_limit] 279 | 280 | return df_diff 281 | 282 | def _top1_diff_noblock(self): 283 | # uniques 284 | values_left = _set_values(self.dfs[0], self.cfg_fuzzy_left_on) 285 | values_right = _set_values(self.dfs[1], self.cfg_fuzzy_right_on) 286 | 287 | # sort 288 | df_keys_left = pd.DataFrame({'__top1left__':list(values_left)}).sort_values('__top1left__') 289 | df_keys_right = pd.DataFrame({'__top1right__':list(values_right)}).sort_values('__top1right__') 290 | 291 | # merge 292 | df_diff = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', direction=self.cfg_direction) 293 | df_diff['__top1diff__'] = (df_diff['__top1left__']-df_diff['__top1right__']).abs() 294 | df_diff['__matchtype__'] = 'top1 left' 295 | df_diff.loc[df_diff['__top1left__'] == df_diff['__top1right__'], '__matchtype__'] = 'exact' 296 | 297 | return df_diff 298 | 299 | def top1_diff(self): 300 | if self.cfg_is_block: 301 | return self._top1_diff_withblock() 302 | else: 303 | return self._top1_diff_noblock() 304 | 305 | def merge(self): 306 | df_diff = self.top1_diff() 307 | 308 | dfjoin = self.dfs[0].merge(df_diff, left_on=self.cfg_exact_left_on+[self.cfg_fuzzy_left_on], right_on=self.cfg_exact_left_on+['__top1left__']) 309 | dfjoin = dfjoin.merge(self.dfs[1], left_on=self.cfg_exact_left_on+['__top1right__'], right_on=self.cfg_exact_right_on+[self.cfg_fuzzy_right_on], suffixes=['','__right__']) 310 | 311 | if not self.cfg_is_keep_debug: 312 | dfjoin = dfjoin[dfjoin.columns[~dfjoin.columns.str.startswith('__')]] 313 | 314 | return {'merged': dfjoin, 'top1': df_diff, 'duplicates': None} 315 | 316 | class MergeTop1(object): 317 | """ 318 | 319 | Left best match join. It applies a difference function to find the key pair with the smallest difference to the join key. 320 | 321 | Args: 322 | df1 (dataframe): left dataframe onto which the right dataframe is joined 323 | df2 (dataframe): right dataframe 324 | fuzzy_left_on (list): join keys for similarity match, left dataframe 325 | fuzzy_right_on (list): join keys for similarity match, right dataframe 326 | exact_left_on (list, default None): join keys for exact match, left dataframe 327 | exact_right_on (list, default None): join keys for exact match, right dataframe 328 | fun_diff (list, default None): list of difference functions to be applied for each fuzzy key 329 | top_limit (list, default None): list of values to cap similarity matches 330 | is_keep_debug (bool): keep diagnostics columns, good for debugging 331 | 332 | Note: 333 | * fun_diff: applies the difference function to find the best match with minimum distance 334 | * By default gets automatically determined depending on whether you have a string or date/number 335 | * Use `None` to keep the default, so example [None, lambda x, y: x-y] 336 | * Functions within list get applied in order same order to fuzzy join keys 337 | * Needs to be a difference function so lower is better. For functions like Jaccard higher is better so you need to adjust for that 338 | * top_limit: Limits the number of matches to anything below that values. For example if two strings differ by 3 but top_limit is 2, that match will be ignored 339 | * for dates you can use `pd.offsets.Day(1)` or similar 340 | 341 | """ 342 | 343 | def __init__(self, df1, df2, fuzzy_left_on=None, fuzzy_right_on=None, exact_left_on=None, exact_right_on=None, 344 | fun_diff = None, top_limit=None, is_keep_debug=False, use_multicore=True): 345 | 346 | 347 | # todo: pass custom merge asof param 348 | # todo: pass list of fundiff 349 | 350 | 351 | # check fuzzy keys 352 | if not fuzzy_left_on or not fuzzy_right_on: 353 | raise ValueError('Need to pass fuzzy left and right keys') 354 | if len(fuzzy_left_on) != len(fuzzy_right_on): 355 | raise ValueError('Need to pass same number of fuzzy left and right keys') 356 | self.cfg_njoins_fuzzy = len(fuzzy_left_on) 357 | 358 | # check exact keys 359 | if not exact_left_on: 360 | exact_left_on = [] 361 | if not exact_right_on: 362 | exact_right_on = [] 363 | 364 | if len(exact_left_on) != len(exact_right_on): 365 | raise ValueError('Need to pass same number of exact keys') 366 | if not isinstance(exact_left_on, (list)) or not isinstance(exact_right_on, (list)): 367 | raise ValueError('Exact keys need to be a list') 368 | 369 | 370 | # use blocking index? 371 | if not exact_left_on and not exact_right_on: 372 | self.cfg_is_block = False 373 | elif exact_left_on and exact_right_on: 374 | self.cfg_is_block = True 375 | else: 376 | raise ValueError('Need to pass exact keys for both or neither dataframe') 377 | 378 | # check custom params 379 | if not top_limit: 380 | top_limit = [None,]*self.cfg_njoins_fuzzy 381 | if not fun_diff: 382 | fun_diff = [None,]*self.cfg_njoins_fuzzy 383 | elif len(fun_diff)!=len(fuzzy_left_on): 384 | raise ValueError('fun_diff needs to the same length as fuzzy_left_on. Use None in list to use default') 385 | if not isinstance(top_limit, (list,)) or not len(top_limit)==self.cfg_njoins_fuzzy: 386 | raise NotImplementedError('top_limit needs to a list with entries for each fuzzy join key') 387 | if not isinstance(fun_diff, (list,)) or not len(top_limit)==self.cfg_njoins_fuzzy: 388 | raise NotImplementedError('fun_diff needs to a list with entries for each fuzzy join key') 389 | 390 | # store data 391 | self.dfs = [df1,df2] 392 | 393 | # store config 394 | self.cfg_fuzzy_left_on = fuzzy_left_on 395 | self.cfg_fuzzy_right_on = fuzzy_right_on 396 | # todo: exact keys by fuzzy key? or just global? 397 | self.cfg_exact_left_on = exact_left_on 398 | self.cfg_exact_right_on = exact_right_on 399 | self.cfg_top_limit = top_limit 400 | self.cfg_fun_diff = fun_diff 401 | self.cfg_is_keep_debug = is_keep_debug 402 | self.cfg_use_multicore = use_multicore 403 | 404 | def merge(self): 405 | """ 406 | 407 | Executes merge 408 | 409 | Returns: 410 | dict: keys 'merged' has merged dataframe, 'top1' has best matches by fuzzy_left_on. See example notebooks for details 411 | 412 | """ 413 | df_diff_bylevel = OrderedDict() 414 | 415 | self.dfjoined = self.dfs[0].copy() 416 | cfg_exact_left_on = self.cfg_exact_left_on 417 | cfg_exact_right_on = self.cfg_exact_right_on 418 | 419 | a=1 420 | for ilevel, ikey in enumerate(self.cfg_fuzzy_left_on): 421 | keyleft = ikey 422 | keyright = self.cfg_fuzzy_right_on[ilevel] 423 | typeleft = self.dfs[0][keyleft].dtype 424 | 425 | if self.cfg_fun_diff[ilevel]: 426 | df_diff_bylevel[ikey] = MergeTop1Diff(self.dfjoined, self.dfs[1], keyleft, keyright, self.cfg_fun_diff[ilevel], cfg_exact_left_on, cfg_exact_right_on, top_limit=self.cfg_top_limit[ilevel], use_multicore=self.cfg_use_multicore).top1_diff()[0] 427 | else: 428 | if typeleft == 'int64' or typeleft == 'float64' or typeleft == 'datetime64[ns]': 429 | df_diff_bylevel[ikey] = MergeTop1Number(self.dfjoined, self.dfs[1], keyleft, keyright, cfg_exact_left_on, cfg_exact_right_on, top_limit=self.cfg_top_limit[ilevel]).top1_diff() 430 | elif typeleft == 'object' and type(self.dfs[0][keyleft].values[0])==str: 431 | df_diff_bylevel[ikey] = MergeTop1Diff(self.dfjoined, self.dfs[1], keyleft, keyright, jellyfish.levenshtein_distance, cfg_exact_left_on, cfg_exact_right_on, top_limit=self.cfg_top_limit[ilevel], use_multicore=self.cfg_use_multicore).top1_diff()[0] 432 | # todo: handle duplicates 433 | else: 434 | raise ValueError('Unrecognized data type for top match, need to pass fun_diff in arguments') 435 | 436 | self.dfjoined = self.dfjoined.merge(df_diff_bylevel[ikey], left_on=cfg_exact_left_on+[keyleft], right_on=cfg_exact_left_on+['__top1left__'], suffixes=['',keyleft]) 437 | cfg_col_rename = ['__top1left__','__top1right__','__top1diff__','__matchtype__'] 438 | self.dfjoined = self.dfjoined.rename(columns=dict((k,k+keyleft) for k in cfg_col_rename)) 439 | cfg_exact_left_on += ['__top1right__%s'%keyleft,] 440 | cfg_exact_right_on += [keyright,] 441 | 442 | self.dfjoined = self.dfjoined.merge(self.dfs[1], left_on=cfg_exact_left_on, right_on=cfg_exact_right_on, suffixes=['','_right']) 443 | 444 | if not self.cfg_is_keep_debug: 445 | self.dfjoined = self.dfjoined[self.dfjoined.columns[~self.dfjoined.columns.str.startswith('__')]] 446 | 447 | return {'merged': self.dfjoined, 'top1': df_diff_bylevel, 'duplicates': None} 448 | 449 | ''' 450 | multikey: want to merge left match onto right df 451 | dont to numbers (non key) join until the very end 452 | ''' -------------------------------------------------------------------------------- /d6tjoin/utils.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import pandas as pd 4 | pd.set_option('display.expand_frame_repr', False) 5 | import numpy as np 6 | 7 | # ****************************************** 8 | # helpers 9 | # ****************************************** 10 | def _set_values_series(dfs): 11 | return set(dfs[~pd.isnull(dfs)]) 12 | 13 | def _set_values(dfg, key): 14 | return _set_values_series(dfg[key]) 15 | 16 | def _filter_group_min(dfg, col, topn=1): 17 | """ 18 | 19 | Returns all rows equal to min in col 20 | 21 | """ 22 | if topn==1: 23 | return dfg[dfg[col] == dfg[col].min()] 24 | else: 25 | return dfg[dfg[col].isin(np.sort(dfg[col].unique())[:topn])] 26 | 27 | from joblib import Parallel, delayed 28 | import multiprocessing 29 | def _applyFunMulticore(values1, values2, func): 30 | retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(p[0],p[1]) for p in zip(values1,values2)) 31 | return retLst 32 | 33 | 34 | # ****************************************** 35 | # tfidf 36 | # ****************************************** 37 | import re 38 | import collections 39 | from joblib import Parallel, delayed 40 | import multiprocessing 41 | import itertools 42 | import warnings 43 | 44 | def tokenCount(dfs, fun, mincount=2, minlength=1): 45 | """ 46 | Tokenize a series of strings and count occurance of string tokens 47 | 48 | Args: 49 | dfs (pd.series): pd.series of values 50 | fun (function): tokenize function 51 | mincount (int): discard tokens with count less than mincount 52 | minlength (int): discard tokens with string length less than minlength 53 | 54 | Returns: 55 | dataframe: count of tokens 56 | 57 | """ 58 | assert len(dfs.shape)==1 59 | dfs=dfs.dropna().unique() 60 | 61 | if dfs.shape[0]>1000: 62 | words = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(fun)(s) for s in dfs) 63 | else: 64 | words = [fun(s) for s in dfs] 65 | words = list(itertools.chain.from_iterable(words)) 66 | df_count = [t for t in collections.Counter(words).most_common() if t[1]>=mincount and len(t[0])>=minlength] 67 | df_count = pd.DataFrame(df_count, columns=['word','count']) 68 | return df_count 69 | 70 | def splitcharTokenCount(dfs, splitchars="[^a-zA-Z0-9]+", mincount=2, minlength=1): #"[ -_|]+" 71 | """ 72 | Tokenize a series of strings by splitting strings on a set of characters. Then count occurance of tokens in series. 73 | 74 | Args: 75 | dfs (pd.series): pd.series of values 76 | splitchars (str): regex by which to split string into tokens. For example `"[^a-zA-Z0-9]+"` for anything not alpha-numeric or `"[ -_|]+"` for common ID tokens. 77 | mincount (int): discard tokens with count less than mincount 78 | minlength (int): discard tokens with string length less than minlength 79 | 80 | Returns: 81 | dataframe: count of tokens 82 | 83 | """ 84 | def funsplit(s): 85 | return re.split(splitchars,s) 86 | return tokenCount(dfs, funsplit, mincount, minlength) 87 | 88 | def ncharTokenCount(dfs, nchars=None, overlapping=False, mincount=2, minlength=1): 89 | """ 90 | Tokenize a series of strings by splitting strings into tokens of `nchars` length. Then count occurance of tokens in series. 91 | 92 | Args: 93 | dfs (pd.series): pd.series of values 94 | nchars (int): number of characters in each token 95 | overlapping (bool): make overlapping tokens 96 | mincount (int): discard tokens with count less than mincount 97 | minlength (int): discard tokens with string length less than minlength 98 | 99 | Returns: 100 | dataframe: count of tokens 101 | 102 | """ 103 | if not nchars: 104 | smax = dfs.str.len().max() 105 | smin = dfs.str.len().min() 106 | if smax-smin>2: 107 | warnings.warn('Tokenize works best if strings have similar length') 108 | nchars = dfs.str.len().max()//4 109 | 110 | if overlapping: 111 | def funtokenize(s): 112 | return [s[i:i+nchars] for i in range(0, len(s)-nchars+1)] 113 | else: 114 | def funtokenize(s): 115 | return [s[i:i+nchars] for i in range(0, len(s), nchars)] 116 | return tokenCount(dfs, funtokenize, mincount, minlength) 117 | 118 | 119 | def unique_contains(dfs, strlist): 120 | """ 121 | Find values which contain a set of substrings 122 | 123 | Args: 124 | dfs (pd.series): pd.series of values 125 | strlist (list): substrings to find 126 | 127 | Returns: 128 | list: unique values which contain substring 129 | 130 | """ 131 | assert len(dfs.shape)==1 132 | dfs=np.unique(dfs) 133 | outlist = [(x, [s for s in dfs if x in s]) for x in strlist] 134 | return outlist 135 | 136 | import collections 137 | 138 | def typeSeries(dfs): 139 | """ 140 | Find type of a pandas series 141 | 142 | Args: 143 | dfs (pd.series): pd.series of values 144 | 145 | Returns: 146 | str: type 147 | 148 | """ 149 | c = collections.Counter([type(x) for x in dfs.values]) 150 | cnt = c.most_common() 151 | if len(cnt)>1: 152 | return 'mixed' 153 | else: 154 | return cnt[0][0] 155 | 156 | def typeDataFrame(df): 157 | """ 158 | Find type of a pandas dataframe columns 159 | 160 | Args: 161 | df (pd.dataframe): pandas dataframe 162 | 163 | Returns: 164 | dict: column, type 165 | 166 | """ 167 | return dict(zip(df.columns,[typeSeries(df[s]) for s in df])) 168 | 169 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = d6tjoin 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=d6t-lib 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/samples.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import itertools 4 | from faker import Faker 5 | import importlib 6 | 7 | import d6tjoin.top1 8 | import d6tjoin.utils 9 | 10 | importlib.reload(d6tjoin.top1) 11 | 12 | # ******************************************************* 13 | # generate sample time series data with id and value 14 | # ******************************************************* 15 | nobs = 10 16 | f1 = Faker() 17 | f1.seed(0) 18 | uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)] 19 | dates1 = pd.date_range('1/1/2010','1/1/2011') 20 | 21 | df1 = pd.DataFrame(list(itertools.product(dates1,uuid1)),columns=['date','id']) 22 | df1['val1']=np.round(np.random.sample(df1.shape[0]),3) 23 | 24 | # create mismatch 25 | df2 = df1.copy() 26 | df2['id'] = df1['id'].str[1:-1] 27 | df2['val2']=np.round(np.random.sample(df2.shape[0]),3) 28 | 29 | d6tjoin.utils.PreJoin([df1,df2],['id','date']).stats_prejoin() 30 | 31 | result = d6tjoin.top1.MergeTop1(df1.head(),df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'],exact_left_on=['date'],exact_right_on=['date']).merge() 32 | 33 | print(result['top1']['id'].head(2)) 34 | 35 | print(result['merged'].head(2)) 36 | -------------------------------------------------------------------------------- /docs/shell-napoleon-html.sh: -------------------------------------------------------------------------------- 1 | make html 2 | -------------------------------------------------------------------------------- /docs/shell-napoleon-recreate.sh: -------------------------------------------------------------------------------- 1 | #rm ./source/* 2 | #cp ./source-bak/* ./source/ 3 | sphinx-apidoc -f -o ./source .. 4 | make clean 5 | make html 6 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # d6t-lib documentation build configuration file, created by 5 | # sphinx-quickstart on Tue Nov 28 11:32:56 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | sys.path.insert(0, os.path.abspath('.')) 24 | sys.path.insert(0, os.path.dirname(os.path.abspath('.'))) # todo: why is this not working? 25 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.')))) 26 | sys.path.insert(0, os.path.join(os.path.dirname((os.path.abspath('.'))), "d6tjoin")) 27 | 28 | # -- General configuration ------------------------------------------------ 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # 32 | # needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = ['sphinx.ext.autodoc', 38 | 'sphinx.ext.todo', 39 | 'sphinx.ext.viewcode', 40 | 'sphinx.ext.githubpages', 41 | 'sphinx.ext.napoleon'] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The suffix(es) of source filenames. 47 | # You can specify multiple suffix as a list of string: 48 | # 49 | # source_suffix = ['.rst', '.md'] 50 | source_suffix = '.rst' 51 | 52 | # The master toctree document. 53 | master_doc = 'index' 54 | 55 | # General information about the project. 56 | project = 'd6tjoin' 57 | copyright = '2017, databolt' 58 | author = 'databolt' 59 | 60 | # The version info for the project you're documenting, acts as replacement for 61 | # |version| and |release|, also used in various other places throughout the 62 | # built documents. 63 | # 64 | # The short X.Y version. 65 | version = '0.1' 66 | # The full version, including alpha/beta/rc tags. 67 | release = '0.1' 68 | 69 | # The language for content autogenerated by Sphinx. Refer to documentation 70 | # for a list of supported languages. 71 | # 72 | # This is also used if you do content translation via gettext catalogs. 73 | # Usually you set "language" from the command line for these cases. 74 | language = None 75 | 76 | # List of patterns, relative to source directory, that match files and 77 | # directories to ignore when looking for source files. 78 | # This patterns also effect to html_static_path and html_extra_path 79 | exclude_patterns = [] 80 | 81 | # The name of the Pygments (syntax highlighting) style to use. 82 | pygments_style = 'sphinx' 83 | 84 | # If true, `todo` and `todoList` produce output, else they produce nothing. 85 | todo_include_todos = True 86 | 87 | # -- Options for HTML output ---------------------------------------------- 88 | 89 | # The theme to use for HTML and HTML Help pages. See the documentation for 90 | # a list of builtin themes. 91 | # 92 | html_theme = 'sphinx_rtd_theme' # 'alabaster' 93 | 94 | # Theme options are theme-specific and customize the look and feel of a theme 95 | # further. For a list of options available for each theme, see the 96 | # documentation. 97 | # 98 | # html_theme_options = {} 99 | 100 | # Add any paths that contain custom static files (such as style sheets) here, 101 | # relative to this directory. They are copied after the builtin static files, 102 | # so a file named "default.css" will overwrite the builtin "default.css". 103 | html_static_path = ['_static'] 104 | 105 | # Custom sidebar templates, must be a dictionary that maps document names 106 | # to template names. 107 | # 108 | # This is required for the alabaster theme 109 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 110 | # html_sidebars = { 111 | # '**': [ 112 | # 'about.html', 113 | # 'navigation.html', 114 | # 'relations.html', # needs 'show_related': True theme option to display 115 | # 'searchbox.html', 116 | # 'donate.html', 117 | # ] 118 | # } 119 | 120 | 121 | # -- Options for HTMLHelp output ------------------------------------------ 122 | 123 | # Output file base name for HTML help builder. 124 | htmlhelp_basename = 'd6tjoin-doc' 125 | 126 | # -- Options for LaTeX output --------------------------------------------- 127 | 128 | latex_elements = { 129 | # The paper size ('letterpaper' or 'a4paper'). 130 | # 131 | # 'papersize': 'letterpaper', 132 | 133 | # The font size ('10pt', '11pt' or '12pt'). 134 | # 135 | # 'pointsize': '10pt', 136 | 137 | # Additional stuff for the LaTeX preamble. 138 | # 139 | # 'preamble': '', 140 | 141 | # Latex figure (float) alignment 142 | # 143 | # 'figure_align': 'htbp', 144 | } 145 | 146 | # Grouping the document tree into LaTeX files. List of tuples 147 | # (source start file, target name, title, 148 | # author, documentclass [howto, manual, or own class]). 149 | latex_documents = [ 150 | (master_doc, 'd6tjoin.tex', 'd6tjoin Documentation', 151 | 'nn', 'manual'), 152 | ] 153 | 154 | # -- Options for manual page output --------------------------------------- 155 | 156 | # One entry per manual page. List of tuples 157 | # (source start file, name, description, authors, manual section). 158 | man_pages = [ 159 | (master_doc, 'd6tjoin', 'd6tjoin Documentation', 160 | [author], 1) 161 | ] 162 | 163 | # -- Options for Texinfo output ------------------------------------------- 164 | 165 | # Grouping the document tree into Texinfo files. List of tuples 166 | # (source start file, target name, title, author, 167 | # dir menu entry, description, category) 168 | texinfo_documents = [ 169 | (master_doc, 'd6tjoin', 'd6tjoin Documentation', 170 | author, 'd6tjoin', 'Databolt python library - Accelerate data engineering', 171 | 'Miscellaneous'), 172 | ] 173 | -------------------------------------------------------------------------------- /docs/source/d6tjoin.rst: -------------------------------------------------------------------------------- 1 | d6tjoin package 2 | =============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | d6tjoin\.top1 module 8 | -------------------- 9 | 10 | .. automodule:: d6tjoin.top1 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | d6tjoin\.utils module 16 | --------------------- 17 | 18 | .. automodule:: d6tjoin.utils 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: d6tjoin 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. d6tjoin documentation master file, created by 2 | sphinx-quickstart on Tue Nov 28 11:32:56 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to d6tjoin documentation! 7 | ============================================== 8 | 9 | Documentation for using the databolt python Smart Join Combine library. 10 | 11 | Library Docs 12 | ================== 13 | 14 | * :ref:`modindex` 15 | 16 | Search 17 | ================== 18 | 19 | * :ref:`search` 20 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | d6tjoin 2 | ======= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | d6tjoin 8 | setup 9 | tests 10 | -------------------------------------------------------------------------------- /docs/source/setup.rst: -------------------------------------------------------------------------------- 1 | setup module 2 | ============ 3 | 4 | .. automodule:: setup 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /examples-prejoin.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Engineering in Python with databolt - Identify and analyze join problems (d6tjoin.Prejoin)\n", 8 | "\n", 9 | "## Introduction\n", 10 | "\n", 11 | "Joining datasets is a common data engineering operation. However, often there are problems merging datasets from different sources because of mismatched identifiers, date conventions etc. \n", 12 | "\n", 13 | "** `d6tjoin.Prejoin` module allows you to test for join accuracy and quickly identify and analyze join problems. **\n", 14 | "\n", 15 | "Here are some examples which show you how to:\n", 16 | "* do join quality analysis prior to attempting a join\n", 17 | "* detect and analyze a string-based identifiers mismatch\n", 18 | "* detect and analyze a date mismatch" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Generate sample data\n", 26 | "\n", 27 | "Let's generate some random respresentative data:\n", 28 | "* identifier (string)\n", 29 | "* date (np.datetime)\n", 30 | "* values (flaot)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 1, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import pandas as pd\n", 40 | "import numpy as np\n", 41 | "import uuid\n", 42 | "import itertools\n", 43 | "import importlib\n", 44 | "\n", 45 | "import d6tjoin\n", 46 | "\n", 47 | "# ******************************************\n", 48 | "# generate sample data\n", 49 | "# ******************************************\n", 50 | "nobs = 10\n", 51 | "uuid1 = [str(uuid.uuid4()) for _ in range(nobs)]\n", 52 | "dates1 = pd.date_range('1/1/2010','1/1/2011')\n", 53 | "\n", 54 | "df1 = pd.DataFrame(list(itertools.product(uuid1,dates1)),columns=['id','date'])\n", 55 | "df1['v']=np.random.sample(df1.shape[0])" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/html": [ 66 | "
\n", 67 | "\n", 80 | "\n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
iddatev
026e41c83-630e-47c5-a410-83fd7865e8262010-01-010.589946
126e41c83-630e-47c5-a410-83fd7865e8262010-01-020.367214
366049676df-998a-4322-9121-84dac8b7547f2010-01-010.570425
367049676df-998a-4322-9121-84dac8b7547f2010-01-020.524693
732ad14d610-3a0b-4d87-8a29-236c9b6e817e2010-01-010.681610
733ad14d610-3a0b-4d87-8a29-236c9b6e817e2010-01-020.236658
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " id date v\n", 132 | "0 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-01 0.589946\n", 133 | "1 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-02 0.367214\n", 134 | "366 049676df-998a-4322-9121-84dac8b7547f 2010-01-01 0.570425\n", 135 | "367 049676df-998a-4322-9121-84dac8b7547f 2010-01-02 0.524693\n", 136 | "732 ad14d610-3a0b-4d87-8a29-236c9b6e817e 2010-01-01 0.681610\n", 137 | "733 ad14d610-3a0b-4d87-8a29-236c9b6e817e 2010-01-02 0.236658" 138 | ] 139 | }, 140 | "execution_count": 2, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "df1.groupby(['id']).head(2).head(6)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Use Case: assert 100% join accuracy for data integrity checks \n", 154 | "\n", 155 | "In data enginerring QA you want to test that data is joined correctly. This is particularly useful for detecting potential data problems in production." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 3, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "df2 = df1.copy()\n", 165 | "\n", 166 | "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n", 167 | "assert j.is_all_matched() # succeeds\n", 168 | "assert j.is_all_matched('id') # succeeds\n", 169 | "assert j.is_all_matched('date') # succeeds\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "## Use Case: detect and analyze id mismatch \n", 177 | "\n", 178 | "When joining data from different sources, eg different vendors, often your ids don't match and then you need to manually analyze the situation. With databolt this becomes much easier." 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "### 100% id mismatch\n", 186 | "\n", 187 | "Let's look at an example where say vendor 1 uses a different id convention than vendor 2 and none of the ids match." 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 4, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "assert fails!\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "# create mismatch\n", 205 | "df2['id'] = df1['id'].str[1:-1]\n", 206 | "\n", 207 | "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n", 208 | "\n", 209 | "try:\n", 210 | " assert j.is_all_matched() # fails\n", 211 | "except:\n", 212 | " print('assert fails!')" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "The QA check shows there's a problem, lets analyze the issue with `Prejoin.match_quality()`. We can immediately see that none of the ids match." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 5, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n", 232 | "0 id id False 0 10 10 20 20 10 10\n", 233 | "1 date date True 366 366 366 366 0 0 0\n", 234 | "2 __all__ __all__ False 0 3660 3660 7320 7320 3660 3660\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "j.match_quality()" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "Let's look at some of the mismatched records with `Prejoin.show_unmatched()`. Looks like there might be a length problem." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 6, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | " id date v\n", 259 | "1098 b5c945ce-fdf6-4774-bbae-ff5f5787a7eb 2010-01-01 0.194907\n", 260 | "1099 b5c945ce-fdf6-4774-bbae-ff5f5787a7eb 2010-01-02 0.558549\n", 261 | "1100 b5c945ce-fdf6-4774-bbae-ff5f5787a7eb 2010-01-03 0.316138\n", 262 | " id date v\n", 263 | "0 6e41c83-630e-47c5-a410-83fd7865e82 2010-01-01 0.589946\n", 264 | "1 6e41c83-630e-47c5-a410-83fd7865e82 2010-01-02 0.367214\n", 265 | "2 6e41c83-630e-47c5-a410-83fd7865e82 2010-01-03 0.290587\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "print(j.show_unmatched('id')['left'])\n", 271 | "print(j.show_unmatched('id')['right'])" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "We can show string length statistics using `d6tjoin.Prejoin().describe_str()` which confirms that the id string lenghts are different." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 7, 284 | "metadata": { 285 | "scrolled": true 286 | }, 287 | "outputs": [ 288 | { 289 | "name": "stdout", 290 | "output_type": "stream", 291 | "text": [ 292 | "dataframe #0\n", 293 | " median min max nrecords\n", 294 | "id 36.0 36.0 36.0 3660.0\n", 295 | "dataframe #1\n", 296 | " median min max nrecords\n", 297 | "id 34.0 34.0 34.0 3660.0\n", 298 | "None\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "print(j.describe_str())\n" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "### Partial id mismatch\n", 311 | "\n", 312 | "Let's look at another example where there is a partial mismatch. In this case let's say vendor 2 only has a certain percentage of ids covered." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 8, 318 | "metadata": { 319 | "scrolled": true 320 | }, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "assert fails!\n" 327 | ] 328 | } 329 | ], 330 | "source": [ 331 | "# create partial mismatch\n", 332 | "uuid_sel = np.array(uuid1)[np.random.choice(nobs, nobs//5, replace=False)].tolist()\n", 333 | "df2 = df1[~df1['id'].isin(uuid_sel)]\n", 334 | "\n", 335 | "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n", 336 | "\n", 337 | "try:\n", 338 | " assert j.is_all_matched() # fails\n", 339 | "except:\n", 340 | " print('assert fails!')" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "Again we've quickly identified a problem. This would typically cause you to do manual and tedious manual QA work but with `Prejoin().match_quality()` you can quickly see how many ids were mismatched." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 9, 353 | "metadata": { 354 | "scrolled": true 355 | }, 356 | "outputs": [ 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n", 362 | "0 id id False 8 10 8 10 2 2 0\n", 363 | "1 date date True 366 366 366 366 0 0 0\n", 364 | "2 __all__ __all__ False 2928 3660 2928 3660 732 732 0\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "j.match_quality()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "## Use Case: detect and analyze date mismatch \n", 377 | "\n", 378 | "Dates are another common sources of frustration for data engineers working with time series data. Dates come in a variety of different formats and conventions. Let's use databolt to analyze a date mismatch situation." 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 10, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n", 388 | "df2 = pd.DataFrame(list(itertools.product(uuid1,dates2)),columns=['id','date'])\n", 389 | "df2['v']=np.random.sample(df2.shape[0])" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "To highlight some different functionality for `Prejoin().match_quality()`. The QA test for all matches fails." 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 11, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "name": "stdout", 406 | "output_type": "stream", 407 | "text": [ 408 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n", 409 | "0 id id True 10 10 10 10 0 0 0\n", 410 | "1 date date False 261 366 261 366 105 105 0\n", 411 | "2 __all__ __all__ False 2610 3660 2610 3660 1050 1050 0\n", 412 | "assert fails!\n" 413 | ] 414 | } 415 | ], 416 | "source": [ 417 | "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n", 418 | "dfr = j.match_quality()\n", 419 | "try:\n", 420 | " assert dfr['all matched'].all() # fails\n", 421 | "except:\n", 422 | " print('assert fails!')" 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "We can look at the dataframe to see 105 dates are not matched." 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 12, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "dfr" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "We can look at mismatched records using `Prejoin.show_unmatched()`. Here we will return all mismatched records into a dataframe you can analyze." 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 13, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "dft = j.show_unmatched('date',keys_only=False,nrecords=-1,nrows=-1)['left']" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 14, 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "data": { 464 | "text/html": [ 465 | "
\n", 466 | "\n", 479 | "\n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | "
iddatev
126e41c83-630e-47c5-a410-83fd7865e8262010-01-020.367214
226e41c83-630e-47c5-a410-83fd7865e8262010-01-030.290587
826e41c83-630e-47c5-a410-83fd7865e8262010-01-090.663732
926e41c83-630e-47c5-a410-83fd7865e8262010-01-100.210751
1526e41c83-630e-47c5-a410-83fd7865e8262010-01-160.889254
\n", 521 | "
" 522 | ], 523 | "text/plain": [ 524 | " id date v\n", 525 | "1 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-02 0.367214\n", 526 | "2 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-03 0.290587\n", 527 | "8 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-09 0.663732\n", 528 | "9 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-10 0.210751\n", 529 | "15 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-16 0.889254" 530 | ] 531 | }, 532 | "execution_count": 14, 533 | "metadata": {}, 534 | "output_type": "execute_result" 535 | } 536 | ], 537 | "source": [ 538 | "dft.head()" 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": {}, 544 | "source": [ 545 | "Looking at the weekdays of the mismatched entries, you can see they are all weekends. " 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 15, 551 | "metadata": {}, 552 | "outputs": [ 553 | { 554 | "data": { 555 | "text/plain": [ 556 | "array([5, 6])" 557 | ] 558 | }, 559 | "execution_count": 15, 560 | "metadata": {}, 561 | "output_type": "execute_result" 562 | } 563 | ], 564 | "source": [ 565 | "dft['date_wkday']=dft['date'].dt.weekday\n", 566 | "dft['date_wkday'].unique()" 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": {}, 572 | "source": [ 573 | "## Conclusion\n", 574 | "\n", 575 | "Joining datasets from different sources can be a big time waster for data engineers! With databolt you can quickly do join QA and analyze problems without doing manual tedious work." 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [] 584 | } 585 | ], 586 | "metadata": { 587 | "kernelspec": { 588 | "display_name": "Python 3", 589 | "language": "python", 590 | "name": "python3" 591 | }, 592 | "language_info": { 593 | "codemirror_mode": { 594 | "name": "ipython", 595 | "version": 3 596 | }, 597 | "file_extension": ".py", 598 | "mimetype": "text/x-python", 599 | "name": "python", 600 | "nbconvert_exporter": "python", 601 | "pygments_lexer": "ipython3", 602 | "version": "3.7.6" 603 | } 604 | }, 605 | "nbformat": 4, 606 | "nbformat_minor": 2 607 | } 608 | -------------------------------------------------------------------------------- /examples-tokencluster.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Engineering in Python with databolt - Find Token Clusters for Fuzzy Merging Identifiers (d6tlib/d6tjoin.utils)\n", 8 | "\n", 9 | "## Introduction\n", 10 | "\n", 11 | "Identifiers such as securities IDs often come in different conventions which makes joining them difficult. Normal joins don't work and fuzzy joins often get tripped up by commonly occuring tokens. \n", 12 | "\n", 13 | "In this notebook we will show how to use `d6tstack.utils.tokenCount` to find clusters of tokens and match on tokens." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import d6tjoin.utils\n", 23 | "import d6tjoin.top1\n", 24 | "import pandas as pd\n", 25 | "pd.set_option('display.expand_frame_repr', False)\n", 26 | "import numpy as np" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# data is tickers from two different vendors which try to join\n", 36 | "df1 = pd.DataFrame({'id':[\"AAP\",\"AAPL\",\"APRN\",\"AMZN-AMZN\",\"BBW\",\"NMG\",\"JLP\"]})\n", 37 | "df2 = pd.DataFrame({'id':[\"AAP_US_Equity\",\"AAPL_US_Equity\",\"AMZN_US_Equity\",\"APRN_US_Equity\",\"AD_NA_Equity\",\"BBY_US_Equity\",\"BMW_NA_Equity\",\"PRIVATE_NMG\",\"PRIVATE_JLP\"]})\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n", 50 | "0 id id False 0 7 9 16 16 7 9\n", 51 | "1 __all__ __all__ False 0 7 9 16 16 7 9\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "# d6tjoin.Prejoin() shows none of the ids match\n", 57 | "\n", 58 | "d6tjoin.Prejoin([df1,df2],['id']).match_quality()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n", 71 | "0 id_cleaned id_cleaned False 4 7 8 11 7 3 4\n", 72 | "1 __all__ __all__ False 4 7 8 11 7 3 4\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "# attempt to join manually, better but still missing a few\n", 78 | "\n", 79 | "df1['id_cleaned'] = df1['id'].str.split('-').str[0]\n", 80 | "df2['id_cleaned'] = df2['id'].str.split('_').str[0]\n", 81 | "\n", 82 | "d6tjoin.Prejoin([df1,df2],['id_cleaned']).match_quality()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Fuzzy joins get confused by tokens\n", 90 | "\n", 91 | "Fuzzy joins to the rescue? Unfortunately, the presence of commonly occuring string tokens is messing with the string similarity functions." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 5, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stderr", 101 | "output_type": "stream", 102 | "text": [ 103 | "/Users/haijing/Haijing/ds_project/d6t/d6tjoin/d6tjoin/top1.py:118: UserWarning: Top1 join for id has duplicates\n", 104 | " warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)\n" 105 | ] 106 | }, 107 | { 108 | "data": { 109 | "text/html": [ 110 | "
\n", 111 | "\n", 124 | "\n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | "
__top1left____top1right____matchtype____top1diff__
40AAPPRIVATE_JLPtop1 left9
58AAPLPRIVATE_JLPtop1 left9
27AMZN-AMZNPRIVATE_NMGtop1 left10
30AMZN-AMZNAD_NA_Equitytop1 left10
34AMZN-AMZNAMZN_US_Equitytop1 left10
9APRNPRIVATE_NMGtop1 left9
0BBWPRIVATE_NMGtop1 left11
1BBWBBY_US_Equitytop1 left11
4BBWPRIVATE_JLPtop1 left11
5BBWBMW_NA_Equitytop1 left11
22JLPPRIVATE_JLPtop1 left8
45NMGPRIVATE_NMGtop1 left8
\n", 221 | "
" 222 | ], 223 | "text/plain": [ 224 | " __top1left__ __top1right__ __matchtype__ __top1diff__\n", 225 | "40 AAP PRIVATE_JLP top1 left 9\n", 226 | "58 AAPL PRIVATE_JLP top1 left 9\n", 227 | "27 AMZN-AMZN PRIVATE_NMG top1 left 10\n", 228 | "30 AMZN-AMZN AD_NA_Equity top1 left 10\n", 229 | "34 AMZN-AMZN AMZN_US_Equity top1 left 10\n", 230 | "9 APRN PRIVATE_NMG top1 left 9\n", 231 | "0 BBW PRIVATE_NMG top1 left 11\n", 232 | "1 BBW BBY_US_Equity top1 left 11\n", 233 | "4 BBW PRIVATE_JLP top1 left 11\n", 234 | "5 BBW BMW_NA_Equity top1 left 11\n", 235 | "22 JLP PRIVATE_JLP top1 left 8\n", 236 | "45 NMG PRIVATE_NMG top1 left 8" 237 | ] 238 | }, 239 | "execution_count": 5, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "# attempt a fuzzy join using edit distance => not looking good\n", 246 | "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id']).merge()['top1']['id']" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 6, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stderr", 256 | "output_type": "stream", 257 | "text": [ 258 | "/Users/haijing/Haijing/ds_project/d6t/d6tjoin/d6tjoin/top1.py:118: UserWarning: Top1 join for id has duplicates\n", 259 | " warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)\n" 260 | ] 261 | }, 262 | { 263 | "data": { 264 | "text/html": [ 265 | "
\n", 266 | "\n", 279 | "\n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | "
__top1left____top1right____matchtype____top1diff__
42AAPAAP_US_Equitytop1 left13.000
56AAPLAAPL_US_Equitytop1 left14.000
34AMZN-AMZNAMZN_US_Equitytop1 left64.625
17APRNAPRN_US_Equitytop1 left14.000
1BBWBBY_US_Equitytop1 left23.000
5BBWBMW_NA_Equitytop1 left23.000
24JLPAAP_US_Equitytop1 left33.000
50NMGBMW_NA_Equitytop1 left33.000
\n", 348 | "
" 349 | ], 350 | "text/plain": [ 351 | " __top1left__ __top1right__ __matchtype__ __top1diff__\n", 352 | "42 AAP AAP_US_Equity top1 left 13.000\n", 353 | "56 AAPL AAPL_US_Equity top1 left 14.000\n", 354 | "34 AMZN-AMZN AMZN_US_Equity top1 left 64.625\n", 355 | "17 APRN APRN_US_Equity top1 left 14.000\n", 356 | "1 BBW BBY_US_Equity top1 left 23.000\n", 357 | "5 BBW BMW_NA_Equity top1 left 23.000\n", 358 | "24 JLP AAP_US_Equity top1 left 33.000\n", 359 | "50 NMG BMW_NA_Equity top1 left 33.000" 360 | ] 361 | }, 362 | "execution_count": 6, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "# attempt a fuzzy join using affine gap distance => not looking good\n", 369 | "import affinegap\n", 370 | "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[affinegap.affineGapDistance]).merge()['top1']['id']" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "## Token-based clustering\n", 378 | "\n", 379 | "With `d6tjoin.utils.splitcharTokenCount` you can quickly split the ids into tokens to find commonly occuring substrings. You can then use that knowledge to join the data." 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 7, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "*** token counts ***\n", 392 | " word count\n", 393 | "0 Equity 7\n", 394 | "1 US 5\n", 395 | "2 NA 2\n", 396 | "3 PRIVATE 2\n", 397 | "\n", 398 | " *** token occurance ***\n", 399 | "[('Equity', ['AAPL_US_Equity', 'AAP_US_Equity', 'AD_NA_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity', 'BMW_NA_Equity']), ('US', ['AAPL_US_Equity', 'AAP_US_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity']), ('NA', ['AD_NA_Equity', 'BMW_NA_Equity']), ('PRIVATE', ['PRIVATE_JLP', 'PRIVATE_NMG'])]\n" 400 | ] 401 | } 402 | ], 403 | "source": [ 404 | "dftoken=d6tjoin.utils.splitcharTokenCount(df2['id'])\n", 405 | "print('*** token counts ***')\n", 406 | "print(dftoken)\n", 407 | "print('\\n *** token occurance ***')\n", 408 | "print(d6tjoin.utils.unique_contains(df2['id'], dftoken['word'].values))\n" 409 | ] 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "## Token-based joins\n", 416 | "\n", 417 | "Based on the analysis above, we want to join pairs which have at least 1 common token. It's easy to define a function which computes that and pass that to `d6tjoin.top1.MergeTop1()` to get a good join." 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 8, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "data": { 427 | "text/html": [ 428 | "
\n", 429 | "\n", 442 | "\n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | "
__top1left____top1right____matchtype____top1diff__
42AAPAAP_US_Equitytop1 left2
56AAPLAAPL_US_Equitytop1 left2
34AMZN-AMZNAMZN_US_Equitytop1 left2
17APRNAPRN_US_Equitytop1 left2
22JLPPRIVATE_JLPtop1 left2
45NMGPRIVATE_NMGtop1 left2
\n", 497 | "
" 498 | ], 499 | "text/plain": [ 500 | " __top1left__ __top1right__ __matchtype__ __top1diff__\n", 501 | "42 AAP AAP_US_Equity top1 left 2\n", 502 | "56 AAPL AAPL_US_Equity top1 left 2\n", 503 | "34 AMZN-AMZN AMZN_US_Equity top1 left 2\n", 504 | "17 APRN APRN_US_Equity top1 left 2\n", 505 | "22 JLP PRIVATE_JLP top1 left 2\n", 506 | "45 NMG PRIVATE_NMG top1 left 2" 507 | ] 508 | }, 509 | "execution_count": 8, 510 | "metadata": {}, 511 | "output_type": "execute_result" 512 | } 513 | ], 514 | "source": [ 515 | "import re\n", 516 | "splitchars=\"[^a-zA-Z0-9]+\"\n", 517 | "def tokenmatch(s1,s2):\n", 518 | " s1=set(re.split(splitchars,s1))\n", 519 | " s2=set(re.split(splitchars,s2))\n", 520 | " return 3-len(s1 & s2)\n", 521 | "\n", 522 | "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[tokenmatch], top_limit=[2]).merge()['top1']['id']\n" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 9, 528 | "metadata": {}, 529 | "outputs": [ 530 | { 531 | "name": "stderr", 532 | "output_type": "stream", 533 | "text": [ 534 | "/Users/haijing/Haijing/ds_project/d6t/d6tjoin/d6tjoin/top1.py:118: UserWarning: Top1 join for id has duplicates\n", 535 | " warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)\n" 536 | ] 537 | }, 538 | { 539 | "data": { 540 | "text/html": [ 541 | "
\n", 542 | "\n", 555 | "\n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | "
__top1left____top1right____matchtype____top1diff__
42AAPAAP_US_Equitytop1 left2
56AAPLAAPL_US_Equitytop1 left2
34AMZN-AMZNAMZN_US_Equitytop1 left2
17APRNAPRN_US_Equitytop1 left2
0BBWPRIVATE_NMGtop1 left3
1BBWBBY_US_Equitytop1 left3
2BBWAAPL_US_Equitytop1 left3
3BBWAD_NA_Equitytop1 left3
4BBWPRIVATE_JLPtop1 left3
5BBWBMW_NA_Equitytop1 left3
6BBWAAP_US_Equitytop1 left3
7BBWAMZN_US_Equitytop1 left3
8BBWAPRN_US_Equitytop1 left3
22JLPPRIVATE_JLPtop1 left2
45NMGPRIVATE_NMGtop1 left2
\n", 673 | "
" 674 | ], 675 | "text/plain": [ 676 | " __top1left__ __top1right__ __matchtype__ __top1diff__\n", 677 | "42 AAP AAP_US_Equity top1 left 2\n", 678 | "56 AAPL AAPL_US_Equity top1 left 2\n", 679 | "34 AMZN-AMZN AMZN_US_Equity top1 left 2\n", 680 | "17 APRN APRN_US_Equity top1 left 2\n", 681 | "0 BBW PRIVATE_NMG top1 left 3\n", 682 | "1 BBW BBY_US_Equity top1 left 3\n", 683 | "2 BBW AAPL_US_Equity top1 left 3\n", 684 | "3 BBW AD_NA_Equity top1 left 3\n", 685 | "4 BBW PRIVATE_JLP top1 left 3\n", 686 | "5 BBW BMW_NA_Equity top1 left 3\n", 687 | "6 BBW AAP_US_Equity top1 left 3\n", 688 | "7 BBW AMZN_US_Equity top1 left 3\n", 689 | "8 BBW APRN_US_Equity top1 left 3\n", 690 | "22 JLP PRIVATE_JLP top1 left 2\n", 691 | "45 NMG PRIVATE_NMG top1 left 2" 692 | ] 693 | }, 694 | "execution_count": 9, 695 | "metadata": {}, 696 | "output_type": "execute_result" 697 | } 698 | ], 699 | "source": [ 700 | "# note that we applied top_limit=[2], meaning strings should have at most 2 tokens mismatched, to exclude bad matches for BBW\n", 701 | "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[tokenmatch]).merge()['top1']['id']\n" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [] 710 | } 711 | ], 712 | "metadata": { 713 | "kernelspec": { 714 | "display_name": "Python 3", 715 | "language": "python", 716 | "name": "python3" 717 | }, 718 | "language_info": { 719 | "codemirror_mode": { 720 | "name": "ipython", 721 | "version": 3 722 | }, 723 | "file_extension": ".py", 724 | "mimetype": "text/x-python", 725 | "name": "python", 726 | "nbconvert_exporter": "python", 727 | "pygments_lexer": "ipython3", 728 | "version": "3.7.6" 729 | } 730 | }, 731 | "nbformat": 4, 732 | "nbformat_minor": 2 733 | } 734 | -------------------------------------------------------------------------------- /examples-top1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Engineering in Python with databolt - Fuzzy Joins (d6tlib/d6tjoin.utils)\n", 8 | "\n", 9 | "## Introduction\n", 10 | "\n", 11 | "Joining datasets is a common data engineering operation. However, often there are problems merging datasets from different sources because of mismatched identifiers, date conventions etc. \n", 12 | "\n", 13 | "** `d6tjoin.top1` module allows you to quickly join datasets even if they don't perfectly match. **\n", 14 | "Easily join different datasets without writing custom code. Does fuzzy top1 similarity joins for strings, dates and numbers, for example you can quickly join similar but not identical stock tickers, addresses, names without manual processing. It will find the top 1 matched entry from the right dataframe to join onto the left dataframe.\n", 15 | "\n", 16 | "Here are some examples which show you how to:\n", 17 | "1. join on mismatched identifiers\n", 18 | "2. join on calendar vs business dates\n", 19 | "3. join on both mismatched dates and identifiers" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/html": [ 30 | "
\n", 31 | "\n", 44 | "\n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | "
dateidv
02010-01-01e3e706820.393
12010-01-01f728b4fa0.837
22010-01-01eb1167b30.389
32010-01-01f7c1bd870.555
42010-01-01e443df780.886
\n", 86 | "
" 87 | ], 88 | "text/plain": [ 89 | " date id v\n", 90 | "0 2010-01-01 e3e70682 0.393\n", 91 | "1 2010-01-01 f728b4fa 0.837\n", 92 | "2 2010-01-01 eb1167b3 0.389\n", 93 | "3 2010-01-01 f7c1bd87 0.555\n", 94 | "4 2010-01-01 e443df78 0.886" 95 | ] 96 | }, 97 | "execution_count": 1, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "import pandas as pd\n", 104 | "import numpy as np\n", 105 | "import itertools\n", 106 | "from faker import Faker\n", 107 | "import importlib\n", 108 | "\n", 109 | "import d6tjoin.top1\n", 110 | "importlib.reload(d6tjoin.top1)\n", 111 | "import d6tjoin.utils\n", 112 | "\n", 113 | "# *******************************************************\n", 114 | "# generate sample time series data with id and value\n", 115 | "# *******************************************************\n", 116 | "nobs = 10\n", 117 | "f1 = Faker()\n", 118 | "Faker.seed(0)\n", 119 | "uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)]\n", 120 | "dates1 = pd.date_range('1/1/2010','1/1/2011')\n", 121 | "\n", 122 | "df1 = pd.DataFrame(list(itertools.product(dates1,uuid1)),columns=['date','id'])\n", 123 | "df1['v']=np.round(np.random.sample(df1.shape[0]),3)\n", 124 | "df1.head()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "# Example 1: join datasets on misalgined ids\n", 132 | "\n", 133 | "When joining data from different sources, eg different vendors, often your ids don't match perfect and then you need to manually analyze the situation. With databolt this becomes much easier.\n", 134 | "\n", 135 | "Let's create another dataset where the `id` is slightly different." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 2, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/html": [ 146 | "
\n", 147 | "\n", 160 | "\n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
dateidv
02010-01-013e70680.393
12010-01-01728b4f0.837
22010-01-01b1167b0.389
32010-01-017c1bd80.555
42010-01-01443df70.886
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " date id v\n", 206 | "0 2010-01-01 3e7068 0.393\n", 207 | "1 2010-01-01 728b4f 0.837\n", 208 | "2 2010-01-01 b1167b 0.389\n", 209 | "3 2010-01-01 7c1bd8 0.555\n", 210 | "4 2010-01-01 443df7 0.886" 211 | ] 212 | }, 213 | "execution_count": 2, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "# create mismatch\n", 220 | "df2 = df1.copy()\n", 221 | "df2['id'] = df1['id'].str[1:-1]\n", 222 | "df2.head()" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "`d6tjoin.Prejoin.match_quality()` shows you there is none of `id` match so a normal join won't work well." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 3, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n", 242 | "0 id id False 0 10 10 20 20 10 10\n", 243 | "1 date date True 366 366 366 366 0 0 0\n", 244 | "2 __all__ __all__ False 0 3660 3660 7320 7320 3660 3660\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "Using `d6tjoin.top1.MergeTop1()` you can quickly merge this dataset without having to do any manual processing. It will find the closest matching id using the Levenstein string similarity metric. We want to look at the closest id by date so we will pass in date as an exact match key." 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 4, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "result = d6tjoin.top1.MergeTop1(df1.head(),df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'],exact_left_on=['date'],exact_right_on=['date']).merge()" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "Lets check what matches it found. Looking at the top1 match table, it shows the closest string with only 2 character difference in id, meaning it found the correct substring. " 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 5, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/html": [ 283 | "
\n", 284 | "\n", 297 | "\n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | "
date__top1left____top1right____top1diff____matchtype__
102010-01-01e3e706823e70682top1 left
342010-01-01e443df78443df72top1 left
422010-01-01eb1167b3b1167b2top1 left
212010-01-01f728b4fa728b4f2top1 left
32010-01-01f7c1bd877c1bd82top1 left
\n", 351 | "
" 352 | ], 353 | "text/plain": [ 354 | " date __top1left__ __top1right__ __top1diff__ __matchtype__\n", 355 | "10 2010-01-01 e3e70682 3e7068 2 top1 left\n", 356 | "34 2010-01-01 e443df78 443df7 2 top1 left\n", 357 | "42 2010-01-01 eb1167b3 b1167b 2 top1 left\n", 358 | "21 2010-01-01 f728b4fa 728b4f 2 top1 left\n", 359 | "3 2010-01-01 f7c1bd87 7c1bd8 2 top1 left" 360 | ] 361 | }, 362 | "execution_count": 5, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "result['top1']['id']" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "Since the match results look good, you can use the merged dataset." 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 6, 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "data": { 385 | "text/html": [ 386 | "
\n", 387 | "\n", 400 | "\n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | "
dateidvid_rightv_right
02010-01-01e3e706820.3933e70680.393
12010-01-01f728b4fa0.837728b4f0.837
22010-01-01eb1167b30.389b1167b0.389
32010-01-01f7c1bd870.5557c1bd80.555
42010-01-01e443df780.886443df70.886
\n", 454 | "
" 455 | ], 456 | "text/plain": [ 457 | " date id v id_right v_right\n", 458 | "0 2010-01-01 e3e70682 0.393 3e7068 0.393\n", 459 | "1 2010-01-01 f728b4fa 0.837 728b4f 0.837\n", 460 | "2 2010-01-01 eb1167b3 0.389 b1167b 0.389\n", 461 | "3 2010-01-01 f7c1bd87 0.555 7c1bd8 0.555\n", 462 | "4 2010-01-01 e443df78 0.886 443df7 0.886" 463 | ] 464 | }, 465 | "execution_count": 6, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "result['merged'].head()" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 7, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "assert not result['duplicates']" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "# Example 2: join 2 datasets with misalgined dates\n", 488 | "\n", 489 | "As another example, instead of the ids not matching, lets look at an example where the dates don't match. We will look at calendar vs business month end dates." 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 8, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n", 499 | "df2 = pd.DataFrame(list(itertools.product(dates2,uuid1)),columns=['date','id'])\n", 500 | "df2['v']=np.round(np.random.sample(df2.shape[0]),3)" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "`d6tjoin.Prejoin()` shows some but not all of the dates match. All the ids match." 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 9, 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n", 520 | "0 id id True 10 10 10 10 0 0 0\n", 521 | "1 date date False 261 366 261 366 105 105 0\n", 522 | "2 __all__ __all__ False 2610 3660 2610 3660 1050 1050 0\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "So we want to do a fuzzy match on dates but have the id match perfectly." 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 10, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "result = d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['date'],fuzzy_right_on=['date'],exact_left_on=['id'],exact_right_on=['id']).merge()" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "Again lets check if the fuzzy matches are correct. If either matches or is off by a day most, looks good!" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 11, 556 | "metadata": {}, 557 | "outputs": [ 558 | { 559 | "data": { 560 | "text/html": [ 561 | "
\n", 562 | "\n", 575 | "\n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | "
id__top1left____top1right____top1diff____matchtype__
01846d4242010-01-012010-01-010 daysexact
1eb1167b32010-01-012010-01-010 daysexact
2e443df782010-01-012010-01-010 daysexact
\n", 613 | "
" 614 | ], 615 | "text/plain": [ 616 | " id __top1left__ __top1right__ __top1diff__ __matchtype__\n", 617 | "0 1846d424 2010-01-01 2010-01-01 0 days exact\n", 618 | "1 eb1167b3 2010-01-01 2010-01-01 0 days exact\n", 619 | "2 e443df78 2010-01-01 2010-01-01 0 days exact" 620 | ] 621 | }, 622 | "execution_count": 11, 623 | "metadata": {}, 624 | "output_type": "execute_result" 625 | } 626 | ], 627 | "source": [ 628 | "result['top1']['date'].head(3)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 12, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/html": [ 639 | "
\n", 640 | "\n", 653 | "\n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | "
id__top1left____top1right____top1diff____matchtype__
36571846d4242011-01-012010-12-311 daystop1 left
3658f7c1bd872011-01-012010-12-311 daystop1 left
3659fcbd04c32011-01-012010-12-311 daystop1 left
\n", 691 | "
" 692 | ], 693 | "text/plain": [ 694 | " id __top1left__ __top1right__ __top1diff__ __matchtype__\n", 695 | "3657 1846d424 2011-01-01 2010-12-31 1 days top1 left\n", 696 | "3658 f7c1bd87 2011-01-01 2010-12-31 1 days top1 left\n", 697 | "3659 fcbd04c3 2011-01-01 2010-12-31 1 days top1 left" 698 | ] 699 | }, 700 | "execution_count": 12, 701 | "metadata": {}, 702 | "output_type": "execute_result" 703 | } 704 | ], 705 | "source": [ 706 | "result['top1']['date'].tail(3)" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": 13, 712 | "metadata": {}, 713 | "outputs": [ 714 | { 715 | "data": { 716 | "text/plain": [ 717 | "Timedelta('1 days 00:00:00')" 718 | ] 719 | }, 720 | "execution_count": 13, 721 | "metadata": {}, 722 | "output_type": "execute_result" 723 | } 724 | ], 725 | "source": [ 726 | "result['top1']['date']['__top1diff__'].max()" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": {}, 732 | "source": [ 733 | "Again with very little effort we were able to join this dataset together." 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": 14, 739 | "metadata": {}, 740 | "outputs": [ 741 | { 742 | "data": { 743 | "text/html": [ 744 | "
\n", 745 | "\n", 758 | "\n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | "
dateidvdate_rightv_right
02010-01-01e3e706820.3932010-01-010.110
12010-01-02e3e706820.5372010-01-010.110
22010-01-01f728b4fa0.8372010-01-010.197
32010-01-02f728b4fa0.5172010-01-010.197
42010-01-01eb1167b30.3892010-01-010.385
\n", 812 | "
" 813 | ], 814 | "text/plain": [ 815 | " date id v date_right v_right\n", 816 | "0 2010-01-01 e3e70682 0.393 2010-01-01 0.110\n", 817 | "1 2010-01-02 e3e70682 0.537 2010-01-01 0.110\n", 818 | "2 2010-01-01 f728b4fa 0.837 2010-01-01 0.197\n", 819 | "3 2010-01-02 f728b4fa 0.517 2010-01-01 0.197\n", 820 | "4 2010-01-01 eb1167b3 0.389 2010-01-01 0.385" 821 | ] 822 | }, 823 | "execution_count": 14, 824 | "metadata": {}, 825 | "output_type": "execute_result" 826 | } 827 | ], 828 | "source": [ 829 | "result['merged'].head()" 830 | ] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": {}, 835 | "source": [ 836 | "# Example 3: join 2 datasets with misalgined dates AND ids\n", 837 | "\n", 838 | "In the final example, we combine the above cases. None of the ids match and some of the dates are mismatched. As before with little manual effort we are able to correctly merge the dataset." 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 15, 844 | "metadata": {}, 845 | "outputs": [], 846 | "source": [ 847 | "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n", 848 | "df2 = pd.DataFrame(list(itertools.product(dates2,uuid1)),columns=['date','id'])\n", 849 | "df2['v']=np.round(np.random.sample(df2.shape[0]),3)\n", 850 | "df2['id'] = df2['id'].str[1:-1]" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": 16, 856 | "metadata": {}, 857 | "outputs": [ 858 | { 859 | "name": "stdout", 860 | "output_type": "stream", 861 | "text": [ 862 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n", 863 | "0 id id False 0 10 10 20 20 10 10\n", 864 | "1 date date False 261 366 261 366 105 105 0\n", 865 | "2 __all__ __all__ False 0 3660 2610 6270 6270 3660 2610\n" 866 | ] 867 | } 868 | ], 869 | "source": [ 870 | "d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()" 871 | ] 872 | }, 873 | { 874 | "cell_type": "code", 875 | "execution_count": 17, 876 | "metadata": {}, 877 | "outputs": [], 878 | "source": [ 879 | "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id']).merge()" 880 | ] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": 18, 885 | "metadata": {}, 886 | "outputs": [ 887 | { 888 | "data": { 889 | "text/html": [ 890 | "
\n", 891 | "\n", 904 | "\n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | "
dateidvdate_rightid_rightv_right
02010-01-01e3e706820.3932010-01-013e70680.693
12010-01-02e3e706820.5372010-01-013e70680.693
22010-01-01f728b4fa0.8372010-01-01728b4f0.463
32010-01-02f728b4fa0.5172010-01-01728b4f0.463
42010-01-01eb1167b30.3892010-01-01b1167b0.227
\n", 964 | "
" 965 | ], 966 | "text/plain": [ 967 | " date id v date_right id_right v_right\n", 968 | "0 2010-01-01 e3e70682 0.393 2010-01-01 3e7068 0.693\n", 969 | "1 2010-01-02 e3e70682 0.537 2010-01-01 3e7068 0.693\n", 970 | "2 2010-01-01 f728b4fa 0.837 2010-01-01 728b4f 0.463\n", 971 | "3 2010-01-02 f728b4fa 0.517 2010-01-01 728b4f 0.463\n", 972 | "4 2010-01-01 eb1167b3 0.389 2010-01-01 b1167b 0.227" 973 | ] 974 | }, 975 | "execution_count": 18, 976 | "metadata": {}, 977 | "output_type": "execute_result" 978 | } 979 | ], 980 | "source": [ 981 | "result['merged'].head()" 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": 19, 987 | "metadata": { 988 | "scrolled": true 989 | }, 990 | "outputs": [ 991 | { 992 | "data": { 993 | "text/html": [ 994 | "
\n", 995 | "\n", 1008 | "\n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | "
__top1left____top1right____top1diff____matchtype__
3612010-12-282010-12-280 daysexact
3622010-12-292010-12-290 daysexact
3632010-12-302010-12-300 daysexact
3642010-12-312010-12-310 daysexact
3652011-01-012010-12-311 daystop1 left
\n", 1056 | "
" 1057 | ], 1058 | "text/plain": [ 1059 | " __top1left__ __top1right__ __top1diff__ __matchtype__\n", 1060 | "361 2010-12-28 2010-12-28 0 days exact\n", 1061 | "362 2010-12-29 2010-12-29 0 days exact\n", 1062 | "363 2010-12-30 2010-12-30 0 days exact\n", 1063 | "364 2010-12-31 2010-12-31 0 days exact\n", 1064 | "365 2011-01-01 2010-12-31 1 days top1 left" 1065 | ] 1066 | }, 1067 | "execution_count": 19, 1068 | "metadata": {}, 1069 | "output_type": "execute_result" 1070 | } 1071 | ], 1072 | "source": [ 1073 | "result['top1']['date'].tail()" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": 20, 1079 | "metadata": {}, 1080 | "outputs": [ 1081 | { 1082 | "data": { 1083 | "text/html": [ 1084 | "
\n", 1085 | "\n", 1098 | "\n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | "
__top1right__date__top1left____top1right____top1diff____matchtype__
93962010-01-011846d424846d422top1 left
39152010-01-0123a7711a3a77112top1 left
206192010-01-01259f432959f4322top1 left
125282010-01-01b4862b214862b22top1 left
130502010-01-01e3e706823e70682top1 left
\n", 1152 | "
" 1153 | ], 1154 | "text/plain": [ 1155 | " __top1right__date __top1left__ __top1right__ __top1diff__ __matchtype__\n", 1156 | "9396 2010-01-01 1846d424 846d42 2 top1 left\n", 1157 | "3915 2010-01-01 23a7711a 3a7711 2 top1 left\n", 1158 | "20619 2010-01-01 259f4329 59f432 2 top1 left\n", 1159 | "12528 2010-01-01 b4862b21 4862b2 2 top1 left\n", 1160 | "13050 2010-01-01 e3e70682 3e7068 2 top1 left" 1161 | ] 1162 | }, 1163 | "execution_count": 20, 1164 | "metadata": {}, 1165 | "output_type": "execute_result" 1166 | } 1167 | ], 1168 | "source": [ 1169 | "result['top1']['id'].head()" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "markdown", 1174 | "metadata": { 1175 | "collapsed": true 1176 | }, 1177 | "source": [ 1178 | "# Advanced Usage Options" 1179 | ] 1180 | }, 1181 | { 1182 | "cell_type": "markdown", 1183 | "metadata": {}, 1184 | "source": [ 1185 | "## Passing a difference limit\n", 1186 | "By default every record in the left dataframe will be matched with a record in the right dataframe. Sometimes the difference is too large though to be considered a match. You can control this by passing the `top_limit` parameter." 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "code", 1191 | "execution_count": 21, 1192 | "metadata": {}, 1193 | "outputs": [], 1194 | "source": [ 1195 | "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n", 1196 | "df2 = pd.DataFrame(list(itertools.product(dates2,uuid1[:-2])),columns=['date','id'])\n", 1197 | "df2['v']=np.random.sample(df2.shape[0])\n", 1198 | "df2['id'] = df2['id'].str[1:-1]" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "code", 1203 | "execution_count": 22, 1204 | "metadata": {}, 1205 | "outputs": [ 1206 | { 1207 | "data": { 1208 | "text/html": [ 1209 | "
\n", 1210 | "\n", 1223 | "\n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | "
__top1right__date__top1left____top1right____top1diff____matchtype__
78302010-01-011846d424846d422top1 left
33932010-01-0123a7711a3a77112top1 left
161822010-01-01259f4329846d426top1 left
88742010-01-01b4862b21b1167b5top1 left
99182010-01-01b4862b21846d425top1 left
\n", 1277 | "
" 1278 | ], 1279 | "text/plain": [ 1280 | " __top1right__date __top1left__ __top1right__ __top1diff__ __matchtype__\n", 1281 | "7830 2010-01-01 1846d424 846d42 2 top1 left\n", 1282 | "3393 2010-01-01 23a7711a 3a7711 2 top1 left\n", 1283 | "16182 2010-01-01 259f4329 846d42 6 top1 left\n", 1284 | "8874 2010-01-01 b4862b21 b1167b 5 top1 left\n", 1285 | "9918 2010-01-01 b4862b21 846d42 5 top1 left" 1286 | ] 1287 | }, 1288 | "execution_count": 22, 1289 | "metadata": {}, 1290 | "output_type": "execute_result" 1291 | } 1292 | ], 1293 | "source": [ 1294 | "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id']).merge()\n", 1295 | "result['top1']['id'].head()" 1296 | ] 1297 | }, 1298 | { 1299 | "cell_type": "markdown", 1300 | "metadata": {}, 1301 | "source": [ 1302 | "We have some correct matches but also some bad matches with `__top1diff__`>2. We will restrict `top_limit` to be at most 2." 1303 | ] 1304 | }, 1305 | { 1306 | "cell_type": "code", 1307 | "execution_count": 23, 1308 | "metadata": {}, 1309 | "outputs": [], 1310 | "source": [ 1311 | "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id'], top_limit=[None,2]).merge()" 1312 | ] 1313 | }, 1314 | { 1315 | "cell_type": "code", 1316 | "execution_count": 24, 1317 | "metadata": {}, 1318 | "outputs": [ 1319 | { 1320 | "data": { 1321 | "text/html": [ 1322 | "
\n", 1323 | "\n", 1336 | "\n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | "
__top1right__date__top1left____top1right____top1diff____matchtype__
78302010-01-011846d424846d422top1 left
33932010-01-0123a7711a3a77112top1 left
104402010-01-01e3e706823e70682top1 left
52202010-01-01e443df78443df72top1 left
172262010-01-01eb1167b3b1167b2top1 left
\n", 1390 | "
" 1391 | ], 1392 | "text/plain": [ 1393 | " __top1right__date __top1left__ __top1right__ __top1diff__ __matchtype__\n", 1394 | "7830 2010-01-01 1846d424 846d42 2 top1 left\n", 1395 | "3393 2010-01-01 23a7711a 3a7711 2 top1 left\n", 1396 | "10440 2010-01-01 e3e70682 3e7068 2 top1 left\n", 1397 | "5220 2010-01-01 e443df78 443df7 2 top1 left\n", 1398 | "17226 2010-01-01 eb1167b3 b1167b 2 top1 left" 1399 | ] 1400 | }, 1401 | "execution_count": 24, 1402 | "metadata": {}, 1403 | "output_type": "execute_result" 1404 | } 1405 | ], 1406 | "source": [ 1407 | "result['top1']['id'].head()" 1408 | ] 1409 | }, 1410 | { 1411 | "cell_type": "markdown", 1412 | "metadata": {}, 1413 | "source": [ 1414 | "## Passing a custom difference function\n", 1415 | "By default string matches are done using Levenstein edit distance. You can pass a custom function using `fun_diff`. For example lets pass Hamming distance." 1416 | ] 1417 | }, 1418 | { 1419 | "cell_type": "code", 1420 | "execution_count": 25, 1421 | "metadata": {}, 1422 | "outputs": [], 1423 | "source": [ 1424 | "import jellyfish\n", 1425 | "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id'], fun_diff=[None,jellyfish.hamming_distance]).merge()" 1426 | ] 1427 | }, 1428 | { 1429 | "cell_type": "code", 1430 | "execution_count": 26, 1431 | "metadata": {}, 1432 | "outputs": [ 1433 | { 1434 | "data": { 1435 | "text/html": [ 1436 | "
\n", 1437 | "\n", 1450 | "\n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | "
__top1right__date__top1left____top1right____top1diff____matchtype__
67862010-01-011846d424b1167b7top1 left
70472010-01-011846d4247c1bd87top1 left
33932010-01-0123a7711a3a77116top1 left
148772010-01-01259f4329728b4f7top1 left
161822010-01-01259f4329846d427top1 left
\n", 1504 | "
" 1505 | ], 1506 | "text/plain": [ 1507 | " __top1right__date __top1left__ __top1right__ __top1diff__ __matchtype__\n", 1508 | "6786 2010-01-01 1846d424 b1167b 7 top1 left\n", 1509 | "7047 2010-01-01 1846d424 7c1bd8 7 top1 left\n", 1510 | "3393 2010-01-01 23a7711a 3a7711 6 top1 left\n", 1511 | "14877 2010-01-01 259f4329 728b4f 7 top1 left\n", 1512 | "16182 2010-01-01 259f4329 846d42 7 top1 left" 1513 | ] 1514 | }, 1515 | "execution_count": 26, 1516 | "metadata": {}, 1517 | "output_type": "execute_result" 1518 | } 1519 | ], 1520 | "source": [ 1521 | "result['top1']['id'].head()" 1522 | ] 1523 | }, 1524 | { 1525 | "cell_type": "code", 1526 | "execution_count": null, 1527 | "metadata": {}, 1528 | "outputs": [], 1529 | "source": [] 1530 | } 1531 | ], 1532 | "metadata": { 1533 | "kernelspec": { 1534 | "display_name": "Python 3", 1535 | "language": "python", 1536 | "name": "python3" 1537 | }, 1538 | "language_info": { 1539 | "codemirror_mode": { 1540 | "name": "ipython", 1541 | "version": 3 1542 | }, 1543 | "file_extension": ".py", 1544 | "mimetype": "text/x-python", 1545 | "name": "python", 1546 | "nbconvert_exporter": "python", 1547 | "pygments_lexer": "ipython3", 1548 | "version": "3.7.6" 1549 | } 1550 | }, 1551 | "nbformat": 4, 1552 | "nbformat_minor": 2 1553 | } 1554 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | sphinx 3 | sphinxcontrib-napoleon 4 | sphinx_rtd_theme 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | jellyfish 4 | d6tstack 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='d6tjoin', 5 | version='0.2.1', 6 | packages=['d6tjoin'], 7 | url='https://github.com/d6t/d6tjoin', 8 | license='MIT', 9 | author='DataBolt Team', 10 | author_email='support@databolt.tech', 11 | description='Easily join python pandas dataframes', 12 | long_description='Easily join python pandas dataframes' 13 | 'See https://github.com/d6t/d6tjoin for details', 14 | install_requires=[ 15 | 'numpy', 16 | 'pandas', 17 | 'jellyfish', 18 | 'joblib', 19 | 'd6tstack', 20 | 'affinegap' 21 | ], 22 | include_package_data=True, 23 | python_requires='>=3.6' 24 | ) 25 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/d6t/d6tjoin/9618b129601aa0b4a9247d7001da8c2220d36d9c/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_pre_pd.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import pytest 5 | 6 | import d6tjoin 7 | 8 | def fake_2dfs_identical(): 9 | df = pd.DataFrame({'a':range(10)}) 10 | df['b'] = ['b']*5+['bb']*5 11 | return [df, df.copy()] 12 | 13 | def fake_2dfs_1missing(): 14 | df = pd.DataFrame({'a':range(10)}) 15 | df['b'] = ['b']*5+['bb']*5 16 | return [df, df.copy().drop(['b'],1)] 17 | 18 | def test_internals(): 19 | dfs = fake_2dfs_identical() 20 | 21 | pdj = d6tjoin.Prejoin(dfs, print_only=False) 22 | assert pdj.keys is None and pdj.keysdf is None 23 | assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs]) 24 | assert all([dfg.shape==(pdj.nrows, dfs[0].shape[1]) for dfg in pdj.dfshead]) 25 | dfc = pdj.head() 26 | assert all([dfg.head().equals(dfc[idx]) for idx,dfg in enumerate(dfs)]) 27 | dfc = pdj.head(10) 28 | assert all([dfg.head(10).equals(dfc[idx]) for idx,dfg in enumerate(dfs)]) 29 | 30 | # single keys param 31 | cfg_keys = ['b'] 32 | pdj = d6tjoin.Prejoin(dfs,keys=cfg_keys) 33 | assert pdj.keys == [['b','b']] and pdj.keysdf == [['b'],['b']] 34 | assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs]) 35 | assert all([dfg.shape==(pdj.nrows, len(cfg_keys)) for dfg in pdj.dfshead]) 36 | 37 | dfs[1] = dfs[1].rename(columns={'b': 'c'}) 38 | with pytest.raises(KeyError, match='Columns missing'): 39 | pdj = d6tjoin.Prejoin(dfs, keys=['b']) 40 | 41 | # different keys for dfs 42 | pdj = d6tjoin.Prejoin(dfs,keys=[['b'],['c']]) 43 | assert pdj.keys == [['b','c']] and pdj.keysdf == [['b'],['c']] 44 | assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs]) 45 | assert all([dfg.shape==(pdj.nrows, 1) for dfg in pdj.dfshead]) 46 | pdj = d6tjoin.Prejoin(dfs,keys=[['b','c']], keys_bydf=False) 47 | assert pdj.keys == [['b','c']] and pdj.keysdf == [['b'],['c']] 48 | 49 | # multi keys param 50 | dfs[0]['b1']=dfs[0]['b'];dfs[1]['c1']=dfs[1]['c']; 51 | pdj = d6tjoin.Prejoin(dfs,keys=[['b','b1'],['c','c1']]) 52 | assert pdj.keys == [['b','c'],['b1','c1']] and pdj.keysdf == [['b','b1'],['c','c1']] 53 | assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs]) 54 | assert all([dfg.shape==(pdj.nrows, 2) for dfg in pdj.dfshead]) 55 | 56 | # joins with keys specified 57 | dfs = fake_2dfs_identical() 58 | pdj = d6tjoin.Prejoin(dfs,keys=['b'], print_only=False) 59 | assert pdj.columns_common()==['b'] 60 | assert pdj.columns_all()==['b'] 61 | 62 | dfs[1] = dfs[1].rename(columns={'b': 'c'}) 63 | pdj = d6tjoin.Prejoin(dfs,keys=[['b'],['c']], print_only=False) 64 | assert pdj.columns_all()==['b','c'] 65 | 66 | 67 | def test_pre_columns(): 68 | dfs = fake_2dfs_identical() 69 | pdj = d6tjoin.Prejoin(dfs,print_only=False) 70 | assert pdj.columns_common()==['a','b'] 71 | assert pdj.columns_all()==['a','b'] 72 | 73 | pdj.describe() 74 | assert pdj.shape() == {0: (10, 2), 1: (10, 2)} 75 | 76 | dfs = fake_2dfs_1missing() 77 | pdj = d6tjoin.Prejoin(dfs,print_only=False) 78 | assert pdj.columns_common()==['a'] 79 | assert pdj.columns_all()==['a','b'] 80 | 81 | def test_pre_describe(): 82 | # describe_str 83 | chk = {'b': {'median': 1.5, 'min': 1.0, 'max': 2.0, 'nrecords': 10.0}} 84 | dfs = fake_2dfs_identical() 85 | pdj = d6tjoin.Prejoin(dfs,print_only=False) 86 | assert pdj.describe_str()[0].to_dict(orient='index')==chk 87 | pdj = d6tjoin.Prejoin(dfs,keys=['b'],print_only=False) 88 | assert pdj.describe_str()[0].to_dict(orient='index')==chk 89 | 90 | # describe_str 91 | chk = {'a': {'nrecords': 10, 'unique': 10, 'nan': 0, 'unique rate': 1.0}, 92 | 'b': {'nrecords': 10, 'unique': 2, 'nan': 0, 'unique rate': 0.2}} 93 | pdj = d6tjoin.Prejoin(dfs,print_only=False) 94 | assert pdj.describe_data()[0].to_dict(orient='index')==chk 95 | pdj = d6tjoin.Prejoin(dfs,keys=['b'],print_only=False) 96 | assert pdj.describe_data()[0].to_dict(orient='index')==chk 97 | 98 | def test_pre_data_match(): 99 | dfs = fake_2dfs_identical() 100 | pdj = d6tjoin.Prejoin(dfs,print_only=False) 101 | 102 | dfc = {'__left__': {0: 'b'}, 103 | '__right__': {0: 'b'}, 104 | '__similarity__': {0: 1.0}, 105 | '__left-sample__': {0: 'bb'}, 106 | '__right-sample__': {0: 'bb'}, 107 | '__left-nunique__': {0: 2}, 108 | '__right-nunique__': {0: 2}} 109 | 110 | assert pd.DataFrame(dfc).equals(pdj.data_match()) 111 | 112 | dfc = {0: {'__left__': 'a', 113 | '__right__': 'a', 114 | '__similarity__': 1.0, 115 | '__left-sample__': 0, 116 | '__right-sample__': 0, 117 | '__left-nunique__': 10, 118 | '__right-nunique__': 10}, 119 | 1: {'__left__': 'b', 120 | '__right__': 'b', 121 | '__similarity__': 1.0, 122 | '__left-sample__': 'bb', 123 | '__right-sample__': 'bb', 124 | '__left-nunique__': 2, 125 | '__right-nunique__': 2}} 126 | 127 | assert dfc==pdj.data_match(ignore_value_columns=False, max_unique_pct=1.0).to_dict(orient='index') 128 | 129 | 130 | 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /tests/test_smartjoin.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | import numpy as np 4 | 5 | # fuzzy join 6 | from faker import Faker 7 | import importlib 8 | 9 | import d6tjoin.smart_join 10 | importlib.reload(d6tjoin.smart_join) 11 | cfg_num = 10 12 | cfg_num_unmatched = 2 13 | cfg_num_matched = cfg_num-cfg_num_unmatched 14 | 15 | # d6t 16 | from d6tjoin.utils import df_str_summary, BaseJoin, PreJoin 17 | 18 | # ****************************************** 19 | # helpers 20 | # ****************************************** 21 | def gen_multikey_simple(): 22 | fake = Faker() 23 | fake.seed(1) 24 | 25 | pool_names = [fake.name() for _ in range(cfg_num)] 26 | pool_dates = pd.date_range('1/1/2018', periods=cfg_num) 27 | 28 | # case multikey 29 | df1 = pd.DataFrame({'key': pool_names[:-cfg_num_unmatched], 'date': pool_dates[:-cfg_num_unmatched]}) 30 | df2 = pd.DataFrame({'key': pool_names[cfg_num_unmatched:], 'date': pool_dates[cfg_num_unmatched:]}) 31 | df1['val1'] = range(df1.shape[0]) 32 | df2['val2'] = range(df2.shape[0]) 33 | 34 | return df1, df2 35 | 36 | def gen_multikey_complex(unmatched_date=True): 37 | 38 | fake = Faker() 39 | fake.seed(1) 40 | 41 | pool_names = [fake.name() for _ in range(cfg_num)] 42 | cfg_num_per_group = 4 43 | pool_date1 = pd.date_range('1/1/2010', periods=cfg_num_per_group, freq='1M') 44 | if unmatched_date: 45 | pool_date2 = pd.bdate_range('1/1/2010', periods=cfg_num_per_group, freq='1BM') 46 | else: 47 | pool_date2 = pool_date1 48 | 49 | def gen_df(cfg_pool_rates, cfg_offset=0): 50 | dfg = [] 51 | for i in range(cfg_num_per_group): 52 | dft = pd.DataFrame({'key': np.roll(pool_names, i + cfg_offset)[:cfg_num_per_group]}) 53 | dft['date'] = cfg_pool_rates[i] 54 | dft['value'] = np.random.randn(dft.shape[0]) 55 | dfg.append(dft) 56 | return pd.concat(dfg) 57 | 58 | df1 = gen_df(pool_date1) 59 | df2 = gen_df(pool_date2, 2) 60 | 61 | return df1, df2 62 | 63 | 64 | # ****************************************** 65 | # utils 66 | # ****************************************** 67 | 68 | def test_df_str_summary(): 69 | df = pd.DataFrame({'a': ['a', 'aa'] * 2}) 70 | df['b'] = ['aa', 'aaa'] * 2 71 | 72 | dft = df_str_summary(df) 73 | assert np.all(dft.values == np.array([[ 1.5, 1.5, 1. , 2. , 4. ], 74 | [ 2.5, 2.5, 2. , 3. , 4. ]])) 75 | dft = df_str_summary(df,['a']) 76 | assert np.all(dft.values == np.array([1.5, 1.5, 1. , 2. , 4.])) 77 | 78 | dft = df_str_summary(df,unique_count=True) 79 | assert np.all(dft.values == np.array([[ 1.5, 1.5, 1. , 2. , 4. , 2. ], 80 | [ 2.5, 2.5, 2. , 3. , 4. , 2. ]])) 81 | 82 | 83 | def test_basejoin(): 84 | df1 = pd.DataFrame({'a': range(3), 'b': range(3)}) 85 | df2 = pd.DataFrame({'a': range(3), 'b': range(3)}) 86 | 87 | with pytest.raises(ValueError) as e: 88 | j = PreJoin([df1], ['a']) 89 | with pytest.raises(NotImplementedError) as e: 90 | j = PreJoin([df1,df2,df1], ['a']) 91 | 92 | j1 = PreJoin([df1,df2], ['a','b']) 93 | j2 = PreJoin([df1,df2], [['a','b'],['a','b']], keys_bydf=True) 94 | j3 = PreJoin([df1,df2], [['a','a'],['b','b']]) 95 | assert j1.keys == [['a', 'a'], ['b', 'b']] 96 | assert j1.keys == j2.keys 97 | assert j2.keys == j3.keys 98 | assert j1.keysdf == [['a', 'b'], ['a', 'b']] 99 | assert j1.keysdf == j2.keysdf 100 | assert j3.keysdf == j2.keysdf 101 | 102 | df2 = pd.DataFrame({'a': range(3), 'c': range(3)}) 103 | 104 | with pytest.raises(KeyError) as e: 105 | j1 = PreJoin([df1,df2], ['a','c']) 106 | 107 | j2 = PreJoin([df1,df2], [['a','b'],['a','c']], keys_bydf=True) 108 | j3 = PreJoin([df1,df2], [['a','a'],['b','c']]) 109 | assert j2.keys == [['a', 'a'], ['b', 'c']] 110 | assert j3.keys == j2.keys 111 | assert j2.keysdf == [['a', 'b'], ['a', 'c']] 112 | assert j3.keysdf == j2.keysdf 113 | 114 | # ****************************************** 115 | # prejoin 116 | # ****************************************** 117 | def test_prejoin(): 118 | df1 = pd.DataFrame({'a': range(3), 'b': range(3)}) 119 | df2 = pd.DataFrame({'a': range(3), 'c': range(3)}) 120 | 121 | j = PreJoin([df1,df2],['a']) 122 | dfr = j.stats_prejoin(print_only=False) 123 | results = dfr.to_dict() 124 | check = {'all matched': {0: True, 1: True}, 125 | 'inner': {0: 3, 1: 3}, 126 | 'key left': {0: 'a', 1: '__all__'}, 127 | 'key right': {0: 'a', 1: '__all__'}, 128 | 'left': {0: 3, 1: 3}, 129 | 'outer': {0: 3, 1: 3}, 130 | 'right': {0: 3, 1: 3}, 131 | 'unmatched left': {0: 0, 1: 0}, 132 | 'unmatched right': {0: 0, 1: 0}, 133 | 'unmatched total': {0: 0, 1: 0}} 134 | assert results == check 135 | assert j.is_all_matched() 136 | assert j.is_all_matched('a') 137 | 138 | df2 = pd.DataFrame({'a': range(3,6), 'c': range(3)}) 139 | 140 | j = PreJoin([df1,df2],['a']) 141 | dfr = j.stats_prejoin(print_only=False) 142 | assert (~dfr['all matched']).all() 143 | assert not j.is_all_matched() 144 | assert not j.is_all_matched('a') 145 | 146 | df2 = pd.DataFrame({'b': range(3,6), 'a': range(3), 'v':range(3)}) 147 | cfg_keys = ['a', 'b'] 148 | j = PreJoin([df1,df2],cfg_keys) 149 | dfr = j.stats_prejoin(print_only=False) 150 | assert dfr['all matched'].tolist()==[True, False, False] 151 | assert not j.is_all_matched() 152 | assert j.is_all_matched('a') 153 | assert not j.is_all_matched('b') 154 | 155 | # test show_input 156 | dfr = j.show_input(1,keys_only=False) 157 | assert dfr[0].equals(df1.head(1)) 158 | assert dfr[1].equals(df2.head(1)) 159 | dfr = j.show_input(-1,keys_only=True) 160 | assert dfr[0][cfg_keys].equals(df1[cfg_keys]) 161 | assert dfr[1][cfg_keys].equals(df2[cfg_keys]) 162 | 163 | # test show_unmatched 164 | j.show_unmatched('b',print_only=True) # just make sure print_only runs without errors 165 | dfr = j.show_unmatched('b',nrecords=-1) 166 | assert dfr['left'].equals(df1['b']) 167 | assert dfr['right'].equals(df2['b']) 168 | dfr = j.show_matched('a',nrecords=-1) 169 | assert dfr['left'].equals(df1['a']) 170 | assert dfr['right'].equals(df2['a']) 171 | dfr = j.show_unmatched('__all__',nrecords=-1) 172 | assert dfr['left'].equals(df1[cfg_keys]) 173 | assert dfr['right'].equals(df2[cfg_keys]) 174 | dfr = j.show_matched('__all__') 175 | assert dfr['left'].empty 176 | assert dfr['right'].empty 177 | 178 | dfr = j.show_unmatched('b',nrecords=1) 179 | assert dfr['left'].equals(df1['b'].head(1)) 180 | assert dfr['right'].equals(df2['b'].head(1)) 181 | 182 | dfr = j.show_unmatched('b',keys_only=False,nrecords=-1) 183 | assert dfr['left'].equals(df1) 184 | assert dfr['right'].equals(df2) 185 | 186 | dfr = j.show_unmatched('a') 187 | assert dfr['left'].empty 188 | assert dfr['right'].empty 189 | dfr = j.show_matched('b') 190 | assert dfr['left'].empty 191 | assert dfr['right'].empty 192 | 193 | # test show_unmatched 194 | j = PreJoin([df1,df2],['a']) 195 | with pytest.raises(RuntimeError) as e: 196 | j.show_unmatched('a', print_only=True) 197 | j.stats_prejoin() 198 | dfr = j.show_matched('__all__',nrecords=-1) 199 | assert dfr['left'].equals(df1[['a']]) 200 | assert dfr['right'].equals(df2[['a']]) 201 | dfr = j.show_unmatched('__all__',nrecords=-1) 202 | assert dfr['left'].empty 203 | assert dfr['right'].empty 204 | 205 | 206 | # ****************************************** 207 | # fuzzy join 208 | # ****************************************** 209 | def test_fakedata_singlekey_string(): 210 | 211 | fake = Faker() 212 | fake.seed(1) 213 | 214 | pool_names = [fake.name() for _ in range(cfg_num)] 215 | pool_names_unmatched_left = pool_names[:cfg_num_unmatched] 216 | 217 | # case single key unmatched 218 | df1=pd.DataFrame({'key':pool_names[:-cfg_num_unmatched]}) 219 | df2=pd.DataFrame({'key':pool_names[cfg_num_unmatched:]}) 220 | df1['val1']=range(df1.shape[0]) 221 | df2['val2']=range(df2.shape[0]) 222 | 223 | 224 | with pytest.raises(ValueError) as e_info: 225 | d6tjoin.smart_join.FuzzyJoinTop1([df1, df2], [], []) 226 | with pytest.raises(KeyError) as e_info: 227 | d6tjoin.smart_join.FuzzyJoinTop1([df1,df2], fuzzy_keys=['unmatched']) 228 | 229 | importlib.reload(d6tjoin.smart_join) 230 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key']) 231 | assert sj.keysdf_fuzzy == [['key']]*2 232 | assert sj.keysdf_exact == [] 233 | 234 | import jellyfish 235 | def diff_edit(a, b): 236 | return jellyfish.levenshtein_distance(a, b) 237 | def diff_hamming(a, b): 238 | return jellyfish.hamming_distance(a, b) 239 | 240 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key']) 241 | dfr = sj._gen_match_top1(0)['table'].copy() 242 | assert sj._gen_match_top1(0)['has duplicates'] 243 | assert set(dfr.loc[dfr['__top1diff__']>0,'__top1left__'].unique()) == set(pool_names_unmatched_left) 244 | assert dfr.loc[dfr['__top1diff__']>0,'__top1right__'].values.tolist() == ['Teresa James', 'Rachel Davis', 'Teresa James'] 245 | dfr['__top1diff__check'] = dfr.apply(lambda x: diff_edit(x['__top1left__'],x['__top1right__']),1) 246 | assert (dfr['__top1diff__']==dfr['__top1diff__check']).all() 247 | 248 | sj.set_fuzzy_how(0,{'fun_diff':[diff_hamming,diff_edit]}) 249 | dfr = sj._gen_match_top1(0)['table'].copy() 250 | assert dfr.loc[dfr['__top1diff__']>0,'__top1right__'].values.tolist() == ['Teresa James', 'Amanda Johnson'] 251 | assert not sj._gen_match_top1(0)['has duplicates'] 252 | 253 | 254 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key']) 255 | dfr1 = sj._gen_match_top1(0)['table'] 256 | # assert df1.shape[0] == dfr1.shape[0] # todo: deal with duplicates 257 | dfr2 = sj.join(True) 258 | assert np.array_equal(dfr1['__top1diff__'].sort_values().values, dfr2['__top1diff__key'].sort_values().values) 259 | 260 | def test_fakedata_singlekey_number(): 261 | pool_dates = pd.date_range('1/1/2018',periods=cfg_num) 262 | 263 | # case single key date 264 | df1=pd.DataFrame({'date':pool_dates[:-cfg_num_unmatched]}) 265 | df2=pd.DataFrame({'date':pool_dates[cfg_num_unmatched:]}) 266 | 267 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['date']) 268 | dfr = sj._gen_match_top1_left_number([],[],'date','date',None) 269 | 270 | df_check = pd.DataFrame({'__top1left__':pool_dates[:-cfg_num_unmatched],'__top1right__':[pool_dates[cfg_num_unmatched]]*cfg_num_unmatched+pool_dates[cfg_num_unmatched:-cfg_num_unmatched].tolist()}) 271 | df_check['__top1diff__'] = (df_check['__top1left__'] - df_check['__top1right__']).abs() 272 | 273 | assert dfr.equals(df_check) 274 | 275 | # apply top_nrecords 276 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['date'],fuzzy_how={0:{'top_limit':1}}) 277 | dfr = sj._gen_match_top1_left_number([],[],'date','date',None) 278 | 279 | df_check = pd.DataFrame({'__top1left__':pool_dates[:-cfg_num_unmatched],'__top1right__':[pool_dates[cfg_num_unmatched]]*cfg_num_unmatched+pool_dates[cfg_num_unmatched:-cfg_num_unmatched].tolist()}) 280 | df_check['__top1diff__'] = (df_check['__top1left__'] - df_check['__top1right__']).abs() 281 | 282 | assert dfr.equals(df_check) 283 | 284 | # case single key date, with exact keys 285 | pool_dates2 = pd.date_range('12/31/2017',periods=cfg_num) 286 | df1=pd.DataFrame({'grp':['a']*cfg_num_matched+['b']*cfg_num_matched,'date':pool_dates[:-cfg_num_unmatched].tolist()+pool_dates2[:-cfg_num_unmatched].tolist()}) 287 | df2=pd.DataFrame({'grp':['a']*cfg_num_matched+['b']*cfg_num_matched,'date2':pool_dates[cfg_num_unmatched:].tolist()+pool_dates2[cfg_num_unmatched:].tolist()}) 288 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],exact_keys=['grp'],fuzzy_keys=[['date', 'date2']]) 289 | dfr = sj._gen_match_top1_left_number(['grp'],['grp'],'date','date2',None) 290 | 291 | dfc0 = pd.merge_asof(df1.sort_values('date'), df2.sort_values('date2'), left_on='date', right_on='date2', by='grp', direction='nearest') 292 | dfc = dfc0.rename(columns={'date':'__top1left__','date2':'__top1right__'}) 293 | dfc['__top1diff__'] = (dfc['__top1left__'] - dfc['__top1right__']).abs() 294 | dfc = dfc[dfr.columns.tolist()] 295 | 296 | assert dfr.equals(dfc) 297 | 298 | dfc['__match type__'] = 'exact' 299 | dfc.loc[dfc['__top1diff__'].dt.days>0,'__match type__'] = 'top1 left' 300 | 301 | assert sj._gen_match_top1(0)['table'].equals(dfc) 302 | assert sj.join().sort_values(['date','grp']).reset_index(drop=True).equals(dfc0) 303 | 304 | 305 | def fakedata_multikey(): 306 | 307 | df1, df2 = gen_multikey_simple() 308 | 309 | cfg_group_left=['date'] 310 | cfg_group_right=cfg_group_left 311 | keyleft='key' 312 | keyright=keyleft 313 | 314 | ''' 315 | from d6tjoin.smart_join import apply_gen_candidates_group 316 | df_keys_left = pd.DataFrame(df1.groupby(cfg_group_left)[keyleft].unique()) 317 | df_keys_right = pd.DataFrame(df2.groupby(cfg_group_right)[keyright].unique()) 318 | df_keysets_groups = df_keys_left.merge(df_keys_right, left_index=True, right_index=True) 319 | df_keysets_groups.columns = ['__top1left__', '__top1right__'] 320 | dfg = df_keysets_groups.reset_index().groupby(cfg_group_left).apply(apply_gen_candidates_group) 321 | dfg = dfg.reset_index(-1, drop=True).reset_index() 322 | ''' 323 | with pytest.raises(NotImplementedError) as e_info: 324 | d6tjoin.smart_join.FuzzyJoinTop1([df1,df2], fuzzy_keys=['key','date']) 325 | 326 | 327 | ''' 328 | df1 329 | df2 330 | 331 | 332 | tests fuzzy string, exact keys 333 | tests fuzzy number int+float 334 | tests with nans 335 | groupby unique deal with nans 336 | 337 | merge just the keys together [often date, key = 1 row...] 338 | => as soon as have >1 fuzzy key need to specify if hierarchical 339 | // does it increase the compute complexity? have to do the same all pairs compute for every date!! 340 | => do global match, from there find the closest ones by date 341 | 342 | explain: warnings.warn('Multi-key fuzzy joins are currently done globally for each key indivudally, not hierarchically for each unique fuzzy key value pair') 343 | tests for factor data id vs date, id matching 344 | 345 | ''' 346 | # with pytest.raises(ValueError) as e_info: 347 | # d6tjoin.smart_join.FuzzyJoinTop1([df1,df2], fuzzy_keys=['key','key'], fuzzy_how=[]) 348 | # 349 | # importlib.reload(d6tjoin.smart_join) 350 | # sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key','date']) 351 | # dfr = sj.join(True) 352 | # assert df1.shape[0] == dfr.shape[0] 353 | 354 | # fakedata_multikey() 355 | 356 | 357 | def test_fakedata_multikey_iddate(): 358 | import uuid 359 | import itertools 360 | 361 | nobs = 10 362 | uuid1 = [str(uuid.uuid4()) for _ in range(nobs)] 363 | dates1 = pd.date_range('1/1/2010','1/1/2011') 364 | 365 | dates2 = pd.bdate_range('1/1/2010', '1/1/2011') # business instead of calendar dates 366 | 367 | df1 = pd.DataFrame(list(itertools.product(uuid1, dates1)), columns=['id', 'date']) 368 | df1['v'] = np.random.sample(df1.shape[0]) 369 | 370 | df2 = pd.DataFrame(list(itertools.product(uuid1, dates2)), columns=['id', 'date']) 371 | df2['v'] = np.random.sample(df2.shape[0]) 372 | 373 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1, df2], exact_keys=['id'], fuzzy_keys=['date']) 374 | dft = sj.preview_fuzzy(0) 375 | 376 | 377 | df2 = df1.copy() 378 | df2['id'] = df1['id'].str[1:-1] 379 | 380 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1, df2], exact_keys=['date'], fuzzy_keys=['id']) 381 | dft = sj.preview_fuzzy(0) 382 | dft.shape 383 | dft = sj._gen_match_top1(0) 384 | dft['table'].shape 385 | 386 | print('a') 387 | 388 | 389 | def fiddle(): 390 | cfg_path_folder_base = '/mnt/data/data.raw/travelclick/' 391 | from d6tstack.read_excel_adv import read_excel_advanced 392 | cfg_path = cfg_path_folder_base+'predict/STR Rolling Weekly Since 9-11-01 to 4-14-18 values weekly.xlsx' 393 | df_str=read_excel_advanced(cfg_path, header_xls_start="A7", header_xls_end="D7",remove_blank_cols=True,remove_blank_rows=True) 394 | df_str['STAY_WEEK'] = df_str['Date']-pd.DateOffset(days=6) 395 | df_str.head() 396 | 397 | df_alltier2 = pd.read_excel(cfg_path_folder_base + 'predict/travelcity-revpar-unsorted.xlsx') 398 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df_alltier2,df_str],fuzzy_keys=['STAY_WEEK']) 399 | sj._gen_match_top1(0) 400 | 401 | # fiddle() 402 | 403 | # test_fakedata_multikey_iddate() -------------------------------------------------------------------------------- /tests/test_top1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | pd.set_option('display.expand_frame_repr', False) 4 | import importlib 5 | import d6tjoin.top1 6 | import jellyfish 7 | from faker import Faker 8 | 9 | import tests.test_smartjoin 10 | 11 | def gen_df2_str(): 12 | l1 = ['a', 'b'] 13 | l2 = [l1[0], 'ba', 'cd'] 14 | df1 = pd.DataFrame({'id':l1*4}) 15 | df2 = pd.DataFrame({'id':l2*4}) 16 | df1['v1']=range(df1.shape[0]) 17 | df2['v2']=range(df2.shape[0]) 18 | return df1, df2 19 | 20 | def gen_df2_num(): 21 | l1 = [1,2] 22 | l2 = [l1[0],1.1,1.2] 23 | df1 = pd.DataFrame({'id': l1 * 4}) 24 | df2 = pd.DataFrame({'id': l2 * 4}) 25 | return df1, df2 26 | 27 | 28 | def test_top1_gen_candidates(): 29 | 30 | def helper(df1, df2): 31 | 32 | dfr = d6tjoin.top1.MergeTop1Diff(df1, df2,'id','id',jellyfish.levenshtein_distance)._allpairs_candidates() 33 | assert dfr.shape==(4, 3) 34 | assert (dfr['__top1left__'].values[0]==df1['id'].values[0]) 35 | assert np.all(dfr['__top1left__'].values[1:]==df1['id'].values[1]) 36 | assert (dfr['__top1right__'].values[0]==df1['id'].values[0]) 37 | assert (dfr['__top1right__']==df2['id'].values[1]).sum()==1 38 | assert (dfr['__top1right__']==df2['id'].values[2]).sum()==1 39 | assert (dfr['__matchtype__']=='exact').sum()==1 40 | assert (dfr['__matchtype__']=='top1 left').sum()==3 41 | 42 | df1, df2 = gen_df2_str() 43 | helper(df1, df2) 44 | 45 | df1, df2 = gen_df2_num() 46 | helper(df1, df2) 47 | 48 | 49 | def test_top1_str(): 50 | 51 | df1, df2 = gen_df2_str() 52 | 53 | r = d6tjoin.top1.MergeTop1Diff(df1, df2,'id','id',jellyfish.levenshtein_distance).merge() 54 | dfr = r['top1'] 55 | assert dfr['__top1diff__'].min()==0 56 | assert dfr['__top1diff__'].max()==1 57 | assert dfr.shape==(3, 4) 58 | dfr = r['merged'] 59 | assert dfr.shape==(48, 4) 60 | assert np.all(dfr.groupby('id').size().values==np.array([16, 32])) 61 | 62 | df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=False) 63 | r = d6tjoin.top1.MergeTop1Diff(df1, df2,'key','key',jellyfish.levenshtein_distance,['date'],['date']).merge() 64 | dfr = r['merged'] 65 | assert dfr.shape==(18, 5) 66 | assert np.all(dfr.groupby(['date','key']).size().values==np.array([1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])) 67 | 68 | df1.head() 69 | df1.merge(df2, on=['date','key']).head() 70 | dfr.head() 71 | 72 | def test_top1_num(): 73 | 74 | df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=True) 75 | r = d6tjoin.top1.MergeTop1Number(df1, df2,'date','date',is_keep_debug=True).merge() 76 | dfr = r['top1'] 77 | assert dfr.shape==(4, 4) 78 | assert np.all(dfr.groupby('__matchtype__').size().values==np.array([2, 2])) 79 | assert dfr['__top1diff__'].dt.days.max()==2 80 | assert dfr['__top1diff__'].dt.days.min()==0 81 | 82 | df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=True) 83 | r = d6tjoin.top1.MergeTop1Number(df1, df2,'date','date',['key'],['key']).merge() 84 | dfr = r['merged'] 85 | dfr.sort_values(['date','key']) 86 | r['top1'].sort_values(['__top1left__','key']) 87 | df1.sort_values(['key','date']) 88 | df2.sort_values(['key','date']) 89 | r['top1'] 90 | 91 | def test_top1_multi(): 92 | 93 | df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=True) 94 | df2['key'] = 'Mr. '+df1['key'] 95 | 96 | r = d6tjoin.top1.MergeTop1(df1, df2,['date','key'],['date','key']).merge() 97 | 98 | 99 | assert True 100 | 101 | 102 | def test_top1_examples(): 103 | import uuid 104 | import itertools 105 | 106 | # ****************************************** 107 | # generate sample data 108 | # ****************************************** 109 | nobs = 10 110 | # todo: set uuid seed 111 | # todo: only pick first 2 blocks 112 | f1 = Faker() 113 | f1.seed(0) 114 | uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)] 115 | dates1 = pd.date_range('1/1/2010', '1/1/2011') 116 | dates2 = pd.bdate_range('1/1/2010', '1/1/2011') # business instead of calendar dates 117 | 118 | df1 = pd.DataFrame(list(itertools.product(uuid1, dates1)), columns=['id', 'date']) 119 | df1['v'] = np.random.sample(df1.shape[0]) 120 | df2 = df1.copy() 121 | df2['id'] = df1['id'].str[1:-1] 122 | 123 | # r = d6tjoin.top1.MergeTop1Number(df1, df2, 'id', 'id', ['date'], ['date']).merge() 124 | # assert raises ValueError => should check it's a number to do number join 125 | 126 | # r = d6tjoin.top1.MergeTop1Diff(df1, df2, 'id', 'id', jellyfish.levenshtein_distance, ['date'], ['date']).merge() 127 | # assert min()==2 128 | # assert diff no duplicates 129 | # assert diff found == substring 130 | # assert only 100 candidates (not 366*100) 131 | 132 | # r = d6tjoin.top1.MergeTop1(df1, df2, ['id'], ['id'], ['date'], ['date']).merge() 133 | # assert merged==merged 134 | # assert diff==diff 135 | 136 | # dates2 = pd.bdate_range('1/1/2010', '1/1/2011') # business instead of calendar dates 137 | # df2 = pd.DataFrame(list(itertools.product(uuid1, dates2)), columns=['id', 'date']) 138 | # df2['v'] = np.random.sample(df2.shape[0]) 139 | # r = d6tjoin.top1.MergeTop1(df1, df2, ['date'], ['date'], ['id'], ['id']).merge() 140 | # # why cause error? 141 | # r = d6tjoin.top1.MergeTop1(df1.head(), df2, ['date'], ['date'], ['id'], ['id']).merge() 142 | 143 | df2 = pd.DataFrame(list(itertools.product(uuid1, dates2)), columns=['id', 'date']) 144 | df2['v'] = np.random.sample(df2.shape[0]) 145 | df2['id'] = df1['id'].str[1:-1] 146 | 147 | result = d6tjoin.top1.MergeTop1(df1, df2, ['date', 'id'], ['date', 'id']).merge() 148 | result['merged'] 149 | # o=d6tjoin.top1.MergeTop1(df1, df2, ['date', 'id'], ['date', 'id']) 150 | # o.cfg_exact_left_on 151 | result = d6tjoin.top1.MergeTop1(df1, df2, ['date', 'id'], ['date', 'id']).merge() 152 | 153 | d6tjoin.utils.PreJoin([df1, df2], ['id', 'date']).stats_prejoin(print_only=False) 154 | 155 | assert True 156 | 157 | 158 | def fiddle_set(): 159 | 160 | import pandas as pd 161 | import numpy as np 162 | import importlib 163 | import d6tjoin.top1 164 | 165 | import ciseau 166 | import scipy.spatial.distance 167 | 168 | df_db = pd.read_csv('~/database.csv',index_col=0) 169 | 170 | def diff_jaccard(a, b): 171 | # pad with empty str to make euqal length 172 | a = np.pad(a, (0, max(0, len(b) - len(a))), 'constant', constant_values=(0, 0)) 173 | b = np.pad(b, (0, max(0, len(a) - len(b))), 'constant', constant_values=(0, 0)) 174 | return scipy.spatial.distance.jaccard(a, b) 175 | 176 | def strsplit(t): 177 | return [s for s in [s.replace(" ", "") for s in ciseau.tokenize(t)] if s not in ['.', ',', '-', ';', '(', ')']] 178 | 179 | importlib.reload(d6tjoin.top1) 180 | j = d6tjoin.top1.MergeTop1Diff(df_db.head(),df_db,'description','description',fun_diff=diff_jaccard,topn=2,fun_preapply=strsplit,fun_postapply=lambda x: ' '.join(x)) 181 | j.merge()['merged'] 182 | 183 | 184 | def test_multicore(): 185 | nobs = 10 186 | f1 = Faker() 187 | f1.seed(0) 188 | uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)] 189 | 190 | df1 = pd.DataFrame(uuid1, columns=['id']) 191 | df1['val1'] = np.round(np.random.sample(df1.shape[0]), 3) 192 | 193 | # create mismatch 194 | df2 = df1.copy() 195 | df2['id'] = df1['id'].str[1:-1] 196 | df2['val2'] = np.round(np.random.sample(df2.shape[0]), 3) 197 | 198 | 199 | m = d6tjoin.top1.MergeTop1Diff(df1,df2,'id','id',fun_diff=jellyfish.levenshtein_distance) 200 | df_candidates = m._allpairs_candidates() 201 | 202 | idxSel = df_candidates['__matchtype__'] != 'exact' 203 | dfd2 = df_candidates.copy() 204 | dfd2.loc[idxSel,'__top1diff__'] = d6tjoin.top1._applyFunMulticore(df_candidates.loc[idxSel,'__top1left__'].values, df_candidates.loc[idxSel,'__top1right__'].values,jellyfish.levenshtein_distance) 205 | 206 | dfd1 = df_candidates.copy() 207 | dfd1.loc[idxSel, '__top1diff__'] = df_candidates[idxSel].apply(lambda x: jellyfish.levenshtein_distance(x['__top1left__'], x['__top1right__']), axis=1) 208 | assert dfd2.equals(dfd1) 209 | 210 | assert True 211 | 212 | ''' 213 | multicore in caller class 214 | pass multicore on 215 | make ifelse multicore for every apply diff 216 | 217 | default yes? 218 | part of requirements 219 | 220 | update setup.py requirements 221 | 222 | 223 | ''' 224 | 225 | 226 | test_top1_gen_candidates() -------------------------------------------------------------------------------- /tests/tmp.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import importlib 3 | 4 | import d6tjoin 5 | import d6tjoin.utils 6 | importlib.reload(d6tjoin.utils) 7 | 8 | df1=pd.DataFrame({'v':list(range(10))*2,'g':['a']*10+['b']*10}) 9 | df2=df1.copy() 10 | 11 | j = d6tjoin.PreJoin([df1,df2]) 12 | j.str_describe() 13 | j.data_describe() 14 | j.columns_common() 15 | j.columns_ispresent() 16 | j.data_match() 17 | 18 | j = d6tjoin.PreJoin([df1,df2], print_only=False) 19 | r = j.data_match() 20 | dfc = {'__left__': {0: 'g', 1: 'v'}, 21 | '__right__': {0: 'g', 1: 'v'}, 22 | '__similarity__': {0: 1.0, 1: 1.0}} 23 | dfc = pd.DataFrame(dfc) 24 | assert r.equals(dfc) 25 | print(r) 26 | 27 | quit() 28 | 29 | df1=pd.DataFrame({'a':range(3),'b':range(3)}) 30 | df2=pd.DataFrame({'a':range(3),'c':range(3)}) 31 | df2=pd.DataFrame({'a':range(3),'b':range(3,6)}) 32 | df2=pd.DataFrame({'a':range(3,6),'c':range(3)}) 33 | 34 | 35 | j = d6tjoin.utils.BaseJoin([df1,df2],['a']) 36 | 37 | j = d6tjoin.utils.BaseJoin([df1,df2],['a','b']) 38 | j.keys 39 | dfr = j.stats_prejoin(return_results=True) 40 | dfr 41 | (~dfr['all matched']).all() 42 | 43 | j = d6tjoin.utils.BaseJoin([df1,df2],['a']) 44 | j.stats_prejoin(return_results=True).to_dict() 45 | 46 | --------------------------------------------------------------------------------