├── .gitignore
├── MANIFEST.in
├── README.md
├── d6tjoin
├── __init__.py
├── pre.py
├── smart_join.py
├── top1.py
└── utils.py
├── docs
├── Makefile
├── make.bat
├── samples.py
├── shell-napoleon-html.sh
├── shell-napoleon-recreate.sh
└── source
│ ├── conf.py
│ ├── d6tjoin.rst
│ ├── index.rst
│ ├── modules.rst
│ └── setup.rst
├── examples-prejoin.ipynb
├── examples-tokencluster.ipynb
├── examples-top1.ipynb
├── requirements-dev.txt
├── requirements.txt
├── setup.py
└── tests
├── __init__.py
├── test_pre_pd.py
├── test_smartjoin.py
├── test_top1.py
└── tmp.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .env
3 | temp/
4 | fiddle*
5 | .pytest_cache/
6 | tests/tmp-local.py
7 | tests/tmp*.py
8 |
9 | docs-examples/
10 |
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | *$py.class
15 |
16 | # C extensions
17 | *.so
18 |
19 | # Distribution / packaging
20 | .Python
21 | build/
22 | develop-eggs/
23 | dist/
24 | downloads/
25 | eggs/
26 | .eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | MANIFEST
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | .hypothesis/
58 |
59 | # Translations
60 | *.mo
61 | *.pot
62 |
63 | # Django stuff:
64 | *.log
65 | .static_storage/
66 | .media/
67 | local_settings.py
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # celery beat schedule file
89 | celerybeat-schedule
90 |
91 | # SageMath parsed files
92 | *.sage.py
93 |
94 | # Environments
95 | .env
96 | .venv
97 | env/
98 | venv/
99 | ENV/
100 | env.bak/
101 | venv.bak/
102 |
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 |
107 | # Rope project settings
108 | .ropeproject
109 |
110 | # mkdocs documentation
111 | /site
112 |
113 | # mypy
114 | .mypy_cache/
115 |
116 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Databolt Smart Join
2 |
3 | Easily join different datasets without writing custom code. Does best match joins on strings, dates and numbers. For example you can quickly join similar but not identical stock tickers, addresses, names and dates without manual processing.
4 |
5 | ## Installation
6 |
7 | **0.2.x is currently in beta. The github master is the latest dev version. The docs refer to <0.2.0**
8 |
9 | We recommend using the latest version from github `pip install git+https://github.com/d6t/d6tjoin.git`
10 |
11 | If you cannot install from github, use the latest published version `pip install d6tjoin`. To update, run `pip install d6tflow -U --no-deps`
12 |
13 | We recommend using [AffineGap](https://github.com/dedupeio/affinegap) which is not an official requirement, you can install using `pip install affinegap`.
14 |
15 | For the `jellyfish` library, make sure the C implementation is working else `d6tjoin` will be very slow. You can test by running `import jellyfish.cjellyfish` if the C version is installed. If you don't have a C compiler, you can `conda install -c conda-forge jellyfish`.
16 |
17 | ## Sample Use
18 |
19 | ```
20 |
21 | import d6tjoin.top1
22 | import d6tjoin.utils
23 | import d6tjoin
24 |
25 | #************************
26 | # pre join diagnostics
27 | #************************
28 |
29 | # check join quality => none of the ids match
30 |
31 | d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()
32 |
33 | key left key right all matched inner left right outer unmatched total unmatched left unmatched right
34 | 0 id id False 0 10 10 20 20 10 10
35 | 1 date date True 366 366 366 366 0 0 0
36 | 2 __all__ __all__ False 0 3660 3660 7320 7320 3660 3660
37 |
38 | #************************
39 | # best match join on id
40 | #************************
41 |
42 | result = d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'],
43 | exact_left_on=['date'],exact_right_on=['date']).merge()
44 |
45 | result['merged'].head(2)
46 |
47 | date id val1 id_right val1_right val2
48 | 0 2010-01-01 e3e70682 0.020 3e7068 0.020 0.034
49 | 1 2010-01-01 f728b4fa 0.806 728b4f 0.806 0.849
50 |
51 | #************************
52 | # debug best matches
53 | #************************
54 |
55 | result['top1']['id'].head(2)
56 |
57 | date __top1left__ __top1right__ __top1diff__ __matchtype__
58 | 10 2010-01-01 e3e70682 3e7068 2 top1 left
59 | 34 2010-01-01 e443df78 443df7 2 top1 left
60 |
61 | #************************
62 | # customize similarity fct
63 | #************************
64 | import affinegap
65 |
66 | result = d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'],
67 | fun_diff=[affinegap.affineGapDistance]).merge()
68 |
69 | #************************
70 | # token-based substring clusters and joins
71 | #************************
72 | dftoken=d6tjoin.utils.splitcharTokenCount(df2['id'])
73 |
74 | word count
75 | 0 Equity 7
76 | 1 US 5
77 | 2 NA 2
78 | 3 PRIVATE 2
79 |
80 |
81 | d6tjoin.utils.unique_contains(df2['id'], dftoken['word'].values)
82 | >>> [('Equity', ['AAPL_US_Equity', 'AAP_US_Equity', 'AD_NA_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity', 'BMW_NA_Equity']), ('US', ['AAPL_US_Equity', 'AAP_US_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity']), ('NA', ['AD_NA_Equity', 'BMW_NA_Equity']), ('PRIVATE', ['PRIVATE_JLP', 'PRIVATE_NMG'])]
83 |
84 | import re
85 | splitchars="[^a-zA-Z0-9]+"
86 | def tokenmatch(s1,s2):
87 | return 3-len(set(re.split(splitchars,s1)) & set(re.split(splitchars,s2)))
88 |
89 | d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[tokenmatch], top_limit=[2]).merge()['top1']['id']
90 |
91 | __top1left__ __top1right__ __matchtype__ __top1diff__
92 | 0 AAP AAP_US_Equity top1 left 2
93 | 1 AAPL AAPL_US_Equity top1 left 2
94 | 2 AMZN-AMZN AMZN_US_Equity top1 left 2
95 | 3 APRN APRN_US_Equity top1 left 2
96 | 4 JLP PRIVATE_JLP top1 left 2
97 | 5 NMG PRIVATE_NMG top1 left 2
98 |
99 | ```
100 |
101 | ## Features include
102 | Enhances `pd.merge()` function with:
103 | * Pre join diagnostics to identify mismatched join keys
104 | * Best match joins that finds the top1 most similar value
105 | * Quickly join stock identifiers, addresses, names without manual processing
106 | * Ability to customize similarity functions, set max difference and other advanced features
107 |
108 | ## Documentation
109 |
110 | * [PreJoin examples notebook](https://github.com/d6t/d6tjoin/blob/master/examples-prejoin.ipynb) - Examples for diagnosing join problems
111 | * [MergeTop1 notebook](https://github.com/d6t/d6tjoin/blob/master/examples-top1.ipynb) - Best match join examples notebook
112 | * [Token substring join notebook](https://github.com/d6t/d6tjoin/blob/master/examples-tokencluster.ipynb) - Find common substrings and joins on token substrings
113 | * [Official docs](http://d6tjoin.readthedocs.io/en/latest/py-modindex.html) - Detailed documentation for modules, classes, functions
114 |
115 | ## Pro version
116 |
117 | Additional features:
118 | * Join >2 dataframes
119 | * Automatic Content-based similarity joins
120 | * Advanced join quality checks
121 | * Fast approximations for big data
122 |
123 | [Request demo](https://pipe.databolt.tech/gui/request-premium/)
124 |
125 | ## Faster Data Engineering
126 |
127 | Check out other d6t libraries to solve common data engineering problems, including
128 | * data ingest, quickly ingest raw data
129 | * fuzzy joins, quickly join data
130 | * data pipes, quickly share and distribute data
131 |
132 | https://github.com/d6t/d6t-python
133 |
134 | And we encourage you to join the Databolt blog to get updates and tips+tricks http://blog.databolt.tech
--------------------------------------------------------------------------------
/d6tjoin/__init__.py:
--------------------------------------------------------------------------------
1 | # import d6tjoin.top1
2 | import d6tjoin.utils
3 |
4 | from d6tjoin.pre import Prejoin
5 | pd = Prejoin
--------------------------------------------------------------------------------
/d6tjoin/pre.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | import itertools, warnings
3 |
4 | import pandas as pd
5 | pd.set_option('display.expand_frame_repr', False)
6 | import numpy as np
7 |
8 | from d6tstack.helpers import *
9 | from scipy.stats import mode
10 |
11 |
12 | # ******************************************
13 | # utils
14 | # ******************************************
15 | def head(dfs, nrows=1000):
16 | return [dfg.head(nrows) for dfg in dfs]
17 |
18 | # ******************************************
19 | # prejoin stats class
20 | # ******************************************
21 |
22 | class Prejoin(object):
23 | """
24 | Analyze, slice & dice join keys and dataframes before joining. Useful for checking how good a join will be and quickly looking at unmatched join keys.
25 |
26 | Args:
27 | dfs (list): list of data frames to join
28 | keys (var): either list of strings `['a','b']` if join keys have the same names in all dataframes or list of lists if join keys are different across dataframes `[[leftkeys],[rightkeys]]`, eg `[['left1','left2'],['right1','right2']]`
29 | keys_bydf (bool): if False, specify multi-key join keys by join level eg `[['left1','right1'],['left2','right2']]`
30 | nrows (int): for `df.head(nrows)`
31 | print_only (bool): if False return results instead of printing
32 | """
33 |
34 | def __init__(self, dfs, keys=None, keys_bydf=True, nrows=5, print_only=True):
35 |
36 | # inputs dfs
37 | self._init_dfs(dfs)
38 |
39 | if keys is not None:
40 | self.set_keys(keys, keys_bydf)
41 | else:
42 | self.keys = None; self.keysdf = None
43 |
44 | self.nrows = nrows
45 | self.print_only = print_only
46 |
47 | # df heads
48 | self.dfshead = [dfg.head(nrows) for idx, dfg in self._enumerate_dfs()]
49 |
50 | # init column scan
51 | self.columns_sniff()
52 |
53 | def _init_dfs(self, dfs):
54 | # check and save dfs
55 | if len(dfs)<2:
56 | raise ValueError('Need to pass at least 2 dataframes')
57 |
58 | if len(dfs)>2:
59 | raise NotImplementedError('Only handles 2 dataframes for now')
60 |
61 | self.dfs = dfs
62 | self.cfg_ndfs = len(dfs)
63 |
64 | def _enumerate_dfs(self):
65 | if self.keys is None:
66 | return enumerate(self.dfs)
67 | else:
68 | return [(idx, dfg[self.keysdf[idx]]) for idx, dfg in enumerate(self.dfs)]
69 |
70 | def set_keys(self, keys, keys_bydf=True):
71 | # check and save join keys
72 | self._check_keys(keys)
73 | keys, keysdf = self._prep_keys(keys, keys_bydf)
74 | self._check_keysdfs(keys, keysdf)
75 | # join keys
76 | self.cfg_njoins = len(keysdf[0])
77 | self.keys = keys # keys by join level
78 | self.keysall = keys + [['__all__'] * len(self.dfs)]
79 | self.keysdf = keysdf # keys by df
80 | self.keysdfall = keysdf + [['__all__']] * len(self.dfs)
81 | self.uniques = [] # set of unique values for each join key individually
82 | self.keysets = [] # set of unique values for all join keys together __all__
83 |
84 | return keys, keysdf
85 |
86 | def _check_keys(self, keys):
87 | if not keys or len(keys)<1:
88 | raise ValueError("Need to have join keys")
89 | # todo: no duplicate join keys passed
90 |
91 | def _check_keysdfs(self, keys, keysdf):
92 | if not all([len(k)==len(self.dfs) for k in keys]):
93 | raise ValueError("Need to provide join keys for all dataframes")
94 |
95 | for idf,dfg in enumerate(self.dfs): # check that keys present in dataframe
96 | missing = set(keysdf[idf]).difference(dfg.columns)
97 | if missing:
98 | raise KeyError(f'Columns missing in df#{idf}: {missing}')
99 |
100 | def _prep_keys(self, keys, keys_bydf):
101 | # deal with empty keys
102 | if not keys:
103 | return [], []
104 |
105 | # get keys in correct format given user input
106 | if isinstance(keys[0], (str,)):
107 | keysdf = [keys]*len(self.dfs)
108 | keys = list(map(list, zip(*keysdf)))
109 |
110 | elif isinstance(keys[0], (list,)):
111 | keysdf = list(map(list, zip(*keys)))
112 |
113 | if keys_bydf:
114 | keys, keysdf = keysdf, keys
115 |
116 | else:
117 | raise ValueError("keys need to be either list of strings or list of lists")
118 |
119 | return keys, keysdf
120 |
121 | def _return(self, result):
122 | if self.print_only:
123 | print(result)
124 | else:
125 | return result
126 |
127 | def _returndict(self, result):
128 | if self.print_only:
129 | for idx,d in result.items():
130 | print(f'dataframe #{idx}')
131 | print(d)
132 | else:
133 | return result
134 |
135 | def columns_sniff(self):
136 | # from d6tstack
137 | # todo: modularize d6tstack
138 | # tood: rewrite scipy mode function
139 |
140 | dfl_all = self.dfshead
141 | fname_list = range(len(self.dfs))
142 |
143 | # process columns
144 | dfl_all_col = [df.columns.tolist() for df in dfl_all]
145 | col_files = dict(zip(fname_list, dfl_all_col))
146 | col_common = list_common(list(col_files.values()))
147 | col_all = list_unique(list(col_files.values()))
148 |
149 | # find index in column list so can check order is correct
150 | df_col_present = {}
151 | for iFileName, iFileCol in col_files.items():
152 | df_col_present[iFileName] = [iCol in iFileCol for iCol in col_all]
153 |
154 | df_col_present = pd.DataFrame(df_col_present, index=col_all).T
155 | df_col_present.index.names = ['file_path']
156 |
157 | # find index in column list so can check order is correct
158 | df_col_idx = {}
159 | for iFileName, iFileCol in col_files.items():
160 | df_col_idx[iFileName] = [iFileCol.index(iCol) if iCol in iFileCol else np.nan for iCol in col_all]
161 | df_col_idx = pd.DataFrame(df_col_idx, index=col_all).T
162 |
163 | # order columns by where they appear in file
164 | m=mode(df_col_idx,axis=0)
165 | df_col_pos = pd.DataFrame({'o':m[0][0],'c':m[1][0]},index=df_col_idx.columns)
166 | df_col_pos = df_col_pos.sort_values(['o','c'])
167 | df_col_pos['iscommon']=df_col_pos.index.isin(col_common)
168 |
169 |
170 | # reorder by position
171 | col_all = df_col_pos.index.values.tolist()
172 | col_common = df_col_pos[df_col_pos['iscommon']].index.values.tolist()
173 | col_unique = df_col_pos[~df_col_pos['iscommon']].index.values.tolist()
174 | df_col_present = df_col_present[col_all]
175 | df_col_idx = df_col_idx[col_all]
176 |
177 | sniff_results = {'files_columns': col_files, 'columns_all': col_all, 'columns_common': col_common,
178 | 'columns_unique': col_unique, 'is_all_equal': columns_all_equal(dfl_all_col),
179 | 'df_columns_present': df_col_present, 'df_columns_order': df_col_idx}
180 |
181 | self.sniff_results = sniff_results
182 |
183 |
184 | def _calc_keysets(self):
185 |
186 | self.keysets = [] # reset
187 |
188 | # find set of unique values for each join key
189 | for idx, dfg in enumerate(self.dfs):
190 |
191 | # keys individually
192 | uniquedict = OrderedDict()
193 | for key in self.keysdf[idx]:
194 | v = dfg[key].unique()
195 | uniquedict[key] = set(v[~pd.isnull(v)])
196 |
197 | # keys _all__
198 | dft = dfg[self.keysdf[idx]].drop_duplicates()
199 | uniquedict['__all__'] = {tuple(x) for x in dft.values}
200 | self.uniques.append(uniquedict)
201 |
202 | # perform set logic
203 | for keys in self.keysall:
204 | df_key = {}
205 | df_key['key left'] = keys[0]
206 | df_key['key right'] = keys[1]
207 | df_key['keyset left'] = self.uniques[0][df_key['key left']]
208 | df_key['keyset right'] = self.uniques[1][df_key['key right']]
209 |
210 | df_key['inner'] = df_key['keyset left'].intersection(df_key['keyset right'])
211 | df_key['outer'] = df_key['keyset left'].union(df_key['keyset right'])
212 | df_key['unmatched total'] = df_key['keyset left'].symmetric_difference(df_key['keyset right'])
213 | df_key['unmatched left'] = df_key['keyset left'].difference(df_key['keyset right'])
214 | df_key['unmatched right'] = df_key['keyset right'].difference(df_key['keyset left'])
215 |
216 | # check types are consistent
217 | vl = next(iter(df_key['keyset left'])) # take first element
218 | vr = next(iter(df_key['keyset right'])) # take first element
219 |
220 | df_key['value type'] = type(vl)
221 |
222 | self.keysets.append(df_key)
223 |
224 | def head(self, nrows=None):
225 | """
226 | .head() of input dataframes
227 |
228 | Args:
229 | keys_only (bool): only print join keys
230 | nrows (int): number of rows to show
231 | print (bool): print or return df
232 |
233 | """
234 | if nrows is None:
235 | result = {idx: dfg for idx, dfg in enumerate(self.dfshead)}
236 | else:
237 | result = {idx: dfg.head(nrows) for idx, dfg in self._enumerate_dfs()}
238 | return self._returndict(result)
239 |
240 | def columns_common(self):
241 | return self._return(self.sniff_results['columns_common'])
242 |
243 | def columns_all(self):
244 | return self._return(self.sniff_results['columns_all'])
245 |
246 | def columns_ispresent(self, as_bool=False):
247 | # todo: maintain column order of first dataframe => take from d6tstack
248 | col_union = list(set().union(*[dfg.columns.tolist() for dfg in self.dfs]))
249 | dfr = dict(zip(range(self.cfg_ndfs),[dfg.columns.isin(col_union) for dfg in self.dfs]))
250 | dfr = pd.DataFrame(dfr,index=col_union).sort_index()
251 | if not as_bool:
252 | dfr = dfr.replace([True,False],['+','-'])
253 | return self._return(dfr)
254 |
255 | def describe(self, **kwargs):
256 | """
257 | .describe() of input dataframes
258 |
259 | Args:
260 | kwargs (misc): to pass to .describe()
261 |
262 | """
263 | result = {idx: dfg.describe(**kwargs) for idx, dfg in self._enumerate_dfs()}
264 | return self._returndict(result)
265 |
266 | def shape(self):
267 | """
268 | .shape of input dataframes
269 |
270 | Args:
271 | kwargs (misc): to pass to .describe()
272 |
273 | """
274 | result = {idx: dfg.shape for idx, dfg in self._enumerate_dfs()}
275 | return self._returndict(result)
276 |
277 | def describe_str(self, unique_count=False):
278 | """
279 | Returns statistics on length of all strings and other objects in pandas dataframe. Statistics include mean, median, min, max. Optional unique count.
280 |
281 | Args:
282 | dfg (dataframe): pandas dataframe
283 | columns (:obj:`list`, optional): column names to analyze. If None analyze all
284 | unique_count (:obj:`bool`, optional): include count of unique values
285 |
286 | Returns:
287 | dataframe: string length statistics
288 | """
289 | def _apply_strlen(dfg, unique_count=False):
290 | lenv = np.vectorize(len)
291 | alens = lenv(dfg.values)
292 | r = {'median':np.median(alens),'mean':np.mean(alens),'min':np.min(alens),'max':np.max(alens),'nrecords':dfg.shape[0]}
293 | if unique_count:
294 | r['uniques'] = len(dfg.unique())
295 | return pd.Series(r)
296 |
297 | result = {}
298 | for idx, dfg in enumerate(self.dfs):
299 | if unique_count:
300 | cfg_col_sel = ['median','min','max','nrecords','uniques']
301 | else:
302 | cfg_col_sel = ['median','min','max','nrecords']
303 | dfo = dfg.select_dtypes(include=['object']).apply(lambda x: _apply_strlen(x.dropna(), unique_count)).T[cfg_col_sel]
304 | result[idx] = dfo
305 | return self._returndict(result)
306 |
307 | def describe_data(self, ignore_value_columns=False):
308 | result = {}
309 | for idx, dfg in enumerate(self.dfs):
310 |
311 | if ignore_value_columns:
312 | columns_sel = dfg.select_dtypes(include=['object']).columns
313 | else:
314 | columns_sel = dfg.columns
315 |
316 | nunique = dfg[columns_sel].apply(lambda x: x.dropna().unique().shape[0]).rename('unique')
317 | nrecords = dfg[columns_sel].apply(lambda x: x.dropna().shape[0]).rename('nrecords')
318 | nnan = dfg[columns_sel].isna().sum().rename('nan')
319 | dfr = pd.concat([nrecords,nunique,nnan],1)
320 | dfr['unique rate'] = dfr['unique']/dfr['nrecords']
321 | result[idx] = dfr
322 |
323 | return self._returndict(result)
324 |
325 | def data_match(self, how=None, topn=1, ignore_value_columns=True, max_unique_pct=0.8, min_unique_count=1, min_match_rate=0.5):
326 | '''
327 | todo:
328 | order matters, sequential inner or left joins (no right or outer joins)
329 | jaccard 1:2 => intersection for inner, same set for left
330 |
331 | '''
332 | how = 'inner' if how is None else how
333 |
334 | if self.cfg_ndfs >2:
335 | warnings.warn('Upgrade to PRO version to join >2 dataframes')
336 |
337 | from d6tjoin.utils import _filter_group_min
338 |
339 | if ignore_value_columns:
340 | df_left, df_right = [dfg.select_dtypes(include=['object']) for _, dfg in self._enumerate_dfs()]
341 | print('ignored columns (value type)', 'left:',set(self.dfs[0].columns)-set(df_left.columns), 'right:', set(self.dfs[1].columns)-set(df_right.columns))
342 | else:
343 | df_left, df_right = [dfg for _, dfg in self._enumerate_dfs()]
344 |
345 | def unique_dict(dfg):
346 | d = dict(zip(dfg.columns, [set(dfg[x].dropna().unique()) for x in dfg.columns]))
347 | d = {k: v for k, v in d.items() if (len(v) > min_unique_count) and (len(v)/dfg[k].shape[0] <= max_unique_pct)}
348 | return d
349 |
350 | # todo: add len(key) and sample=next(key)
351 | values_left = unique_dict(df_left)
352 | values_right = unique_dict(df_right)
353 | values_left_ignored = set(df_left.columns)-set(values_left.keys())
354 | values_right_ignored = set(df_right.columns)-set(values_right.keys())
355 | if values_left_ignored: print('ignored columns (unique count)', 'left:', values_left_ignored)
356 | if values_right_ignored: print('ignored columns (unique count)', 'right:', values_right_ignored)
357 |
358 | df_candidates = list(itertools.product(values_left.keys(), values_right.keys()))
359 | df_candidates = pd.DataFrame(df_candidates, columns=['__left__', '__right__'])
360 |
361 | def jaccard_similarity(s1, s2, how):
362 | intersection = len(s1.intersection(s2))
363 | if how=='left':
364 | ratio = float(intersection / len(s1))
365 | else:
366 | union = (len(s1) + len(s2)) - intersection
367 | ratio = float(intersection / union)
368 | return ratio
369 |
370 | def jaccard_caller(col_left, col_right):
371 | return jaccard_similarity(values_left[col_left], values_right[col_right], how)
372 |
373 | df_candidates['__similarity__'] = df_candidates.apply(lambda x: jaccard_caller(x['__left__'], x['__right__']), axis=1)
374 | df_candidates = df_candidates.dropna(subset=['__similarity__'])
375 | if df_candidates.empty:
376 | raise ValueError('Failed to compute meaningful similarity, might need to loosen parameters')
377 | df_candidates['__similarity__'] = -df_candidates['__similarity__']
378 | df_diff = df_candidates.groupby('__left__',group_keys=False).apply(lambda x: _filter_group_min(x,'__similarity__',topn)).reset_index(drop=True)
379 | df_diff['__similarity__'] = -df_diff['__similarity__']
380 |
381 | df_diff['__left-sample__'] = df_diff['__left__'].map(lambda x: next(iter(values_left[x]),None))
382 | df_diff['__right-sample__'] = df_diff['__right__'].map(lambda x: next(iter(values_right[x]),None))
383 | df_diff['__left-nunique__'] = df_diff['__left__'].map(lambda x: len(values_left[x]))
384 | df_diff['__right-nunique__'] = df_diff['__right__'].map(lambda x: len(values_right[x]))
385 |
386 | if min_match_rate is not None:
387 | df_diff = df_diff[df_diff['__similarity__']>min_match_rate]
388 |
389 | # todo: sort by left df columns and then by similarity descending
390 |
391 | return self._return(df_diff)
392 |
393 | def data_similarity(self, how=None, columns=None):
394 | # goal: which columns data is most "similar"
395 | # todo: run similarity function show median/min/max similarity across columns
396 | # similarity on all vs all values?
397 | # find the top1/n similarity for each value. median across all values
398 | # above is strings. for numbers and dates:
399 | # numbers: "same distribution" => distribution similarity
400 | # dates: "same distribution" => distribution similarity
401 | # distribution similarity: non-parametric. interquartile range similar
402 | # want to find join keys not join value columns
403 | #
404 |
405 | raise NotImplementedError()
406 |
407 |
408 | def match_quality(self, rerun=False):
409 | """
410 | Show prejoin statistics
411 |
412 | Args:
413 | return_results (bool): Return results as df instead of printing
414 |
415 | """
416 |
417 | if not self.keysets or rerun:
418 | self._calc_keysets()
419 |
420 | df_out = []
421 |
422 | for key_set in self.keysets:
423 | df_key = {}
424 | for k in ['keyset left','keyset right','inner','outer','unmatched total','unmatched left','unmatched right']:
425 | df_key[k] = len(key_set[k])
426 | for k in ['key left','key right']:
427 | df_key[k] = key_set[k]
428 | df_key['all matched'] = df_key['inner']==df_key['outer']
429 | df_out.append(df_key)
430 |
431 | df_out = pd.DataFrame(df_out)
432 | df_out = df_out.rename(columns={'keyset left':'left','keyset right':'right'})
433 | df_out = df_out[['key left','key right','all matched','inner','left','right','outer','unmatched total','unmatched left','unmatched right']]
434 |
435 | return self._return(df_out)
436 |
437 | def is_all_matched(self, key='__all__',rerun=False):
438 |
439 | if not self.keysets or rerun:
440 | self._calc_keysets()
441 |
442 | keymask = [key in e for e in self.keysall]
443 | if not (any(keymask)):
444 | raise ValueError('key ', self.cfg_show_key, ' not a join key in ', self.keys)
445 | ilevel = keymask.index(True)
446 |
447 | return (self.keysets[ilevel]['key left']==key or self.keysets[ilevel]['key right']==key) and len(self.keysets[ilevel]['unmatched total'])==0
448 |
449 | def _show_prep_df(self, idf, mode):
450 | """
451 | PRIVATE. prepare data for self.show() functions
452 |
453 | Args:
454 | idf (int): which df in self.dfs
455 | mode (str): matched vs unmatched
456 |
457 | """
458 |
459 | if idf==0:
460 | side='left'
461 | elif idf==1:
462 | side='right'
463 | else:
464 | raise ValueError('invalid idx')
465 |
466 | if self.cfg_show_keys_only:
467 | if self.cfg_show_key == '__all__':
468 | cfg_col_sel = self.keysdf[idf]
469 | else:
470 | cfg_col_sel = self.cfg_show_key
471 | else:
472 | cfg_col_sel = self.dfs[idf].columns
473 |
474 | # which set to return?
475 | if mode=='matched':
476 | cfg_mode_sel = 'inner'
477 | elif mode=='unmatched':
478 | cfg_mode_sel = mode + ' ' + side
479 | else:
480 | raise ValueError('invalid mode', mode)
481 |
482 | keys = list(self.keysets[self.cfg_show_level][cfg_mode_sel])
483 | if self.cfg_show_nrecords > 0:
484 | keys = keys[:self.cfg_show_nrecords]
485 |
486 | if self.cfg_show_key == '__all__' and self.cfg_njoins>1:
487 | dfg = self.dfs[idf].copy()
488 | dfg = self.dfs[idf].reset_index().set_index(self.keysdf[idf])
489 | dfg = dfg.loc[keys]
490 | dfg = dfg.reset_index().sort_values('index')[cfg_col_sel].reset_index(drop=True) # reorder to original order
491 | elif self.cfg_show_key == '__all__' and self.cfg_njoins==1:
492 | dfg = self.dfs[idf]
493 | dfg = dfg.loc[dfg[self.keysdf[idf][0]].isin([e[0] for e in keys]), cfg_col_sel]
494 | else:
495 | dfg = self.dfs[idf]
496 | dfg = dfg.loc[dfg[self.cfg_show_key].isin(keys),cfg_col_sel]
497 |
498 | if self.cfg_show_nrows > 0:
499 | dfg = dfg.head(self.cfg_show_nrows)
500 |
501 | if self.cfg_show_print_only:
502 | print('%s %s for key %s' %(mode, side, self.cfg_show_key))
503 | print(dfg)
504 | else:
505 | self.df_show_out[side] = dfg.copy()
506 |
507 | def _show(self, mode):
508 | if not self.keysets:
509 | raise RuntimeError('run .stats_prejoin() first')
510 |
511 | keymask = [self.cfg_show_key in e for e in self.keysall]
512 | if not (any(keymask)):
513 | raise ValueError('key ', self.cfg_show_key, ' not a join key in ', self.keys)
514 | self.cfg_show_level = keymask.index(True)
515 |
516 | for idf in range(self.cfg_ndfs): # run for all self.dfs
517 | if self.keysall[self.cfg_show_level][idf] == self.cfg_show_key: # check if key applies
518 | self._show_prep_df(idf, mode)
519 |
520 | def show_unmatched(self, key, nrecords=3, nrows=3, keys_only=False, print_only=False):
521 | """
522 | Show unmatched records
523 |
524 | Args:
525 | key (str): join key
526 | nrecords (int): number of unmatched records
527 | nrows (int): number of rows
528 | keys_only (bool): show only join keys
529 | print_only (bool): if false return results instead of printing
530 | """
531 | self.df_show_out = {}
532 | self.cfg_show_key = key
533 | self.cfg_show_nrecords = nrecords
534 | self.cfg_show_nrows = nrows
535 | self.cfg_show_keys_only = keys_only
536 | self.cfg_show_print_only = print_only
537 |
538 | self._show('unmatched')
539 | if not self.cfg_show_print_only:
540 | return self.df_show_out
541 |
542 | def show_matched(self, key, nrecords=3, nrows=3, keys_only=False, print_only=False):
543 | """
544 | Show matched records
545 |
546 | Args:
547 | key (str): join key
548 | nrecords (int): number of unmatched records
549 | nrows (int): number of rows
550 | keys_only (bool): show only join keys
551 | print_only (bool): if false return results instead of printing
552 | """
553 | self.df_show_out = {}
554 | self.cfg_show_key = key
555 | self.cfg_show_nrecords = nrecords
556 | self.cfg_show_nrows = nrows
557 | self.cfg_show_keys_only = keys_only
558 | self.cfg_show_print_only = print_only
559 |
560 | self._show('matched')
561 | if not self.cfg_show_print_only:
562 | return self.df_show_out
563 |
564 | def merge(self, **kwargs):
565 | """
566 | Perform merge using keys
567 |
568 | Args:
569 | kwargs (misc): parameters to pass to `pd.merge()`
570 | """
571 | if len(self.dfs) > 2:
572 | raise NotImplementedError('Only handles 2 dataframes for now')
573 |
574 | return self.dfs[0].merge(self.dfs[1], left_on=self.keysdf[0], right_on=self.keysdf[1], **kwargs)
575 |
576 |
--------------------------------------------------------------------------------
/d6tjoin/smart_join.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from collections import OrderedDict
4 | import itertools
5 | import warnings
6 | import jellyfish
7 |
8 | from d6tjoin.pre import Prejoin as BaseJoin
9 |
10 |
11 | # ******************************************
12 | # helpers
13 | # ******************************************
14 | def set_values(dfg, key):
15 | v = dfg[key].unique()
16 | return v[~pd.isnull(v)]
17 |
18 |
19 | def apply_gen_candidates_group(dfg):
20 | return pd.DataFrame(list(itertools.product(dfg['__top1left__'].values[0],dfg['__top1right__'].values[0])),columns=['__top1left__','__top1right__'])
21 |
22 |
23 | def apply_gen_candidates(set1, set2):
24 | df_candidates = list(itertools.product(set1, set2))
25 | df_candidates = pd.DataFrame(df_candidates,columns=['__top1left__','__top1right__'])
26 |
27 | return df_candidates
28 |
29 |
30 | def diff_arithmetic(x,y):
31 | return abs(x - y)
32 |
33 |
34 | def diff_edit(a,b):
35 | return jellyfish.levenshtein_distance(a,b)
36 |
37 |
38 | def filter_group_minmax(dfg, col):
39 | """
40 |
41 | Returns all rows equal to min in col
42 |
43 | """
44 | return dfg[dfg[col] == dfg[col].min()]
45 |
46 |
47 | def prep_match_df(dfg):
48 | dfg = dfg[['__top1left__', '__top1right__', '__top1diff__', '__match type__']]
49 | return dfg
50 |
51 | # ******************************************
52 | # fuzzy join
53 | # ******************************************
54 | class FuzzyJoinTop1(BaseJoin):
55 |
56 | def __init__(self, dfs, exact_keys=[], fuzzy_keys=[], exact_how='inner', fuzzy_how = {}, keys_bydf=False, init_merge=False):
57 |
58 | """
59 |
60 | Smart joiner for top 1 similarity joins. By setting fuzzy keys, it calculates similarity metrics for strings, numbers and dates to join on the closest matching entry.
61 |
62 | Args:
63 | dfs (list): list of dataframes
64 | exact_keys (list): list of join keys for exact joins. See notes for details
65 | fuzzy_keys (list): list of join keys for fuzzy joins. See notes for details
66 | exact_how (str): exact join mode same as `pd.merge(how='inner')`
67 | fuzzy_how (dict): specify fuzzy join options by merge level eg {0:{'top_limit':1}}
68 | keys_bydf (bool): if keys list is by dataframe (default) or join level. See notes for details
69 |
70 | Note:
71 | * specifying join keys:
72 | * if both dataframes have matching columns: `fuzzy_keys=['key1','key2']`
73 | * else: `fuzzy_keys=[['key1df1','key1df2'],['key2df1','key2df2']]`
74 | * by default you provide keys by join level eg `[['key1df1','key1df2'],['key2df1','key2df2']]` instead you can also provide keys by dataframe `[['key1df1','key2df1'],['key1df2','key2df2']], keys_bydf=True`
75 | * fuzzy_how: controls join options by join level
76 | * dict keys are join level eg with `fuzzy_keys=[['key1df1','key1df2'],['key2df1','key2df2']]` you set `fuzzy_how={0:{'top_nrecords':5},0:{'top_nrecords':5}}`
77 | * options are:
78 | * fun_diff: difference function or list of difference functions applied sequentially. Needs to be 0=similar and >0 dissimilar
79 | * top_limit: maximum difference, keep only canidates with difference <= top_limit
80 | * top_nrecords: keep only n top_nrecords, good for generating previews
81 |
82 | """
83 |
84 | # inputs dfs
85 | self._init_dfs(dfs)
86 |
87 | # check and save join keys
88 | if not exact_keys and not fuzzy_keys:
89 | raise ValueError("Must provide at least one of exact_keys or fuzzy_keys")
90 |
91 | self.keys_exact, self.keysdf_exact = self._prep_keys(exact_keys, keys_bydf)
92 | if self.keys_exact:
93 | self._check_keysdfs(self.keys_exact, self.keysdf_exact)
94 |
95 | self.keys_fuzzy, self.keysdf_fuzzy = self._prep_keys(fuzzy_keys, keys_bydf)
96 | if self.keys_fuzzy:
97 | self._check_keysdfs(self.keys_fuzzy, self.keysdf_fuzzy)
98 |
99 | # todo: no duplicate join keys passed
100 |
101 | if not isinstance(exact_how, (str,)):
102 | raise NotImplementedError('exact_how can only be applied globally for now')
103 | elif exact_how not in ('left','right','inner','outer'):
104 | raise ValueError("Invalid how parameter, check documentation for valid values")
105 |
106 | self.cfg_njoins_exact = len(self.keysdf_exact[0]) if self.keysdf_exact else 0
107 | self.cfg_njoins_fuzzy = len(self.keysdf_fuzzy[0]) if self.keysdf_fuzzy else 0
108 |
109 | if self.cfg_njoins_fuzzy>1:
110 | # raise NotImplementedError('Currently supports only 1 fuzzy key')
111 | warnings.warn('Multi-key fuzzy joins are currently done globally for each key indivudally, not hierarchically for each unique fuzzy key value pair')
112 |
113 | self.exact_how = exact_how
114 | self.set_fuzzy_how_all(fuzzy_how)
115 |
116 | if init_merge:
117 | self.join()
118 | else:
119 | self.dfjoined = None
120 |
121 | self.table_fuzzy = {}
122 |
123 |
124 | def set_fuzzy_how(self, ilevel, fuzzy_how):
125 | self.fuzzy_how[ilevel] = fuzzy_how
126 | self._gen_fuzzy_how(ilevel)
127 |
128 | def set_fuzzy_how_all(self, fuzzy_how):
129 | if not isinstance(fuzzy_how, (dict,)):
130 | raise ValueError('fuzzy_how needs to be a dict')
131 | self.fuzzy_how = fuzzy_how
132 | self._gen_fuzzy_how_all()
133 |
134 | def _gen_fuzzy_how_all(self):
135 |
136 | for ilevel in range(self.cfg_njoins_fuzzy):
137 | self._gen_fuzzy_how(ilevel)
138 |
139 | def _gen_fuzzy_how(self, ilevel):
140 |
141 | # check if entry exists
142 | cfg_top1 = self.fuzzy_how.get(ilevel,{})
143 |
144 | keyleft = self.keys_fuzzy[ilevel][0]
145 | keyright = self.keys_fuzzy[ilevel][1]
146 |
147 | typeleft = self.dfs[0][keyleft].dtype
148 | typeright = self.dfs[1][keyright].dtype
149 |
150 | if 'type' not in cfg_top1:
151 | if typeleft == 'int64' or typeleft == 'float64' or typeleft == 'datetime64[ns]':
152 | cfg_top1['type'] = 'number'
153 | elif typeleft == 'object' and type(self.dfs[0][keyleft].values[~self.dfs[0][keyleft].isnull()][0])==str:
154 | cfg_top1['type'] = 'string'
155 | else:
156 | raise ValueError('Unrecognized data type for top match, need to pass fun_diff in arguments')
157 |
158 | # make defaults if no settings provided
159 | if 'fun_diff' not in cfg_top1:
160 |
161 | if cfg_top1['type'] == 'number':
162 | cfg_top1['fun_diff'] = pd.merge_asof
163 | elif cfg_top1['type'] == 'string':
164 | cfg_top1['fun_diff'] = diff_edit
165 | else:
166 | raise ValueError('Unrecognized data type for top match, need to pass fun_diff in arguments')
167 | else:
168 | is_valid = callable(cfg_top1['fun_diff']) or (type(cfg_top1['fun_diff']) == list and all([callable(f) for f in cfg_top1['fun_diff']]))
169 | if not is_valid:
170 | raise ValueError("'fun_diff' needs to be a function or a list of functions")
171 |
172 | if not type(cfg_top1['fun_diff']) == list:
173 | cfg_top1['fun_diff'] = [cfg_top1['fun_diff']]
174 |
175 | if 'top_limit' not in cfg_top1:
176 | cfg_top1['top_limit'] = None
177 |
178 | if 'top_nrecords' not in cfg_top1:
179 | cfg_top1['top_nrecords'] = None
180 |
181 | cfg_top1['dir'] = 'left'
182 |
183 | # save config
184 | # check if entry exists
185 | self.fuzzy_how[ilevel] = cfg_top1
186 |
187 | def preview_fuzzy(self, ilevel, top_nrecords=5):
188 | if top_nrecords>0:
189 | return self._gen_match_top1(ilevel, top_nrecords)
190 | else:
191 | return self._gen_match_top1(ilevel)
192 |
193 | def _gen_match_top1_left_number(self, cfg_group_left, cfg_group_right, keyleft, keyright, top_nrecords):
194 | if len(cfg_group_left) > 0:
195 |
196 | # unique values
197 | if top_nrecords is None:
198 | # df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique())
199 | df_keys_left = self.dfs[0].groupby(cfg_group_left)[keyleft].apply(lambda x: pd.Series(x.unique()))
200 | df_keys_left.index = df_keys_left.index.droplevel(1)
201 | df_keys_left = pd.DataFrame(df_keys_left)
202 | else:
203 | # df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique()[:top_nrecords])
204 | df_keys_left = self.dfs[0].groupby(cfg_group_left)[keyleft].apply(lambda x: pd.Series(x.unique()[:top_nrecords]))
205 | df_keys_left.index = df_keys_left.index.droplevel(1)
206 | df_keys_left = pd.DataFrame(df_keys_left)
207 | df_keys_right = self.dfs[1].groupby(cfg_group_right)[keyright].apply(lambda x: pd.Series(x.unique()))
208 | df_keys_right.index = df_keys_right.index.droplevel(1)
209 | df_keys_right = pd.DataFrame(df_keys_right)
210 | # df_keys_right = pd.DataFrame(self.dfs[1].groupby(cfg_group_right)[keyright].unique())
211 |
212 | # sort
213 | df_keys_left = df_keys_left.sort_values(keyleft).reset_index().rename(columns={keyleft:'__top1left__'})
214 | df_keys_right = df_keys_right.sort_values(keyright).reset_index().rename(columns={keyright:'__top1right__'})
215 |
216 | df_match = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', left_by=cfg_group_left, right_by=cfg_group_right, direction='nearest')
217 | else:
218 | # uniques
219 | values_left = set_values(self.dfs[0], keyleft)
220 | values_right = set_values(self.dfs[1], keyright)
221 |
222 | if top_nrecords:
223 | values_left = values_left[:top_nrecords]
224 |
225 | df_keys_left = pd.DataFrame({'__top1left__':values_left}).sort_values('__top1left__')
226 | df_keys_right = pd.DataFrame({'__top1right__':values_right}).sort_values('__top1right__')
227 |
228 | df_match = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', direction='nearest')
229 |
230 | df_match['__top1diff__'] = (df_match['__top1left__']-df_match['__top1right__']).abs()
231 |
232 | return df_match
233 |
234 | def _gen_match_top1(self, ilevel, top_nrecords=None):
235 | """
236 |
237 | Generates match table between two sets
238 |
239 | Args:
240 | keyssets (dict): values for join keys ['key left', 'key right', 'keyset left', 'keyset right', 'inner', 'outer', 'unmatched total', 'unmatched left', 'unmatched right']
241 | mode (str, list): global string or list for each join. Possible values: ['exact inner', 'exact left', 'exact right', 'exact outer', 'top1 left', 'top1 right', 'top1 bidir all', 'top1 bidir unmatched']
242 | is_lower_better (bool): True = difference, False = Similarity
243 |
244 | """
245 |
246 | cfg_top1 = self.fuzzy_how[ilevel]
247 | fun_diff = cfg_top1['fun_diff']
248 | top_limit = cfg_top1['top_limit']
249 | if not top_nrecords:
250 | top_nrecords = cfg_top1['top_nrecords']
251 |
252 | keyleft = self.keys_fuzzy[ilevel][0]
253 | keyright = self.keys_fuzzy[ilevel][1]
254 |
255 | #******************************************
256 | # table LEFT
257 | #******************************************
258 | if cfg_top1['dir']=='left':
259 |
260 | # exact keys for groupby
261 | cfg_group_left = self.keysdf_exact[0] if self.keysdf_exact else []
262 | cfg_group_right = self.keysdf_exact[1] if self.keysdf_exact else []
263 |
264 | if cfg_top1['type'] == 'string' or (cfg_top1['type'] == 'number' and cfg_top1['fun_diff'] != [pd.merge_asof]):
265 |
266 | if len(cfg_group_left)>0:
267 | # generate candidates if exact matches are present (= blocking index)
268 |
269 | if top_nrecords is None:
270 | df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique())
271 | else:
272 | df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique()[:top_nrecords])
273 | df_keys_right = pd.DataFrame(self.dfs[1].groupby(cfg_group_right)[keyright].unique())
274 | df_keysets_groups = df_keys_left.merge(df_keys_right,left_index=True, right_index=True)
275 | df_keysets_groups.columns = ['__top1left__','__top1right__']
276 | dfg = df_keysets_groups.reset_index().groupby(cfg_group_left).apply(apply_gen_candidates_group)
277 | dfg = dfg.reset_index(-1,drop=True).reset_index()
278 | dfg = dfg.dropna()
279 |
280 | else:
281 | # generate candidates if NO exact matches
282 | values_left = set_values(self.dfs[0],keyleft)
283 | values_right = set_values(self.dfs[1],keyright)
284 |
285 | if top_nrecords is None:
286 | dfg = apply_gen_candidates(values_left,values_right)
287 | else:
288 | dfg = apply_gen_candidates(values_left[:top_nrecords], values_right)
289 |
290 |
291 | # find exact matches and remove from candidates
292 | # todo: use set logic before generating candidates
293 | idxSelExact = dfg['__top1left__']==dfg['__top1right__']
294 | df_match_exact = dfg[idxSelExact].copy()
295 | df_match_exact['__match type__'] = 'exact'
296 | df_match_exact['__top1diff__'] = 0
297 |
298 | idxSel = dfg['__top1left__'].isin(df_match_exact['__top1left__'])
299 | dfg = dfg[~idxSel]
300 |
301 | for fun_diff in cfg_top1['fun_diff']:
302 | dfg['__top1diff__'] = dfg.apply(lambda x: fun_diff(x['__top1left__'], x['__top1right__']), axis=1)
303 |
304 | # filtering
305 | if not top_limit is None:
306 | dfg = dfg[dfg['__top1diff__'] <= top_limit]
307 |
308 | # get top 1
309 | dfg = dfg.groupby('__top1left__',group_keys=False).apply(lambda x: filter_group_minmax(x,'__top1diff__'))
310 |
311 | # return results
312 | dfg['__match type__'] = 'top1 left'
313 | df_match = pd.concat([dfg,df_match_exact])
314 |
315 | elif cfg_top1['type'] == 'number' and cfg_top1['fun_diff'] == [pd.merge_asof]:
316 | df_match = self._gen_match_top1_left_number(cfg_group_left, cfg_group_right, keyleft, keyright, top_nrecords).copy()
317 |
318 | # filtering
319 | if not top_limit is None:
320 | df_match = df_match[df_match['__top1diff__'] <= top_limit]
321 |
322 | df_match['__match type__'] = 'top1 left'
323 | df_match.loc[df_match['__top1left__'] == df_match['__top1right__'], '__match type__'] = 'exact'
324 | else:
325 | raise ValueError('Dev error: cfg_top1["type/fun_diff"]')
326 |
327 |
328 | #******************************************
329 | # table RIGHT
330 | #******************************************
331 | elif cfg_top1['dir']=='right' or cfg_top1['dir'] == 'inner':
332 | raise NotImplementedError('Only use left join for now')
333 | else:
334 | raise ValueError("wrong 'how' parameter for top1 join, check documentation")
335 |
336 | return {'key left':keyleft, 'key right':keyright,
337 | 'table':df_match,'has duplicates':df_match.groupby('__top1left__').size().max()>1}
338 |
339 | def run_match_top1_all(self, cfg_top1=None):
340 |
341 | for ilevel in range(self.cfg_njoins_fuzzy):
342 | self.table_fuzzy[ilevel] = self._gen_match_top1(ilevel)
343 |
344 | def join(self, is_keep_debug=False):
345 | if self.cfg_njoins_fuzzy==0:
346 | self.dfjoined = self.dfs[0].merge(self.dfs[1], left_on=self.keysdf_exact[0], right_on=self.keysdf_exact[1], how=self.exact_how)
347 | else:
348 |
349 | self.run_match_top1_all()
350 |
351 | cfg_group_left = self.keysdf_exact[0] if self.keysdf_exact else []
352 | cfg_group_right = self.keysdf_exact[1] if self.keysdf_exact else []
353 | self.dfjoined = self.dfs[0]
354 | for ilevel in range(self.cfg_njoins_fuzzy):
355 | keyleft = self.keys_fuzzy[ilevel][0]
356 | keyright = self.keys_fuzzy[ilevel][1]
357 | dft = self.table_fuzzy[ilevel]['table'].copy()
358 | dft.columns = [s + keyleft if s.startswith('__') else s for s in dft.columns]
359 | self.dfjoined = self.dfjoined.merge(dft, left_on=cfg_group_left+[keyleft], right_on=cfg_group_left+['__top1left__'+keyleft])
360 | pass
361 |
362 | cfg_keys_left = cfg_group_left+['__top1right__'+k for k in self.keysdf_fuzzy[0]]
363 | cfg_keys_right = cfg_group_right+[k for k in self.keysdf_fuzzy[1]]
364 |
365 | self.dfjoined = self.dfjoined.merge(self.dfs[1], left_on = cfg_keys_left, right_on = cfg_keys_right, suffixes=['','__right__'])
366 |
367 | if not is_keep_debug:
368 | self.dfjoined = self.dfjoined[self.dfjoined.columns[~self.dfjoined.columns.str.startswith('__')]]
369 |
370 | return self.dfjoined
371 |
372 |
373 |
--------------------------------------------------------------------------------
/d6tjoin/top1.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from collections import OrderedDict
4 | import itertools
5 | import warnings
6 | import jellyfish
7 |
8 | # ******************************************
9 | # helpers
10 | # ******************************************
11 |
12 |
13 | from d6tjoin.utils import _applyFunMulticore, _filter_group_min, _set_values
14 |
15 | class MergeTop1Diff(object):
16 | """
17 |
18 | Top1 minimum difference join. Mostly used for strings. Helper for `MergeTop1`.
19 |
20 | """
21 |
22 | def __init__(self, df1, df2, fuzzy_left_on, fuzzy_right_on, fun_diff=None, exact_left_on=None, exact_right_on=None,
23 | top_limit=None, topn=1, fun_preapply = None, fun_postapply = None, is_keep_debug=False, use_multicore=True):
24 |
25 | # check exact keys
26 | if not exact_left_on:
27 | exact_left_on = []
28 | if not exact_right_on:
29 | exact_right_on = []
30 |
31 | if not isinstance(fuzzy_left_on, (str,)) or not isinstance(fuzzy_right_on, (str,)):
32 | raise ValueError('fuzzy_on needs to be a string')
33 |
34 | if len(exact_left_on) != len(exact_right_on):
35 | raise ValueError('Need to pass same number of exact keys')
36 | if not isinstance(exact_left_on, (list)) or not isinstance(exact_right_on, (list)):
37 | raise ValueError('Exact keys need to be a list')
38 |
39 | if not callable(fun_diff):
40 | raise ValueError('fun_diff needs to a function')
41 |
42 | if (fun_preapply and fun_postapply) and (not callable(fun_preapply) or not callable(fun_postapply)):
43 | raise ValueError('fun_preapply and fun_postapply needs to a function')
44 |
45 | # use blocking index?
46 | if not exact_left_on and not exact_right_on:
47 | self.cfg_is_block = False
48 | elif exact_left_on and exact_right_on:
49 | self.cfg_is_block = True
50 | else:
51 | raise ValueError('Need to pass exact keys for both or neither dataframe')
52 |
53 | # store data
54 | self.dfs = [df1,df2]
55 |
56 | # store config
57 | self.cfg_fuzzy_left_on = fuzzy_left_on
58 | self.cfg_fuzzy_right_on = fuzzy_right_on
59 | self.cfg_exact_left_on = exact_left_on
60 | self.cfg_exact_right_on = exact_right_on
61 | self.cfg_fun_diff = fun_diff
62 | self.cfg_fun_preapply = fun_preapply
63 | self.cfg_fun_postapply = fun_postapply
64 | self.cfg_top_limit = top_limit
65 | self.cfg_is_keep_debug = is_keep_debug
66 | self.cfg_topn = topn
67 | self.cfg_use_multicore = use_multicore
68 |
69 | def _allpairs_candidates(self):
70 | values_left = _set_values(self.dfs[0], self.cfg_fuzzy_left_on)
71 | values_right = _set_values(self.dfs[1], self.cfg_fuzzy_right_on)
72 |
73 | if self.cfg_topn>1:
74 | values_left_exact = set()
75 | values_left_fuzzy = values_left
76 | else:
77 | values_left_exact = values_left.intersection(values_right)
78 | values_left_fuzzy = values_left.difference(values_right)
79 |
80 | # pre apply a function
81 | if self.cfg_fun_preapply:
82 | values_left_fuzzy = [self.cfg_fun_preapply(v) for v in values_left_fuzzy]
83 | values_right = [self.cfg_fun_preapply(v) for v in values_right]
84 |
85 | df_candidates_fuzzy = list(itertools.product(values_left_fuzzy, values_right))
86 | df_candidates_fuzzy = pd.DataFrame(df_candidates_fuzzy,columns=['__top1left__','__top1right__'])
87 | df_candidates_fuzzy['__matchtype__'] = 'top1 left'
88 |
89 | df_candidates_exact = pd.DataFrame({'__top1left__': list(values_left_exact)})
90 | df_candidates_exact['__top1right__'] = df_candidates_exact['__top1left__']
91 | df_candidates_exact['__matchtype__'] = 'exact'
92 |
93 | df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)
94 |
95 | return df_candidates
96 |
97 | def _top1_diff_noblock(self):
98 | df_candidates = self._allpairs_candidates()
99 |
100 | idxSel = df_candidates['__matchtype__'] != 'exact'
101 | if self.cfg_use_multicore:
102 | df_candidates.loc[idxSel, '__top1diff__'] = _applyFunMulticore(df_candidates.loc[idxSel,'__top1left__'].values, df_candidates.loc[idxSel,'__top1right__'].values,self.cfg_fun_diff)
103 | else:
104 | df_candidates.loc[idxSel,'__top1diff__'] = df_candidates[idxSel].apply(lambda x: self.cfg_fun_diff(x['__top1left__'], x['__top1right__']), axis=1)
105 |
106 | df_candidates.loc[~idxSel, '__top1diff__'] = 0
107 | has_duplicates = False
108 |
109 | if self.cfg_fun_postapply:
110 | df_candidates['__top1left__']=df_candidates['__top1left__'].apply(self.cfg_fun_postapply,1)
111 | df_candidates['__top1right__']=df_candidates['__top1right__'].apply(self.cfg_fun_postapply,1)
112 |
113 | df_diff = df_candidates.groupby('__top1left__',group_keys=False).apply(lambda x: _filter_group_min(x,'__top1diff__',self.cfg_topn))
114 | if self.cfg_top_limit is not None:
115 | df_diff = df_diff[df_diff['__top1diff__']<=self.cfg_top_limit]
116 | has_duplicates = df_diff.groupby('__top1left__').size().max()>1
117 | if has_duplicates:
118 | warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)
119 |
120 | return df_diff, has_duplicates
121 |
122 |
123 | def _merge_top1_diff_noblock(self):
124 | df_diff, has_duplicates = self._top1_diff_noblock()
125 | dfjoin = self.dfs[0].merge(df_diff, left_on=self.cfg_fuzzy_left_on, right_on='__top1left__')
126 | dfjoin = dfjoin.merge(self.dfs[1], left_on='__top1right__', right_on=self.cfg_fuzzy_right_on, suffixes=['','__right__'])
127 |
128 | if not self.cfg_is_keep_debug:
129 | dfjoin = dfjoin[dfjoin.columns[~dfjoin.columns.str.startswith('__')]]
130 |
131 | return {'merged':dfjoin, 'top1':df_diff, 'duplicates':has_duplicates}
132 |
133 |
134 | def _top1_diff_withblock(self):
135 |
136 | def apply_gen_candidates_group(dfg):
137 | return pd.DataFrame(list(itertools.product(dfg['__top1left__'].values[0],dfg['__top1right__'].values[0])),columns=['__top1left__','__top1right__'])
138 |
139 | # find key unique values
140 | keysleft = self.dfs[0][self.cfg_exact_left_on+[self.cfg_fuzzy_left_on]].drop_duplicates().dropna()
141 | keysright = self.dfs[1][self.cfg_exact_right_on+[self.cfg_fuzzy_right_on]].drop_duplicates().dropna()
142 | keysleft = {tuple(x) for x in keysleft.values}
143 | keysright = {tuple(x) for x in keysright.values}
144 | values_left_exact = keysleft.intersection(keysright)
145 | values_left_fuzzy = keysleft.difference(keysright)
146 |
147 | df_keys_left_exact = pd.DataFrame(list(values_left_exact))
148 | if not df_keys_left_exact.empty:
149 | df_keys_left_exact.columns = self.cfg_exact_left_on+['__top1left__']
150 | df_keys_left_exact['__top1right__']=df_keys_left_exact['__top1left__']
151 | df_keys_left_exact['__matchtype__'] = 'exact'
152 |
153 | df_keys_left_fuzzy = pd.DataFrame(list(values_left_fuzzy))
154 | if not df_keys_left_fuzzy.empty:
155 | df_keys_left_fuzzy.columns = self.cfg_exact_left_on+[self.cfg_fuzzy_left_on]
156 |
157 | # fuzzy pair candidates
158 | df_keys_left = pd.DataFrame(df_keys_left_fuzzy.groupby(self.cfg_exact_left_on)[self.cfg_fuzzy_left_on].unique())
159 | df_keys_right = pd.DataFrame(self.dfs[1].groupby(self.cfg_exact_right_on)[self.cfg_fuzzy_right_on].unique())
160 | df_keysets_groups = df_keys_left.merge(df_keys_right, left_index=True, right_index=True)
161 | df_keysets_groups.columns = ['__top1left__', '__top1right__']
162 | df_keysets_groups = df_keysets_groups.reset_index().groupby(self.cfg_exact_left_on).apply(apply_gen_candidates_group)
163 | df_keysets_groups = df_keysets_groups.reset_index(-1, drop=True).reset_index()
164 | df_keysets_groups = df_keysets_groups.dropna()
165 |
166 | df_candidates = df_keysets_groups[['__top1left__', '__top1right__']].drop_duplicates()
167 | if self.cfg_use_multicore:
168 | df_candidates['__top1diff__'] = _applyFunMulticore(df_candidates['__top1left__'].values, df_candidates['__top1right__'].values, self.cfg_fun_diff)
169 | else:
170 | df_candidates['__top1diff__'] = df_candidates.apply(lambda x: self.cfg_fun_diff(x['__top1left__'], x['__top1right__']), axis=1)
171 | df_candidates['__matchtype__'] = 'top1 left'
172 |
173 | # calculate difference
174 | df_diff = df_keysets_groups.merge(df_candidates, on=['__top1left__', '__top1right__'])
175 |
176 | df_diff = df_diff.append(df_keys_left_exact)
177 | df_diff['__top1diff__']=df_diff['__top1diff__'].fillna(0) # exact keys
178 | df_diff = df_diff.groupby(self.cfg_exact_left_on+['__top1left__'],group_keys=False).apply(lambda x: _filter_group_min(x,'__top1diff__'))
179 | if self.cfg_top_limit is not None:
180 | df_diff = df_diff[df_diff['__top1diff__']<=self.cfg_top_limit]
181 | has_duplicates = df_diff.groupby(self.cfg_exact_left_on+['__top1left__']).size().max()>1
182 |
183 | return df_diff, has_duplicates
184 |
185 |
186 | def _merge_top1_diff_withblock(self):
187 |
188 | df_diff, has_duplicates = self._top1_diff_withblock()
189 |
190 | dfjoin = self.dfs[0].merge(df_diff, left_on=self.cfg_exact_left_on+[self.cfg_fuzzy_left_on], right_on=self.cfg_exact_left_on+['__top1left__'])
191 | # todo: add exact join keys
192 | dfjoin = dfjoin.merge(self.dfs[1], left_on=self.cfg_exact_left_on+['__top1right__'], right_on=self.cfg_exact_right_on+[self.cfg_fuzzy_right_on], suffixes=['','__right__'])
193 |
194 | if not self.cfg_is_keep_debug:
195 | dfjoin = dfjoin[dfjoin.columns[~dfjoin.columns.str.startswith('__')]]
196 |
197 | return {'merged':dfjoin, 'top1':df_diff, 'duplicates':has_duplicates}
198 |
199 | def top1_diff(self):
200 | if self.cfg_is_block:
201 | return self._top1_diff_withblock()
202 | else:
203 | return self._top1_diff_noblock()
204 |
205 | def merge(self):
206 |
207 | if not self.cfg_exact_left_on and not self.cfg_exact_right_on:
208 | return self._merge_top1_diff_noblock()
209 | elif self.cfg_exact_left_on and self.cfg_exact_right_on:
210 | return self._merge_top1_diff_withblock()
211 | else:
212 | raise ValueError('Need to pass exact keys for both or neither dataframe')
213 |
214 |
215 | class MergeTop1Number(object):
216 | """
217 |
218 | Top1 minimum difference join for numbers. Helper for `MergeTop1`.
219 |
220 | """
221 |
222 | def __init__(self, df1, df2, fuzzy_left_on, fuzzy_right_on, exact_left_on=None, exact_right_on=None,
223 | direction='nearest', top_limit=None, is_keep_debug=False):
224 |
225 | # check exact keys
226 | if not exact_left_on:
227 | exact_left_on = []
228 | if not exact_right_on:
229 | exact_right_on = []
230 |
231 | if len(exact_left_on) != len(exact_right_on):
232 | raise ValueError('Need to pass same number of exact keys')
233 | if not isinstance(exact_left_on, (list)) or not isinstance(exact_right_on, (list)):
234 | raise ValueError('Exact keys need to be a list')
235 |
236 | # use blocking index?
237 | if not exact_left_on and not exact_right_on:
238 | self.cfg_is_block = False
239 | elif exact_left_on and exact_right_on:
240 | self.cfg_is_block = True
241 | else:
242 | raise ValueError('Need to pass exact keys for both or neither dataframe')
243 |
244 | # store data
245 | self.dfs = [df1,df2]
246 |
247 | # store config
248 | self.cfg_fuzzy_left_on = fuzzy_left_on
249 | self.cfg_fuzzy_right_on = fuzzy_right_on
250 | self.cfg_exact_left_on = exact_left_on
251 | self.cfg_exact_right_on = exact_right_on
252 | self.cfg_direction = direction
253 | self.cfg_top_limit = top_limit
254 | self.cfg_is_keep_debug = is_keep_debug
255 |
256 | def _top1_diff_withblock(self):
257 |
258 | # unique values
259 | df_keys_left = self.dfs[0].groupby(self.cfg_exact_left_on)[self.cfg_fuzzy_left_on].apply(lambda x: pd.Series(x.unique()))
260 | df_keys_left.index = df_keys_left.index.droplevel(-1)
261 | df_keys_left = pd.DataFrame(df_keys_left)
262 | df_keys_right = self.dfs[1].groupby(self.cfg_exact_right_on)[self.cfg_fuzzy_right_on].apply(lambda x: pd.Series(x.unique()))
263 | df_keys_right.index = df_keys_right.index.droplevel(-1)
264 | df_keys_right = pd.DataFrame(df_keys_right)
265 |
266 | # todo: global consolidation like with MergeTop1Diff
267 |
268 | # sort
269 | df_keys_left = df_keys_left.sort_values(self.cfg_fuzzy_left_on).reset_index().rename(columns={self.cfg_fuzzy_left_on:'__top1left__'})
270 | df_keys_right = df_keys_right.sort_values(self.cfg_fuzzy_right_on).reset_index().rename(columns={self.cfg_fuzzy_right_on:'__top1right__'})
271 |
272 | # merge
273 | df_diff = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', left_by=self.cfg_exact_left_on, right_by=self.cfg_exact_right_on, direction=self.cfg_direction)
274 | df_diff['__top1diff__'] = (df_diff['__top1left__']-df_diff['__top1right__']).abs()
275 | df_diff['__matchtype__'] = 'top1 left'
276 | df_diff.loc[df_diff['__top1left__'] == df_diff['__top1right__'], '__matchtype__'] = 'exact'
277 | if self.cfg_top_limit is not None:
278 | df_diff = df_diff[df_diff['__top1diff__']<=self.cfg_top_limit]
279 |
280 | return df_diff
281 |
282 | def _top1_diff_noblock(self):
283 | # uniques
284 | values_left = _set_values(self.dfs[0], self.cfg_fuzzy_left_on)
285 | values_right = _set_values(self.dfs[1], self.cfg_fuzzy_right_on)
286 |
287 | # sort
288 | df_keys_left = pd.DataFrame({'__top1left__':list(values_left)}).sort_values('__top1left__')
289 | df_keys_right = pd.DataFrame({'__top1right__':list(values_right)}).sort_values('__top1right__')
290 |
291 | # merge
292 | df_diff = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', direction=self.cfg_direction)
293 | df_diff['__top1diff__'] = (df_diff['__top1left__']-df_diff['__top1right__']).abs()
294 | df_diff['__matchtype__'] = 'top1 left'
295 | df_diff.loc[df_diff['__top1left__'] == df_diff['__top1right__'], '__matchtype__'] = 'exact'
296 |
297 | return df_diff
298 |
299 | def top1_diff(self):
300 | if self.cfg_is_block:
301 | return self._top1_diff_withblock()
302 | else:
303 | return self._top1_diff_noblock()
304 |
305 | def merge(self):
306 | df_diff = self.top1_diff()
307 |
308 | dfjoin = self.dfs[0].merge(df_diff, left_on=self.cfg_exact_left_on+[self.cfg_fuzzy_left_on], right_on=self.cfg_exact_left_on+['__top1left__'])
309 | dfjoin = dfjoin.merge(self.dfs[1], left_on=self.cfg_exact_left_on+['__top1right__'], right_on=self.cfg_exact_right_on+[self.cfg_fuzzy_right_on], suffixes=['','__right__'])
310 |
311 | if not self.cfg_is_keep_debug:
312 | dfjoin = dfjoin[dfjoin.columns[~dfjoin.columns.str.startswith('__')]]
313 |
314 | return {'merged': dfjoin, 'top1': df_diff, 'duplicates': None}
315 |
316 | class MergeTop1(object):
317 | """
318 |
319 | Left best match join. It applies a difference function to find the key pair with the smallest difference to the join key.
320 |
321 | Args:
322 | df1 (dataframe): left dataframe onto which the right dataframe is joined
323 | df2 (dataframe): right dataframe
324 | fuzzy_left_on (list): join keys for similarity match, left dataframe
325 | fuzzy_right_on (list): join keys for similarity match, right dataframe
326 | exact_left_on (list, default None): join keys for exact match, left dataframe
327 | exact_right_on (list, default None): join keys for exact match, right dataframe
328 | fun_diff (list, default None): list of difference functions to be applied for each fuzzy key
329 | top_limit (list, default None): list of values to cap similarity matches
330 | is_keep_debug (bool): keep diagnostics columns, good for debugging
331 |
332 | Note:
333 | * fun_diff: applies the difference function to find the best match with minimum distance
334 | * By default gets automatically determined depending on whether you have a string or date/number
335 | * Use `None` to keep the default, so example [None, lambda x, y: x-y]
336 | * Functions within list get applied in order same order to fuzzy join keys
337 | * Needs to be a difference function so lower is better. For functions like Jaccard higher is better so you need to adjust for that
338 | * top_limit: Limits the number of matches to anything below that values. For example if two strings differ by 3 but top_limit is 2, that match will be ignored
339 | * for dates you can use `pd.offsets.Day(1)` or similar
340 |
341 | """
342 |
343 | def __init__(self, df1, df2, fuzzy_left_on=None, fuzzy_right_on=None, exact_left_on=None, exact_right_on=None,
344 | fun_diff = None, top_limit=None, is_keep_debug=False, use_multicore=True):
345 |
346 |
347 | # todo: pass custom merge asof param
348 | # todo: pass list of fundiff
349 |
350 |
351 | # check fuzzy keys
352 | if not fuzzy_left_on or not fuzzy_right_on:
353 | raise ValueError('Need to pass fuzzy left and right keys')
354 | if len(fuzzy_left_on) != len(fuzzy_right_on):
355 | raise ValueError('Need to pass same number of fuzzy left and right keys')
356 | self.cfg_njoins_fuzzy = len(fuzzy_left_on)
357 |
358 | # check exact keys
359 | if not exact_left_on:
360 | exact_left_on = []
361 | if not exact_right_on:
362 | exact_right_on = []
363 |
364 | if len(exact_left_on) != len(exact_right_on):
365 | raise ValueError('Need to pass same number of exact keys')
366 | if not isinstance(exact_left_on, (list)) or not isinstance(exact_right_on, (list)):
367 | raise ValueError('Exact keys need to be a list')
368 |
369 |
370 | # use blocking index?
371 | if not exact_left_on and not exact_right_on:
372 | self.cfg_is_block = False
373 | elif exact_left_on and exact_right_on:
374 | self.cfg_is_block = True
375 | else:
376 | raise ValueError('Need to pass exact keys for both or neither dataframe')
377 |
378 | # check custom params
379 | if not top_limit:
380 | top_limit = [None,]*self.cfg_njoins_fuzzy
381 | if not fun_diff:
382 | fun_diff = [None,]*self.cfg_njoins_fuzzy
383 | elif len(fun_diff)!=len(fuzzy_left_on):
384 | raise ValueError('fun_diff needs to the same length as fuzzy_left_on. Use None in list to use default')
385 | if not isinstance(top_limit, (list,)) or not len(top_limit)==self.cfg_njoins_fuzzy:
386 | raise NotImplementedError('top_limit needs to a list with entries for each fuzzy join key')
387 | if not isinstance(fun_diff, (list,)) or not len(top_limit)==self.cfg_njoins_fuzzy:
388 | raise NotImplementedError('fun_diff needs to a list with entries for each fuzzy join key')
389 |
390 | # store data
391 | self.dfs = [df1,df2]
392 |
393 | # store config
394 | self.cfg_fuzzy_left_on = fuzzy_left_on
395 | self.cfg_fuzzy_right_on = fuzzy_right_on
396 | # todo: exact keys by fuzzy key? or just global?
397 | self.cfg_exact_left_on = exact_left_on
398 | self.cfg_exact_right_on = exact_right_on
399 | self.cfg_top_limit = top_limit
400 | self.cfg_fun_diff = fun_diff
401 | self.cfg_is_keep_debug = is_keep_debug
402 | self.cfg_use_multicore = use_multicore
403 |
404 | def merge(self):
405 | """
406 |
407 | Executes merge
408 |
409 | Returns:
410 | dict: keys 'merged' has merged dataframe, 'top1' has best matches by fuzzy_left_on. See example notebooks for details
411 |
412 | """
413 | df_diff_bylevel = OrderedDict()
414 |
415 | self.dfjoined = self.dfs[0].copy()
416 | cfg_exact_left_on = self.cfg_exact_left_on
417 | cfg_exact_right_on = self.cfg_exact_right_on
418 |
419 | a=1
420 | for ilevel, ikey in enumerate(self.cfg_fuzzy_left_on):
421 | keyleft = ikey
422 | keyright = self.cfg_fuzzy_right_on[ilevel]
423 | typeleft = self.dfs[0][keyleft].dtype
424 |
425 | if self.cfg_fun_diff[ilevel]:
426 | df_diff_bylevel[ikey] = MergeTop1Diff(self.dfjoined, self.dfs[1], keyleft, keyright, self.cfg_fun_diff[ilevel], cfg_exact_left_on, cfg_exact_right_on, top_limit=self.cfg_top_limit[ilevel], use_multicore=self.cfg_use_multicore).top1_diff()[0]
427 | else:
428 | if typeleft == 'int64' or typeleft == 'float64' or typeleft == 'datetime64[ns]':
429 | df_diff_bylevel[ikey] = MergeTop1Number(self.dfjoined, self.dfs[1], keyleft, keyright, cfg_exact_left_on, cfg_exact_right_on, top_limit=self.cfg_top_limit[ilevel]).top1_diff()
430 | elif typeleft == 'object' and type(self.dfs[0][keyleft].values[0])==str:
431 | df_diff_bylevel[ikey] = MergeTop1Diff(self.dfjoined, self.dfs[1], keyleft, keyright, jellyfish.levenshtein_distance, cfg_exact_left_on, cfg_exact_right_on, top_limit=self.cfg_top_limit[ilevel], use_multicore=self.cfg_use_multicore).top1_diff()[0]
432 | # todo: handle duplicates
433 | else:
434 | raise ValueError('Unrecognized data type for top match, need to pass fun_diff in arguments')
435 |
436 | self.dfjoined = self.dfjoined.merge(df_diff_bylevel[ikey], left_on=cfg_exact_left_on+[keyleft], right_on=cfg_exact_left_on+['__top1left__'], suffixes=['',keyleft])
437 | cfg_col_rename = ['__top1left__','__top1right__','__top1diff__','__matchtype__']
438 | self.dfjoined = self.dfjoined.rename(columns=dict((k,k+keyleft) for k in cfg_col_rename))
439 | cfg_exact_left_on += ['__top1right__%s'%keyleft,]
440 | cfg_exact_right_on += [keyright,]
441 |
442 | self.dfjoined = self.dfjoined.merge(self.dfs[1], left_on=cfg_exact_left_on, right_on=cfg_exact_right_on, suffixes=['','_right'])
443 |
444 | if not self.cfg_is_keep_debug:
445 | self.dfjoined = self.dfjoined[self.dfjoined.columns[~self.dfjoined.columns.str.startswith('__')]]
446 |
447 | return {'merged': self.dfjoined, 'top1': df_diff_bylevel, 'duplicates': None}
448 |
449 | '''
450 | multikey: want to merge left match onto right df
451 | dont to numbers (non key) join until the very end
452 | '''
--------------------------------------------------------------------------------
/d6tjoin/utils.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | import pandas as pd
4 | pd.set_option('display.expand_frame_repr', False)
5 | import numpy as np
6 |
7 | # ******************************************
8 | # helpers
9 | # ******************************************
10 | def _set_values_series(dfs):
11 | return set(dfs[~pd.isnull(dfs)])
12 |
13 | def _set_values(dfg, key):
14 | return _set_values_series(dfg[key])
15 |
16 | def _filter_group_min(dfg, col, topn=1):
17 | """
18 |
19 | Returns all rows equal to min in col
20 |
21 | """
22 | if topn==1:
23 | return dfg[dfg[col] == dfg[col].min()]
24 | else:
25 | return dfg[dfg[col].isin(np.sort(dfg[col].unique())[:topn])]
26 |
27 | from joblib import Parallel, delayed
28 | import multiprocessing
29 | def _applyFunMulticore(values1, values2, func):
30 | retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(p[0],p[1]) for p in zip(values1,values2))
31 | return retLst
32 |
33 |
34 | # ******************************************
35 | # tfidf
36 | # ******************************************
37 | import re
38 | import collections
39 | from joblib import Parallel, delayed
40 | import multiprocessing
41 | import itertools
42 | import warnings
43 |
44 | def tokenCount(dfs, fun, mincount=2, minlength=1):
45 | """
46 | Tokenize a series of strings and count occurance of string tokens
47 |
48 | Args:
49 | dfs (pd.series): pd.series of values
50 | fun (function): tokenize function
51 | mincount (int): discard tokens with count less than mincount
52 | minlength (int): discard tokens with string length less than minlength
53 |
54 | Returns:
55 | dataframe: count of tokens
56 |
57 | """
58 | assert len(dfs.shape)==1
59 | dfs=dfs.dropna().unique()
60 |
61 | if dfs.shape[0]>1000:
62 | words = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(fun)(s) for s in dfs)
63 | else:
64 | words = [fun(s) for s in dfs]
65 | words = list(itertools.chain.from_iterable(words))
66 | df_count = [t for t in collections.Counter(words).most_common() if t[1]>=mincount and len(t[0])>=minlength]
67 | df_count = pd.DataFrame(df_count, columns=['word','count'])
68 | return df_count
69 |
70 | def splitcharTokenCount(dfs, splitchars="[^a-zA-Z0-9]+", mincount=2, minlength=1): #"[ -_|]+"
71 | """
72 | Tokenize a series of strings by splitting strings on a set of characters. Then count occurance of tokens in series.
73 |
74 | Args:
75 | dfs (pd.series): pd.series of values
76 | splitchars (str): regex by which to split string into tokens. For example `"[^a-zA-Z0-9]+"` for anything not alpha-numeric or `"[ -_|]+"` for common ID tokens.
77 | mincount (int): discard tokens with count less than mincount
78 | minlength (int): discard tokens with string length less than minlength
79 |
80 | Returns:
81 | dataframe: count of tokens
82 |
83 | """
84 | def funsplit(s):
85 | return re.split(splitchars,s)
86 | return tokenCount(dfs, funsplit, mincount, minlength)
87 |
88 | def ncharTokenCount(dfs, nchars=None, overlapping=False, mincount=2, minlength=1):
89 | """
90 | Tokenize a series of strings by splitting strings into tokens of `nchars` length. Then count occurance of tokens in series.
91 |
92 | Args:
93 | dfs (pd.series): pd.series of values
94 | nchars (int): number of characters in each token
95 | overlapping (bool): make overlapping tokens
96 | mincount (int): discard tokens with count less than mincount
97 | minlength (int): discard tokens with string length less than minlength
98 |
99 | Returns:
100 | dataframe: count of tokens
101 |
102 | """
103 | if not nchars:
104 | smax = dfs.str.len().max()
105 | smin = dfs.str.len().min()
106 | if smax-smin>2:
107 | warnings.warn('Tokenize works best if strings have similar length')
108 | nchars = dfs.str.len().max()//4
109 |
110 | if overlapping:
111 | def funtokenize(s):
112 | return [s[i:i+nchars] for i in range(0, len(s)-nchars+1)]
113 | else:
114 | def funtokenize(s):
115 | return [s[i:i+nchars] for i in range(0, len(s), nchars)]
116 | return tokenCount(dfs, funtokenize, mincount, minlength)
117 |
118 |
119 | def unique_contains(dfs, strlist):
120 | """
121 | Find values which contain a set of substrings
122 |
123 | Args:
124 | dfs (pd.series): pd.series of values
125 | strlist (list): substrings to find
126 |
127 | Returns:
128 | list: unique values which contain substring
129 |
130 | """
131 | assert len(dfs.shape)==1
132 | dfs=np.unique(dfs)
133 | outlist = [(x, [s for s in dfs if x in s]) for x in strlist]
134 | return outlist
135 |
136 | import collections
137 |
138 | def typeSeries(dfs):
139 | """
140 | Find type of a pandas series
141 |
142 | Args:
143 | dfs (pd.series): pd.series of values
144 |
145 | Returns:
146 | str: type
147 |
148 | """
149 | c = collections.Counter([type(x) for x in dfs.values])
150 | cnt = c.most_common()
151 | if len(cnt)>1:
152 | return 'mixed'
153 | else:
154 | return cnt[0][0]
155 |
156 | def typeDataFrame(df):
157 | """
158 | Find type of a pandas dataframe columns
159 |
160 | Args:
161 | df (pd.dataframe): pandas dataframe
162 |
163 | Returns:
164 | dict: column, type
165 |
166 | """
167 | return dict(zip(df.columns,[typeSeries(df[s]) for s in df]))
168 |
169 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = python -msphinx
7 | SPHINXPROJ = d6tjoin
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=python -msphinx
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=d6t-lib
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | echo.then set the SPHINXBUILD environment variable to point to the full
21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | echo.Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/docs/samples.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import itertools
4 | from faker import Faker
5 | import importlib
6 |
7 | import d6tjoin.top1
8 | import d6tjoin.utils
9 |
10 | importlib.reload(d6tjoin.top1)
11 |
12 | # *******************************************************
13 | # generate sample time series data with id and value
14 | # *******************************************************
15 | nobs = 10
16 | f1 = Faker()
17 | f1.seed(0)
18 | uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)]
19 | dates1 = pd.date_range('1/1/2010','1/1/2011')
20 |
21 | df1 = pd.DataFrame(list(itertools.product(dates1,uuid1)),columns=['date','id'])
22 | df1['val1']=np.round(np.random.sample(df1.shape[0]),3)
23 |
24 | # create mismatch
25 | df2 = df1.copy()
26 | df2['id'] = df1['id'].str[1:-1]
27 | df2['val2']=np.round(np.random.sample(df2.shape[0]),3)
28 |
29 | d6tjoin.utils.PreJoin([df1,df2],['id','date']).stats_prejoin()
30 |
31 | result = d6tjoin.top1.MergeTop1(df1.head(),df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'],exact_left_on=['date'],exact_right_on=['date']).merge()
32 |
33 | print(result['top1']['id'].head(2))
34 |
35 | print(result['merged'].head(2))
36 |
--------------------------------------------------------------------------------
/docs/shell-napoleon-html.sh:
--------------------------------------------------------------------------------
1 | make html
2 |
--------------------------------------------------------------------------------
/docs/shell-napoleon-recreate.sh:
--------------------------------------------------------------------------------
1 | #rm ./source/*
2 | #cp ./source-bak/* ./source/
3 | sphinx-apidoc -f -o ./source ..
4 | make clean
5 | make html
6 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # d6t-lib documentation build configuration file, created by
5 | # sphinx-quickstart on Tue Nov 28 11:32:56 2017.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 | #
20 | import os
21 | import sys
22 |
23 | sys.path.insert(0, os.path.abspath('.'))
24 | sys.path.insert(0, os.path.dirname(os.path.abspath('.'))) # todo: why is this not working?
25 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
26 | sys.path.insert(0, os.path.join(os.path.dirname((os.path.abspath('.'))), "d6tjoin"))
27 |
28 | # -- General configuration ------------------------------------------------
29 |
30 | # If your documentation needs a minimal Sphinx version, state it here.
31 | #
32 | # needs_sphinx = '1.0'
33 |
34 | # Add any Sphinx extension module names here, as strings. They can be
35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36 | # ones.
37 | extensions = ['sphinx.ext.autodoc',
38 | 'sphinx.ext.todo',
39 | 'sphinx.ext.viewcode',
40 | 'sphinx.ext.githubpages',
41 | 'sphinx.ext.napoleon']
42 |
43 | # Add any paths that contain templates here, relative to this directory.
44 | templates_path = ['_templates']
45 |
46 | # The suffix(es) of source filenames.
47 | # You can specify multiple suffix as a list of string:
48 | #
49 | # source_suffix = ['.rst', '.md']
50 | source_suffix = '.rst'
51 |
52 | # The master toctree document.
53 | master_doc = 'index'
54 |
55 | # General information about the project.
56 | project = 'd6tjoin'
57 | copyright = '2017, databolt'
58 | author = 'databolt'
59 |
60 | # The version info for the project you're documenting, acts as replacement for
61 | # |version| and |release|, also used in various other places throughout the
62 | # built documents.
63 | #
64 | # The short X.Y version.
65 | version = '0.1'
66 | # The full version, including alpha/beta/rc tags.
67 | release = '0.1'
68 |
69 | # The language for content autogenerated by Sphinx. Refer to documentation
70 | # for a list of supported languages.
71 | #
72 | # This is also used if you do content translation via gettext catalogs.
73 | # Usually you set "language" from the command line for these cases.
74 | language = None
75 |
76 | # List of patterns, relative to source directory, that match files and
77 | # directories to ignore when looking for source files.
78 | # This patterns also effect to html_static_path and html_extra_path
79 | exclude_patterns = []
80 |
81 | # The name of the Pygments (syntax highlighting) style to use.
82 | pygments_style = 'sphinx'
83 |
84 | # If true, `todo` and `todoList` produce output, else they produce nothing.
85 | todo_include_todos = True
86 |
87 | # -- Options for HTML output ----------------------------------------------
88 |
89 | # The theme to use for HTML and HTML Help pages. See the documentation for
90 | # a list of builtin themes.
91 | #
92 | html_theme = 'sphinx_rtd_theme' # 'alabaster'
93 |
94 | # Theme options are theme-specific and customize the look and feel of a theme
95 | # further. For a list of options available for each theme, see the
96 | # documentation.
97 | #
98 | # html_theme_options = {}
99 |
100 | # Add any paths that contain custom static files (such as style sheets) here,
101 | # relative to this directory. They are copied after the builtin static files,
102 | # so a file named "default.css" will overwrite the builtin "default.css".
103 | html_static_path = ['_static']
104 |
105 | # Custom sidebar templates, must be a dictionary that maps document names
106 | # to template names.
107 | #
108 | # This is required for the alabaster theme
109 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
110 | # html_sidebars = {
111 | # '**': [
112 | # 'about.html',
113 | # 'navigation.html',
114 | # 'relations.html', # needs 'show_related': True theme option to display
115 | # 'searchbox.html',
116 | # 'donate.html',
117 | # ]
118 | # }
119 |
120 |
121 | # -- Options for HTMLHelp output ------------------------------------------
122 |
123 | # Output file base name for HTML help builder.
124 | htmlhelp_basename = 'd6tjoin-doc'
125 |
126 | # -- Options for LaTeX output ---------------------------------------------
127 |
128 | latex_elements = {
129 | # The paper size ('letterpaper' or 'a4paper').
130 | #
131 | # 'papersize': 'letterpaper',
132 |
133 | # The font size ('10pt', '11pt' or '12pt').
134 | #
135 | # 'pointsize': '10pt',
136 |
137 | # Additional stuff for the LaTeX preamble.
138 | #
139 | # 'preamble': '',
140 |
141 | # Latex figure (float) alignment
142 | #
143 | # 'figure_align': 'htbp',
144 | }
145 |
146 | # Grouping the document tree into LaTeX files. List of tuples
147 | # (source start file, target name, title,
148 | # author, documentclass [howto, manual, or own class]).
149 | latex_documents = [
150 | (master_doc, 'd6tjoin.tex', 'd6tjoin Documentation',
151 | 'nn', 'manual'),
152 | ]
153 |
154 | # -- Options for manual page output ---------------------------------------
155 |
156 | # One entry per manual page. List of tuples
157 | # (source start file, name, description, authors, manual section).
158 | man_pages = [
159 | (master_doc, 'd6tjoin', 'd6tjoin Documentation',
160 | [author], 1)
161 | ]
162 |
163 | # -- Options for Texinfo output -------------------------------------------
164 |
165 | # Grouping the document tree into Texinfo files. List of tuples
166 | # (source start file, target name, title, author,
167 | # dir menu entry, description, category)
168 | texinfo_documents = [
169 | (master_doc, 'd6tjoin', 'd6tjoin Documentation',
170 | author, 'd6tjoin', 'Databolt python library - Accelerate data engineering',
171 | 'Miscellaneous'),
172 | ]
173 |
--------------------------------------------------------------------------------
/docs/source/d6tjoin.rst:
--------------------------------------------------------------------------------
1 | d6tjoin package
2 | ===============
3 |
4 | Submodules
5 | ----------
6 |
7 | d6tjoin\.top1 module
8 | --------------------
9 |
10 | .. automodule:: d6tjoin.top1
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | d6tjoin\.utils module
16 | ---------------------
17 |
18 | .. automodule:: d6tjoin.utils
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 |
24 | Module contents
25 | ---------------
26 |
27 | .. automodule:: d6tjoin
28 | :members:
29 | :undoc-members:
30 | :show-inheritance:
31 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. d6tjoin documentation master file, created by
2 | sphinx-quickstart on Tue Nov 28 11:32:56 2017.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to d6tjoin documentation!
7 | ==============================================
8 |
9 | Documentation for using the databolt python Smart Join Combine library.
10 |
11 | Library Docs
12 | ==================
13 |
14 | * :ref:`modindex`
15 |
16 | Search
17 | ==================
18 |
19 | * :ref:`search`
20 |
--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | d6tjoin
2 | =======
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | d6tjoin
8 | setup
9 | tests
10 |
--------------------------------------------------------------------------------
/docs/source/setup.rst:
--------------------------------------------------------------------------------
1 | setup module
2 | ============
3 |
4 | .. automodule:: setup
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/examples-prejoin.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Engineering in Python with databolt - Identify and analyze join problems (d6tjoin.Prejoin)\n",
8 | "\n",
9 | "## Introduction\n",
10 | "\n",
11 | "Joining datasets is a common data engineering operation. However, often there are problems merging datasets from different sources because of mismatched identifiers, date conventions etc. \n",
12 | "\n",
13 | "** `d6tjoin.Prejoin` module allows you to test for join accuracy and quickly identify and analyze join problems. **\n",
14 | "\n",
15 | "Here are some examples which show you how to:\n",
16 | "* do join quality analysis prior to attempting a join\n",
17 | "* detect and analyze a string-based identifiers mismatch\n",
18 | "* detect and analyze a date mismatch"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Generate sample data\n",
26 | "\n",
27 | "Let's generate some random respresentative data:\n",
28 | "* identifier (string)\n",
29 | "* date (np.datetime)\n",
30 | "* values (flaot)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 1,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import pandas as pd\n",
40 | "import numpy as np\n",
41 | "import uuid\n",
42 | "import itertools\n",
43 | "import importlib\n",
44 | "\n",
45 | "import d6tjoin\n",
46 | "\n",
47 | "# ******************************************\n",
48 | "# generate sample data\n",
49 | "# ******************************************\n",
50 | "nobs = 10\n",
51 | "uuid1 = [str(uuid.uuid4()) for _ in range(nobs)]\n",
52 | "dates1 = pd.date_range('1/1/2010','1/1/2011')\n",
53 | "\n",
54 | "df1 = pd.DataFrame(list(itertools.product(uuid1,dates1)),columns=['id','date'])\n",
55 | "df1['v']=np.random.sample(df1.shape[0])"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 2,
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "data": {
65 | "text/html": [
66 | "
\n",
67 | "\n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " | \n",
84 | " id | \n",
85 | " date | \n",
86 | " v | \n",
87 | "
\n",
88 | " \n",
89 | " \n",
90 | " \n",
91 | " 0 | \n",
92 | " 26e41c83-630e-47c5-a410-83fd7865e826 | \n",
93 | " 2010-01-01 | \n",
94 | " 0.589946 | \n",
95 | "
\n",
96 | " \n",
97 | " 1 | \n",
98 | " 26e41c83-630e-47c5-a410-83fd7865e826 | \n",
99 | " 2010-01-02 | \n",
100 | " 0.367214 | \n",
101 | "
\n",
102 | " \n",
103 | " 366 | \n",
104 | " 049676df-998a-4322-9121-84dac8b7547f | \n",
105 | " 2010-01-01 | \n",
106 | " 0.570425 | \n",
107 | "
\n",
108 | " \n",
109 | " 367 | \n",
110 | " 049676df-998a-4322-9121-84dac8b7547f | \n",
111 | " 2010-01-02 | \n",
112 | " 0.524693 | \n",
113 | "
\n",
114 | " \n",
115 | " 732 | \n",
116 | " ad14d610-3a0b-4d87-8a29-236c9b6e817e | \n",
117 | " 2010-01-01 | \n",
118 | " 0.681610 | \n",
119 | "
\n",
120 | " \n",
121 | " 733 | \n",
122 | " ad14d610-3a0b-4d87-8a29-236c9b6e817e | \n",
123 | " 2010-01-02 | \n",
124 | " 0.236658 | \n",
125 | "
\n",
126 | " \n",
127 | "
\n",
128 | "
"
129 | ],
130 | "text/plain": [
131 | " id date v\n",
132 | "0 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-01 0.589946\n",
133 | "1 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-02 0.367214\n",
134 | "366 049676df-998a-4322-9121-84dac8b7547f 2010-01-01 0.570425\n",
135 | "367 049676df-998a-4322-9121-84dac8b7547f 2010-01-02 0.524693\n",
136 | "732 ad14d610-3a0b-4d87-8a29-236c9b6e817e 2010-01-01 0.681610\n",
137 | "733 ad14d610-3a0b-4d87-8a29-236c9b6e817e 2010-01-02 0.236658"
138 | ]
139 | },
140 | "execution_count": 2,
141 | "metadata": {},
142 | "output_type": "execute_result"
143 | }
144 | ],
145 | "source": [
146 | "df1.groupby(['id']).head(2).head(6)"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "## Use Case: assert 100% join accuracy for data integrity checks \n",
154 | "\n",
155 | "In data enginerring QA you want to test that data is joined correctly. This is particularly useful for detecting potential data problems in production."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 3,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "df2 = df1.copy()\n",
165 | "\n",
166 | "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n",
167 | "assert j.is_all_matched() # succeeds\n",
168 | "assert j.is_all_matched('id') # succeeds\n",
169 | "assert j.is_all_matched('date') # succeeds\n"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "## Use Case: detect and analyze id mismatch \n",
177 | "\n",
178 | "When joining data from different sources, eg different vendors, often your ids don't match and then you need to manually analyze the situation. With databolt this becomes much easier."
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "### 100% id mismatch\n",
186 | "\n",
187 | "Let's look at an example where say vendor 1 uses a different id convention than vendor 2 and none of the ids match."
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 4,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "assert fails!\n"
200 | ]
201 | }
202 | ],
203 | "source": [
204 | "# create mismatch\n",
205 | "df2['id'] = df1['id'].str[1:-1]\n",
206 | "\n",
207 | "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n",
208 | "\n",
209 | "try:\n",
210 | " assert j.is_all_matched() # fails\n",
211 | "except:\n",
212 | " print('assert fails!')"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "The QA check shows there's a problem, lets analyze the issue with `Prejoin.match_quality()`. We can immediately see that none of the ids match."
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 5,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "name": "stdout",
229 | "output_type": "stream",
230 | "text": [
231 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n",
232 | "0 id id False 0 10 10 20 20 10 10\n",
233 | "1 date date True 366 366 366 366 0 0 0\n",
234 | "2 __all__ __all__ False 0 3660 3660 7320 7320 3660 3660\n"
235 | ]
236 | }
237 | ],
238 | "source": [
239 | "j.match_quality()"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "Let's look at some of the mismatched records with `Prejoin.show_unmatched()`. Looks like there might be a length problem."
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 6,
252 | "metadata": {},
253 | "outputs": [
254 | {
255 | "name": "stdout",
256 | "output_type": "stream",
257 | "text": [
258 | " id date v\n",
259 | "1098 b5c945ce-fdf6-4774-bbae-ff5f5787a7eb 2010-01-01 0.194907\n",
260 | "1099 b5c945ce-fdf6-4774-bbae-ff5f5787a7eb 2010-01-02 0.558549\n",
261 | "1100 b5c945ce-fdf6-4774-bbae-ff5f5787a7eb 2010-01-03 0.316138\n",
262 | " id date v\n",
263 | "0 6e41c83-630e-47c5-a410-83fd7865e82 2010-01-01 0.589946\n",
264 | "1 6e41c83-630e-47c5-a410-83fd7865e82 2010-01-02 0.367214\n",
265 | "2 6e41c83-630e-47c5-a410-83fd7865e82 2010-01-03 0.290587\n"
266 | ]
267 | }
268 | ],
269 | "source": [
270 | "print(j.show_unmatched('id')['left'])\n",
271 | "print(j.show_unmatched('id')['right'])"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "We can show string length statistics using `d6tjoin.Prejoin().describe_str()` which confirms that the id string lenghts are different."
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 7,
284 | "metadata": {
285 | "scrolled": true
286 | },
287 | "outputs": [
288 | {
289 | "name": "stdout",
290 | "output_type": "stream",
291 | "text": [
292 | "dataframe #0\n",
293 | " median min max nrecords\n",
294 | "id 36.0 36.0 36.0 3660.0\n",
295 | "dataframe #1\n",
296 | " median min max nrecords\n",
297 | "id 34.0 34.0 34.0 3660.0\n",
298 | "None\n"
299 | ]
300 | }
301 | ],
302 | "source": [
303 | "print(j.describe_str())\n"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "### Partial id mismatch\n",
311 | "\n",
312 | "Let's look at another example where there is a partial mismatch. In this case let's say vendor 2 only has a certain percentage of ids covered."
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 8,
318 | "metadata": {
319 | "scrolled": true
320 | },
321 | "outputs": [
322 | {
323 | "name": "stdout",
324 | "output_type": "stream",
325 | "text": [
326 | "assert fails!\n"
327 | ]
328 | }
329 | ],
330 | "source": [
331 | "# create partial mismatch\n",
332 | "uuid_sel = np.array(uuid1)[np.random.choice(nobs, nobs//5, replace=False)].tolist()\n",
333 | "df2 = df1[~df1['id'].isin(uuid_sel)]\n",
334 | "\n",
335 | "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n",
336 | "\n",
337 | "try:\n",
338 | " assert j.is_all_matched() # fails\n",
339 | "except:\n",
340 | " print('assert fails!')"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "Again we've quickly identified a problem. This would typically cause you to do manual and tedious manual QA work but with `Prejoin().match_quality()` you can quickly see how many ids were mismatched."
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 9,
353 | "metadata": {
354 | "scrolled": true
355 | },
356 | "outputs": [
357 | {
358 | "name": "stdout",
359 | "output_type": "stream",
360 | "text": [
361 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n",
362 | "0 id id False 8 10 8 10 2 2 0\n",
363 | "1 date date True 366 366 366 366 0 0 0\n",
364 | "2 __all__ __all__ False 2928 3660 2928 3660 732 732 0\n"
365 | ]
366 | }
367 | ],
368 | "source": [
369 | "j.match_quality()"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "## Use Case: detect and analyze date mismatch \n",
377 | "\n",
378 | "Dates are another common sources of frustration for data engineers working with time series data. Dates come in a variety of different formats and conventions. Let's use databolt to analyze a date mismatch situation."
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 10,
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n",
388 | "df2 = pd.DataFrame(list(itertools.product(uuid1,dates2)),columns=['id','date'])\n",
389 | "df2['v']=np.random.sample(df2.shape[0])"
390 | ]
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "metadata": {},
395 | "source": [
396 | "To highlight some different functionality for `Prejoin().match_quality()`. The QA test for all matches fails."
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": 11,
402 | "metadata": {},
403 | "outputs": [
404 | {
405 | "name": "stdout",
406 | "output_type": "stream",
407 | "text": [
408 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n",
409 | "0 id id True 10 10 10 10 0 0 0\n",
410 | "1 date date False 261 366 261 366 105 105 0\n",
411 | "2 __all__ __all__ False 2610 3660 2610 3660 1050 1050 0\n",
412 | "assert fails!\n"
413 | ]
414 | }
415 | ],
416 | "source": [
417 | "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n",
418 | "dfr = j.match_quality()\n",
419 | "try:\n",
420 | " assert dfr['all matched'].all() # fails\n",
421 | "except:\n",
422 | " print('assert fails!')"
423 | ]
424 | },
425 | {
426 | "cell_type": "markdown",
427 | "metadata": {},
428 | "source": [
429 | "We can look at the dataframe to see 105 dates are not matched."
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": 12,
435 | "metadata": {},
436 | "outputs": [],
437 | "source": [
438 | "dfr"
439 | ]
440 | },
441 | {
442 | "cell_type": "markdown",
443 | "metadata": {},
444 | "source": [
445 | "We can look at mismatched records using `Prejoin.show_unmatched()`. Here we will return all mismatched records into a dataframe you can analyze."
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 13,
451 | "metadata": {},
452 | "outputs": [],
453 | "source": [
454 | "dft = j.show_unmatched('date',keys_only=False,nrecords=-1,nrows=-1)['left']"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": 14,
460 | "metadata": {},
461 | "outputs": [
462 | {
463 | "data": {
464 | "text/html": [
465 | "\n",
466 | "\n",
479 | "
\n",
480 | " \n",
481 | " \n",
482 | " | \n",
483 | " id | \n",
484 | " date | \n",
485 | " v | \n",
486 | "
\n",
487 | " \n",
488 | " \n",
489 | " \n",
490 | " 1 | \n",
491 | " 26e41c83-630e-47c5-a410-83fd7865e826 | \n",
492 | " 2010-01-02 | \n",
493 | " 0.367214 | \n",
494 | "
\n",
495 | " \n",
496 | " 2 | \n",
497 | " 26e41c83-630e-47c5-a410-83fd7865e826 | \n",
498 | " 2010-01-03 | \n",
499 | " 0.290587 | \n",
500 | "
\n",
501 | " \n",
502 | " 8 | \n",
503 | " 26e41c83-630e-47c5-a410-83fd7865e826 | \n",
504 | " 2010-01-09 | \n",
505 | " 0.663732 | \n",
506 | "
\n",
507 | " \n",
508 | " 9 | \n",
509 | " 26e41c83-630e-47c5-a410-83fd7865e826 | \n",
510 | " 2010-01-10 | \n",
511 | " 0.210751 | \n",
512 | "
\n",
513 | " \n",
514 | " 15 | \n",
515 | " 26e41c83-630e-47c5-a410-83fd7865e826 | \n",
516 | " 2010-01-16 | \n",
517 | " 0.889254 | \n",
518 | "
\n",
519 | " \n",
520 | "
\n",
521 | "
"
522 | ],
523 | "text/plain": [
524 | " id date v\n",
525 | "1 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-02 0.367214\n",
526 | "2 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-03 0.290587\n",
527 | "8 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-09 0.663732\n",
528 | "9 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-10 0.210751\n",
529 | "15 26e41c83-630e-47c5-a410-83fd7865e826 2010-01-16 0.889254"
530 | ]
531 | },
532 | "execution_count": 14,
533 | "metadata": {},
534 | "output_type": "execute_result"
535 | }
536 | ],
537 | "source": [
538 | "dft.head()"
539 | ]
540 | },
541 | {
542 | "cell_type": "markdown",
543 | "metadata": {},
544 | "source": [
545 | "Looking at the weekdays of the mismatched entries, you can see they are all weekends. "
546 | ]
547 | },
548 | {
549 | "cell_type": "code",
550 | "execution_count": 15,
551 | "metadata": {},
552 | "outputs": [
553 | {
554 | "data": {
555 | "text/plain": [
556 | "array([5, 6])"
557 | ]
558 | },
559 | "execution_count": 15,
560 | "metadata": {},
561 | "output_type": "execute_result"
562 | }
563 | ],
564 | "source": [
565 | "dft['date_wkday']=dft['date'].dt.weekday\n",
566 | "dft['date_wkday'].unique()"
567 | ]
568 | },
569 | {
570 | "cell_type": "markdown",
571 | "metadata": {},
572 | "source": [
573 | "## Conclusion\n",
574 | "\n",
575 | "Joining datasets from different sources can be a big time waster for data engineers! With databolt you can quickly do join QA and analyze problems without doing manual tedious work."
576 | ]
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": null,
581 | "metadata": {},
582 | "outputs": [],
583 | "source": []
584 | }
585 | ],
586 | "metadata": {
587 | "kernelspec": {
588 | "display_name": "Python 3",
589 | "language": "python",
590 | "name": "python3"
591 | },
592 | "language_info": {
593 | "codemirror_mode": {
594 | "name": "ipython",
595 | "version": 3
596 | },
597 | "file_extension": ".py",
598 | "mimetype": "text/x-python",
599 | "name": "python",
600 | "nbconvert_exporter": "python",
601 | "pygments_lexer": "ipython3",
602 | "version": "3.7.6"
603 | }
604 | },
605 | "nbformat": 4,
606 | "nbformat_minor": 2
607 | }
608 |
--------------------------------------------------------------------------------
/examples-tokencluster.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Engineering in Python with databolt - Find Token Clusters for Fuzzy Merging Identifiers (d6tlib/d6tjoin.utils)\n",
8 | "\n",
9 | "## Introduction\n",
10 | "\n",
11 | "Identifiers such as securities IDs often come in different conventions which makes joining them difficult. Normal joins don't work and fuzzy joins often get tripped up by commonly occuring tokens. \n",
12 | "\n",
13 | "In this notebook we will show how to use `d6tstack.utils.tokenCount` to find clusters of tokens and match on tokens."
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "import d6tjoin.utils\n",
23 | "import d6tjoin.top1\n",
24 | "import pandas as pd\n",
25 | "pd.set_option('display.expand_frame_repr', False)\n",
26 | "import numpy as np"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "# data is tickers from two different vendors which try to join\n",
36 | "df1 = pd.DataFrame({'id':[\"AAP\",\"AAPL\",\"APRN\",\"AMZN-AMZN\",\"BBW\",\"NMG\",\"JLP\"]})\n",
37 | "df2 = pd.DataFrame({'id':[\"AAP_US_Equity\",\"AAPL_US_Equity\",\"AMZN_US_Equity\",\"APRN_US_Equity\",\"AD_NA_Equity\",\"BBY_US_Equity\",\"BMW_NA_Equity\",\"PRIVATE_NMG\",\"PRIVATE_JLP\"]})\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n",
50 | "0 id id False 0 7 9 16 16 7 9\n",
51 | "1 __all__ __all__ False 0 7 9 16 16 7 9\n"
52 | ]
53 | }
54 | ],
55 | "source": [
56 | "# d6tjoin.Prejoin() shows none of the ids match\n",
57 | "\n",
58 | "d6tjoin.Prejoin([df1,df2],['id']).match_quality()"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n",
71 | "0 id_cleaned id_cleaned False 4 7 8 11 7 3 4\n",
72 | "1 __all__ __all__ False 4 7 8 11 7 3 4\n"
73 | ]
74 | }
75 | ],
76 | "source": [
77 | "# attempt to join manually, better but still missing a few\n",
78 | "\n",
79 | "df1['id_cleaned'] = df1['id'].str.split('-').str[0]\n",
80 | "df2['id_cleaned'] = df2['id'].str.split('_').str[0]\n",
81 | "\n",
82 | "d6tjoin.Prejoin([df1,df2],['id_cleaned']).match_quality()"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "## Fuzzy joins get confused by tokens\n",
90 | "\n",
91 | "Fuzzy joins to the rescue? Unfortunately, the presence of commonly occuring string tokens is messing with the string similarity functions."
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 5,
97 | "metadata": {},
98 | "outputs": [
99 | {
100 | "name": "stderr",
101 | "output_type": "stream",
102 | "text": [
103 | "/Users/haijing/Haijing/ds_project/d6t/d6tjoin/d6tjoin/top1.py:118: UserWarning: Top1 join for id has duplicates\n",
104 | " warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)\n"
105 | ]
106 | },
107 | {
108 | "data": {
109 | "text/html": [
110 | "\n",
111 | "\n",
124 | "
\n",
125 | " \n",
126 | " \n",
127 | " | \n",
128 | " __top1left__ | \n",
129 | " __top1right__ | \n",
130 | " __matchtype__ | \n",
131 | " __top1diff__ | \n",
132 | "
\n",
133 | " \n",
134 | " \n",
135 | " \n",
136 | " 40 | \n",
137 | " AAP | \n",
138 | " PRIVATE_JLP | \n",
139 | " top1 left | \n",
140 | " 9 | \n",
141 | "
\n",
142 | " \n",
143 | " 58 | \n",
144 | " AAPL | \n",
145 | " PRIVATE_JLP | \n",
146 | " top1 left | \n",
147 | " 9 | \n",
148 | "
\n",
149 | " \n",
150 | " 27 | \n",
151 | " AMZN-AMZN | \n",
152 | " PRIVATE_NMG | \n",
153 | " top1 left | \n",
154 | " 10 | \n",
155 | "
\n",
156 | " \n",
157 | " 30 | \n",
158 | " AMZN-AMZN | \n",
159 | " AD_NA_Equity | \n",
160 | " top1 left | \n",
161 | " 10 | \n",
162 | "
\n",
163 | " \n",
164 | " 34 | \n",
165 | " AMZN-AMZN | \n",
166 | " AMZN_US_Equity | \n",
167 | " top1 left | \n",
168 | " 10 | \n",
169 | "
\n",
170 | " \n",
171 | " 9 | \n",
172 | " APRN | \n",
173 | " PRIVATE_NMG | \n",
174 | " top1 left | \n",
175 | " 9 | \n",
176 | "
\n",
177 | " \n",
178 | " 0 | \n",
179 | " BBW | \n",
180 | " PRIVATE_NMG | \n",
181 | " top1 left | \n",
182 | " 11 | \n",
183 | "
\n",
184 | " \n",
185 | " 1 | \n",
186 | " BBW | \n",
187 | " BBY_US_Equity | \n",
188 | " top1 left | \n",
189 | " 11 | \n",
190 | "
\n",
191 | " \n",
192 | " 4 | \n",
193 | " BBW | \n",
194 | " PRIVATE_JLP | \n",
195 | " top1 left | \n",
196 | " 11 | \n",
197 | "
\n",
198 | " \n",
199 | " 5 | \n",
200 | " BBW | \n",
201 | " BMW_NA_Equity | \n",
202 | " top1 left | \n",
203 | " 11 | \n",
204 | "
\n",
205 | " \n",
206 | " 22 | \n",
207 | " JLP | \n",
208 | " PRIVATE_JLP | \n",
209 | " top1 left | \n",
210 | " 8 | \n",
211 | "
\n",
212 | " \n",
213 | " 45 | \n",
214 | " NMG | \n",
215 | " PRIVATE_NMG | \n",
216 | " top1 left | \n",
217 | " 8 | \n",
218 | "
\n",
219 | " \n",
220 | "
\n",
221 | "
"
222 | ],
223 | "text/plain": [
224 | " __top1left__ __top1right__ __matchtype__ __top1diff__\n",
225 | "40 AAP PRIVATE_JLP top1 left 9\n",
226 | "58 AAPL PRIVATE_JLP top1 left 9\n",
227 | "27 AMZN-AMZN PRIVATE_NMG top1 left 10\n",
228 | "30 AMZN-AMZN AD_NA_Equity top1 left 10\n",
229 | "34 AMZN-AMZN AMZN_US_Equity top1 left 10\n",
230 | "9 APRN PRIVATE_NMG top1 left 9\n",
231 | "0 BBW PRIVATE_NMG top1 left 11\n",
232 | "1 BBW BBY_US_Equity top1 left 11\n",
233 | "4 BBW PRIVATE_JLP top1 left 11\n",
234 | "5 BBW BMW_NA_Equity top1 left 11\n",
235 | "22 JLP PRIVATE_JLP top1 left 8\n",
236 | "45 NMG PRIVATE_NMG top1 left 8"
237 | ]
238 | },
239 | "execution_count": 5,
240 | "metadata": {},
241 | "output_type": "execute_result"
242 | }
243 | ],
244 | "source": [
245 | "# attempt a fuzzy join using edit distance => not looking good\n",
246 | "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id']).merge()['top1']['id']"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 6,
252 | "metadata": {},
253 | "outputs": [
254 | {
255 | "name": "stderr",
256 | "output_type": "stream",
257 | "text": [
258 | "/Users/haijing/Haijing/ds_project/d6t/d6tjoin/d6tjoin/top1.py:118: UserWarning: Top1 join for id has duplicates\n",
259 | " warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)\n"
260 | ]
261 | },
262 | {
263 | "data": {
264 | "text/html": [
265 | "\n",
266 | "\n",
279 | "
\n",
280 | " \n",
281 | " \n",
282 | " | \n",
283 | " __top1left__ | \n",
284 | " __top1right__ | \n",
285 | " __matchtype__ | \n",
286 | " __top1diff__ | \n",
287 | "
\n",
288 | " \n",
289 | " \n",
290 | " \n",
291 | " 42 | \n",
292 | " AAP | \n",
293 | " AAP_US_Equity | \n",
294 | " top1 left | \n",
295 | " 13.000 | \n",
296 | "
\n",
297 | " \n",
298 | " 56 | \n",
299 | " AAPL | \n",
300 | " AAPL_US_Equity | \n",
301 | " top1 left | \n",
302 | " 14.000 | \n",
303 | "
\n",
304 | " \n",
305 | " 34 | \n",
306 | " AMZN-AMZN | \n",
307 | " AMZN_US_Equity | \n",
308 | " top1 left | \n",
309 | " 64.625 | \n",
310 | "
\n",
311 | " \n",
312 | " 17 | \n",
313 | " APRN | \n",
314 | " APRN_US_Equity | \n",
315 | " top1 left | \n",
316 | " 14.000 | \n",
317 | "
\n",
318 | " \n",
319 | " 1 | \n",
320 | " BBW | \n",
321 | " BBY_US_Equity | \n",
322 | " top1 left | \n",
323 | " 23.000 | \n",
324 | "
\n",
325 | " \n",
326 | " 5 | \n",
327 | " BBW | \n",
328 | " BMW_NA_Equity | \n",
329 | " top1 left | \n",
330 | " 23.000 | \n",
331 | "
\n",
332 | " \n",
333 | " 24 | \n",
334 | " JLP | \n",
335 | " AAP_US_Equity | \n",
336 | " top1 left | \n",
337 | " 33.000 | \n",
338 | "
\n",
339 | " \n",
340 | " 50 | \n",
341 | " NMG | \n",
342 | " BMW_NA_Equity | \n",
343 | " top1 left | \n",
344 | " 33.000 | \n",
345 | "
\n",
346 | " \n",
347 | "
\n",
348 | "
"
349 | ],
350 | "text/plain": [
351 | " __top1left__ __top1right__ __matchtype__ __top1diff__\n",
352 | "42 AAP AAP_US_Equity top1 left 13.000\n",
353 | "56 AAPL AAPL_US_Equity top1 left 14.000\n",
354 | "34 AMZN-AMZN AMZN_US_Equity top1 left 64.625\n",
355 | "17 APRN APRN_US_Equity top1 left 14.000\n",
356 | "1 BBW BBY_US_Equity top1 left 23.000\n",
357 | "5 BBW BMW_NA_Equity top1 left 23.000\n",
358 | "24 JLP AAP_US_Equity top1 left 33.000\n",
359 | "50 NMG BMW_NA_Equity top1 left 33.000"
360 | ]
361 | },
362 | "execution_count": 6,
363 | "metadata": {},
364 | "output_type": "execute_result"
365 | }
366 | ],
367 | "source": [
368 | "# attempt a fuzzy join using affine gap distance => not looking good\n",
369 | "import affinegap\n",
370 | "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[affinegap.affineGapDistance]).merge()['top1']['id']"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "## Token-based clustering\n",
378 | "\n",
379 | "With `d6tjoin.utils.splitcharTokenCount` you can quickly split the ids into tokens to find commonly occuring substrings. You can then use that knowledge to join the data."
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 7,
385 | "metadata": {},
386 | "outputs": [
387 | {
388 | "name": "stdout",
389 | "output_type": "stream",
390 | "text": [
391 | "*** token counts ***\n",
392 | " word count\n",
393 | "0 Equity 7\n",
394 | "1 US 5\n",
395 | "2 NA 2\n",
396 | "3 PRIVATE 2\n",
397 | "\n",
398 | " *** token occurance ***\n",
399 | "[('Equity', ['AAPL_US_Equity', 'AAP_US_Equity', 'AD_NA_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity', 'BMW_NA_Equity']), ('US', ['AAPL_US_Equity', 'AAP_US_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity']), ('NA', ['AD_NA_Equity', 'BMW_NA_Equity']), ('PRIVATE', ['PRIVATE_JLP', 'PRIVATE_NMG'])]\n"
400 | ]
401 | }
402 | ],
403 | "source": [
404 | "dftoken=d6tjoin.utils.splitcharTokenCount(df2['id'])\n",
405 | "print('*** token counts ***')\n",
406 | "print(dftoken)\n",
407 | "print('\\n *** token occurance ***')\n",
408 | "print(d6tjoin.utils.unique_contains(df2['id'], dftoken['word'].values))\n"
409 | ]
410 | },
411 | {
412 | "cell_type": "markdown",
413 | "metadata": {},
414 | "source": [
415 | "## Token-based joins\n",
416 | "\n",
417 | "Based on the analysis above, we want to join pairs which have at least 1 common token. It's easy to define a function which computes that and pass that to `d6tjoin.top1.MergeTop1()` to get a good join."
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 8,
423 | "metadata": {},
424 | "outputs": [
425 | {
426 | "data": {
427 | "text/html": [
428 | "\n",
429 | "\n",
442 | "
\n",
443 | " \n",
444 | " \n",
445 | " | \n",
446 | " __top1left__ | \n",
447 | " __top1right__ | \n",
448 | " __matchtype__ | \n",
449 | " __top1diff__ | \n",
450 | "
\n",
451 | " \n",
452 | " \n",
453 | " \n",
454 | " 42 | \n",
455 | " AAP | \n",
456 | " AAP_US_Equity | \n",
457 | " top1 left | \n",
458 | " 2 | \n",
459 | "
\n",
460 | " \n",
461 | " 56 | \n",
462 | " AAPL | \n",
463 | " AAPL_US_Equity | \n",
464 | " top1 left | \n",
465 | " 2 | \n",
466 | "
\n",
467 | " \n",
468 | " 34 | \n",
469 | " AMZN-AMZN | \n",
470 | " AMZN_US_Equity | \n",
471 | " top1 left | \n",
472 | " 2 | \n",
473 | "
\n",
474 | " \n",
475 | " 17 | \n",
476 | " APRN | \n",
477 | " APRN_US_Equity | \n",
478 | " top1 left | \n",
479 | " 2 | \n",
480 | "
\n",
481 | " \n",
482 | " 22 | \n",
483 | " JLP | \n",
484 | " PRIVATE_JLP | \n",
485 | " top1 left | \n",
486 | " 2 | \n",
487 | "
\n",
488 | " \n",
489 | " 45 | \n",
490 | " NMG | \n",
491 | " PRIVATE_NMG | \n",
492 | " top1 left | \n",
493 | " 2 | \n",
494 | "
\n",
495 | " \n",
496 | "
\n",
497 | "
"
498 | ],
499 | "text/plain": [
500 | " __top1left__ __top1right__ __matchtype__ __top1diff__\n",
501 | "42 AAP AAP_US_Equity top1 left 2\n",
502 | "56 AAPL AAPL_US_Equity top1 left 2\n",
503 | "34 AMZN-AMZN AMZN_US_Equity top1 left 2\n",
504 | "17 APRN APRN_US_Equity top1 left 2\n",
505 | "22 JLP PRIVATE_JLP top1 left 2\n",
506 | "45 NMG PRIVATE_NMG top1 left 2"
507 | ]
508 | },
509 | "execution_count": 8,
510 | "metadata": {},
511 | "output_type": "execute_result"
512 | }
513 | ],
514 | "source": [
515 | "import re\n",
516 | "splitchars=\"[^a-zA-Z0-9]+\"\n",
517 | "def tokenmatch(s1,s2):\n",
518 | " s1=set(re.split(splitchars,s1))\n",
519 | " s2=set(re.split(splitchars,s2))\n",
520 | " return 3-len(s1 & s2)\n",
521 | "\n",
522 | "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[tokenmatch], top_limit=[2]).merge()['top1']['id']\n"
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": 9,
528 | "metadata": {},
529 | "outputs": [
530 | {
531 | "name": "stderr",
532 | "output_type": "stream",
533 | "text": [
534 | "/Users/haijing/Haijing/ds_project/d6t/d6tjoin/d6tjoin/top1.py:118: UserWarning: Top1 join for id has duplicates\n",
535 | " warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)\n"
536 | ]
537 | },
538 | {
539 | "data": {
540 | "text/html": [
541 | "\n",
542 | "\n",
555 | "
\n",
556 | " \n",
557 | " \n",
558 | " | \n",
559 | " __top1left__ | \n",
560 | " __top1right__ | \n",
561 | " __matchtype__ | \n",
562 | " __top1diff__ | \n",
563 | "
\n",
564 | " \n",
565 | " \n",
566 | " \n",
567 | " 42 | \n",
568 | " AAP | \n",
569 | " AAP_US_Equity | \n",
570 | " top1 left | \n",
571 | " 2 | \n",
572 | "
\n",
573 | " \n",
574 | " 56 | \n",
575 | " AAPL | \n",
576 | " AAPL_US_Equity | \n",
577 | " top1 left | \n",
578 | " 2 | \n",
579 | "
\n",
580 | " \n",
581 | " 34 | \n",
582 | " AMZN-AMZN | \n",
583 | " AMZN_US_Equity | \n",
584 | " top1 left | \n",
585 | " 2 | \n",
586 | "
\n",
587 | " \n",
588 | " 17 | \n",
589 | " APRN | \n",
590 | " APRN_US_Equity | \n",
591 | " top1 left | \n",
592 | " 2 | \n",
593 | "
\n",
594 | " \n",
595 | " 0 | \n",
596 | " BBW | \n",
597 | " PRIVATE_NMG | \n",
598 | " top1 left | \n",
599 | " 3 | \n",
600 | "
\n",
601 | " \n",
602 | " 1 | \n",
603 | " BBW | \n",
604 | " BBY_US_Equity | \n",
605 | " top1 left | \n",
606 | " 3 | \n",
607 | "
\n",
608 | " \n",
609 | " 2 | \n",
610 | " BBW | \n",
611 | " AAPL_US_Equity | \n",
612 | " top1 left | \n",
613 | " 3 | \n",
614 | "
\n",
615 | " \n",
616 | " 3 | \n",
617 | " BBW | \n",
618 | " AD_NA_Equity | \n",
619 | " top1 left | \n",
620 | " 3 | \n",
621 | "
\n",
622 | " \n",
623 | " 4 | \n",
624 | " BBW | \n",
625 | " PRIVATE_JLP | \n",
626 | " top1 left | \n",
627 | " 3 | \n",
628 | "
\n",
629 | " \n",
630 | " 5 | \n",
631 | " BBW | \n",
632 | " BMW_NA_Equity | \n",
633 | " top1 left | \n",
634 | " 3 | \n",
635 | "
\n",
636 | " \n",
637 | " 6 | \n",
638 | " BBW | \n",
639 | " AAP_US_Equity | \n",
640 | " top1 left | \n",
641 | " 3 | \n",
642 | "
\n",
643 | " \n",
644 | " 7 | \n",
645 | " BBW | \n",
646 | " AMZN_US_Equity | \n",
647 | " top1 left | \n",
648 | " 3 | \n",
649 | "
\n",
650 | " \n",
651 | " 8 | \n",
652 | " BBW | \n",
653 | " APRN_US_Equity | \n",
654 | " top1 left | \n",
655 | " 3 | \n",
656 | "
\n",
657 | " \n",
658 | " 22 | \n",
659 | " JLP | \n",
660 | " PRIVATE_JLP | \n",
661 | " top1 left | \n",
662 | " 2 | \n",
663 | "
\n",
664 | " \n",
665 | " 45 | \n",
666 | " NMG | \n",
667 | " PRIVATE_NMG | \n",
668 | " top1 left | \n",
669 | " 2 | \n",
670 | "
\n",
671 | " \n",
672 | "
\n",
673 | "
"
674 | ],
675 | "text/plain": [
676 | " __top1left__ __top1right__ __matchtype__ __top1diff__\n",
677 | "42 AAP AAP_US_Equity top1 left 2\n",
678 | "56 AAPL AAPL_US_Equity top1 left 2\n",
679 | "34 AMZN-AMZN AMZN_US_Equity top1 left 2\n",
680 | "17 APRN APRN_US_Equity top1 left 2\n",
681 | "0 BBW PRIVATE_NMG top1 left 3\n",
682 | "1 BBW BBY_US_Equity top1 left 3\n",
683 | "2 BBW AAPL_US_Equity top1 left 3\n",
684 | "3 BBW AD_NA_Equity top1 left 3\n",
685 | "4 BBW PRIVATE_JLP top1 left 3\n",
686 | "5 BBW BMW_NA_Equity top1 left 3\n",
687 | "6 BBW AAP_US_Equity top1 left 3\n",
688 | "7 BBW AMZN_US_Equity top1 left 3\n",
689 | "8 BBW APRN_US_Equity top1 left 3\n",
690 | "22 JLP PRIVATE_JLP top1 left 2\n",
691 | "45 NMG PRIVATE_NMG top1 left 2"
692 | ]
693 | },
694 | "execution_count": 9,
695 | "metadata": {},
696 | "output_type": "execute_result"
697 | }
698 | ],
699 | "source": [
700 | "# note that we applied top_limit=[2], meaning strings should have at most 2 tokens mismatched, to exclude bad matches for BBW\n",
701 | "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[tokenmatch]).merge()['top1']['id']\n"
702 | ]
703 | },
704 | {
705 | "cell_type": "code",
706 | "execution_count": null,
707 | "metadata": {},
708 | "outputs": [],
709 | "source": []
710 | }
711 | ],
712 | "metadata": {
713 | "kernelspec": {
714 | "display_name": "Python 3",
715 | "language": "python",
716 | "name": "python3"
717 | },
718 | "language_info": {
719 | "codemirror_mode": {
720 | "name": "ipython",
721 | "version": 3
722 | },
723 | "file_extension": ".py",
724 | "mimetype": "text/x-python",
725 | "name": "python",
726 | "nbconvert_exporter": "python",
727 | "pygments_lexer": "ipython3",
728 | "version": "3.7.6"
729 | }
730 | },
731 | "nbformat": 4,
732 | "nbformat_minor": 2
733 | }
734 |
--------------------------------------------------------------------------------
/examples-top1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Data Engineering in Python with databolt - Fuzzy Joins (d6tlib/d6tjoin.utils)\n",
8 | "\n",
9 | "## Introduction\n",
10 | "\n",
11 | "Joining datasets is a common data engineering operation. However, often there are problems merging datasets from different sources because of mismatched identifiers, date conventions etc. \n",
12 | "\n",
13 | "** `d6tjoin.top1` module allows you to quickly join datasets even if they don't perfectly match. **\n",
14 | "Easily join different datasets without writing custom code. Does fuzzy top1 similarity joins for strings, dates and numbers, for example you can quickly join similar but not identical stock tickers, addresses, names without manual processing. It will find the top 1 matched entry from the right dataframe to join onto the left dataframe.\n",
15 | "\n",
16 | "Here are some examples which show you how to:\n",
17 | "1. join on mismatched identifiers\n",
18 | "2. join on calendar vs business dates\n",
19 | "3. join on both mismatched dates and identifiers"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "metadata": {},
26 | "outputs": [
27 | {
28 | "data": {
29 | "text/html": [
30 | "\n",
31 | "\n",
44 | "
\n",
45 | " \n",
46 | " \n",
47 | " | \n",
48 | " date | \n",
49 | " id | \n",
50 | " v | \n",
51 | "
\n",
52 | " \n",
53 | " \n",
54 | " \n",
55 | " 0 | \n",
56 | " 2010-01-01 | \n",
57 | " e3e70682 | \n",
58 | " 0.393 | \n",
59 | "
\n",
60 | " \n",
61 | " 1 | \n",
62 | " 2010-01-01 | \n",
63 | " f728b4fa | \n",
64 | " 0.837 | \n",
65 | "
\n",
66 | " \n",
67 | " 2 | \n",
68 | " 2010-01-01 | \n",
69 | " eb1167b3 | \n",
70 | " 0.389 | \n",
71 | "
\n",
72 | " \n",
73 | " 3 | \n",
74 | " 2010-01-01 | \n",
75 | " f7c1bd87 | \n",
76 | " 0.555 | \n",
77 | "
\n",
78 | " \n",
79 | " 4 | \n",
80 | " 2010-01-01 | \n",
81 | " e443df78 | \n",
82 | " 0.886 | \n",
83 | "
\n",
84 | " \n",
85 | "
\n",
86 | "
"
87 | ],
88 | "text/plain": [
89 | " date id v\n",
90 | "0 2010-01-01 e3e70682 0.393\n",
91 | "1 2010-01-01 f728b4fa 0.837\n",
92 | "2 2010-01-01 eb1167b3 0.389\n",
93 | "3 2010-01-01 f7c1bd87 0.555\n",
94 | "4 2010-01-01 e443df78 0.886"
95 | ]
96 | },
97 | "execution_count": 1,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "import pandas as pd\n",
104 | "import numpy as np\n",
105 | "import itertools\n",
106 | "from faker import Faker\n",
107 | "import importlib\n",
108 | "\n",
109 | "import d6tjoin.top1\n",
110 | "importlib.reload(d6tjoin.top1)\n",
111 | "import d6tjoin.utils\n",
112 | "\n",
113 | "# *******************************************************\n",
114 | "# generate sample time series data with id and value\n",
115 | "# *******************************************************\n",
116 | "nobs = 10\n",
117 | "f1 = Faker()\n",
118 | "Faker.seed(0)\n",
119 | "uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)]\n",
120 | "dates1 = pd.date_range('1/1/2010','1/1/2011')\n",
121 | "\n",
122 | "df1 = pd.DataFrame(list(itertools.product(dates1,uuid1)),columns=['date','id'])\n",
123 | "df1['v']=np.round(np.random.sample(df1.shape[0]),3)\n",
124 | "df1.head()"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "metadata": {},
130 | "source": [
131 | "# Example 1: join datasets on misalgined ids\n",
132 | "\n",
133 | "When joining data from different sources, eg different vendors, often your ids don't match perfect and then you need to manually analyze the situation. With databolt this becomes much easier.\n",
134 | "\n",
135 | "Let's create another dataset where the `id` is slightly different."
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 2,
141 | "metadata": {},
142 | "outputs": [
143 | {
144 | "data": {
145 | "text/html": [
146 | "\n",
147 | "\n",
160 | "
\n",
161 | " \n",
162 | " \n",
163 | " | \n",
164 | " date | \n",
165 | " id | \n",
166 | " v | \n",
167 | "
\n",
168 | " \n",
169 | " \n",
170 | " \n",
171 | " 0 | \n",
172 | " 2010-01-01 | \n",
173 | " 3e7068 | \n",
174 | " 0.393 | \n",
175 | "
\n",
176 | " \n",
177 | " 1 | \n",
178 | " 2010-01-01 | \n",
179 | " 728b4f | \n",
180 | " 0.837 | \n",
181 | "
\n",
182 | " \n",
183 | " 2 | \n",
184 | " 2010-01-01 | \n",
185 | " b1167b | \n",
186 | " 0.389 | \n",
187 | "
\n",
188 | " \n",
189 | " 3 | \n",
190 | " 2010-01-01 | \n",
191 | " 7c1bd8 | \n",
192 | " 0.555 | \n",
193 | "
\n",
194 | " \n",
195 | " 4 | \n",
196 | " 2010-01-01 | \n",
197 | " 443df7 | \n",
198 | " 0.886 | \n",
199 | "
\n",
200 | " \n",
201 | "
\n",
202 | "
"
203 | ],
204 | "text/plain": [
205 | " date id v\n",
206 | "0 2010-01-01 3e7068 0.393\n",
207 | "1 2010-01-01 728b4f 0.837\n",
208 | "2 2010-01-01 b1167b 0.389\n",
209 | "3 2010-01-01 7c1bd8 0.555\n",
210 | "4 2010-01-01 443df7 0.886"
211 | ]
212 | },
213 | "execution_count": 2,
214 | "metadata": {},
215 | "output_type": "execute_result"
216 | }
217 | ],
218 | "source": [
219 | "# create mismatch\n",
220 | "df2 = df1.copy()\n",
221 | "df2['id'] = df1['id'].str[1:-1]\n",
222 | "df2.head()"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "`d6tjoin.Prejoin.match_quality()` shows you there is none of `id` match so a normal join won't work well."
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 3,
235 | "metadata": {},
236 | "outputs": [
237 | {
238 | "name": "stdout",
239 | "output_type": "stream",
240 | "text": [
241 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n",
242 | "0 id id False 0 10 10 20 20 10 10\n",
243 | "1 date date True 366 366 366 366 0 0 0\n",
244 | "2 __all__ __all__ False 0 3660 3660 7320 7320 3660 3660\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "metadata": {},
255 | "source": [
256 | "Using `d6tjoin.top1.MergeTop1()` you can quickly merge this dataset without having to do any manual processing. It will find the closest matching id using the Levenstein string similarity metric. We want to look at the closest id by date so we will pass in date as an exact match key."
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": 4,
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "result = d6tjoin.top1.MergeTop1(df1.head(),df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'],exact_left_on=['date'],exact_right_on=['date']).merge()"
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "Lets check what matches it found. Looking at the top1 match table, it shows the closest string with only 2 character difference in id, meaning it found the correct substring. "
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 5,
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "data": {
282 | "text/html": [
283 | "\n",
284 | "\n",
297 | "
\n",
298 | " \n",
299 | " \n",
300 | " | \n",
301 | " date | \n",
302 | " __top1left__ | \n",
303 | " __top1right__ | \n",
304 | " __top1diff__ | \n",
305 | " __matchtype__ | \n",
306 | "
\n",
307 | " \n",
308 | " \n",
309 | " \n",
310 | " 10 | \n",
311 | " 2010-01-01 | \n",
312 | " e3e70682 | \n",
313 | " 3e7068 | \n",
314 | " 2 | \n",
315 | " top1 left | \n",
316 | "
\n",
317 | " \n",
318 | " 34 | \n",
319 | " 2010-01-01 | \n",
320 | " e443df78 | \n",
321 | " 443df7 | \n",
322 | " 2 | \n",
323 | " top1 left | \n",
324 | "
\n",
325 | " \n",
326 | " 42 | \n",
327 | " 2010-01-01 | \n",
328 | " eb1167b3 | \n",
329 | " b1167b | \n",
330 | " 2 | \n",
331 | " top1 left | \n",
332 | "
\n",
333 | " \n",
334 | " 21 | \n",
335 | " 2010-01-01 | \n",
336 | " f728b4fa | \n",
337 | " 728b4f | \n",
338 | " 2 | \n",
339 | " top1 left | \n",
340 | "
\n",
341 | " \n",
342 | " 3 | \n",
343 | " 2010-01-01 | \n",
344 | " f7c1bd87 | \n",
345 | " 7c1bd8 | \n",
346 | " 2 | \n",
347 | " top1 left | \n",
348 | "
\n",
349 | " \n",
350 | "
\n",
351 | "
"
352 | ],
353 | "text/plain": [
354 | " date __top1left__ __top1right__ __top1diff__ __matchtype__\n",
355 | "10 2010-01-01 e3e70682 3e7068 2 top1 left\n",
356 | "34 2010-01-01 e443df78 443df7 2 top1 left\n",
357 | "42 2010-01-01 eb1167b3 b1167b 2 top1 left\n",
358 | "21 2010-01-01 f728b4fa 728b4f 2 top1 left\n",
359 | "3 2010-01-01 f7c1bd87 7c1bd8 2 top1 left"
360 | ]
361 | },
362 | "execution_count": 5,
363 | "metadata": {},
364 | "output_type": "execute_result"
365 | }
366 | ],
367 | "source": [
368 | "result['top1']['id']"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "metadata": {},
374 | "source": [
375 | "Since the match results look good, you can use the merged dataset."
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 6,
381 | "metadata": {},
382 | "outputs": [
383 | {
384 | "data": {
385 | "text/html": [
386 | "\n",
387 | "\n",
400 | "
\n",
401 | " \n",
402 | " \n",
403 | " | \n",
404 | " date | \n",
405 | " id | \n",
406 | " v | \n",
407 | " id_right | \n",
408 | " v_right | \n",
409 | "
\n",
410 | " \n",
411 | " \n",
412 | " \n",
413 | " 0 | \n",
414 | " 2010-01-01 | \n",
415 | " e3e70682 | \n",
416 | " 0.393 | \n",
417 | " 3e7068 | \n",
418 | " 0.393 | \n",
419 | "
\n",
420 | " \n",
421 | " 1 | \n",
422 | " 2010-01-01 | \n",
423 | " f728b4fa | \n",
424 | " 0.837 | \n",
425 | " 728b4f | \n",
426 | " 0.837 | \n",
427 | "
\n",
428 | " \n",
429 | " 2 | \n",
430 | " 2010-01-01 | \n",
431 | " eb1167b3 | \n",
432 | " 0.389 | \n",
433 | " b1167b | \n",
434 | " 0.389 | \n",
435 | "
\n",
436 | " \n",
437 | " 3 | \n",
438 | " 2010-01-01 | \n",
439 | " f7c1bd87 | \n",
440 | " 0.555 | \n",
441 | " 7c1bd8 | \n",
442 | " 0.555 | \n",
443 | "
\n",
444 | " \n",
445 | " 4 | \n",
446 | " 2010-01-01 | \n",
447 | " e443df78 | \n",
448 | " 0.886 | \n",
449 | " 443df7 | \n",
450 | " 0.886 | \n",
451 | "
\n",
452 | " \n",
453 | "
\n",
454 | "
"
455 | ],
456 | "text/plain": [
457 | " date id v id_right v_right\n",
458 | "0 2010-01-01 e3e70682 0.393 3e7068 0.393\n",
459 | "1 2010-01-01 f728b4fa 0.837 728b4f 0.837\n",
460 | "2 2010-01-01 eb1167b3 0.389 b1167b 0.389\n",
461 | "3 2010-01-01 f7c1bd87 0.555 7c1bd8 0.555\n",
462 | "4 2010-01-01 e443df78 0.886 443df7 0.886"
463 | ]
464 | },
465 | "execution_count": 6,
466 | "metadata": {},
467 | "output_type": "execute_result"
468 | }
469 | ],
470 | "source": [
471 | "result['merged'].head()"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": 7,
477 | "metadata": {},
478 | "outputs": [],
479 | "source": [
480 | "assert not result['duplicates']"
481 | ]
482 | },
483 | {
484 | "cell_type": "markdown",
485 | "metadata": {},
486 | "source": [
487 | "# Example 2: join 2 datasets with misalgined dates\n",
488 | "\n",
489 | "As another example, instead of the ids not matching, lets look at an example where the dates don't match. We will look at calendar vs business month end dates."
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": 8,
495 | "metadata": {},
496 | "outputs": [],
497 | "source": [
498 | "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n",
499 | "df2 = pd.DataFrame(list(itertools.product(dates2,uuid1)),columns=['date','id'])\n",
500 | "df2['v']=np.round(np.random.sample(df2.shape[0]),3)"
501 | ]
502 | },
503 | {
504 | "cell_type": "markdown",
505 | "metadata": {},
506 | "source": [
507 | "`d6tjoin.Prejoin()` shows some but not all of the dates match. All the ids match."
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": 9,
513 | "metadata": {},
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n",
520 | "0 id id True 10 10 10 10 0 0 0\n",
521 | "1 date date False 261 366 261 366 105 105 0\n",
522 | "2 __all__ __all__ False 2610 3660 2610 3660 1050 1050 0\n"
523 | ]
524 | }
525 | ],
526 | "source": [
527 | "d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()"
528 | ]
529 | },
530 | {
531 | "cell_type": "markdown",
532 | "metadata": {},
533 | "source": [
534 | "So we want to do a fuzzy match on dates but have the id match perfectly."
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 10,
540 | "metadata": {},
541 | "outputs": [],
542 | "source": [
543 | "result = d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['date'],fuzzy_right_on=['date'],exact_left_on=['id'],exact_right_on=['id']).merge()"
544 | ]
545 | },
546 | {
547 | "cell_type": "markdown",
548 | "metadata": {},
549 | "source": [
550 | "Again lets check if the fuzzy matches are correct. If either matches or is off by a day most, looks good!"
551 | ]
552 | },
553 | {
554 | "cell_type": "code",
555 | "execution_count": 11,
556 | "metadata": {},
557 | "outputs": [
558 | {
559 | "data": {
560 | "text/html": [
561 | "\n",
562 | "\n",
575 | "
\n",
576 | " \n",
577 | " \n",
578 | " | \n",
579 | " id | \n",
580 | " __top1left__ | \n",
581 | " __top1right__ | \n",
582 | " __top1diff__ | \n",
583 | " __matchtype__ | \n",
584 | "
\n",
585 | " \n",
586 | " \n",
587 | " \n",
588 | " 0 | \n",
589 | " 1846d424 | \n",
590 | " 2010-01-01 | \n",
591 | " 2010-01-01 | \n",
592 | " 0 days | \n",
593 | " exact | \n",
594 | "
\n",
595 | " \n",
596 | " 1 | \n",
597 | " eb1167b3 | \n",
598 | " 2010-01-01 | \n",
599 | " 2010-01-01 | \n",
600 | " 0 days | \n",
601 | " exact | \n",
602 | "
\n",
603 | " \n",
604 | " 2 | \n",
605 | " e443df78 | \n",
606 | " 2010-01-01 | \n",
607 | " 2010-01-01 | \n",
608 | " 0 days | \n",
609 | " exact | \n",
610 | "
\n",
611 | " \n",
612 | "
\n",
613 | "
"
614 | ],
615 | "text/plain": [
616 | " id __top1left__ __top1right__ __top1diff__ __matchtype__\n",
617 | "0 1846d424 2010-01-01 2010-01-01 0 days exact\n",
618 | "1 eb1167b3 2010-01-01 2010-01-01 0 days exact\n",
619 | "2 e443df78 2010-01-01 2010-01-01 0 days exact"
620 | ]
621 | },
622 | "execution_count": 11,
623 | "metadata": {},
624 | "output_type": "execute_result"
625 | }
626 | ],
627 | "source": [
628 | "result['top1']['date'].head(3)"
629 | ]
630 | },
631 | {
632 | "cell_type": "code",
633 | "execution_count": 12,
634 | "metadata": {},
635 | "outputs": [
636 | {
637 | "data": {
638 | "text/html": [
639 | "\n",
640 | "\n",
653 | "
\n",
654 | " \n",
655 | " \n",
656 | " | \n",
657 | " id | \n",
658 | " __top1left__ | \n",
659 | " __top1right__ | \n",
660 | " __top1diff__ | \n",
661 | " __matchtype__ | \n",
662 | "
\n",
663 | " \n",
664 | " \n",
665 | " \n",
666 | " 3657 | \n",
667 | " 1846d424 | \n",
668 | " 2011-01-01 | \n",
669 | " 2010-12-31 | \n",
670 | " 1 days | \n",
671 | " top1 left | \n",
672 | "
\n",
673 | " \n",
674 | " 3658 | \n",
675 | " f7c1bd87 | \n",
676 | " 2011-01-01 | \n",
677 | " 2010-12-31 | \n",
678 | " 1 days | \n",
679 | " top1 left | \n",
680 | "
\n",
681 | " \n",
682 | " 3659 | \n",
683 | " fcbd04c3 | \n",
684 | " 2011-01-01 | \n",
685 | " 2010-12-31 | \n",
686 | " 1 days | \n",
687 | " top1 left | \n",
688 | "
\n",
689 | " \n",
690 | "
\n",
691 | "
"
692 | ],
693 | "text/plain": [
694 | " id __top1left__ __top1right__ __top1diff__ __matchtype__\n",
695 | "3657 1846d424 2011-01-01 2010-12-31 1 days top1 left\n",
696 | "3658 f7c1bd87 2011-01-01 2010-12-31 1 days top1 left\n",
697 | "3659 fcbd04c3 2011-01-01 2010-12-31 1 days top1 left"
698 | ]
699 | },
700 | "execution_count": 12,
701 | "metadata": {},
702 | "output_type": "execute_result"
703 | }
704 | ],
705 | "source": [
706 | "result['top1']['date'].tail(3)"
707 | ]
708 | },
709 | {
710 | "cell_type": "code",
711 | "execution_count": 13,
712 | "metadata": {},
713 | "outputs": [
714 | {
715 | "data": {
716 | "text/plain": [
717 | "Timedelta('1 days 00:00:00')"
718 | ]
719 | },
720 | "execution_count": 13,
721 | "metadata": {},
722 | "output_type": "execute_result"
723 | }
724 | ],
725 | "source": [
726 | "result['top1']['date']['__top1diff__'].max()"
727 | ]
728 | },
729 | {
730 | "cell_type": "markdown",
731 | "metadata": {},
732 | "source": [
733 | "Again with very little effort we were able to join this dataset together."
734 | ]
735 | },
736 | {
737 | "cell_type": "code",
738 | "execution_count": 14,
739 | "metadata": {},
740 | "outputs": [
741 | {
742 | "data": {
743 | "text/html": [
744 | "\n",
745 | "\n",
758 | "
\n",
759 | " \n",
760 | " \n",
761 | " | \n",
762 | " date | \n",
763 | " id | \n",
764 | " v | \n",
765 | " date_right | \n",
766 | " v_right | \n",
767 | "
\n",
768 | " \n",
769 | " \n",
770 | " \n",
771 | " 0 | \n",
772 | " 2010-01-01 | \n",
773 | " e3e70682 | \n",
774 | " 0.393 | \n",
775 | " 2010-01-01 | \n",
776 | " 0.110 | \n",
777 | "
\n",
778 | " \n",
779 | " 1 | \n",
780 | " 2010-01-02 | \n",
781 | " e3e70682 | \n",
782 | " 0.537 | \n",
783 | " 2010-01-01 | \n",
784 | " 0.110 | \n",
785 | "
\n",
786 | " \n",
787 | " 2 | \n",
788 | " 2010-01-01 | \n",
789 | " f728b4fa | \n",
790 | " 0.837 | \n",
791 | " 2010-01-01 | \n",
792 | " 0.197 | \n",
793 | "
\n",
794 | " \n",
795 | " 3 | \n",
796 | " 2010-01-02 | \n",
797 | " f728b4fa | \n",
798 | " 0.517 | \n",
799 | " 2010-01-01 | \n",
800 | " 0.197 | \n",
801 | "
\n",
802 | " \n",
803 | " 4 | \n",
804 | " 2010-01-01 | \n",
805 | " eb1167b3 | \n",
806 | " 0.389 | \n",
807 | " 2010-01-01 | \n",
808 | " 0.385 | \n",
809 | "
\n",
810 | " \n",
811 | "
\n",
812 | "
"
813 | ],
814 | "text/plain": [
815 | " date id v date_right v_right\n",
816 | "0 2010-01-01 e3e70682 0.393 2010-01-01 0.110\n",
817 | "1 2010-01-02 e3e70682 0.537 2010-01-01 0.110\n",
818 | "2 2010-01-01 f728b4fa 0.837 2010-01-01 0.197\n",
819 | "3 2010-01-02 f728b4fa 0.517 2010-01-01 0.197\n",
820 | "4 2010-01-01 eb1167b3 0.389 2010-01-01 0.385"
821 | ]
822 | },
823 | "execution_count": 14,
824 | "metadata": {},
825 | "output_type": "execute_result"
826 | }
827 | ],
828 | "source": [
829 | "result['merged'].head()"
830 | ]
831 | },
832 | {
833 | "cell_type": "markdown",
834 | "metadata": {},
835 | "source": [
836 | "# Example 3: join 2 datasets with misalgined dates AND ids\n",
837 | "\n",
838 | "In the final example, we combine the above cases. None of the ids match and some of the dates are mismatched. As before with little manual effort we are able to correctly merge the dataset."
839 | ]
840 | },
841 | {
842 | "cell_type": "code",
843 | "execution_count": 15,
844 | "metadata": {},
845 | "outputs": [],
846 | "source": [
847 | "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n",
848 | "df2 = pd.DataFrame(list(itertools.product(dates2,uuid1)),columns=['date','id'])\n",
849 | "df2['v']=np.round(np.random.sample(df2.shape[0]),3)\n",
850 | "df2['id'] = df2['id'].str[1:-1]"
851 | ]
852 | },
853 | {
854 | "cell_type": "code",
855 | "execution_count": 16,
856 | "metadata": {},
857 | "outputs": [
858 | {
859 | "name": "stdout",
860 | "output_type": "stream",
861 | "text": [
862 | " key left key right all matched inner left right outer unmatched total unmatched left unmatched right\n",
863 | "0 id id False 0 10 10 20 20 10 10\n",
864 | "1 date date False 261 366 261 366 105 105 0\n",
865 | "2 __all__ __all__ False 0 3660 2610 6270 6270 3660 2610\n"
866 | ]
867 | }
868 | ],
869 | "source": [
870 | "d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()"
871 | ]
872 | },
873 | {
874 | "cell_type": "code",
875 | "execution_count": 17,
876 | "metadata": {},
877 | "outputs": [],
878 | "source": [
879 | "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id']).merge()"
880 | ]
881 | },
882 | {
883 | "cell_type": "code",
884 | "execution_count": 18,
885 | "metadata": {},
886 | "outputs": [
887 | {
888 | "data": {
889 | "text/html": [
890 | "\n",
891 | "\n",
904 | "
\n",
905 | " \n",
906 | " \n",
907 | " | \n",
908 | " date | \n",
909 | " id | \n",
910 | " v | \n",
911 | " date_right | \n",
912 | " id_right | \n",
913 | " v_right | \n",
914 | "
\n",
915 | " \n",
916 | " \n",
917 | " \n",
918 | " 0 | \n",
919 | " 2010-01-01 | \n",
920 | " e3e70682 | \n",
921 | " 0.393 | \n",
922 | " 2010-01-01 | \n",
923 | " 3e7068 | \n",
924 | " 0.693 | \n",
925 | "
\n",
926 | " \n",
927 | " 1 | \n",
928 | " 2010-01-02 | \n",
929 | " e3e70682 | \n",
930 | " 0.537 | \n",
931 | " 2010-01-01 | \n",
932 | " 3e7068 | \n",
933 | " 0.693 | \n",
934 | "
\n",
935 | " \n",
936 | " 2 | \n",
937 | " 2010-01-01 | \n",
938 | " f728b4fa | \n",
939 | " 0.837 | \n",
940 | " 2010-01-01 | \n",
941 | " 728b4f | \n",
942 | " 0.463 | \n",
943 | "
\n",
944 | " \n",
945 | " 3 | \n",
946 | " 2010-01-02 | \n",
947 | " f728b4fa | \n",
948 | " 0.517 | \n",
949 | " 2010-01-01 | \n",
950 | " 728b4f | \n",
951 | " 0.463 | \n",
952 | "
\n",
953 | " \n",
954 | " 4 | \n",
955 | " 2010-01-01 | \n",
956 | " eb1167b3 | \n",
957 | " 0.389 | \n",
958 | " 2010-01-01 | \n",
959 | " b1167b | \n",
960 | " 0.227 | \n",
961 | "
\n",
962 | " \n",
963 | "
\n",
964 | "
"
965 | ],
966 | "text/plain": [
967 | " date id v date_right id_right v_right\n",
968 | "0 2010-01-01 e3e70682 0.393 2010-01-01 3e7068 0.693\n",
969 | "1 2010-01-02 e3e70682 0.537 2010-01-01 3e7068 0.693\n",
970 | "2 2010-01-01 f728b4fa 0.837 2010-01-01 728b4f 0.463\n",
971 | "3 2010-01-02 f728b4fa 0.517 2010-01-01 728b4f 0.463\n",
972 | "4 2010-01-01 eb1167b3 0.389 2010-01-01 b1167b 0.227"
973 | ]
974 | },
975 | "execution_count": 18,
976 | "metadata": {},
977 | "output_type": "execute_result"
978 | }
979 | ],
980 | "source": [
981 | "result['merged'].head()"
982 | ]
983 | },
984 | {
985 | "cell_type": "code",
986 | "execution_count": 19,
987 | "metadata": {
988 | "scrolled": true
989 | },
990 | "outputs": [
991 | {
992 | "data": {
993 | "text/html": [
994 | "\n",
995 | "\n",
1008 | "
\n",
1009 | " \n",
1010 | " \n",
1011 | " | \n",
1012 | " __top1left__ | \n",
1013 | " __top1right__ | \n",
1014 | " __top1diff__ | \n",
1015 | " __matchtype__ | \n",
1016 | "
\n",
1017 | " \n",
1018 | " \n",
1019 | " \n",
1020 | " 361 | \n",
1021 | " 2010-12-28 | \n",
1022 | " 2010-12-28 | \n",
1023 | " 0 days | \n",
1024 | " exact | \n",
1025 | "
\n",
1026 | " \n",
1027 | " 362 | \n",
1028 | " 2010-12-29 | \n",
1029 | " 2010-12-29 | \n",
1030 | " 0 days | \n",
1031 | " exact | \n",
1032 | "
\n",
1033 | " \n",
1034 | " 363 | \n",
1035 | " 2010-12-30 | \n",
1036 | " 2010-12-30 | \n",
1037 | " 0 days | \n",
1038 | " exact | \n",
1039 | "
\n",
1040 | " \n",
1041 | " 364 | \n",
1042 | " 2010-12-31 | \n",
1043 | " 2010-12-31 | \n",
1044 | " 0 days | \n",
1045 | " exact | \n",
1046 | "
\n",
1047 | " \n",
1048 | " 365 | \n",
1049 | " 2011-01-01 | \n",
1050 | " 2010-12-31 | \n",
1051 | " 1 days | \n",
1052 | " top1 left | \n",
1053 | "
\n",
1054 | " \n",
1055 | "
\n",
1056 | "
"
1057 | ],
1058 | "text/plain": [
1059 | " __top1left__ __top1right__ __top1diff__ __matchtype__\n",
1060 | "361 2010-12-28 2010-12-28 0 days exact\n",
1061 | "362 2010-12-29 2010-12-29 0 days exact\n",
1062 | "363 2010-12-30 2010-12-30 0 days exact\n",
1063 | "364 2010-12-31 2010-12-31 0 days exact\n",
1064 | "365 2011-01-01 2010-12-31 1 days top1 left"
1065 | ]
1066 | },
1067 | "execution_count": 19,
1068 | "metadata": {},
1069 | "output_type": "execute_result"
1070 | }
1071 | ],
1072 | "source": [
1073 | "result['top1']['date'].tail()"
1074 | ]
1075 | },
1076 | {
1077 | "cell_type": "code",
1078 | "execution_count": 20,
1079 | "metadata": {},
1080 | "outputs": [
1081 | {
1082 | "data": {
1083 | "text/html": [
1084 | "\n",
1085 | "\n",
1098 | "
\n",
1099 | " \n",
1100 | " \n",
1101 | " | \n",
1102 | " __top1right__date | \n",
1103 | " __top1left__ | \n",
1104 | " __top1right__ | \n",
1105 | " __top1diff__ | \n",
1106 | " __matchtype__ | \n",
1107 | "
\n",
1108 | " \n",
1109 | " \n",
1110 | " \n",
1111 | " 9396 | \n",
1112 | " 2010-01-01 | \n",
1113 | " 1846d424 | \n",
1114 | " 846d42 | \n",
1115 | " 2 | \n",
1116 | " top1 left | \n",
1117 | "
\n",
1118 | " \n",
1119 | " 3915 | \n",
1120 | " 2010-01-01 | \n",
1121 | " 23a7711a | \n",
1122 | " 3a7711 | \n",
1123 | " 2 | \n",
1124 | " top1 left | \n",
1125 | "
\n",
1126 | " \n",
1127 | " 20619 | \n",
1128 | " 2010-01-01 | \n",
1129 | " 259f4329 | \n",
1130 | " 59f432 | \n",
1131 | " 2 | \n",
1132 | " top1 left | \n",
1133 | "
\n",
1134 | " \n",
1135 | " 12528 | \n",
1136 | " 2010-01-01 | \n",
1137 | " b4862b21 | \n",
1138 | " 4862b2 | \n",
1139 | " 2 | \n",
1140 | " top1 left | \n",
1141 | "
\n",
1142 | " \n",
1143 | " 13050 | \n",
1144 | " 2010-01-01 | \n",
1145 | " e3e70682 | \n",
1146 | " 3e7068 | \n",
1147 | " 2 | \n",
1148 | " top1 left | \n",
1149 | "
\n",
1150 | " \n",
1151 | "
\n",
1152 | "
"
1153 | ],
1154 | "text/plain": [
1155 | " __top1right__date __top1left__ __top1right__ __top1diff__ __matchtype__\n",
1156 | "9396 2010-01-01 1846d424 846d42 2 top1 left\n",
1157 | "3915 2010-01-01 23a7711a 3a7711 2 top1 left\n",
1158 | "20619 2010-01-01 259f4329 59f432 2 top1 left\n",
1159 | "12528 2010-01-01 b4862b21 4862b2 2 top1 left\n",
1160 | "13050 2010-01-01 e3e70682 3e7068 2 top1 left"
1161 | ]
1162 | },
1163 | "execution_count": 20,
1164 | "metadata": {},
1165 | "output_type": "execute_result"
1166 | }
1167 | ],
1168 | "source": [
1169 | "result['top1']['id'].head()"
1170 | ]
1171 | },
1172 | {
1173 | "cell_type": "markdown",
1174 | "metadata": {
1175 | "collapsed": true
1176 | },
1177 | "source": [
1178 | "# Advanced Usage Options"
1179 | ]
1180 | },
1181 | {
1182 | "cell_type": "markdown",
1183 | "metadata": {},
1184 | "source": [
1185 | "## Passing a difference limit\n",
1186 | "By default every record in the left dataframe will be matched with a record in the right dataframe. Sometimes the difference is too large though to be considered a match. You can control this by passing the `top_limit` parameter."
1187 | ]
1188 | },
1189 | {
1190 | "cell_type": "code",
1191 | "execution_count": 21,
1192 | "metadata": {},
1193 | "outputs": [],
1194 | "source": [
1195 | "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n",
1196 | "df2 = pd.DataFrame(list(itertools.product(dates2,uuid1[:-2])),columns=['date','id'])\n",
1197 | "df2['v']=np.random.sample(df2.shape[0])\n",
1198 | "df2['id'] = df2['id'].str[1:-1]"
1199 | ]
1200 | },
1201 | {
1202 | "cell_type": "code",
1203 | "execution_count": 22,
1204 | "metadata": {},
1205 | "outputs": [
1206 | {
1207 | "data": {
1208 | "text/html": [
1209 | "\n",
1210 | "\n",
1223 | "
\n",
1224 | " \n",
1225 | " \n",
1226 | " | \n",
1227 | " __top1right__date | \n",
1228 | " __top1left__ | \n",
1229 | " __top1right__ | \n",
1230 | " __top1diff__ | \n",
1231 | " __matchtype__ | \n",
1232 | "
\n",
1233 | " \n",
1234 | " \n",
1235 | " \n",
1236 | " 7830 | \n",
1237 | " 2010-01-01 | \n",
1238 | " 1846d424 | \n",
1239 | " 846d42 | \n",
1240 | " 2 | \n",
1241 | " top1 left | \n",
1242 | "
\n",
1243 | " \n",
1244 | " 3393 | \n",
1245 | " 2010-01-01 | \n",
1246 | " 23a7711a | \n",
1247 | " 3a7711 | \n",
1248 | " 2 | \n",
1249 | " top1 left | \n",
1250 | "
\n",
1251 | " \n",
1252 | " 16182 | \n",
1253 | " 2010-01-01 | \n",
1254 | " 259f4329 | \n",
1255 | " 846d42 | \n",
1256 | " 6 | \n",
1257 | " top1 left | \n",
1258 | "
\n",
1259 | " \n",
1260 | " 8874 | \n",
1261 | " 2010-01-01 | \n",
1262 | " b4862b21 | \n",
1263 | " b1167b | \n",
1264 | " 5 | \n",
1265 | " top1 left | \n",
1266 | "
\n",
1267 | " \n",
1268 | " 9918 | \n",
1269 | " 2010-01-01 | \n",
1270 | " b4862b21 | \n",
1271 | " 846d42 | \n",
1272 | " 5 | \n",
1273 | " top1 left | \n",
1274 | "
\n",
1275 | " \n",
1276 | "
\n",
1277 | "
"
1278 | ],
1279 | "text/plain": [
1280 | " __top1right__date __top1left__ __top1right__ __top1diff__ __matchtype__\n",
1281 | "7830 2010-01-01 1846d424 846d42 2 top1 left\n",
1282 | "3393 2010-01-01 23a7711a 3a7711 2 top1 left\n",
1283 | "16182 2010-01-01 259f4329 846d42 6 top1 left\n",
1284 | "8874 2010-01-01 b4862b21 b1167b 5 top1 left\n",
1285 | "9918 2010-01-01 b4862b21 846d42 5 top1 left"
1286 | ]
1287 | },
1288 | "execution_count": 22,
1289 | "metadata": {},
1290 | "output_type": "execute_result"
1291 | }
1292 | ],
1293 | "source": [
1294 | "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id']).merge()\n",
1295 | "result['top1']['id'].head()"
1296 | ]
1297 | },
1298 | {
1299 | "cell_type": "markdown",
1300 | "metadata": {},
1301 | "source": [
1302 | "We have some correct matches but also some bad matches with `__top1diff__`>2. We will restrict `top_limit` to be at most 2."
1303 | ]
1304 | },
1305 | {
1306 | "cell_type": "code",
1307 | "execution_count": 23,
1308 | "metadata": {},
1309 | "outputs": [],
1310 | "source": [
1311 | "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id'], top_limit=[None,2]).merge()"
1312 | ]
1313 | },
1314 | {
1315 | "cell_type": "code",
1316 | "execution_count": 24,
1317 | "metadata": {},
1318 | "outputs": [
1319 | {
1320 | "data": {
1321 | "text/html": [
1322 | "\n",
1323 | "\n",
1336 | "
\n",
1337 | " \n",
1338 | " \n",
1339 | " | \n",
1340 | " __top1right__date | \n",
1341 | " __top1left__ | \n",
1342 | " __top1right__ | \n",
1343 | " __top1diff__ | \n",
1344 | " __matchtype__ | \n",
1345 | "
\n",
1346 | " \n",
1347 | " \n",
1348 | " \n",
1349 | " 7830 | \n",
1350 | " 2010-01-01 | \n",
1351 | " 1846d424 | \n",
1352 | " 846d42 | \n",
1353 | " 2 | \n",
1354 | " top1 left | \n",
1355 | "
\n",
1356 | " \n",
1357 | " 3393 | \n",
1358 | " 2010-01-01 | \n",
1359 | " 23a7711a | \n",
1360 | " 3a7711 | \n",
1361 | " 2 | \n",
1362 | " top1 left | \n",
1363 | "
\n",
1364 | " \n",
1365 | " 10440 | \n",
1366 | " 2010-01-01 | \n",
1367 | " e3e70682 | \n",
1368 | " 3e7068 | \n",
1369 | " 2 | \n",
1370 | " top1 left | \n",
1371 | "
\n",
1372 | " \n",
1373 | " 5220 | \n",
1374 | " 2010-01-01 | \n",
1375 | " e443df78 | \n",
1376 | " 443df7 | \n",
1377 | " 2 | \n",
1378 | " top1 left | \n",
1379 | "
\n",
1380 | " \n",
1381 | " 17226 | \n",
1382 | " 2010-01-01 | \n",
1383 | " eb1167b3 | \n",
1384 | " b1167b | \n",
1385 | " 2 | \n",
1386 | " top1 left | \n",
1387 | "
\n",
1388 | " \n",
1389 | "
\n",
1390 | "
"
1391 | ],
1392 | "text/plain": [
1393 | " __top1right__date __top1left__ __top1right__ __top1diff__ __matchtype__\n",
1394 | "7830 2010-01-01 1846d424 846d42 2 top1 left\n",
1395 | "3393 2010-01-01 23a7711a 3a7711 2 top1 left\n",
1396 | "10440 2010-01-01 e3e70682 3e7068 2 top1 left\n",
1397 | "5220 2010-01-01 e443df78 443df7 2 top1 left\n",
1398 | "17226 2010-01-01 eb1167b3 b1167b 2 top1 left"
1399 | ]
1400 | },
1401 | "execution_count": 24,
1402 | "metadata": {},
1403 | "output_type": "execute_result"
1404 | }
1405 | ],
1406 | "source": [
1407 | "result['top1']['id'].head()"
1408 | ]
1409 | },
1410 | {
1411 | "cell_type": "markdown",
1412 | "metadata": {},
1413 | "source": [
1414 | "## Passing a custom difference function\n",
1415 | "By default string matches are done using Levenstein edit distance. You can pass a custom function using `fun_diff`. For example lets pass Hamming distance."
1416 | ]
1417 | },
1418 | {
1419 | "cell_type": "code",
1420 | "execution_count": 25,
1421 | "metadata": {},
1422 | "outputs": [],
1423 | "source": [
1424 | "import jellyfish\n",
1425 | "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id'], fun_diff=[None,jellyfish.hamming_distance]).merge()"
1426 | ]
1427 | },
1428 | {
1429 | "cell_type": "code",
1430 | "execution_count": 26,
1431 | "metadata": {},
1432 | "outputs": [
1433 | {
1434 | "data": {
1435 | "text/html": [
1436 | "\n",
1437 | "\n",
1450 | "
\n",
1451 | " \n",
1452 | " \n",
1453 | " | \n",
1454 | " __top1right__date | \n",
1455 | " __top1left__ | \n",
1456 | " __top1right__ | \n",
1457 | " __top1diff__ | \n",
1458 | " __matchtype__ | \n",
1459 | "
\n",
1460 | " \n",
1461 | " \n",
1462 | " \n",
1463 | " 6786 | \n",
1464 | " 2010-01-01 | \n",
1465 | " 1846d424 | \n",
1466 | " b1167b | \n",
1467 | " 7 | \n",
1468 | " top1 left | \n",
1469 | "
\n",
1470 | " \n",
1471 | " 7047 | \n",
1472 | " 2010-01-01 | \n",
1473 | " 1846d424 | \n",
1474 | " 7c1bd8 | \n",
1475 | " 7 | \n",
1476 | " top1 left | \n",
1477 | "
\n",
1478 | " \n",
1479 | " 3393 | \n",
1480 | " 2010-01-01 | \n",
1481 | " 23a7711a | \n",
1482 | " 3a7711 | \n",
1483 | " 6 | \n",
1484 | " top1 left | \n",
1485 | "
\n",
1486 | " \n",
1487 | " 14877 | \n",
1488 | " 2010-01-01 | \n",
1489 | " 259f4329 | \n",
1490 | " 728b4f | \n",
1491 | " 7 | \n",
1492 | " top1 left | \n",
1493 | "
\n",
1494 | " \n",
1495 | " 16182 | \n",
1496 | " 2010-01-01 | \n",
1497 | " 259f4329 | \n",
1498 | " 846d42 | \n",
1499 | " 7 | \n",
1500 | " top1 left | \n",
1501 | "
\n",
1502 | " \n",
1503 | "
\n",
1504 | "
"
1505 | ],
1506 | "text/plain": [
1507 | " __top1right__date __top1left__ __top1right__ __top1diff__ __matchtype__\n",
1508 | "6786 2010-01-01 1846d424 b1167b 7 top1 left\n",
1509 | "7047 2010-01-01 1846d424 7c1bd8 7 top1 left\n",
1510 | "3393 2010-01-01 23a7711a 3a7711 6 top1 left\n",
1511 | "14877 2010-01-01 259f4329 728b4f 7 top1 left\n",
1512 | "16182 2010-01-01 259f4329 846d42 7 top1 left"
1513 | ]
1514 | },
1515 | "execution_count": 26,
1516 | "metadata": {},
1517 | "output_type": "execute_result"
1518 | }
1519 | ],
1520 | "source": [
1521 | "result['top1']['id'].head()"
1522 | ]
1523 | },
1524 | {
1525 | "cell_type": "code",
1526 | "execution_count": null,
1527 | "metadata": {},
1528 | "outputs": [],
1529 | "source": []
1530 | }
1531 | ],
1532 | "metadata": {
1533 | "kernelspec": {
1534 | "display_name": "Python 3",
1535 | "language": "python",
1536 | "name": "python3"
1537 | },
1538 | "language_info": {
1539 | "codemirror_mode": {
1540 | "name": "ipython",
1541 | "version": 3
1542 | },
1543 | "file_extension": ".py",
1544 | "mimetype": "text/x-python",
1545 | "name": "python",
1546 | "nbconvert_exporter": "python",
1547 | "pygments_lexer": "ipython3",
1548 | "version": "3.7.6"
1549 | }
1550 | },
1551 | "nbformat": 4,
1552 | "nbformat_minor": 2
1553 | }
1554 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | sphinx
3 | sphinxcontrib-napoleon
4 | sphinx_rtd_theme
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | jellyfish
4 | d6tstack
5 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name='d6tjoin',
5 | version='0.2.1',
6 | packages=['d6tjoin'],
7 | url='https://github.com/d6t/d6tjoin',
8 | license='MIT',
9 | author='DataBolt Team',
10 | author_email='support@databolt.tech',
11 | description='Easily join python pandas dataframes',
12 | long_description='Easily join python pandas dataframes'
13 | 'See https://github.com/d6t/d6tjoin for details',
14 | install_requires=[
15 | 'numpy',
16 | 'pandas',
17 | 'jellyfish',
18 | 'joblib',
19 | 'd6tstack',
20 | 'affinegap'
21 | ],
22 | include_package_data=True,
23 | python_requires='>=3.6'
24 | )
25 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d6t/d6tjoin/9618b129601aa0b4a9247d7001da8c2220d36d9c/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_pre_pd.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | import pytest
5 |
6 | import d6tjoin
7 |
8 | def fake_2dfs_identical():
9 | df = pd.DataFrame({'a':range(10)})
10 | df['b'] = ['b']*5+['bb']*5
11 | return [df, df.copy()]
12 |
13 | def fake_2dfs_1missing():
14 | df = pd.DataFrame({'a':range(10)})
15 | df['b'] = ['b']*5+['bb']*5
16 | return [df, df.copy().drop(['b'],1)]
17 |
18 | def test_internals():
19 | dfs = fake_2dfs_identical()
20 |
21 | pdj = d6tjoin.Prejoin(dfs, print_only=False)
22 | assert pdj.keys is None and pdj.keysdf is None
23 | assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs])
24 | assert all([dfg.shape==(pdj.nrows, dfs[0].shape[1]) for dfg in pdj.dfshead])
25 | dfc = pdj.head()
26 | assert all([dfg.head().equals(dfc[idx]) for idx,dfg in enumerate(dfs)])
27 | dfc = pdj.head(10)
28 | assert all([dfg.head(10).equals(dfc[idx]) for idx,dfg in enumerate(dfs)])
29 |
30 | # single keys param
31 | cfg_keys = ['b']
32 | pdj = d6tjoin.Prejoin(dfs,keys=cfg_keys)
33 | assert pdj.keys == [['b','b']] and pdj.keysdf == [['b'],['b']]
34 | assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs])
35 | assert all([dfg.shape==(pdj.nrows, len(cfg_keys)) for dfg in pdj.dfshead])
36 |
37 | dfs[1] = dfs[1].rename(columns={'b': 'c'})
38 | with pytest.raises(KeyError, match='Columns missing'):
39 | pdj = d6tjoin.Prejoin(dfs, keys=['b'])
40 |
41 | # different keys for dfs
42 | pdj = d6tjoin.Prejoin(dfs,keys=[['b'],['c']])
43 | assert pdj.keys == [['b','c']] and pdj.keysdf == [['b'],['c']]
44 | assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs])
45 | assert all([dfg.shape==(pdj.nrows, 1) for dfg in pdj.dfshead])
46 | pdj = d6tjoin.Prejoin(dfs,keys=[['b','c']], keys_bydf=False)
47 | assert pdj.keys == [['b','c']] and pdj.keysdf == [['b'],['c']]
48 |
49 | # multi keys param
50 | dfs[0]['b1']=dfs[0]['b'];dfs[1]['c1']=dfs[1]['c'];
51 | pdj = d6tjoin.Prejoin(dfs,keys=[['b','b1'],['c','c1']])
52 | assert pdj.keys == [['b','c'],['b1','c1']] and pdj.keysdf == [['b','b1'],['c','c1']]
53 | assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs])
54 | assert all([dfg.shape==(pdj.nrows, 2) for dfg in pdj.dfshead])
55 |
56 | # joins with keys specified
57 | dfs = fake_2dfs_identical()
58 | pdj = d6tjoin.Prejoin(dfs,keys=['b'], print_only=False)
59 | assert pdj.columns_common()==['b']
60 | assert pdj.columns_all()==['b']
61 |
62 | dfs[1] = dfs[1].rename(columns={'b': 'c'})
63 | pdj = d6tjoin.Prejoin(dfs,keys=[['b'],['c']], print_only=False)
64 | assert pdj.columns_all()==['b','c']
65 |
66 |
67 | def test_pre_columns():
68 | dfs = fake_2dfs_identical()
69 | pdj = d6tjoin.Prejoin(dfs,print_only=False)
70 | assert pdj.columns_common()==['a','b']
71 | assert pdj.columns_all()==['a','b']
72 |
73 | pdj.describe()
74 | assert pdj.shape() == {0: (10, 2), 1: (10, 2)}
75 |
76 | dfs = fake_2dfs_1missing()
77 | pdj = d6tjoin.Prejoin(dfs,print_only=False)
78 | assert pdj.columns_common()==['a']
79 | assert pdj.columns_all()==['a','b']
80 |
81 | def test_pre_describe():
82 | # describe_str
83 | chk = {'b': {'median': 1.5, 'min': 1.0, 'max': 2.0, 'nrecords': 10.0}}
84 | dfs = fake_2dfs_identical()
85 | pdj = d6tjoin.Prejoin(dfs,print_only=False)
86 | assert pdj.describe_str()[0].to_dict(orient='index')==chk
87 | pdj = d6tjoin.Prejoin(dfs,keys=['b'],print_only=False)
88 | assert pdj.describe_str()[0].to_dict(orient='index')==chk
89 |
90 | # describe_str
91 | chk = {'a': {'nrecords': 10, 'unique': 10, 'nan': 0, 'unique rate': 1.0},
92 | 'b': {'nrecords': 10, 'unique': 2, 'nan': 0, 'unique rate': 0.2}}
93 | pdj = d6tjoin.Prejoin(dfs,print_only=False)
94 | assert pdj.describe_data()[0].to_dict(orient='index')==chk
95 | pdj = d6tjoin.Prejoin(dfs,keys=['b'],print_only=False)
96 | assert pdj.describe_data()[0].to_dict(orient='index')==chk
97 |
98 | def test_pre_data_match():
99 | dfs = fake_2dfs_identical()
100 | pdj = d6tjoin.Prejoin(dfs,print_only=False)
101 |
102 | dfc = {'__left__': {0: 'b'},
103 | '__right__': {0: 'b'},
104 | '__similarity__': {0: 1.0},
105 | '__left-sample__': {0: 'bb'},
106 | '__right-sample__': {0: 'bb'},
107 | '__left-nunique__': {0: 2},
108 | '__right-nunique__': {0: 2}}
109 |
110 | assert pd.DataFrame(dfc).equals(pdj.data_match())
111 |
112 | dfc = {0: {'__left__': 'a',
113 | '__right__': 'a',
114 | '__similarity__': 1.0,
115 | '__left-sample__': 0,
116 | '__right-sample__': 0,
117 | '__left-nunique__': 10,
118 | '__right-nunique__': 10},
119 | 1: {'__left__': 'b',
120 | '__right__': 'b',
121 | '__similarity__': 1.0,
122 | '__left-sample__': 'bb',
123 | '__right-sample__': 'bb',
124 | '__left-nunique__': 2,
125 | '__right-nunique__': 2}}
126 |
127 | assert dfc==pdj.data_match(ignore_value_columns=False, max_unique_pct=1.0).to_dict(orient='index')
128 |
129 |
130 |
131 |
132 |
133 |
134 |
--------------------------------------------------------------------------------
/tests/test_smartjoin.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import pandas as pd
3 | import numpy as np
4 |
5 | # fuzzy join
6 | from faker import Faker
7 | import importlib
8 |
9 | import d6tjoin.smart_join
10 | importlib.reload(d6tjoin.smart_join)
11 | cfg_num = 10
12 | cfg_num_unmatched = 2
13 | cfg_num_matched = cfg_num-cfg_num_unmatched
14 |
15 | # d6t
16 | from d6tjoin.utils import df_str_summary, BaseJoin, PreJoin
17 |
18 | # ******************************************
19 | # helpers
20 | # ******************************************
21 | def gen_multikey_simple():
22 | fake = Faker()
23 | fake.seed(1)
24 |
25 | pool_names = [fake.name() for _ in range(cfg_num)]
26 | pool_dates = pd.date_range('1/1/2018', periods=cfg_num)
27 |
28 | # case multikey
29 | df1 = pd.DataFrame({'key': pool_names[:-cfg_num_unmatched], 'date': pool_dates[:-cfg_num_unmatched]})
30 | df2 = pd.DataFrame({'key': pool_names[cfg_num_unmatched:], 'date': pool_dates[cfg_num_unmatched:]})
31 | df1['val1'] = range(df1.shape[0])
32 | df2['val2'] = range(df2.shape[0])
33 |
34 | return df1, df2
35 |
36 | def gen_multikey_complex(unmatched_date=True):
37 |
38 | fake = Faker()
39 | fake.seed(1)
40 |
41 | pool_names = [fake.name() for _ in range(cfg_num)]
42 | cfg_num_per_group = 4
43 | pool_date1 = pd.date_range('1/1/2010', periods=cfg_num_per_group, freq='1M')
44 | if unmatched_date:
45 | pool_date2 = pd.bdate_range('1/1/2010', periods=cfg_num_per_group, freq='1BM')
46 | else:
47 | pool_date2 = pool_date1
48 |
49 | def gen_df(cfg_pool_rates, cfg_offset=0):
50 | dfg = []
51 | for i in range(cfg_num_per_group):
52 | dft = pd.DataFrame({'key': np.roll(pool_names, i + cfg_offset)[:cfg_num_per_group]})
53 | dft['date'] = cfg_pool_rates[i]
54 | dft['value'] = np.random.randn(dft.shape[0])
55 | dfg.append(dft)
56 | return pd.concat(dfg)
57 |
58 | df1 = gen_df(pool_date1)
59 | df2 = gen_df(pool_date2, 2)
60 |
61 | return df1, df2
62 |
63 |
64 | # ******************************************
65 | # utils
66 | # ******************************************
67 |
68 | def test_df_str_summary():
69 | df = pd.DataFrame({'a': ['a', 'aa'] * 2})
70 | df['b'] = ['aa', 'aaa'] * 2
71 |
72 | dft = df_str_summary(df)
73 | assert np.all(dft.values == np.array([[ 1.5, 1.5, 1. , 2. , 4. ],
74 | [ 2.5, 2.5, 2. , 3. , 4. ]]))
75 | dft = df_str_summary(df,['a'])
76 | assert np.all(dft.values == np.array([1.5, 1.5, 1. , 2. , 4.]))
77 |
78 | dft = df_str_summary(df,unique_count=True)
79 | assert np.all(dft.values == np.array([[ 1.5, 1.5, 1. , 2. , 4. , 2. ],
80 | [ 2.5, 2.5, 2. , 3. , 4. , 2. ]]))
81 |
82 |
83 | def test_basejoin():
84 | df1 = pd.DataFrame({'a': range(3), 'b': range(3)})
85 | df2 = pd.DataFrame({'a': range(3), 'b': range(3)})
86 |
87 | with pytest.raises(ValueError) as e:
88 | j = PreJoin([df1], ['a'])
89 | with pytest.raises(NotImplementedError) as e:
90 | j = PreJoin([df1,df2,df1], ['a'])
91 |
92 | j1 = PreJoin([df1,df2], ['a','b'])
93 | j2 = PreJoin([df1,df2], [['a','b'],['a','b']], keys_bydf=True)
94 | j3 = PreJoin([df1,df2], [['a','a'],['b','b']])
95 | assert j1.keys == [['a', 'a'], ['b', 'b']]
96 | assert j1.keys == j2.keys
97 | assert j2.keys == j3.keys
98 | assert j1.keysdf == [['a', 'b'], ['a', 'b']]
99 | assert j1.keysdf == j2.keysdf
100 | assert j3.keysdf == j2.keysdf
101 |
102 | df2 = pd.DataFrame({'a': range(3), 'c': range(3)})
103 |
104 | with pytest.raises(KeyError) as e:
105 | j1 = PreJoin([df1,df2], ['a','c'])
106 |
107 | j2 = PreJoin([df1,df2], [['a','b'],['a','c']], keys_bydf=True)
108 | j3 = PreJoin([df1,df2], [['a','a'],['b','c']])
109 | assert j2.keys == [['a', 'a'], ['b', 'c']]
110 | assert j3.keys == j2.keys
111 | assert j2.keysdf == [['a', 'b'], ['a', 'c']]
112 | assert j3.keysdf == j2.keysdf
113 |
114 | # ******************************************
115 | # prejoin
116 | # ******************************************
117 | def test_prejoin():
118 | df1 = pd.DataFrame({'a': range(3), 'b': range(3)})
119 | df2 = pd.DataFrame({'a': range(3), 'c': range(3)})
120 |
121 | j = PreJoin([df1,df2],['a'])
122 | dfr = j.stats_prejoin(print_only=False)
123 | results = dfr.to_dict()
124 | check = {'all matched': {0: True, 1: True},
125 | 'inner': {0: 3, 1: 3},
126 | 'key left': {0: 'a', 1: '__all__'},
127 | 'key right': {0: 'a', 1: '__all__'},
128 | 'left': {0: 3, 1: 3},
129 | 'outer': {0: 3, 1: 3},
130 | 'right': {0: 3, 1: 3},
131 | 'unmatched left': {0: 0, 1: 0},
132 | 'unmatched right': {0: 0, 1: 0},
133 | 'unmatched total': {0: 0, 1: 0}}
134 | assert results == check
135 | assert j.is_all_matched()
136 | assert j.is_all_matched('a')
137 |
138 | df2 = pd.DataFrame({'a': range(3,6), 'c': range(3)})
139 |
140 | j = PreJoin([df1,df2],['a'])
141 | dfr = j.stats_prejoin(print_only=False)
142 | assert (~dfr['all matched']).all()
143 | assert not j.is_all_matched()
144 | assert not j.is_all_matched('a')
145 |
146 | df2 = pd.DataFrame({'b': range(3,6), 'a': range(3), 'v':range(3)})
147 | cfg_keys = ['a', 'b']
148 | j = PreJoin([df1,df2],cfg_keys)
149 | dfr = j.stats_prejoin(print_only=False)
150 | assert dfr['all matched'].tolist()==[True, False, False]
151 | assert not j.is_all_matched()
152 | assert j.is_all_matched('a')
153 | assert not j.is_all_matched('b')
154 |
155 | # test show_input
156 | dfr = j.show_input(1,keys_only=False)
157 | assert dfr[0].equals(df1.head(1))
158 | assert dfr[1].equals(df2.head(1))
159 | dfr = j.show_input(-1,keys_only=True)
160 | assert dfr[0][cfg_keys].equals(df1[cfg_keys])
161 | assert dfr[1][cfg_keys].equals(df2[cfg_keys])
162 |
163 | # test show_unmatched
164 | j.show_unmatched('b',print_only=True) # just make sure print_only runs without errors
165 | dfr = j.show_unmatched('b',nrecords=-1)
166 | assert dfr['left'].equals(df1['b'])
167 | assert dfr['right'].equals(df2['b'])
168 | dfr = j.show_matched('a',nrecords=-1)
169 | assert dfr['left'].equals(df1['a'])
170 | assert dfr['right'].equals(df2['a'])
171 | dfr = j.show_unmatched('__all__',nrecords=-1)
172 | assert dfr['left'].equals(df1[cfg_keys])
173 | assert dfr['right'].equals(df2[cfg_keys])
174 | dfr = j.show_matched('__all__')
175 | assert dfr['left'].empty
176 | assert dfr['right'].empty
177 |
178 | dfr = j.show_unmatched('b',nrecords=1)
179 | assert dfr['left'].equals(df1['b'].head(1))
180 | assert dfr['right'].equals(df2['b'].head(1))
181 |
182 | dfr = j.show_unmatched('b',keys_only=False,nrecords=-1)
183 | assert dfr['left'].equals(df1)
184 | assert dfr['right'].equals(df2)
185 |
186 | dfr = j.show_unmatched('a')
187 | assert dfr['left'].empty
188 | assert dfr['right'].empty
189 | dfr = j.show_matched('b')
190 | assert dfr['left'].empty
191 | assert dfr['right'].empty
192 |
193 | # test show_unmatched
194 | j = PreJoin([df1,df2],['a'])
195 | with pytest.raises(RuntimeError) as e:
196 | j.show_unmatched('a', print_only=True)
197 | j.stats_prejoin()
198 | dfr = j.show_matched('__all__',nrecords=-1)
199 | assert dfr['left'].equals(df1[['a']])
200 | assert dfr['right'].equals(df2[['a']])
201 | dfr = j.show_unmatched('__all__',nrecords=-1)
202 | assert dfr['left'].empty
203 | assert dfr['right'].empty
204 |
205 |
206 | # ******************************************
207 | # fuzzy join
208 | # ******************************************
209 | def test_fakedata_singlekey_string():
210 |
211 | fake = Faker()
212 | fake.seed(1)
213 |
214 | pool_names = [fake.name() for _ in range(cfg_num)]
215 | pool_names_unmatched_left = pool_names[:cfg_num_unmatched]
216 |
217 | # case single key unmatched
218 | df1=pd.DataFrame({'key':pool_names[:-cfg_num_unmatched]})
219 | df2=pd.DataFrame({'key':pool_names[cfg_num_unmatched:]})
220 | df1['val1']=range(df1.shape[0])
221 | df2['val2']=range(df2.shape[0])
222 |
223 |
224 | with pytest.raises(ValueError) as e_info:
225 | d6tjoin.smart_join.FuzzyJoinTop1([df1, df2], [], [])
226 | with pytest.raises(KeyError) as e_info:
227 | d6tjoin.smart_join.FuzzyJoinTop1([df1,df2], fuzzy_keys=['unmatched'])
228 |
229 | importlib.reload(d6tjoin.smart_join)
230 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key'])
231 | assert sj.keysdf_fuzzy == [['key']]*2
232 | assert sj.keysdf_exact == []
233 |
234 | import jellyfish
235 | def diff_edit(a, b):
236 | return jellyfish.levenshtein_distance(a, b)
237 | def diff_hamming(a, b):
238 | return jellyfish.hamming_distance(a, b)
239 |
240 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key'])
241 | dfr = sj._gen_match_top1(0)['table'].copy()
242 | assert sj._gen_match_top1(0)['has duplicates']
243 | assert set(dfr.loc[dfr['__top1diff__']>0,'__top1left__'].unique()) == set(pool_names_unmatched_left)
244 | assert dfr.loc[dfr['__top1diff__']>0,'__top1right__'].values.tolist() == ['Teresa James', 'Rachel Davis', 'Teresa James']
245 | dfr['__top1diff__check'] = dfr.apply(lambda x: diff_edit(x['__top1left__'],x['__top1right__']),1)
246 | assert (dfr['__top1diff__']==dfr['__top1diff__check']).all()
247 |
248 | sj.set_fuzzy_how(0,{'fun_diff':[diff_hamming,diff_edit]})
249 | dfr = sj._gen_match_top1(0)['table'].copy()
250 | assert dfr.loc[dfr['__top1diff__']>0,'__top1right__'].values.tolist() == ['Teresa James', 'Amanda Johnson']
251 | assert not sj._gen_match_top1(0)['has duplicates']
252 |
253 |
254 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key'])
255 | dfr1 = sj._gen_match_top1(0)['table']
256 | # assert df1.shape[0] == dfr1.shape[0] # todo: deal with duplicates
257 | dfr2 = sj.join(True)
258 | assert np.array_equal(dfr1['__top1diff__'].sort_values().values, dfr2['__top1diff__key'].sort_values().values)
259 |
260 | def test_fakedata_singlekey_number():
261 | pool_dates = pd.date_range('1/1/2018',periods=cfg_num)
262 |
263 | # case single key date
264 | df1=pd.DataFrame({'date':pool_dates[:-cfg_num_unmatched]})
265 | df2=pd.DataFrame({'date':pool_dates[cfg_num_unmatched:]})
266 |
267 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['date'])
268 | dfr = sj._gen_match_top1_left_number([],[],'date','date',None)
269 |
270 | df_check = pd.DataFrame({'__top1left__':pool_dates[:-cfg_num_unmatched],'__top1right__':[pool_dates[cfg_num_unmatched]]*cfg_num_unmatched+pool_dates[cfg_num_unmatched:-cfg_num_unmatched].tolist()})
271 | df_check['__top1diff__'] = (df_check['__top1left__'] - df_check['__top1right__']).abs()
272 |
273 | assert dfr.equals(df_check)
274 |
275 | # apply top_nrecords
276 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['date'],fuzzy_how={0:{'top_limit':1}})
277 | dfr = sj._gen_match_top1_left_number([],[],'date','date',None)
278 |
279 | df_check = pd.DataFrame({'__top1left__':pool_dates[:-cfg_num_unmatched],'__top1right__':[pool_dates[cfg_num_unmatched]]*cfg_num_unmatched+pool_dates[cfg_num_unmatched:-cfg_num_unmatched].tolist()})
280 | df_check['__top1diff__'] = (df_check['__top1left__'] - df_check['__top1right__']).abs()
281 |
282 | assert dfr.equals(df_check)
283 |
284 | # case single key date, with exact keys
285 | pool_dates2 = pd.date_range('12/31/2017',periods=cfg_num)
286 | df1=pd.DataFrame({'grp':['a']*cfg_num_matched+['b']*cfg_num_matched,'date':pool_dates[:-cfg_num_unmatched].tolist()+pool_dates2[:-cfg_num_unmatched].tolist()})
287 | df2=pd.DataFrame({'grp':['a']*cfg_num_matched+['b']*cfg_num_matched,'date2':pool_dates[cfg_num_unmatched:].tolist()+pool_dates2[cfg_num_unmatched:].tolist()})
288 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],exact_keys=['grp'],fuzzy_keys=[['date', 'date2']])
289 | dfr = sj._gen_match_top1_left_number(['grp'],['grp'],'date','date2',None)
290 |
291 | dfc0 = pd.merge_asof(df1.sort_values('date'), df2.sort_values('date2'), left_on='date', right_on='date2', by='grp', direction='nearest')
292 | dfc = dfc0.rename(columns={'date':'__top1left__','date2':'__top1right__'})
293 | dfc['__top1diff__'] = (dfc['__top1left__'] - dfc['__top1right__']).abs()
294 | dfc = dfc[dfr.columns.tolist()]
295 |
296 | assert dfr.equals(dfc)
297 |
298 | dfc['__match type__'] = 'exact'
299 | dfc.loc[dfc['__top1diff__'].dt.days>0,'__match type__'] = 'top1 left'
300 |
301 | assert sj._gen_match_top1(0)['table'].equals(dfc)
302 | assert sj.join().sort_values(['date','grp']).reset_index(drop=True).equals(dfc0)
303 |
304 |
305 | def fakedata_multikey():
306 |
307 | df1, df2 = gen_multikey_simple()
308 |
309 | cfg_group_left=['date']
310 | cfg_group_right=cfg_group_left
311 | keyleft='key'
312 | keyright=keyleft
313 |
314 | '''
315 | from d6tjoin.smart_join import apply_gen_candidates_group
316 | df_keys_left = pd.DataFrame(df1.groupby(cfg_group_left)[keyleft].unique())
317 | df_keys_right = pd.DataFrame(df2.groupby(cfg_group_right)[keyright].unique())
318 | df_keysets_groups = df_keys_left.merge(df_keys_right, left_index=True, right_index=True)
319 | df_keysets_groups.columns = ['__top1left__', '__top1right__']
320 | dfg = df_keysets_groups.reset_index().groupby(cfg_group_left).apply(apply_gen_candidates_group)
321 | dfg = dfg.reset_index(-1, drop=True).reset_index()
322 | '''
323 | with pytest.raises(NotImplementedError) as e_info:
324 | d6tjoin.smart_join.FuzzyJoinTop1([df1,df2], fuzzy_keys=['key','date'])
325 |
326 |
327 | '''
328 | df1
329 | df2
330 |
331 |
332 | tests fuzzy string, exact keys
333 | tests fuzzy number int+float
334 | tests with nans
335 | groupby unique deal with nans
336 |
337 | merge just the keys together [often date, key = 1 row...]
338 | => as soon as have >1 fuzzy key need to specify if hierarchical
339 | // does it increase the compute complexity? have to do the same all pairs compute for every date!!
340 | => do global match, from there find the closest ones by date
341 |
342 | explain: warnings.warn('Multi-key fuzzy joins are currently done globally for each key indivudally, not hierarchically for each unique fuzzy key value pair')
343 | tests for factor data id vs date, id matching
344 |
345 | '''
346 | # with pytest.raises(ValueError) as e_info:
347 | # d6tjoin.smart_join.FuzzyJoinTop1([df1,df2], fuzzy_keys=['key','key'], fuzzy_how=[])
348 | #
349 | # importlib.reload(d6tjoin.smart_join)
350 | # sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key','date'])
351 | # dfr = sj.join(True)
352 | # assert df1.shape[0] == dfr.shape[0]
353 |
354 | # fakedata_multikey()
355 |
356 |
357 | def test_fakedata_multikey_iddate():
358 | import uuid
359 | import itertools
360 |
361 | nobs = 10
362 | uuid1 = [str(uuid.uuid4()) for _ in range(nobs)]
363 | dates1 = pd.date_range('1/1/2010','1/1/2011')
364 |
365 | dates2 = pd.bdate_range('1/1/2010', '1/1/2011') # business instead of calendar dates
366 |
367 | df1 = pd.DataFrame(list(itertools.product(uuid1, dates1)), columns=['id', 'date'])
368 | df1['v'] = np.random.sample(df1.shape[0])
369 |
370 | df2 = pd.DataFrame(list(itertools.product(uuid1, dates2)), columns=['id', 'date'])
371 | df2['v'] = np.random.sample(df2.shape[0])
372 |
373 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1, df2], exact_keys=['id'], fuzzy_keys=['date'])
374 | dft = sj.preview_fuzzy(0)
375 |
376 |
377 | df2 = df1.copy()
378 | df2['id'] = df1['id'].str[1:-1]
379 |
380 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df1, df2], exact_keys=['date'], fuzzy_keys=['id'])
381 | dft = sj.preview_fuzzy(0)
382 | dft.shape
383 | dft = sj._gen_match_top1(0)
384 | dft['table'].shape
385 |
386 | print('a')
387 |
388 |
389 | def fiddle():
390 | cfg_path_folder_base = '/mnt/data/data.raw/travelclick/'
391 | from d6tstack.read_excel_adv import read_excel_advanced
392 | cfg_path = cfg_path_folder_base+'predict/STR Rolling Weekly Since 9-11-01 to 4-14-18 values weekly.xlsx'
393 | df_str=read_excel_advanced(cfg_path, header_xls_start="A7", header_xls_end="D7",remove_blank_cols=True,remove_blank_rows=True)
394 | df_str['STAY_WEEK'] = df_str['Date']-pd.DateOffset(days=6)
395 | df_str.head()
396 |
397 | df_alltier2 = pd.read_excel(cfg_path_folder_base + 'predict/travelcity-revpar-unsorted.xlsx')
398 | sj = d6tjoin.smart_join.FuzzyJoinTop1([df_alltier2,df_str],fuzzy_keys=['STAY_WEEK'])
399 | sj._gen_match_top1(0)
400 |
401 | # fiddle()
402 |
403 | # test_fakedata_multikey_iddate()
--------------------------------------------------------------------------------
/tests/test_top1.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | pd.set_option('display.expand_frame_repr', False)
4 | import importlib
5 | import d6tjoin.top1
6 | import jellyfish
7 | from faker import Faker
8 |
9 | import tests.test_smartjoin
10 |
11 | def gen_df2_str():
12 | l1 = ['a', 'b']
13 | l2 = [l1[0], 'ba', 'cd']
14 | df1 = pd.DataFrame({'id':l1*4})
15 | df2 = pd.DataFrame({'id':l2*4})
16 | df1['v1']=range(df1.shape[0])
17 | df2['v2']=range(df2.shape[0])
18 | return df1, df2
19 |
20 | def gen_df2_num():
21 | l1 = [1,2]
22 | l2 = [l1[0],1.1,1.2]
23 | df1 = pd.DataFrame({'id': l1 * 4})
24 | df2 = pd.DataFrame({'id': l2 * 4})
25 | return df1, df2
26 |
27 |
28 | def test_top1_gen_candidates():
29 |
30 | def helper(df1, df2):
31 |
32 | dfr = d6tjoin.top1.MergeTop1Diff(df1, df2,'id','id',jellyfish.levenshtein_distance)._allpairs_candidates()
33 | assert dfr.shape==(4, 3)
34 | assert (dfr['__top1left__'].values[0]==df1['id'].values[0])
35 | assert np.all(dfr['__top1left__'].values[1:]==df1['id'].values[1])
36 | assert (dfr['__top1right__'].values[0]==df1['id'].values[0])
37 | assert (dfr['__top1right__']==df2['id'].values[1]).sum()==1
38 | assert (dfr['__top1right__']==df2['id'].values[2]).sum()==1
39 | assert (dfr['__matchtype__']=='exact').sum()==1
40 | assert (dfr['__matchtype__']=='top1 left').sum()==3
41 |
42 | df1, df2 = gen_df2_str()
43 | helper(df1, df2)
44 |
45 | df1, df2 = gen_df2_num()
46 | helper(df1, df2)
47 |
48 |
49 | def test_top1_str():
50 |
51 | df1, df2 = gen_df2_str()
52 |
53 | r = d6tjoin.top1.MergeTop1Diff(df1, df2,'id','id',jellyfish.levenshtein_distance).merge()
54 | dfr = r['top1']
55 | assert dfr['__top1diff__'].min()==0
56 | assert dfr['__top1diff__'].max()==1
57 | assert dfr.shape==(3, 4)
58 | dfr = r['merged']
59 | assert dfr.shape==(48, 4)
60 | assert np.all(dfr.groupby('id').size().values==np.array([16, 32]))
61 |
62 | df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=False)
63 | r = d6tjoin.top1.MergeTop1Diff(df1, df2,'key','key',jellyfish.levenshtein_distance,['date'],['date']).merge()
64 | dfr = r['merged']
65 | assert dfr.shape==(18, 5)
66 | assert np.all(dfr.groupby(['date','key']).size().values==np.array([1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))
67 |
68 | df1.head()
69 | df1.merge(df2, on=['date','key']).head()
70 | dfr.head()
71 |
72 | def test_top1_num():
73 |
74 | df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=True)
75 | r = d6tjoin.top1.MergeTop1Number(df1, df2,'date','date',is_keep_debug=True).merge()
76 | dfr = r['top1']
77 | assert dfr.shape==(4, 4)
78 | assert np.all(dfr.groupby('__matchtype__').size().values==np.array([2, 2]))
79 | assert dfr['__top1diff__'].dt.days.max()==2
80 | assert dfr['__top1diff__'].dt.days.min()==0
81 |
82 | df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=True)
83 | r = d6tjoin.top1.MergeTop1Number(df1, df2,'date','date',['key'],['key']).merge()
84 | dfr = r['merged']
85 | dfr.sort_values(['date','key'])
86 | r['top1'].sort_values(['__top1left__','key'])
87 | df1.sort_values(['key','date'])
88 | df2.sort_values(['key','date'])
89 | r['top1']
90 |
91 | def test_top1_multi():
92 |
93 | df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=True)
94 | df2['key'] = 'Mr. '+df1['key']
95 |
96 | r = d6tjoin.top1.MergeTop1(df1, df2,['date','key'],['date','key']).merge()
97 |
98 |
99 | assert True
100 |
101 |
102 | def test_top1_examples():
103 | import uuid
104 | import itertools
105 |
106 | # ******************************************
107 | # generate sample data
108 | # ******************************************
109 | nobs = 10
110 | # todo: set uuid seed
111 | # todo: only pick first 2 blocks
112 | f1 = Faker()
113 | f1.seed(0)
114 | uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)]
115 | dates1 = pd.date_range('1/1/2010', '1/1/2011')
116 | dates2 = pd.bdate_range('1/1/2010', '1/1/2011') # business instead of calendar dates
117 |
118 | df1 = pd.DataFrame(list(itertools.product(uuid1, dates1)), columns=['id', 'date'])
119 | df1['v'] = np.random.sample(df1.shape[0])
120 | df2 = df1.copy()
121 | df2['id'] = df1['id'].str[1:-1]
122 |
123 | # r = d6tjoin.top1.MergeTop1Number(df1, df2, 'id', 'id', ['date'], ['date']).merge()
124 | # assert raises ValueError => should check it's a number to do number join
125 |
126 | # r = d6tjoin.top1.MergeTop1Diff(df1, df2, 'id', 'id', jellyfish.levenshtein_distance, ['date'], ['date']).merge()
127 | # assert min()==2
128 | # assert diff no duplicates
129 | # assert diff found == substring
130 | # assert only 100 candidates (not 366*100)
131 |
132 | # r = d6tjoin.top1.MergeTop1(df1, df2, ['id'], ['id'], ['date'], ['date']).merge()
133 | # assert merged==merged
134 | # assert diff==diff
135 |
136 | # dates2 = pd.bdate_range('1/1/2010', '1/1/2011') # business instead of calendar dates
137 | # df2 = pd.DataFrame(list(itertools.product(uuid1, dates2)), columns=['id', 'date'])
138 | # df2['v'] = np.random.sample(df2.shape[0])
139 | # r = d6tjoin.top1.MergeTop1(df1, df2, ['date'], ['date'], ['id'], ['id']).merge()
140 | # # why cause error?
141 | # r = d6tjoin.top1.MergeTop1(df1.head(), df2, ['date'], ['date'], ['id'], ['id']).merge()
142 |
143 | df2 = pd.DataFrame(list(itertools.product(uuid1, dates2)), columns=['id', 'date'])
144 | df2['v'] = np.random.sample(df2.shape[0])
145 | df2['id'] = df1['id'].str[1:-1]
146 |
147 | result = d6tjoin.top1.MergeTop1(df1, df2, ['date', 'id'], ['date', 'id']).merge()
148 | result['merged']
149 | # o=d6tjoin.top1.MergeTop1(df1, df2, ['date', 'id'], ['date', 'id'])
150 | # o.cfg_exact_left_on
151 | result = d6tjoin.top1.MergeTop1(df1, df2, ['date', 'id'], ['date', 'id']).merge()
152 |
153 | d6tjoin.utils.PreJoin([df1, df2], ['id', 'date']).stats_prejoin(print_only=False)
154 |
155 | assert True
156 |
157 |
158 | def fiddle_set():
159 |
160 | import pandas as pd
161 | import numpy as np
162 | import importlib
163 | import d6tjoin.top1
164 |
165 | import ciseau
166 | import scipy.spatial.distance
167 |
168 | df_db = pd.read_csv('~/database.csv',index_col=0)
169 |
170 | def diff_jaccard(a, b):
171 | # pad with empty str to make euqal length
172 | a = np.pad(a, (0, max(0, len(b) - len(a))), 'constant', constant_values=(0, 0))
173 | b = np.pad(b, (0, max(0, len(a) - len(b))), 'constant', constant_values=(0, 0))
174 | return scipy.spatial.distance.jaccard(a, b)
175 |
176 | def strsplit(t):
177 | return [s for s in [s.replace(" ", "") for s in ciseau.tokenize(t)] if s not in ['.', ',', '-', ';', '(', ')']]
178 |
179 | importlib.reload(d6tjoin.top1)
180 | j = d6tjoin.top1.MergeTop1Diff(df_db.head(),df_db,'description','description',fun_diff=diff_jaccard,topn=2,fun_preapply=strsplit,fun_postapply=lambda x: ' '.join(x))
181 | j.merge()['merged']
182 |
183 |
184 | def test_multicore():
185 | nobs = 10
186 | f1 = Faker()
187 | f1.seed(0)
188 | uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)]
189 |
190 | df1 = pd.DataFrame(uuid1, columns=['id'])
191 | df1['val1'] = np.round(np.random.sample(df1.shape[0]), 3)
192 |
193 | # create mismatch
194 | df2 = df1.copy()
195 | df2['id'] = df1['id'].str[1:-1]
196 | df2['val2'] = np.round(np.random.sample(df2.shape[0]), 3)
197 |
198 |
199 | m = d6tjoin.top1.MergeTop1Diff(df1,df2,'id','id',fun_diff=jellyfish.levenshtein_distance)
200 | df_candidates = m._allpairs_candidates()
201 |
202 | idxSel = df_candidates['__matchtype__'] != 'exact'
203 | dfd2 = df_candidates.copy()
204 | dfd2.loc[idxSel,'__top1diff__'] = d6tjoin.top1._applyFunMulticore(df_candidates.loc[idxSel,'__top1left__'].values, df_candidates.loc[idxSel,'__top1right__'].values,jellyfish.levenshtein_distance)
205 |
206 | dfd1 = df_candidates.copy()
207 | dfd1.loc[idxSel, '__top1diff__'] = df_candidates[idxSel].apply(lambda x: jellyfish.levenshtein_distance(x['__top1left__'], x['__top1right__']), axis=1)
208 | assert dfd2.equals(dfd1)
209 |
210 | assert True
211 |
212 | '''
213 | multicore in caller class
214 | pass multicore on
215 | make ifelse multicore for every apply diff
216 |
217 | default yes?
218 | part of requirements
219 |
220 | update setup.py requirements
221 |
222 |
223 | '''
224 |
225 |
226 | test_top1_gen_candidates()
--------------------------------------------------------------------------------
/tests/tmp.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import importlib
3 |
4 | import d6tjoin
5 | import d6tjoin.utils
6 | importlib.reload(d6tjoin.utils)
7 |
8 | df1=pd.DataFrame({'v':list(range(10))*2,'g':['a']*10+['b']*10})
9 | df2=df1.copy()
10 |
11 | j = d6tjoin.PreJoin([df1,df2])
12 | j.str_describe()
13 | j.data_describe()
14 | j.columns_common()
15 | j.columns_ispresent()
16 | j.data_match()
17 |
18 | j = d6tjoin.PreJoin([df1,df2], print_only=False)
19 | r = j.data_match()
20 | dfc = {'__left__': {0: 'g', 1: 'v'},
21 | '__right__': {0: 'g', 1: 'v'},
22 | '__similarity__': {0: 1.0, 1: 1.0}}
23 | dfc = pd.DataFrame(dfc)
24 | assert r.equals(dfc)
25 | print(r)
26 |
27 | quit()
28 |
29 | df1=pd.DataFrame({'a':range(3),'b':range(3)})
30 | df2=pd.DataFrame({'a':range(3),'c':range(3)})
31 | df2=pd.DataFrame({'a':range(3),'b':range(3,6)})
32 | df2=pd.DataFrame({'a':range(3,6),'c':range(3)})
33 |
34 |
35 | j = d6tjoin.utils.BaseJoin([df1,df2],['a'])
36 |
37 | j = d6tjoin.utils.BaseJoin([df1,df2],['a','b'])
38 | j.keys
39 | dfr = j.stats_prejoin(return_results=True)
40 | dfr
41 | (~dfr['all matched']).all()
42 |
43 | j = d6tjoin.utils.BaseJoin([df1,df2],['a'])
44 | j.stats_prejoin(return_results=True).to_dict()
45 |
46 |
--------------------------------------------------------------------------------