├── .gitignore
├── MANIFEST.in
├── README.md
├── d6tjoin
    ├── __init__.py
    ├── pre.py
    ├── smart_join.py
    ├── top1.py
    └── utils.py
├── docs
    ├── Makefile
    ├── make.bat
    ├── samples.py
    ├── shell-napoleon-html.sh
    ├── shell-napoleon-recreate.sh
    └── source
    │   ├── conf.py
    │   ├── d6tjoin.rst
    │   ├── index.rst
    │   ├── modules.rst
    │   └── setup.rst
├── examples-prejoin.ipynb
├── examples-tokencluster.ipynb
├── examples-top1.ipynb
├── requirements-dev.txt
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── test_pre_pd.py
    ├── test_smartjoin.py
    ├── test_top1.py
    └── tmp.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea/
  2 | .env
  3 | temp/
  4 | fiddle*
  5 | .pytest_cache/
  6 | tests/tmp-local.py
  7 | tests/tmp*.py
  8 | 
  9 | docs-examples/
 10 | 
 11 | # Byte-compiled / optimized / DLL files
 12 | __pycache__/
 13 | *.py[cod]
 14 | *$py.class
 15 | 
 16 | # C extensions
 17 | *.so
 18 | 
 19 | # Distribution / packaging
 20 | .Python
 21 | build/
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | .hypothesis/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | .static_storage/
 66 | .media/
 67 | local_settings.py
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # Environments
 95 | .env
 96 | .venv
 97 | env/
 98 | venv/
 99 | ENV/
100 | env.bak/
101 | venv.bak/
102 | 
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 | 
107 | # Rope project settings
108 | .ropeproject
109 | 
110 | # mkdocs documentation
111 | /site
112 | 
113 | # mypy
114 | .mypy_cache/
115 | 
116 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Databolt Smart Join
  2 | 
  3 | Easily join different datasets without writing custom code. Does best match joins on strings, dates and numbers. For example you can quickly join similar but not identical stock tickers, addresses, names and dates without manual processing.
  4 | 
  5 | ## Installation
  6 | 
  7 | **0.2.x is currently in beta. The github master is the latest dev version. The docs refer to <0.2.0**  
  8 | 
  9 | We recommend using the latest version from github `pip install git+https://github.com/d6t/d6tjoin.git`
 10 | 
 11 | If you cannot install from github, use the latest published version `pip install d6tjoin`. To update, run `pip install d6tflow -U --no-deps`
 12 | 
 13 | We recommend using [AffineGap](https://github.com/dedupeio/affinegap) which is not an official requirement, you can install using `pip install affinegap`.
 14 | 
 15 | For the `jellyfish` library, make sure the C implementation is working else `d6tjoin` will be very slow. You can test by running `import jellyfish.cjellyfish` if the C version is installed. If you don't have a C compiler, you can `conda install -c conda-forge jellyfish`.
 16 | 
 17 | ## Sample Use
 18 | 
 19 | ```
 20 | 
 21 | import d6tjoin.top1
 22 | import d6tjoin.utils
 23 | import d6tjoin
 24 | 
 25 | #************************
 26 | # pre join diagnostics
 27 | #************************
 28 | 
 29 | # check join quality => none of the ids match
 30 | 
 31 | d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()
 32 | 
 33 |   key left key right  all matched  inner  left  right  outer  unmatched total  unmatched left  unmatched right
 34 | 0       id        id        False      0    10     10     20               20              10               10
 35 | 1     date      date         True    366   366    366    366                0               0                0
 36 | 2  __all__   __all__        False      0  3660   3660   7320             7320            3660             3660
 37 | 
 38 | #************************
 39 | # best match join on id
 40 | #************************
 41 | 
 42 | result = d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'],
 43 |     exact_left_on=['date'],exact_right_on=['date']).merge()
 44 | 
 45 | result['merged'].head(2)
 46 | 
 47 |         date        id   val1 id_right  val1_right   val2
 48 | 0 2010-01-01  e3e70682  0.020   3e7068       0.020  0.034
 49 | 1 2010-01-01  f728b4fa  0.806   728b4f       0.806  0.849
 50 | 
 51 | #************************
 52 | # debug best matches
 53 | #************************
 54 | 
 55 | result['top1']['id'].head(2)
 56 | 
 57 |          date __top1left__ __top1right__  __top1diff__ __matchtype__
 58 | 10 2010-01-01     e3e70682        3e7068             2     top1 left
 59 | 34 2010-01-01     e443df78        443df7             2     top1 left
 60 | 
 61 | #************************
 62 | # customize similarity fct
 63 | #************************
 64 | import affinegap
 65 | 
 66 | result = d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], 
 67 |     fun_diff=[affinegap.affineGapDistance]).merge()
 68 | 
 69 | #************************
 70 | # token-based substring clusters and joins
 71 | #************************
 72 | dftoken=d6tjoin.utils.splitcharTokenCount(df2['id'])
 73 | 
 74 |       word  count
 75 | 0   Equity      7
 76 | 1       US      5
 77 | 2       NA      2
 78 | 3  PRIVATE      2
 79 | 
 80 | 
 81 | d6tjoin.utils.unique_contains(df2['id'], dftoken['word'].values)
 82 | >>> [('Equity', ['AAPL_US_Equity', 'AAP_US_Equity', 'AD_NA_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity', 'BMW_NA_Equity']), ('US', ['AAPL_US_Equity', 'AAP_US_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity']), ('NA', ['AD_NA_Equity', 'BMW_NA_Equity']), ('PRIVATE', ['PRIVATE_JLP', 'PRIVATE_NMG'])]
 83 | 
 84 | import re
 85 | splitchars="[^a-zA-Z0-9]+"
 86 | def tokenmatch(s1,s2):
 87 |     return 3-len(set(re.split(splitchars,s1)) & set(re.split(splitchars,s2)))
 88 | 
 89 | d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[tokenmatch], top_limit=[2]).merge()['top1']['id']
 90 | 
 91 |   __top1left__   __top1right__ __matchtype__  __top1diff__
 92 | 0          AAP   AAP_US_Equity     top1 left             2
 93 | 1         AAPL  AAPL_US_Equity     top1 left             2
 94 | 2    AMZN-AMZN  AMZN_US_Equity     top1 left             2
 95 | 3         APRN  APRN_US_Equity     top1 left             2
 96 | 4          JLP     PRIVATE_JLP     top1 left             2
 97 | 5          NMG     PRIVATE_NMG     top1 left             2
 98 | 
 99 | ```
100 | 
101 | ## Features include
102 | Enhances `pd.merge()` function with:
103 | * Pre join diagnostics to identify mismatched join keys
104 | * Best match joins that finds the top1 most similar value
105 | 	* Quickly join stock identifiers, addresses, names without manual processing
106 | 	* Ability to customize similarity functions, set max difference and other advanced features
107 | 
108 | ## Documentation
109 | 
110 | *  [PreJoin examples notebook](https://github.com/d6t/d6tjoin/blob/master/examples-prejoin.ipynb) - Examples for diagnosing join problems
111 | *  [MergeTop1 notebook](https://github.com/d6t/d6tjoin/blob/master/examples-top1.ipynb) - Best match join examples notebook
112 | *  [Token substring join notebook](https://github.com/d6t/d6tjoin/blob/master/examples-tokencluster.ipynb) - Find common substrings and joins on token substrings
113 | *  [Official docs](http://d6tjoin.readthedocs.io/en/latest/py-modindex.html) - Detailed documentation for modules, classes, functions
114 | 
115 | ## Pro version
116 | 
117 | Additional features:  
118 | * Join >2 dataframes
119 | * Automatic Content-based similarity joins
120 | * Advanced join quality checks
121 | * Fast approximations for big data
122 | 
123 | [Request demo](https://pipe.databolt.tech/gui/request-premium/)
124 | 
125 | ## Faster Data Engineering
126 | 
127 | Check out other d6t libraries to solve common data engineering problems, including  
128 | * data ingest, quickly ingest raw data
129 | * fuzzy joins, quickly join data
130 | * data pipes, quickly share and distribute data
131 | 
132 | https://github.com/d6t/d6t-python
133 | 
134 | And we encourage you to join the Databolt blog to get updates and tips+tricks http://blog.databolt.tech


--------------------------------------------------------------------------------
/d6tjoin/__init__.py:
--------------------------------------------------------------------------------
1 | # import d6tjoin.top1
2 | import d6tjoin.utils
3 | 
4 | from d6tjoin.pre import Prejoin
5 | pd = Prejoin


--------------------------------------------------------------------------------
/d6tjoin/pre.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import itertools, warnings
  3 | 
  4 | import pandas as pd
  5 | pd.set_option('display.expand_frame_repr', False)
  6 | import numpy as np
  7 | 
  8 | from d6tstack.helpers import *
  9 | from scipy.stats import mode
 10 | 
 11 | 
 12 | # ******************************************
 13 | # utils
 14 | # ******************************************
 15 | def head(dfs, nrows=1000):
 16 |     return [dfg.head(nrows) for dfg in dfs]
 17 | 
 18 | # ******************************************
 19 | # prejoin stats class
 20 | # ******************************************
 21 | 
 22 | class Prejoin(object):
 23 |     """
 24 |     Analyze, slice & dice join keys and dataframes before joining. Useful for checking how good a join will be and quickly looking at unmatched join keys.
 25 | 
 26 |     Args:
 27 |         dfs (list): list of data frames to join
 28 |         keys (var): either list of strings `['a','b']` if join keys have the same names in all dataframes or list of lists if join keys are different across dataframes `[[leftkeys],[rightkeys]]`, eg `[['left1','left2'],['right1','right2']]`
 29 |         keys_bydf (bool): if False, specify multi-key join keys by join level eg `[['left1','right1'],['left2','right2']]`
 30 |         nrows (int): for `df.head(nrows)`
 31 |         print_only (bool): if False return results instead of printing
 32 |     """
 33 | 
 34 |     def __init__(self, dfs, keys=None, keys_bydf=True, nrows=5, print_only=True):
 35 | 
 36 |         # inputs dfs
 37 |         self._init_dfs(dfs)
 38 | 
 39 |         if keys is not None:
 40 |             self.set_keys(keys, keys_bydf)
 41 |         else:
 42 |             self.keys = None; self.keysdf = None
 43 | 
 44 |         self.nrows = nrows
 45 |         self.print_only = print_only
 46 | 
 47 |         # df heads
 48 |         self.dfshead = [dfg.head(nrows) for idx, dfg in self._enumerate_dfs()]
 49 | 
 50 |         # init column scan
 51 |         self.columns_sniff()
 52 | 
 53 |     def _init_dfs(self, dfs):
 54 |         # check and save dfs
 55 |         if len(dfs)<2:
 56 |             raise ValueError('Need to pass at least 2 dataframes')
 57 | 
 58 |         if len(dfs)>2:
 59 |             raise NotImplementedError('Only handles 2 dataframes for now')
 60 | 
 61 |         self.dfs = dfs
 62 |         self.cfg_ndfs = len(dfs)
 63 | 
 64 |     def _enumerate_dfs(self):
 65 |         if self.keys is None:
 66 |             return enumerate(self.dfs)
 67 |         else:
 68 |             return [(idx, dfg[self.keysdf[idx]]) for idx, dfg in enumerate(self.dfs)]
 69 | 
 70 |     def set_keys(self, keys, keys_bydf=True):
 71 |         # check and save join keys
 72 |         self._check_keys(keys)
 73 |         keys, keysdf = self._prep_keys(keys, keys_bydf)
 74 |         self._check_keysdfs(keys, keysdf)
 75 |         # join keys
 76 |         self.cfg_njoins = len(keysdf[0])
 77 |         self.keys = keys  # keys by join level
 78 |         self.keysall = keys + [['__all__'] * len(self.dfs)]
 79 |         self.keysdf = keysdf  # keys by df
 80 |         self.keysdfall = keysdf + [['__all__']] * len(self.dfs)
 81 |         self.uniques = []  # set of unique values for each join key individually
 82 |         self.keysets = []  # set of unique values for all join keys together __all__
 83 | 
 84 |         return keys, keysdf
 85 | 
 86 |     def _check_keys(self, keys):
 87 |         if not keys or len(keys)<1:
 88 |             raise ValueError("Need to have join keys")
 89 |         # todo: no duplicate join keys passed
 90 | 
 91 |     def _check_keysdfs(self, keys, keysdf):
 92 |         if not all([len(k)==len(self.dfs) for k in keys]):
 93 |             raise ValueError("Need to provide join keys for all dataframes")
 94 | 
 95 |         for idf,dfg in enumerate(self.dfs): # check that keys present in dataframe
 96 |             missing = set(keysdf[idf]).difference(dfg.columns)
 97 |             if missing:
 98 |                 raise KeyError(f'Columns missing in df#{idf}: {missing}')
 99 | 
100 |     def _prep_keys(self, keys, keys_bydf):
101 |         # deal with empty keys
102 |         if not keys:
103 |             return [], []
104 | 
105 |         # get keys in correct format given user input
106 |         if isinstance(keys[0], (str,)):
107 |             keysdf = [keys]*len(self.dfs)
108 |             keys = list(map(list, zip(*keysdf)))
109 | 
110 |         elif isinstance(keys[0], (list,)):
111 |             keysdf = list(map(list, zip(*keys)))
112 | 
113 |             if keys_bydf:
114 |                 keys, keysdf = keysdf, keys
115 | 
116 |         else:
117 |             raise ValueError("keys need to be either list of strings or list of lists")
118 | 
119 |         return keys, keysdf
120 | 
121 |     def _return(self, result):
122 |         if self.print_only:
123 |             print(result)
124 |         else:
125 |             return result
126 | 
127 |     def _returndict(self, result):
128 |         if self.print_only:
129 |             for idx,d in result.items():
130 |                 print(f'dataframe #{idx}')
131 |                 print(d)
132 |         else:
133 |             return result
134 | 
135 |     def columns_sniff(self):
136 |         # from d6tstack
137 |         # todo: modularize d6tstack
138 |         # tood: rewrite scipy mode function
139 | 
140 |         dfl_all = self.dfshead
141 |         fname_list = range(len(self.dfs))
142 | 
143 |         # process columns
144 |         dfl_all_col = [df.columns.tolist() for df in dfl_all]
145 |         col_files = dict(zip(fname_list, dfl_all_col))
146 |         col_common = list_common(list(col_files.values()))
147 |         col_all = list_unique(list(col_files.values()))
148 | 
149 |         # find index in column list so can check order is correct
150 |         df_col_present = {}
151 |         for iFileName, iFileCol in col_files.items():
152 |             df_col_present[iFileName] = [iCol in iFileCol for iCol in col_all]
153 | 
154 |         df_col_present = pd.DataFrame(df_col_present, index=col_all).T
155 |         df_col_present.index.names = ['file_path']
156 | 
157 |         # find index in column list so can check order is correct
158 |         df_col_idx = {}
159 |         for iFileName, iFileCol in col_files.items():
160 |             df_col_idx[iFileName] = [iFileCol.index(iCol) if iCol in iFileCol else np.nan for iCol in col_all]
161 |         df_col_idx = pd.DataFrame(df_col_idx, index=col_all).T
162 | 
163 |         # order columns by where they appear in file
164 |         m=mode(df_col_idx,axis=0)
165 |         df_col_pos = pd.DataFrame({'o':m[0][0],'c':m[1][0]},index=df_col_idx.columns)
166 |         df_col_pos = df_col_pos.sort_values(['o','c'])
167 |         df_col_pos['iscommon']=df_col_pos.index.isin(col_common)
168 | 
169 | 
170 |         # reorder by position
171 |         col_all = df_col_pos.index.values.tolist()
172 |         col_common = df_col_pos[df_col_pos['iscommon']].index.values.tolist()
173 |         col_unique = df_col_pos[~df_col_pos['iscommon']].index.values.tolist()
174 |         df_col_present = df_col_present[col_all]
175 |         df_col_idx = df_col_idx[col_all]
176 | 
177 |         sniff_results = {'files_columns': col_files, 'columns_all': col_all, 'columns_common': col_common,
178 |                        'columns_unique': col_unique, 'is_all_equal': columns_all_equal(dfl_all_col),
179 |                        'df_columns_present': df_col_present, 'df_columns_order': df_col_idx}
180 | 
181 |         self.sniff_results = sniff_results
182 | 
183 | 
184 |     def _calc_keysets(self):
185 | 
186 |         self.keysets = [] # reset
187 | 
188 |         # find set of unique values for each join key
189 |         for idx, dfg in enumerate(self.dfs):
190 | 
191 |             # keys individually
192 |             uniquedict = OrderedDict()
193 |             for key in self.keysdf[idx]:
194 |                 v = dfg[key].unique()
195 |                 uniquedict[key] = set(v[~pd.isnull(v)])
196 | 
197 |             # keys _all__
198 |             dft = dfg[self.keysdf[idx]].drop_duplicates()
199 |             uniquedict['__all__'] = {tuple(x) for x in dft.values}
200 |             self.uniques.append(uniquedict)
201 | 
202 |         # perform set logic
203 |         for keys in self.keysall:
204 |             df_key = {}
205 |             df_key['key left'] = keys[0]
206 |             df_key['key right'] = keys[1]
207 |             df_key['keyset left'] = self.uniques[0][df_key['key left']]
208 |             df_key['keyset right'] = self.uniques[1][df_key['key right']]
209 | 
210 |             df_key['inner'] = df_key['keyset left'].intersection(df_key['keyset right'])
211 |             df_key['outer'] = df_key['keyset left'].union(df_key['keyset right'])
212 |             df_key['unmatched total'] = df_key['keyset left'].symmetric_difference(df_key['keyset right'])
213 |             df_key['unmatched left'] = df_key['keyset left'].difference(df_key['keyset right'])
214 |             df_key['unmatched right'] = df_key['keyset right'].difference(df_key['keyset left'])
215 | 
216 |             # check types are consistent
217 |             vl = next(iter(df_key['keyset left'])) # take first element
218 |             vr = next(iter(df_key['keyset right'])) # take first element
219 | 
220 |             df_key['value type'] = type(vl)
221 | 
222 |             self.keysets.append(df_key)
223 | 
224 |     def head(self, nrows=None):
225 |         """
226 |         .head() of input dataframes
227 | 
228 |         Args:
229 |             keys_only (bool): only print join keys
230 |             nrows (int): number of rows to show
231 |             print (bool): print or return df
232 | 
233 |         """
234 |         if nrows is None:
235 |             result = {idx: dfg for idx, dfg in enumerate(self.dfshead)}
236 |         else:
237 |             result = {idx: dfg.head(nrows) for idx, dfg in self._enumerate_dfs()}
238 |         return self._returndict(result)
239 | 
240 |     def columns_common(self):
241 |         return self._return(self.sniff_results['columns_common'])
242 | 
243 |     def columns_all(self):
244 |         return self._return(self.sniff_results['columns_all'])
245 | 
246 |     def columns_ispresent(self, as_bool=False):
247 |         # todo: maintain column order of first dataframe => take from d6tstack
248 |         col_union = list(set().union(*[dfg.columns.tolist() for dfg in self.dfs]))
249 |         dfr = dict(zip(range(self.cfg_ndfs),[dfg.columns.isin(col_union) for dfg in self.dfs]))
250 |         dfr = pd.DataFrame(dfr,index=col_union).sort_index()
251 |         if not as_bool:
252 |             dfr = dfr.replace([True,False],['+','-'])
253 |         return self._return(dfr)
254 | 
255 |     def describe(self, **kwargs):
256 |         """
257 |         .describe() of input dataframes
258 | 
259 |         Args:
260 |             kwargs (misc): to pass to .describe()
261 | 
262 |         """
263 |         result = {idx: dfg.describe(**kwargs) for idx, dfg in self._enumerate_dfs()}
264 |         return self._returndict(result)
265 | 
266 |     def shape(self):
267 |         """
268 |         .shape of input dataframes
269 | 
270 |         Args:
271 |             kwargs (misc): to pass to .describe()
272 | 
273 |         """
274 |         result = {idx: dfg.shape for idx, dfg in self._enumerate_dfs()}
275 |         return self._returndict(result)
276 | 
277 |     def describe_str(self, unique_count=False):
278 |         """
279 |         Returns statistics on length of all strings and other objects in pandas dataframe. Statistics include mean, median, min, max. Optional unique count.
280 | 
281 |         Args:
282 |             dfg (dataframe): pandas dataframe
283 |             columns (:obj:`list`, optional): column names to analyze. If None analyze all
284 |             unique_count (:obj:`bool`, optional): include count of unique values
285 | 
286 |         Returns:
287 |             dataframe: string length statistics
288 |         """
289 |         def _apply_strlen(dfg, unique_count=False):
290 |             lenv = np.vectorize(len)
291 |             alens = lenv(dfg.values)
292 |             r = {'median':np.median(alens),'mean':np.mean(alens),'min':np.min(alens),'max':np.max(alens),'nrecords':dfg.shape[0]}
293 |             if unique_count:
294 |                 r['uniques'] = len(dfg.unique())
295 |             return pd.Series(r)
296 | 
297 |         result = {}
298 |         for idx, dfg in enumerate(self.dfs):
299 |             if unique_count:
300 |                 cfg_col_sel = ['median','min','max','nrecords','uniques']
301 |             else:
302 |                 cfg_col_sel = ['median','min','max','nrecords']
303 |             dfo = dfg.select_dtypes(include=['object']).apply(lambda x: _apply_strlen(x.dropna(), unique_count)).T[cfg_col_sel]
304 |             result[idx] = dfo
305 |         return self._returndict(result)
306 | 
307 |     def describe_data(self, ignore_value_columns=False):
308 |         result = {}
309 |         for idx, dfg in enumerate(self.dfs):
310 | 
311 |             if ignore_value_columns:
312 |                 columns_sel = dfg.select_dtypes(include=['object']).columns
313 |             else:
314 |                 columns_sel = dfg.columns
315 | 
316 |             nunique = dfg[columns_sel].apply(lambda x: x.dropna().unique().shape[0]).rename('unique')
317 |             nrecords = dfg[columns_sel].apply(lambda x: x.dropna().shape[0]).rename('nrecords')
318 |             nnan = dfg[columns_sel].isna().sum().rename('nan')
319 |             dfr = pd.concat([nrecords,nunique,nnan],1)
320 |             dfr['unique rate'] = dfr['unique']/dfr['nrecords']
321 |             result[idx] = dfr
322 | 
323 |         return self._returndict(result)
324 | 
325 |     def data_match(self, how=None, topn=1, ignore_value_columns=True, max_unique_pct=0.8, min_unique_count=1, min_match_rate=0.5):
326 |         '''
327 |         todo:
328 |             order matters, sequential inner or left joins (no right or outer joins)
329 |             jaccard 1:2 => intersection for inner, same set for left
330 |             
331 |         '''
332 |         how = 'inner' if how is None else how
333 | 
334 |         if self.cfg_ndfs >2:
335 |             warnings.warn('Upgrade to PRO version to join >2 dataframes')
336 | 
337 |         from d6tjoin.utils import _filter_group_min
338 | 
339 |         if ignore_value_columns:
340 |             df_left, df_right = [dfg.select_dtypes(include=['object']) for _, dfg in self._enumerate_dfs()]
341 |             print('ignored columns (value type)', 'left:',set(self.dfs[0].columns)-set(df_left.columns), 'right:', set(self.dfs[1].columns)-set(df_right.columns))
342 |         else:
343 |             df_left, df_right = [dfg for _, dfg in self._enumerate_dfs()]
344 | 
345 |         def unique_dict(dfg):
346 |             d = dict(zip(dfg.columns, [set(dfg[x].dropna().unique()) for x in dfg.columns]))
347 |             d = {k: v for k, v in d.items() if (len(v) > min_unique_count) and (len(v)/dfg[k].shape[0] <= max_unique_pct)}
348 |             return d
349 | 
350 |         # todo: add len(key) and sample=next(key)
351 |         values_left = unique_dict(df_left)
352 |         values_right = unique_dict(df_right)
353 |         values_left_ignored = set(df_left.columns)-set(values_left.keys())
354 |         values_right_ignored = set(df_right.columns)-set(values_right.keys())
355 |         if values_left_ignored: print('ignored columns (unique count)', 'left:', values_left_ignored)
356 |         if values_right_ignored: print('ignored columns (unique count)', 'right:', values_right_ignored)
357 | 
358 |         df_candidates = list(itertools.product(values_left.keys(), values_right.keys()))
359 |         df_candidates = pd.DataFrame(df_candidates, columns=['__left__', '__right__'])
360 | 
361 |         def jaccard_similarity(s1, s2, how):
362 |             intersection = len(s1.intersection(s2))
363 |             if how=='left':
364 |                 ratio = float(intersection / len(s1))
365 |             else:
366 |                 union = (len(s1) + len(s2)) - intersection
367 |                 ratio = float(intersection / union)
368 |             return ratio
369 | 
370 |         def jaccard_caller(col_left, col_right):
371 |             return jaccard_similarity(values_left[col_left], values_right[col_right], how)
372 | 
373 |         df_candidates['__similarity__'] = df_candidates.apply(lambda x: jaccard_caller(x['__left__'], x['__right__']), axis=1)
374 |         df_candidates = df_candidates.dropna(subset=['__similarity__'])
375 |         if df_candidates.empty:
376 |             raise ValueError('Failed to compute meaningful similarity, might need to loosen parameters')
377 |         df_candidates['__similarity__'] = -df_candidates['__similarity__']
378 |         df_diff = df_candidates.groupby('__left__',group_keys=False).apply(lambda x: _filter_group_min(x,'__similarity__',topn)).reset_index(drop=True)
379 |         df_diff['__similarity__'] = -df_diff['__similarity__']
380 | 
381 |         df_diff['__left-sample__'] = df_diff['__left__'].map(lambda x: next(iter(values_left[x]),None))
382 |         df_diff['__right-sample__'] = df_diff['__right__'].map(lambda x: next(iter(values_right[x]),None))
383 |         df_diff['__left-nunique__'] = df_diff['__left__'].map(lambda x: len(values_left[x]))
384 |         df_diff['__right-nunique__'] = df_diff['__right__'].map(lambda x: len(values_right[x]))
385 | 
386 |         if min_match_rate is not None:
387 |             df_diff = df_diff[df_diff['__similarity__']>min_match_rate]
388 | 
389 |         # todo: sort by left df columns and then by similarity descending
390 | 
391 |         return self._return(df_diff)
392 | 
393 |     def data_similarity(self, how=None, columns=None):
394 |         # goal: which columns data is most "similar"
395 |         # todo: run similarity function show median/min/max similarity across columns
396 |         # similarity on all vs all values?
397 |         # find the top1/n similarity for each value. median across all values
398 |         # above is strings. for numbers and dates:
399 |         # numbers: "same distribution" => distribution similarity
400 |         # dates: "same distribution" => distribution similarity
401 |         # distribution similarity: non-parametric. interquartile range similar
402 |         # want to find join keys not join value columns
403 |         #
404 | 
405 |         raise NotImplementedError()
406 | 
407 | 
408 |     def match_quality(self, rerun=False):
409 |         """
410 |         Show prejoin statistics
411 | 
412 |         Args:
413 |             return_results (bool): Return results as df instead of printing
414 | 
415 |         """
416 | 
417 |         if not self.keysets or rerun:
418 |             self._calc_keysets()
419 | 
420 |         df_out = []
421 | 
422 |         for key_set in self.keysets:
423 |             df_key = {}
424 |             for k in ['keyset left','keyset right','inner','outer','unmatched total','unmatched left','unmatched right']:
425 |                 df_key[k] = len(key_set[k])
426 |             for k in ['key left','key right']:
427 |                 df_key[k] = key_set[k]
428 |             df_key['all matched'] = df_key['inner']==df_key['outer']
429 |             df_out.append(df_key)
430 | 
431 |         df_out = pd.DataFrame(df_out)
432 |         df_out = df_out.rename(columns={'keyset left':'left','keyset right':'right'})
433 |         df_out = df_out[['key left','key right','all matched','inner','left','right','outer','unmatched total','unmatched left','unmatched right']]
434 | 
435 |         return self._return(df_out)
436 | 
437 |     def is_all_matched(self, key='__all__',rerun=False):
438 | 
439 |         if not self.keysets or rerun:
440 |             self._calc_keysets()
441 | 
442 |         keymask = [key in e for e in self.keysall]
443 |         if not (any(keymask)):
444 |             raise ValueError('key ', self.cfg_show_key, ' not a join key in ', self.keys)
445 |         ilevel = keymask.index(True)
446 | 
447 |         return (self.keysets[ilevel]['key left']==key or self.keysets[ilevel]['key right']==key) and len(self.keysets[ilevel]['unmatched total'])==0
448 | 
449 |     def _show_prep_df(self, idf, mode):
450 |         """
451 |         PRIVATE. prepare data for self.show() functions
452 | 
453 |         Args:
454 |             idf (int): which df in self.dfs
455 |             mode (str): matched vs unmatched
456 | 
457 |         """
458 | 
459 |         if idf==0:
460 |             side='left'
461 |         elif idf==1:
462 |             side='right'
463 |         else:
464 |             raise ValueError('invalid idx')
465 | 
466 |         if self.cfg_show_keys_only:
467 |             if self.cfg_show_key == '__all__':
468 |                 cfg_col_sel = self.keysdf[idf]
469 |             else:
470 |                 cfg_col_sel = self.cfg_show_key
471 |         else:
472 |             cfg_col_sel = self.dfs[idf].columns
473 | 
474 |         # which set to return?
475 |         if mode=='matched':
476 |             cfg_mode_sel = 'inner'
477 |         elif mode=='unmatched':
478 |             cfg_mode_sel = mode + ' ' + side
479 |         else:
480 |             raise ValueError('invalid mode', mode)
481 | 
482 |         keys = list(self.keysets[self.cfg_show_level][cfg_mode_sel])
483 |         if self.cfg_show_nrecords > 0:
484 |             keys = keys[:self.cfg_show_nrecords]
485 | 
486 |         if self.cfg_show_key == '__all__' and self.cfg_njoins>1:
487 |             dfg = self.dfs[idf].copy()
488 |             dfg = self.dfs[idf].reset_index().set_index(self.keysdf[idf])
489 |             dfg = dfg.loc[keys]
490 |             dfg = dfg.reset_index().sort_values('index')[cfg_col_sel].reset_index(drop=True) # reorder to original order
491 |         elif self.cfg_show_key == '__all__' and self.cfg_njoins==1:
492 |             dfg = self.dfs[idf]
493 |             dfg = dfg.loc[dfg[self.keysdf[idf][0]].isin([e[0] for e in keys]), cfg_col_sel]
494 |         else:
495 |             dfg = self.dfs[idf]
496 |             dfg = dfg.loc[dfg[self.cfg_show_key].isin(keys),cfg_col_sel]
497 | 
498 |         if self.cfg_show_nrows > 0:
499 |             dfg = dfg.head(self.cfg_show_nrows)
500 | 
501 |         if self.cfg_show_print_only:
502 |             print('%s %s for key %s' %(mode, side, self.cfg_show_key))
503 |             print(dfg)
504 |         else:
505 |             self.df_show_out[side] = dfg.copy()
506 | 
507 |     def _show(self, mode):
508 |         if not self.keysets:
509 |             raise RuntimeError('run .stats_prejoin() first')
510 | 
511 |         keymask = [self.cfg_show_key in e for e in self.keysall]
512 |         if not (any(keymask)):
513 |             raise ValueError('key ', self.cfg_show_key, ' not a join key in ', self.keys)
514 |         self.cfg_show_level = keymask.index(True)
515 | 
516 |         for idf in range(self.cfg_ndfs):  # run for all self.dfs
517 |             if self.keysall[self.cfg_show_level][idf] == self.cfg_show_key:  # check if key applies
518 |                 self._show_prep_df(idf, mode)
519 | 
520 |     def show_unmatched(self, key, nrecords=3, nrows=3, keys_only=False, print_only=False):
521 |         """
522 |         Show unmatched records
523 | 
524 |         Args:
525 |             key (str): join key
526 |             nrecords (int): number of unmatched records
527 |             nrows (int): number of rows
528 |             keys_only (bool): show only join keys
529 |             print_only (bool): if false return results instead of printing
530 |         """
531 |         self.df_show_out = {}
532 |         self.cfg_show_key = key
533 |         self.cfg_show_nrecords = nrecords
534 |         self.cfg_show_nrows = nrows
535 |         self.cfg_show_keys_only = keys_only
536 |         self.cfg_show_print_only = print_only
537 | 
538 |         self._show('unmatched')
539 |         if not self.cfg_show_print_only:
540 |             return self.df_show_out
541 | 
542 |     def show_matched(self, key, nrecords=3, nrows=3, keys_only=False, print_only=False):
543 |         """
544 |         Show matched records
545 | 
546 |         Args:
547 |             key (str): join key
548 |             nrecords (int): number of unmatched records
549 |             nrows (int): number of rows
550 |             keys_only (bool): show only join keys
551 |             print_only (bool): if false return results instead of printing
552 |         """
553 |         self.df_show_out = {}
554 |         self.cfg_show_key = key
555 |         self.cfg_show_nrecords = nrecords
556 |         self.cfg_show_nrows = nrows
557 |         self.cfg_show_keys_only = keys_only
558 |         self.cfg_show_print_only = print_only
559 | 
560 |         self._show('matched')
561 |         if not self.cfg_show_print_only:
562 |             return self.df_show_out
563 | 
564 |     def merge(self, **kwargs):
565 |         """
566 |         Perform merge using keys
567 | 
568 |         Args:
569 |             kwargs (misc): parameters to pass to `pd.merge()`
570 |         """
571 |         if len(self.dfs) > 2:
572 |             raise NotImplementedError('Only handles 2 dataframes for now')
573 | 
574 |         return self.dfs[0].merge(self.dfs[1], left_on=self.keysdf[0], right_on=self.keysdf[1], **kwargs)
575 | 
576 | 


--------------------------------------------------------------------------------
/d6tjoin/smart_join.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from collections import OrderedDict
  4 | import itertools
  5 | import warnings
  6 | import jellyfish
  7 | 
  8 | from d6tjoin.pre import Prejoin as BaseJoin
  9 | 
 10 | 
 11 | # ******************************************
 12 | # helpers
 13 | # ******************************************
 14 | def set_values(dfg, key):
 15 |     v = dfg[key].unique()
 16 |     return v[~pd.isnull(v)]
 17 | 
 18 | 
 19 | def apply_gen_candidates_group(dfg):
 20 |     return pd.DataFrame(list(itertools.product(dfg['__top1left__'].values[0],dfg['__top1right__'].values[0])),columns=['__top1left__','__top1right__'])
 21 | 
 22 | 
 23 | def apply_gen_candidates(set1, set2):
 24 |     df_candidates = list(itertools.product(set1, set2))
 25 |     df_candidates = pd.DataFrame(df_candidates,columns=['__top1left__','__top1right__'])
 26 | 
 27 |     return df_candidates
 28 | 
 29 | 
 30 | def diff_arithmetic(x,y):
 31 |     return abs(x - y)
 32 | 
 33 | 
 34 | def diff_edit(a,b):
 35 |     return jellyfish.levenshtein_distance(a,b)
 36 | 
 37 | 
 38 | def filter_group_minmax(dfg, col):
 39 |     """
 40 | 
 41 |     Returns all rows equal to min in col
 42 | 
 43 |     """
 44 |     return dfg[dfg[col] == dfg[col].min()]
 45 | 
 46 | 
 47 | def prep_match_df(dfg):
 48 |     dfg = dfg[['__top1left__', '__top1right__', '__top1diff__', '__match type__']]
 49 |     return dfg
 50 | 
 51 | # ******************************************
 52 | # fuzzy join
 53 | # ******************************************
 54 | class FuzzyJoinTop1(BaseJoin):
 55 | 
 56 |     def __init__(self, dfs, exact_keys=[], fuzzy_keys=[], exact_how='inner', fuzzy_how = {}, keys_bydf=False, init_merge=False):
 57 | 
 58 |         """
 59 | 
 60 |         Smart joiner for top 1 similarity joins. By setting fuzzy keys, it calculates similarity metrics for strings, numbers and dates to join on the closest matching entry.
 61 | 
 62 |         Args:
 63 |             dfs (list): list of dataframes
 64 |             exact_keys (list): list of join keys for exact joins. See notes for details
 65 |             fuzzy_keys (list): list of join keys for fuzzy joins. See notes for details
 66 |             exact_how (str): exact join mode same as `pd.merge(how='inner')`
 67 |             fuzzy_how (dict): specify fuzzy join options by merge level eg {0:{'top_limit':1}}
 68 |             keys_bydf (bool): if keys list is by dataframe (default) or join level. See notes for details
 69 | 
 70 |         Note:
 71 |             * specifying join keys:
 72 |                 * if both dataframes have matching columns: `fuzzy_keys=['key1','key2']`
 73 |                 * else: `fuzzy_keys=[['key1df1','key1df2'],['key2df1','key2df2']]`
 74 |                     * by default you provide keys by join level eg `[['key1df1','key1df2'],['key2df1','key2df2']]` instead you can also provide keys by dataframe `[['key1df1','key2df1'],['key1df2','key2df2']], keys_bydf=True`
 75 |             * fuzzy_how: controls join options by join level
 76 |                 * dict keys are join level eg with `fuzzy_keys=[['key1df1','key1df2'],['key2df1','key2df2']]` you set `fuzzy_how={0:{'top_nrecords':5},0:{'top_nrecords':5}}`
 77 |                 * options are:
 78 |                     * fun_diff: difference function or list of difference functions applied sequentially. Needs to be 0=similar and >0 dissimilar
 79 |                     * top_limit: maximum difference, keep only canidates with difference <= top_limit
 80 |                     * top_nrecords: keep only n top_nrecords, good for generating previews
 81 | 
 82 |         """
 83 | 
 84 |         # inputs dfs
 85 |         self._init_dfs(dfs)
 86 | 
 87 |         # check and save join keys
 88 |         if not exact_keys and not fuzzy_keys:
 89 |             raise ValueError("Must provide at least one of exact_keys or fuzzy_keys")
 90 | 
 91 |         self.keys_exact, self.keysdf_exact = self._prep_keys(exact_keys, keys_bydf)
 92 |         if self.keys_exact:
 93 |             self._check_keysdfs(self.keys_exact, self.keysdf_exact)
 94 | 
 95 |         self.keys_fuzzy, self.keysdf_fuzzy = self._prep_keys(fuzzy_keys, keys_bydf)
 96 |         if self.keys_fuzzy:
 97 |             self._check_keysdfs(self.keys_fuzzy, self.keysdf_fuzzy)
 98 | 
 99 |         # todo: no duplicate join keys passed
100 | 
101 |         if not isinstance(exact_how, (str,)):
102 |             raise NotImplementedError('exact_how can only be applied globally for now')
103 |         elif exact_how not in ('left','right','inner','outer'):
104 |             raise ValueError("Invalid how parameter, check documentation for valid values")
105 | 
106 |         self.cfg_njoins_exact = len(self.keysdf_exact[0]) if self.keysdf_exact else 0
107 |         self.cfg_njoins_fuzzy = len(self.keysdf_fuzzy[0]) if self.keysdf_fuzzy else 0
108 | 
109 |         if self.cfg_njoins_fuzzy>1:
110 |         #     raise NotImplementedError('Currently supports only 1 fuzzy key')
111 |             warnings.warn('Multi-key fuzzy joins are currently done globally for each key indivudally, not hierarchically for each unique fuzzy key value pair')
112 | 
113 |         self.exact_how = exact_how
114 |         self.set_fuzzy_how_all(fuzzy_how)
115 | 
116 |         if init_merge:
117 |             self.join()
118 |         else:
119 |             self.dfjoined = None
120 | 
121 |         self.table_fuzzy = {}
122 | 
123 | 
124 |     def set_fuzzy_how(self, ilevel, fuzzy_how):
125 |         self.fuzzy_how[ilevel] = fuzzy_how
126 |         self._gen_fuzzy_how(ilevel)
127 | 
128 |     def set_fuzzy_how_all(self, fuzzy_how):
129 |         if not isinstance(fuzzy_how, (dict,)):
130 |             raise ValueError('fuzzy_how needs to be a dict')
131 |         self.fuzzy_how = fuzzy_how
132 |         self._gen_fuzzy_how_all()
133 | 
134 |     def _gen_fuzzy_how_all(self):
135 | 
136 |         for ilevel in range(self.cfg_njoins_fuzzy):
137 |             self._gen_fuzzy_how(ilevel)
138 | 
139 |     def _gen_fuzzy_how(self, ilevel):
140 | 
141 |             # check if entry exists
142 |             cfg_top1 = self.fuzzy_how.get(ilevel,{})
143 | 
144 |             keyleft = self.keys_fuzzy[ilevel][0]
145 |             keyright = self.keys_fuzzy[ilevel][1]
146 | 
147 |             typeleft = self.dfs[0][keyleft].dtype
148 |             typeright = self.dfs[1][keyright].dtype
149 | 
150 |             if 'type' not in cfg_top1:
151 |                 if typeleft == 'int64' or typeleft == 'float64' or typeleft == 'datetime64[ns]':
152 |                     cfg_top1['type'] = 'number'
153 |                 elif typeleft == 'object' and type(self.dfs[0][keyleft].values[~self.dfs[0][keyleft].isnull()][0])==str:
154 |                     cfg_top1['type'] = 'string'
155 |                 else:
156 |                     raise ValueError('Unrecognized data type for top match, need to pass fun_diff in arguments')
157 | 
158 |             # make defaults if no settings provided
159 |             if 'fun_diff' not in cfg_top1:
160 | 
161 |                 if cfg_top1['type'] == 'number':
162 |                     cfg_top1['fun_diff'] = pd.merge_asof
163 |                 elif cfg_top1['type'] == 'string':
164 |                     cfg_top1['fun_diff'] = diff_edit
165 |                 else:
166 |                     raise ValueError('Unrecognized data type for top match, need to pass fun_diff in arguments')
167 |             else:
168 |                 is_valid = callable(cfg_top1['fun_diff']) or (type(cfg_top1['fun_diff']) == list and all([callable(f) for f in cfg_top1['fun_diff']]))
169 |                 if not is_valid:
170 |                     raise ValueError("'fun_diff' needs to be a function or a list of functions")
171 | 
172 |             if not type(cfg_top1['fun_diff']) == list:
173 |                 cfg_top1['fun_diff'] = [cfg_top1['fun_diff']]
174 | 
175 |             if 'top_limit' not in cfg_top1:
176 |                 cfg_top1['top_limit'] = None
177 | 
178 |             if 'top_nrecords' not in cfg_top1:
179 |                 cfg_top1['top_nrecords'] = None
180 | 
181 |             cfg_top1['dir'] = 'left'
182 | 
183 |             # save config
184 |             # check if entry exists
185 |             self.fuzzy_how[ilevel] = cfg_top1
186 | 
187 |     def preview_fuzzy(self, ilevel, top_nrecords=5):
188 |         if top_nrecords>0:
189 |             return self._gen_match_top1(ilevel, top_nrecords)
190 |         else:
191 |             return self._gen_match_top1(ilevel)
192 | 
193 |     def _gen_match_top1_left_number(self, cfg_group_left, cfg_group_right, keyleft, keyright, top_nrecords):
194 |         if len(cfg_group_left) > 0:
195 | 
196 |             # unique values
197 |             if top_nrecords is None:
198 |                 # df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique())
199 |                 df_keys_left = self.dfs[0].groupby(cfg_group_left)[keyleft].apply(lambda x: pd.Series(x.unique()))
200 |                 df_keys_left.index = df_keys_left.index.droplevel(1)
201 |                 df_keys_left = pd.DataFrame(df_keys_left)
202 |             else:
203 |                 # df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique()[:top_nrecords])
204 |                 df_keys_left = self.dfs[0].groupby(cfg_group_left)[keyleft].apply(lambda x: pd.Series(x.unique()[:top_nrecords]))
205 |                 df_keys_left.index = df_keys_left.index.droplevel(1)
206 |                 df_keys_left = pd.DataFrame(df_keys_left)
207 |             df_keys_right = self.dfs[1].groupby(cfg_group_right)[keyright].apply(lambda x: pd.Series(x.unique()))
208 |             df_keys_right.index = df_keys_right.index.droplevel(1)
209 |             df_keys_right = pd.DataFrame(df_keys_right)
210 |             # df_keys_right = pd.DataFrame(self.dfs[1].groupby(cfg_group_right)[keyright].unique())
211 | 
212 |             # sort
213 |             df_keys_left = df_keys_left.sort_values(keyleft).reset_index().rename(columns={keyleft:'__top1left__'})
214 |             df_keys_right = df_keys_right.sort_values(keyright).reset_index().rename(columns={keyright:'__top1right__'})
215 | 
216 |             df_match = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', left_by=cfg_group_left, right_by=cfg_group_right, direction='nearest')
217 |         else:
218 |             # uniques
219 |             values_left = set_values(self.dfs[0], keyleft)
220 |             values_right = set_values(self.dfs[1], keyright)
221 | 
222 |             if top_nrecords:
223 |                 values_left = values_left[:top_nrecords]
224 | 
225 |             df_keys_left = pd.DataFrame({'__top1left__':values_left}).sort_values('__top1left__')
226 |             df_keys_right = pd.DataFrame({'__top1right__':values_right}).sort_values('__top1right__')
227 | 
228 |             df_match = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', direction='nearest')
229 | 
230 |         df_match['__top1diff__'] = (df_match['__top1left__']-df_match['__top1right__']).abs()
231 | 
232 |         return df_match
233 | 
234 |     def _gen_match_top1(self, ilevel, top_nrecords=None):
235 |         """
236 | 
237 |         Generates match table between two sets
238 | 
239 |         Args:
240 |             keyssets (dict): values for join keys ['key left', 'key right', 'keyset left', 'keyset right', 'inner', 'outer', 'unmatched total', 'unmatched left', 'unmatched right']
241 |             mode (str, list): global string or list for each join. Possible values: ['exact inner', 'exact left', 'exact right', 'exact outer', 'top1 left', 'top1 right', 'top1 bidir all', 'top1 bidir unmatched']
242 |             is_lower_better (bool): True = difference, False = Similarity
243 | 
244 |         """
245 | 
246 |         cfg_top1 = self.fuzzy_how[ilevel]
247 |         fun_diff = cfg_top1['fun_diff']
248 |         top_limit = cfg_top1['top_limit']
249 |         if not top_nrecords:
250 |             top_nrecords = cfg_top1['top_nrecords']
251 | 
252 |         keyleft = self.keys_fuzzy[ilevel][0]
253 |         keyright = self.keys_fuzzy[ilevel][1]
254 | 
255 |         #******************************************
256 |         # table LEFT
257 |         #******************************************
258 |         if cfg_top1['dir']=='left':
259 |             
260 |             # exact keys for groupby
261 |             cfg_group_left = self.keysdf_exact[0] if self.keysdf_exact else []
262 |             cfg_group_right = self.keysdf_exact[1] if self.keysdf_exact else []
263 | 
264 |             if cfg_top1['type'] == 'string' or (cfg_top1['type'] == 'number' and cfg_top1['fun_diff'] != [pd.merge_asof]):
265 | 
266 |                 if len(cfg_group_left)>0:
267 |                     # generate candidates if exact matches are present (= blocking index)
268 | 
269 |                     if top_nrecords is None:
270 |                         df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique())
271 |                     else:
272 |                         df_keys_left = pd.DataFrame(self.dfs[0].groupby(cfg_group_left)[keyleft].unique()[:top_nrecords])
273 |                     df_keys_right = pd.DataFrame(self.dfs[1].groupby(cfg_group_right)[keyright].unique())
274 |                     df_keysets_groups = df_keys_left.merge(df_keys_right,left_index=True, right_index=True)
275 |                     df_keysets_groups.columns = ['__top1left__','__top1right__']
276 |                     dfg = df_keysets_groups.reset_index().groupby(cfg_group_left).apply(apply_gen_candidates_group)
277 |                     dfg = dfg.reset_index(-1,drop=True).reset_index()
278 |                     dfg = dfg.dropna()
279 | 
280 |                 else:
281 |                     # generate candidates if NO exact matches
282 |                     values_left = set_values(self.dfs[0],keyleft)
283 |                     values_right = set_values(self.dfs[1],keyright)
284 | 
285 |                     if top_nrecords is None:
286 |                         dfg = apply_gen_candidates(values_left,values_right)
287 |                     else:
288 |                         dfg = apply_gen_candidates(values_left[:top_nrecords], values_right)
289 | 
290 | 
291 |                 # find exact matches and remove from candidates
292 |                 # todo: use set logic before generating candidates
293 |                 idxSelExact = dfg['__top1left__']==dfg['__top1right__']
294 |                 df_match_exact = dfg[idxSelExact].copy()
295 |                 df_match_exact['__match type__'] = 'exact'
296 |                 df_match_exact['__top1diff__'] = 0
297 | 
298 |                 idxSel = dfg['__top1left__'].isin(df_match_exact['__top1left__'])
299 |                 dfg = dfg[~idxSel]
300 | 
301 |                 for fun_diff in cfg_top1['fun_diff']:
302 |                     dfg['__top1diff__'] = dfg.apply(lambda x: fun_diff(x['__top1left__'], x['__top1right__']), axis=1)
303 | 
304 |                     # filtering
305 |                     if not top_limit is None:
306 |                         dfg = dfg[dfg['__top1diff__'] <= top_limit]
307 | 
308 |                     # get top 1
309 |                     dfg = dfg.groupby('__top1left__',group_keys=False).apply(lambda x: filter_group_minmax(x,'__top1diff__'))
310 | 
311 |                 # return results
312 |                 dfg['__match type__'] = 'top1 left'
313 |                 df_match = pd.concat([dfg,df_match_exact])
314 | 
315 |             elif cfg_top1['type'] == 'number' and cfg_top1['fun_diff'] == [pd.merge_asof]:
316 |                 df_match = self._gen_match_top1_left_number(cfg_group_left, cfg_group_right, keyleft, keyright, top_nrecords).copy()
317 | 
318 |                 # filtering
319 |                 if not top_limit is None:
320 |                     df_match = df_match[df_match['__top1diff__'] <= top_limit]
321 | 
322 |                 df_match['__match type__'] = 'top1 left'
323 |                 df_match.loc[df_match['__top1left__'] == df_match['__top1right__'], '__match type__'] = 'exact'
324 |             else:
325 |                 raise ValueError('Dev error: cfg_top1["type/fun_diff"]')
326 | 
327 | 
328 |         #******************************************
329 |         # table RIGHT
330 |         #******************************************
331 |         elif cfg_top1['dir']=='right' or cfg_top1['dir'] == 'inner':
332 |             raise NotImplementedError('Only use left join for now')
333 |         else:
334 |             raise ValueError("wrong 'how' parameter for top1 join, check documentation")
335 | 
336 |         return {'key left':keyleft, 'key right':keyright,
337 |                 'table':df_match,'has duplicates':df_match.groupby('__top1left__').size().max()>1}
338 | 
339 |     def run_match_top1_all(self, cfg_top1=None):
340 | 
341 |         for ilevel in range(self.cfg_njoins_fuzzy):
342 |             self.table_fuzzy[ilevel] = self._gen_match_top1(ilevel)
343 | 
344 |     def join(self, is_keep_debug=False):
345 |         if self.cfg_njoins_fuzzy==0:
346 |             self.dfjoined = self.dfs[0].merge(self.dfs[1], left_on=self.keysdf_exact[0], right_on=self.keysdf_exact[1], how=self.exact_how)
347 |         else:
348 | 
349 |             self.run_match_top1_all()
350 | 
351 |             cfg_group_left = self.keysdf_exact[0] if self.keysdf_exact else []
352 |             cfg_group_right = self.keysdf_exact[1] if self.keysdf_exact else []
353 |             self.dfjoined = self.dfs[0]
354 |             for ilevel in range(self.cfg_njoins_fuzzy):
355 |                 keyleft = self.keys_fuzzy[ilevel][0]
356 |                 keyright = self.keys_fuzzy[ilevel][1]
357 |                 dft = self.table_fuzzy[ilevel]['table'].copy()
358 |                 dft.columns = [s + keyleft if s.startswith('__') else s for s in dft.columns]
359 |                 self.dfjoined = self.dfjoined.merge(dft, left_on=cfg_group_left+[keyleft], right_on=cfg_group_left+['__top1left__'+keyleft])
360 |                 pass
361 | 
362 |             cfg_keys_left = cfg_group_left+['__top1right__'+k for k in self.keysdf_fuzzy[0]]
363 |             cfg_keys_right = cfg_group_right+[k for k in self.keysdf_fuzzy[1]]
364 | 
365 |             self.dfjoined = self.dfjoined.merge(self.dfs[1], left_on = cfg_keys_left, right_on = cfg_keys_right, suffixes=['','__right__'])
366 | 
367 |             if not is_keep_debug:
368 |                 self.dfjoined = self.dfjoined[self.dfjoined.columns[~self.dfjoined.columns.str.startswith('__')]]
369 | 
370 |         return self.dfjoined
371 | 
372 | 
373 | 


--------------------------------------------------------------------------------
/d6tjoin/top1.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from collections import OrderedDict
  4 | import itertools
  5 | import warnings
  6 | import jellyfish
  7 | 
  8 | # ******************************************
  9 | # helpers
 10 | # ******************************************
 11 | 
 12 | 
 13 | from d6tjoin.utils import _applyFunMulticore, _filter_group_min, _set_values
 14 | 
 15 | class MergeTop1Diff(object):
 16 |     """
 17 | 
 18 |     Top1 minimum difference join. Mostly used for strings. Helper for `MergeTop1`.
 19 | 
 20 |     """
 21 | 
 22 |     def __init__(self, df1, df2, fuzzy_left_on, fuzzy_right_on, fun_diff=None, exact_left_on=None, exact_right_on=None,
 23 |                  top_limit=None, topn=1, fun_preapply = None, fun_postapply = None, is_keep_debug=False, use_multicore=True):
 24 | 
 25 |         # check exact keys
 26 |         if not exact_left_on:
 27 |             exact_left_on = []
 28 |         if not exact_right_on:
 29 |             exact_right_on = []
 30 | 
 31 |         if not isinstance(fuzzy_left_on, (str,)) or not isinstance(fuzzy_right_on, (str,)):
 32 |             raise ValueError('fuzzy_on needs to be a string')
 33 | 
 34 |         if len(exact_left_on) != len(exact_right_on):
 35 |             raise ValueError('Need to pass same number of exact keys')
 36 |         if not isinstance(exact_left_on, (list)) or not isinstance(exact_right_on, (list)):
 37 |             raise ValueError('Exact keys need to be a list')
 38 | 
 39 |         if not callable(fun_diff):
 40 |             raise ValueError('fun_diff needs to a function')
 41 | 
 42 |         if (fun_preapply and fun_postapply) and (not callable(fun_preapply) or not callable(fun_postapply)):
 43 |             raise ValueError('fun_preapply and fun_postapply needs to a function')
 44 | 
 45 |         # use blocking index?
 46 |         if not exact_left_on and not exact_right_on:
 47 |             self.cfg_is_block = False
 48 |         elif exact_left_on and exact_right_on:
 49 |             self.cfg_is_block = True
 50 |         else:
 51 |             raise ValueError('Need to pass exact keys for both or neither dataframe')
 52 | 
 53 |         # store data
 54 |         self.dfs = [df1,df2]
 55 | 
 56 |         # store config
 57 |         self.cfg_fuzzy_left_on = fuzzy_left_on
 58 |         self.cfg_fuzzy_right_on = fuzzy_right_on
 59 |         self.cfg_exact_left_on = exact_left_on
 60 |         self.cfg_exact_right_on = exact_right_on
 61 |         self.cfg_fun_diff = fun_diff
 62 |         self.cfg_fun_preapply = fun_preapply
 63 |         self.cfg_fun_postapply = fun_postapply
 64 |         self.cfg_top_limit = top_limit
 65 |         self.cfg_is_keep_debug = is_keep_debug
 66 |         self.cfg_topn = topn
 67 |         self.cfg_use_multicore = use_multicore
 68 | 
 69 |     def _allpairs_candidates(self):
 70 |         values_left = _set_values(self.dfs[0], self.cfg_fuzzy_left_on)
 71 |         values_right = _set_values(self.dfs[1], self.cfg_fuzzy_right_on)
 72 | 
 73 |         if self.cfg_topn>1:
 74 |             values_left_exact = set()
 75 |             values_left_fuzzy = values_left
 76 |         else:
 77 |             values_left_exact = values_left.intersection(values_right)
 78 |             values_left_fuzzy = values_left.difference(values_right)
 79 | 
 80 |         # pre apply a function
 81 |         if self.cfg_fun_preapply:
 82 |             values_left_fuzzy = [self.cfg_fun_preapply(v) for v in values_left_fuzzy]
 83 |             values_right = [self.cfg_fun_preapply(v) for v in values_right]
 84 | 
 85 |         df_candidates_fuzzy = list(itertools.product(values_left_fuzzy, values_right))
 86 |         df_candidates_fuzzy = pd.DataFrame(df_candidates_fuzzy,columns=['__top1left__','__top1right__'])
 87 |         df_candidates_fuzzy['__matchtype__'] = 'top1 left'
 88 | 
 89 |         df_candidates_exact = pd.DataFrame({'__top1left__': list(values_left_exact)})
 90 |         df_candidates_exact['__top1right__'] = df_candidates_exact['__top1left__']
 91 |         df_candidates_exact['__matchtype__'] = 'exact'
 92 | 
 93 |         df_candidates = df_candidates_exact.append(df_candidates_fuzzy, ignore_index=True)
 94 | 
 95 |         return df_candidates
 96 | 
 97 |     def _top1_diff_noblock(self):
 98 |         df_candidates = self._allpairs_candidates()
 99 | 
100 |         idxSel = df_candidates['__matchtype__'] != 'exact'
101 |         if self.cfg_use_multicore:
102 |             df_candidates.loc[idxSel, '__top1diff__'] = _applyFunMulticore(df_candidates.loc[idxSel,'__top1left__'].values, df_candidates.loc[idxSel,'__top1right__'].values,self.cfg_fun_diff)
103 |         else:
104 |             df_candidates.loc[idxSel,'__top1diff__'] = df_candidates[idxSel].apply(lambda x: self.cfg_fun_diff(x['__top1left__'], x['__top1right__']), axis=1)
105 | 
106 |         df_candidates.loc[~idxSel, '__top1diff__'] = 0
107 |         has_duplicates = False
108 | 
109 |         if self.cfg_fun_postapply:
110 |             df_candidates['__top1left__']=df_candidates['__top1left__'].apply(self.cfg_fun_postapply,1)
111 |             df_candidates['__top1right__']=df_candidates['__top1right__'].apply(self.cfg_fun_postapply,1)
112 | 
113 |         df_diff = df_candidates.groupby('__top1left__',group_keys=False).apply(lambda x: _filter_group_min(x,'__top1diff__',self.cfg_topn))
114 |         if self.cfg_top_limit is not None:
115 |             df_diff = df_diff[df_diff['__top1diff__']<=self.cfg_top_limit]
116 |         has_duplicates = df_diff.groupby('__top1left__').size().max()>1
117 |         if has_duplicates:
118 |             warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)
119 | 
120 |         return df_diff, has_duplicates
121 | 
122 | 
123 |     def _merge_top1_diff_noblock(self):
124 |         df_diff, has_duplicates = self._top1_diff_noblock()
125 |         dfjoin = self.dfs[0].merge(df_diff, left_on=self.cfg_fuzzy_left_on, right_on='__top1left__')
126 |         dfjoin = dfjoin.merge(self.dfs[1], left_on='__top1right__', right_on=self.cfg_fuzzy_right_on, suffixes=['','__right__'])
127 | 
128 |         if not self.cfg_is_keep_debug:
129 |             dfjoin = dfjoin[dfjoin.columns[~dfjoin.columns.str.startswith('__')]]
130 | 
131 |         return {'merged':dfjoin, 'top1':df_diff, 'duplicates':has_duplicates}
132 | 
133 | 
134 |     def _top1_diff_withblock(self):
135 | 
136 |         def apply_gen_candidates_group(dfg):
137 |             return pd.DataFrame(list(itertools.product(dfg['__top1left__'].values[0],dfg['__top1right__'].values[0])),columns=['__top1left__','__top1right__'])
138 | 
139 |         # find key unique values
140 |         keysleft = self.dfs[0][self.cfg_exact_left_on+[self.cfg_fuzzy_left_on]].drop_duplicates().dropna()
141 |         keysright = self.dfs[1][self.cfg_exact_right_on+[self.cfg_fuzzy_right_on]].drop_duplicates().dropna()
142 |         keysleft = {tuple(x) for x in keysleft.values}
143 |         keysright = {tuple(x) for x in keysright.values}
144 |         values_left_exact = keysleft.intersection(keysright)
145 |         values_left_fuzzy = keysleft.difference(keysright)
146 | 
147 |         df_keys_left_exact = pd.DataFrame(list(values_left_exact))
148 |         if not df_keys_left_exact.empty:
149 |             df_keys_left_exact.columns = self.cfg_exact_left_on+['__top1left__']
150 |             df_keys_left_exact['__top1right__']=df_keys_left_exact['__top1left__']
151 |             df_keys_left_exact['__matchtype__'] = 'exact'
152 | 
153 |         df_keys_left_fuzzy = pd.DataFrame(list(values_left_fuzzy))
154 |         if not df_keys_left_fuzzy.empty:
155 |             df_keys_left_fuzzy.columns = self.cfg_exact_left_on+[self.cfg_fuzzy_left_on]
156 | 
157 |         # fuzzy pair candidates
158 |         df_keys_left = pd.DataFrame(df_keys_left_fuzzy.groupby(self.cfg_exact_left_on)[self.cfg_fuzzy_left_on].unique())
159 |         df_keys_right = pd.DataFrame(self.dfs[1].groupby(self.cfg_exact_right_on)[self.cfg_fuzzy_right_on].unique())
160 |         df_keysets_groups = df_keys_left.merge(df_keys_right, left_index=True, right_index=True)
161 |         df_keysets_groups.columns = ['__top1left__', '__top1right__']
162 |         df_keysets_groups = df_keysets_groups.reset_index().groupby(self.cfg_exact_left_on).apply(apply_gen_candidates_group)
163 |         df_keysets_groups = df_keysets_groups.reset_index(-1, drop=True).reset_index()
164 |         df_keysets_groups = df_keysets_groups.dropna()
165 | 
166 |         df_candidates = df_keysets_groups[['__top1left__', '__top1right__']].drop_duplicates()
167 |         if self.cfg_use_multicore:
168 |             df_candidates['__top1diff__'] = _applyFunMulticore(df_candidates['__top1left__'].values, df_candidates['__top1right__'].values, self.cfg_fun_diff)
169 |         else:
170 |             df_candidates['__top1diff__'] = df_candidates.apply(lambda x: self.cfg_fun_diff(x['__top1left__'], x['__top1right__']), axis=1)
171 |         df_candidates['__matchtype__'] = 'top1 left'
172 | 
173 |         # calculate difference
174 |         df_diff = df_keysets_groups.merge(df_candidates, on=['__top1left__', '__top1right__'])
175 | 
176 |         df_diff = df_diff.append(df_keys_left_exact)
177 |         df_diff['__top1diff__']=df_diff['__top1diff__'].fillna(0) # exact keys
178 |         df_diff = df_diff.groupby(self.cfg_exact_left_on+['__top1left__'],group_keys=False).apply(lambda x: _filter_group_min(x,'__top1diff__'))
179 |         if self.cfg_top_limit is not None:
180 |             df_diff = df_diff[df_diff['__top1diff__']<=self.cfg_top_limit]
181 |         has_duplicates = df_diff.groupby(self.cfg_exact_left_on+['__top1left__']).size().max()>1
182 | 
183 |         return df_diff, has_duplicates
184 | 
185 | 
186 |     def _merge_top1_diff_withblock(self):
187 | 
188 |         df_diff, has_duplicates = self._top1_diff_withblock()
189 | 
190 |         dfjoin = self.dfs[0].merge(df_diff, left_on=self.cfg_exact_left_on+[self.cfg_fuzzy_left_on], right_on=self.cfg_exact_left_on+['__top1left__'])
191 |         # todo: add exact join keys
192 |         dfjoin = dfjoin.merge(self.dfs[1], left_on=self.cfg_exact_left_on+['__top1right__'], right_on=self.cfg_exact_right_on+[self.cfg_fuzzy_right_on], suffixes=['','__right__'])
193 | 
194 |         if not self.cfg_is_keep_debug:
195 |             dfjoin = dfjoin[dfjoin.columns[~dfjoin.columns.str.startswith('__')]]
196 | 
197 |         return {'merged':dfjoin, 'top1':df_diff, 'duplicates':has_duplicates}
198 | 
199 |     def top1_diff(self):
200 |         if self.cfg_is_block:
201 |             return self._top1_diff_withblock()
202 |         else:
203 |             return self._top1_diff_noblock()
204 | 
205 |     def merge(self):
206 | 
207 |         if not self.cfg_exact_left_on and not self.cfg_exact_right_on:
208 |             return self._merge_top1_diff_noblock()
209 |         elif self.cfg_exact_left_on and self.cfg_exact_right_on:
210 |             return self._merge_top1_diff_withblock()
211 |         else:
212 |             raise ValueError('Need to pass exact keys for both or neither dataframe')
213 | 
214 | 
215 | class MergeTop1Number(object):
216 |     """
217 | 
218 |     Top1 minimum difference join for numbers. Helper for `MergeTop1`.
219 | 
220 |     """
221 | 
222 |     def __init__(self, df1, df2, fuzzy_left_on, fuzzy_right_on, exact_left_on=None, exact_right_on=None,
223 |                  direction='nearest', top_limit=None, is_keep_debug=False):
224 | 
225 |         # check exact keys
226 |         if not exact_left_on:
227 |             exact_left_on = []
228 |         if not exact_right_on:
229 |             exact_right_on = []
230 | 
231 |         if len(exact_left_on) != len(exact_right_on):
232 |             raise ValueError('Need to pass same number of exact keys')
233 |         if not isinstance(exact_left_on, (list)) or not isinstance(exact_right_on, (list)):
234 |             raise ValueError('Exact keys need to be a list')
235 | 
236 |         # use blocking index?
237 |         if not exact_left_on and not exact_right_on:
238 |             self.cfg_is_block = False
239 |         elif exact_left_on and exact_right_on:
240 |             self.cfg_is_block = True
241 |         else:
242 |             raise ValueError('Need to pass exact keys for both or neither dataframe')
243 | 
244 |         # store data
245 |         self.dfs = [df1,df2]
246 | 
247 |         # store config
248 |         self.cfg_fuzzy_left_on = fuzzy_left_on
249 |         self.cfg_fuzzy_right_on = fuzzy_right_on
250 |         self.cfg_exact_left_on = exact_left_on
251 |         self.cfg_exact_right_on = exact_right_on
252 |         self.cfg_direction = direction
253 |         self.cfg_top_limit = top_limit
254 |         self.cfg_is_keep_debug = is_keep_debug
255 | 
256 |     def _top1_diff_withblock(self):
257 | 
258 |         # unique values
259 |         df_keys_left = self.dfs[0].groupby(self.cfg_exact_left_on)[self.cfg_fuzzy_left_on].apply(lambda x: pd.Series(x.unique()))
260 |         df_keys_left.index = df_keys_left.index.droplevel(-1)
261 |         df_keys_left = pd.DataFrame(df_keys_left)
262 |         df_keys_right = self.dfs[1].groupby(self.cfg_exact_right_on)[self.cfg_fuzzy_right_on].apply(lambda x: pd.Series(x.unique()))
263 |         df_keys_right.index = df_keys_right.index.droplevel(-1)
264 |         df_keys_right = pd.DataFrame(df_keys_right)
265 | 
266 |         # todo: global consolidation like with MergeTop1Diff
267 | 
268 |         # sort
269 |         df_keys_left = df_keys_left.sort_values(self.cfg_fuzzy_left_on).reset_index().rename(columns={self.cfg_fuzzy_left_on:'__top1left__'})
270 |         df_keys_right = df_keys_right.sort_values(self.cfg_fuzzy_right_on).reset_index().rename(columns={self.cfg_fuzzy_right_on:'__top1right__'})
271 | 
272 |         # merge
273 |         df_diff = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', left_by=self.cfg_exact_left_on, right_by=self.cfg_exact_right_on, direction=self.cfg_direction)
274 |         df_diff['__top1diff__'] = (df_diff['__top1left__']-df_diff['__top1right__']).abs()
275 |         df_diff['__matchtype__'] = 'top1 left'
276 |         df_diff.loc[df_diff['__top1left__'] == df_diff['__top1right__'], '__matchtype__'] = 'exact'
277 |         if self.cfg_top_limit is not None:
278 |             df_diff = df_diff[df_diff['__top1diff__']<=self.cfg_top_limit]
279 | 
280 |         return df_diff
281 | 
282 |     def _top1_diff_noblock(self):
283 |             # uniques
284 |             values_left = _set_values(self.dfs[0], self.cfg_fuzzy_left_on)
285 |             values_right = _set_values(self.dfs[1], self.cfg_fuzzy_right_on)
286 | 
287 |             # sort
288 |             df_keys_left = pd.DataFrame({'__top1left__':list(values_left)}).sort_values('__top1left__')
289 |             df_keys_right = pd.DataFrame({'__top1right__':list(values_right)}).sort_values('__top1right__')
290 | 
291 |             # merge
292 |             df_diff = pd.merge_asof(df_keys_left, df_keys_right, left_on='__top1left__', right_on='__top1right__', direction=self.cfg_direction)
293 |             df_diff['__top1diff__'] = (df_diff['__top1left__']-df_diff['__top1right__']).abs()
294 |             df_diff['__matchtype__'] = 'top1 left'
295 |             df_diff.loc[df_diff['__top1left__'] == df_diff['__top1right__'], '__matchtype__'] = 'exact'
296 | 
297 |             return df_diff
298 | 
299 |     def top1_diff(self):
300 |         if self.cfg_is_block:
301 |             return self._top1_diff_withblock()
302 |         else:
303 |             return self._top1_diff_noblock()
304 | 
305 |     def merge(self):
306 |         df_diff = self.top1_diff()
307 | 
308 |         dfjoin = self.dfs[0].merge(df_diff, left_on=self.cfg_exact_left_on+[self.cfg_fuzzy_left_on], right_on=self.cfg_exact_left_on+['__top1left__'])
309 |         dfjoin = dfjoin.merge(self.dfs[1], left_on=self.cfg_exact_left_on+['__top1right__'], right_on=self.cfg_exact_right_on+[self.cfg_fuzzy_right_on], suffixes=['','__right__'])
310 | 
311 |         if not self.cfg_is_keep_debug:
312 |             dfjoin = dfjoin[dfjoin.columns[~dfjoin.columns.str.startswith('__')]]
313 | 
314 |         return {'merged': dfjoin, 'top1': df_diff, 'duplicates': None}
315 | 
316 | class MergeTop1(object):
317 |     """
318 | 
319 |     Left best match join. It applies a difference function to find the key pair with the smallest difference to the join key.
320 | 
321 |     Args:
322 |         df1 (dataframe): left dataframe onto which the right dataframe is joined
323 |         df2 (dataframe): right dataframe
324 |         fuzzy_left_on (list): join keys for similarity match, left dataframe
325 |         fuzzy_right_on (list): join keys for similarity match, right dataframe
326 |         exact_left_on (list, default None): join keys for exact match, left dataframe
327 |         exact_right_on (list, default None): join keys for exact match, right dataframe
328 |         fun_diff (list, default None): list of difference functions to be applied for each fuzzy key
329 |         top_limit (list, default None): list of values to cap similarity matches
330 |         is_keep_debug (bool): keep diagnostics columns, good for debugging
331 | 
332 |     Note:
333 |         * fun_diff: applies the difference function to find the best match with minimum distance
334 |             * By default gets automatically determined depending on whether you have a string or date/number
335 |             * Use `None` to keep the default, so example [None, lambda x, y: x-y]
336 |             * Functions within list get applied in order same order to fuzzy join keys
337 |             * Needs to be a difference function so lower is better. For functions like Jaccard higher is better so you need to adjust for that
338 |         * top_limit: Limits the number of matches to anything below that values. For example if two strings differ by 3 but top_limit is 2, that match will be ignored
339 |             * for dates you can use `pd.offsets.Day(1)` or similar
340 | 
341 |     """
342 | 
343 |     def __init__(self, df1, df2, fuzzy_left_on=None, fuzzy_right_on=None, exact_left_on=None, exact_right_on=None,
344 |                  fun_diff = None, top_limit=None, is_keep_debug=False, use_multicore=True):
345 | 
346 | 
347 |         # todo: pass custom merge asof param
348 |         # todo: pass list of fundiff
349 | 
350 | 
351 |         # check fuzzy keys
352 |         if not fuzzy_left_on or not fuzzy_right_on:
353 |             raise ValueError('Need to pass fuzzy left and right keys')
354 |         if len(fuzzy_left_on) != len(fuzzy_right_on):
355 |             raise ValueError('Need to pass same number of fuzzy left and right keys')
356 |         self.cfg_njoins_fuzzy = len(fuzzy_left_on)
357 | 
358 |         # check exact keys
359 |         if not exact_left_on:
360 |             exact_left_on = []
361 |         if not exact_right_on:
362 |             exact_right_on = []
363 | 
364 |         if len(exact_left_on) != len(exact_right_on):
365 |             raise ValueError('Need to pass same number of exact keys')
366 |         if not isinstance(exact_left_on, (list)) or not isinstance(exact_right_on, (list)):
367 |             raise ValueError('Exact keys need to be a list')
368 | 
369 | 
370 |         # use blocking index?
371 |         if not exact_left_on and not exact_right_on:
372 |             self.cfg_is_block = False
373 |         elif exact_left_on and exact_right_on:
374 |             self.cfg_is_block = True
375 |         else:
376 |             raise ValueError('Need to pass exact keys for both or neither dataframe')
377 | 
378 |         # check custom params
379 |         if not top_limit:
380 |             top_limit = [None,]*self.cfg_njoins_fuzzy
381 |         if not fun_diff:
382 |             fun_diff = [None,]*self.cfg_njoins_fuzzy
383 |         elif len(fun_diff)!=len(fuzzy_left_on):
384 |             raise ValueError('fun_diff needs to the same length as fuzzy_left_on. Use None in list to use default')
385 |         if not isinstance(top_limit, (list,)) or not len(top_limit)==self.cfg_njoins_fuzzy:
386 |             raise NotImplementedError('top_limit needs to a list with entries for each fuzzy join key')
387 |         if not isinstance(fun_diff, (list,)) or not len(top_limit)==self.cfg_njoins_fuzzy:
388 |             raise NotImplementedError('fun_diff needs to a list with entries for each fuzzy join key')
389 | 
390 |         # store data
391 |         self.dfs = [df1,df2]
392 | 
393 |         # store config
394 |         self.cfg_fuzzy_left_on = fuzzy_left_on
395 |         self.cfg_fuzzy_right_on = fuzzy_right_on
396 |         # todo: exact keys by fuzzy key? or just global?
397 |         self.cfg_exact_left_on = exact_left_on
398 |         self.cfg_exact_right_on = exact_right_on
399 |         self.cfg_top_limit = top_limit
400 |         self.cfg_fun_diff = fun_diff
401 |         self.cfg_is_keep_debug = is_keep_debug
402 |         self.cfg_use_multicore = use_multicore
403 | 
404 |     def merge(self):
405 |         """
406 | 
407 |         Executes merge
408 | 
409 |         Returns:
410 |              dict: keys 'merged' has merged dataframe, 'top1' has best matches by fuzzy_left_on. See example notebooks for details
411 | 
412 |         """
413 |         df_diff_bylevel = OrderedDict()
414 | 
415 |         self.dfjoined = self.dfs[0].copy()
416 |         cfg_exact_left_on = self.cfg_exact_left_on
417 |         cfg_exact_right_on = self.cfg_exact_right_on
418 | 
419 |         a=1
420 |         for ilevel, ikey in enumerate(self.cfg_fuzzy_left_on):
421 |             keyleft = ikey
422 |             keyright = self.cfg_fuzzy_right_on[ilevel]
423 |             typeleft = self.dfs[0][keyleft].dtype
424 | 
425 |             if self.cfg_fun_diff[ilevel]:
426 |                 df_diff_bylevel[ikey] = MergeTop1Diff(self.dfjoined, self.dfs[1], keyleft, keyright, self.cfg_fun_diff[ilevel], cfg_exact_left_on, cfg_exact_right_on, top_limit=self.cfg_top_limit[ilevel], use_multicore=self.cfg_use_multicore).top1_diff()[0]
427 |             else:
428 |                 if typeleft == 'int64' or typeleft == 'float64' or typeleft == 'datetime64[ns]':
429 |                     df_diff_bylevel[ikey] = MergeTop1Number(self.dfjoined, self.dfs[1], keyleft, keyright, cfg_exact_left_on, cfg_exact_right_on, top_limit=self.cfg_top_limit[ilevel]).top1_diff()
430 |                 elif typeleft == 'object' and type(self.dfs[0][keyleft].values[0])==str:
431 |                     df_diff_bylevel[ikey] = MergeTop1Diff(self.dfjoined, self.dfs[1], keyleft, keyright, jellyfish.levenshtein_distance, cfg_exact_left_on, cfg_exact_right_on, top_limit=self.cfg_top_limit[ilevel], use_multicore=self.cfg_use_multicore).top1_diff()[0]
432 |                     # todo: handle duplicates
433 |                 else:
434 |                     raise ValueError('Unrecognized data type for top match, need to pass fun_diff in arguments')
435 | 
436 |             self.dfjoined = self.dfjoined.merge(df_diff_bylevel[ikey], left_on=cfg_exact_left_on+[keyleft], right_on=cfg_exact_left_on+['__top1left__'], suffixes=['',keyleft])
437 |             cfg_col_rename = ['__top1left__','__top1right__','__top1diff__','__matchtype__']
438 |             self.dfjoined = self.dfjoined.rename(columns=dict((k,k+keyleft) for k in cfg_col_rename))
439 |             cfg_exact_left_on += ['__top1right__%s'%keyleft,]
440 |             cfg_exact_right_on += [keyright,]
441 | 
442 |         self.dfjoined = self.dfjoined.merge(self.dfs[1], left_on=cfg_exact_left_on, right_on=cfg_exact_right_on, suffixes=['','_right'])
443 | 
444 |         if not self.cfg_is_keep_debug:
445 |             self.dfjoined = self.dfjoined[self.dfjoined.columns[~self.dfjoined.columns.str.startswith('__')]]
446 | 
447 |         return {'merged': self.dfjoined, 'top1': df_diff_bylevel, 'duplicates': None}
448 | 
449 | '''
450 | multikey: want to merge left match onto right df
451 | dont to numbers (non key) join until the very end
452 | '''


--------------------------------------------------------------------------------
/d6tjoin/utils.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import pandas as pd
  4 | pd.set_option('display.expand_frame_repr', False)
  5 | import numpy as np
  6 | 
  7 | # ******************************************
  8 | # helpers
  9 | # ******************************************
 10 | def _set_values_series(dfs):
 11 |     return set(dfs[~pd.isnull(dfs)])
 12 | 
 13 | def _set_values(dfg, key):
 14 |     return _set_values_series(dfg[key])
 15 | 
 16 | def _filter_group_min(dfg, col, topn=1):
 17 |     """
 18 | 
 19 |     Returns all rows equal to min in col
 20 | 
 21 |     """
 22 |     if topn==1:
 23 |         return dfg[dfg[col] == dfg[col].min()]
 24 |     else:
 25 |         return dfg[dfg[col].isin(np.sort(dfg[col].unique())[:topn])]
 26 | 
 27 | from joblib import Parallel, delayed
 28 | import multiprocessing
 29 | def _applyFunMulticore(values1, values2, func):
 30 |     retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(p[0],p[1]) for p in zip(values1,values2))
 31 |     return retLst
 32 | 
 33 | 
 34 | # ******************************************
 35 | # tfidf
 36 | # ******************************************
 37 | import re
 38 | import collections
 39 | from joblib import Parallel, delayed
 40 | import multiprocessing
 41 | import itertools
 42 | import warnings
 43 | 
 44 | def tokenCount(dfs, fun, mincount=2, minlength=1):
 45 |     """
 46 |     Tokenize a series of strings and count occurance of string tokens
 47 | 
 48 |     Args:
 49 |         dfs (pd.series): pd.series of values
 50 |         fun (function): tokenize function
 51 |         mincount (int): discard tokens with count less than mincount
 52 |         minlength (int): discard tokens with string length less than minlength
 53 | 
 54 |     Returns:
 55 |         dataframe: count of tokens
 56 | 
 57 |     """
 58 |     assert len(dfs.shape)==1
 59 |     dfs=dfs.dropna().unique()
 60 |     
 61 |     if dfs.shape[0]>1000:
 62 |         words = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(fun)(s) for s in dfs)
 63 |     else:
 64 |         words = [fun(s) for s in dfs]
 65 |     words = list(itertools.chain.from_iterable(words))    
 66 |     df_count = [t for t in collections.Counter(words).most_common() if t[1]>=mincount and len(t[0])>=minlength]
 67 |     df_count = pd.DataFrame(df_count, columns=['word','count'])
 68 |     return df_count
 69 | 
 70 | def splitcharTokenCount(dfs, splitchars="[^a-zA-Z0-9]+", mincount=2, minlength=1): #"[ -_|]+"
 71 |     """
 72 |     Tokenize a series of strings by splitting strings on a set of characters. Then count occurance of tokens in series.
 73 | 
 74 |     Args:
 75 |         dfs (pd.series): pd.series of values
 76 |         splitchars (str): regex by which to split string into tokens. For example `"[^a-zA-Z0-9]+"` for anything not alpha-numeric or `"[ -_|]+"` for common ID tokens.
 77 |         mincount (int): discard tokens with count less than mincount
 78 |         minlength (int): discard tokens with string length less than minlength
 79 | 
 80 |     Returns:
 81 |         dataframe: count of tokens
 82 | 
 83 |     """
 84 |     def funsplit(s):
 85 |         return re.split(splitchars,s)
 86 |     return tokenCount(dfs, funsplit, mincount, minlength)
 87 | 
 88 | def ncharTokenCount(dfs, nchars=None, overlapping=False, mincount=2, minlength=1):
 89 |     """
 90 |     Tokenize a series of strings by splitting strings into tokens of `nchars` length. Then count occurance of tokens in series.
 91 | 
 92 |     Args:
 93 |         dfs (pd.series): pd.series of values
 94 |         nchars (int): number of characters in each token
 95 |         overlapping (bool): make overlapping tokens
 96 |         mincount (int): discard tokens with count less than mincount
 97 |         minlength (int): discard tokens with string length less than minlength
 98 | 
 99 |     Returns:
100 |         dataframe: count of tokens
101 | 
102 |     """
103 |     if not nchars:
104 |         smax = dfs.str.len().max()
105 |         smin = dfs.str.len().min()
106 |         if smax-smin>2:
107 |             warnings.warn('Tokenize works best if strings have similar length')
108 |         nchars = dfs.str.len().max()//4
109 | 
110 |     if overlapping:
111 |         def funtokenize(s):
112 |             return [s[i:i+nchars] for i in range(0, len(s)-nchars+1)]
113 |     else:
114 |         def funtokenize(s):
115 |             return [s[i:i+nchars] for i in range(0, len(s), nchars)]
116 |     return tokenCount(dfs, funtokenize, mincount, minlength)
117 | 
118 | 
119 | def unique_contains(dfs, strlist):
120 |     """
121 |     Find values which contain a set of substrings
122 | 
123 |     Args:
124 |         dfs (pd.series): pd.series of values
125 |         strlist (list): substrings to find
126 | 
127 |     Returns:
128 |         list: unique values which contain substring
129 | 
130 |     """
131 |     assert len(dfs.shape)==1
132 |     dfs=np.unique(dfs)
133 |     outlist = [(x, [s for s in dfs if x in s]) for x in strlist]
134 |     return outlist
135 | 
136 | import collections
137 | 
138 | def typeSeries(dfs):
139 |     """
140 |     Find type of a pandas series
141 | 
142 |     Args:
143 |         dfs (pd.series): pd.series of values
144 | 
145 |     Returns:
146 |         str: type
147 | 
148 |     """
149 |     c = collections.Counter([type(x) for x in dfs.values])
150 |     cnt = c.most_common()
151 |     if len(cnt)>1:
152 |         return 'mixed'
153 |     else:
154 |         return cnt[0][0]
155 | 
156 | def typeDataFrame(df):
157 |     """
158 |     Find type of a pandas dataframe columns
159 | 
160 |     Args:
161 |         df (pd.dataframe): pandas dataframe
162 | 
163 |     Returns:
164 |         dict: column, type
165 | 
166 |     """
167 |     return dict(zip(df.columns,[typeSeries(df[s]) for s in df]))
168 | 
169 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = d6tjoin
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=d6t-lib
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/samples.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import itertools
 4 | from faker import Faker
 5 | import importlib
 6 | 
 7 | import d6tjoin.top1
 8 | import d6tjoin.utils
 9 | 
10 | importlib.reload(d6tjoin.top1)
11 | 
12 | # *******************************************************
13 | # generate sample time series data with id and value
14 | # *******************************************************
15 | nobs = 10
16 | f1 = Faker()
17 | f1.seed(0)
18 | uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)]
19 | dates1 = pd.date_range('1/1/2010','1/1/2011')
20 | 
21 | df1 = pd.DataFrame(list(itertools.product(dates1,uuid1)),columns=['date','id'])
22 | df1['val1']=np.round(np.random.sample(df1.shape[0]),3)
23 | 
24 | # create mismatch
25 | df2 = df1.copy()
26 | df2['id'] = df1['id'].str[1:-1]
27 | df2['val2']=np.round(np.random.sample(df2.shape[0]),3)
28 | 
29 | d6tjoin.utils.PreJoin([df1,df2],['id','date']).stats_prejoin()
30 | 
31 | result = d6tjoin.top1.MergeTop1(df1.head(),df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'],exact_left_on=['date'],exact_right_on=['date']).merge()
32 | 
33 | print(result['top1']['id'].head(2))
34 | 
35 | print(result['merged'].head(2))
36 | 


--------------------------------------------------------------------------------
/docs/shell-napoleon-html.sh:
--------------------------------------------------------------------------------
1 | make html
2 | 


--------------------------------------------------------------------------------
/docs/shell-napoleon-recreate.sh:
--------------------------------------------------------------------------------
1 | #rm ./source/*
2 | #cp ./source-bak/* ./source/
3 | sphinx-apidoc -f -o ./source ..
4 | make clean
5 | make html
6 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # d6t-lib documentation build configuration file, created by
  5 | # sphinx-quickstart on Tue Nov 28 11:32:56 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | 
 23 | sys.path.insert(0, os.path.abspath('.'))
 24 | sys.path.insert(0, os.path.dirname(os.path.abspath('.')))  # todo: why is this not working?
 25 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
 26 | sys.path.insert(0, os.path.join(os.path.dirname((os.path.abspath('.'))), "d6tjoin"))
 27 | 
 28 | # -- General configuration ------------------------------------------------
 29 | 
 30 | # If your documentation needs a minimal Sphinx version, state it here.
 31 | #
 32 | # needs_sphinx = '1.0'
 33 | 
 34 | # Add any Sphinx extension module names here, as strings. They can be
 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 36 | # ones.
 37 | extensions = ['sphinx.ext.autodoc',
 38 |               'sphinx.ext.todo',
 39 |               'sphinx.ext.viewcode',
 40 |               'sphinx.ext.githubpages',
 41 |               'sphinx.ext.napoleon']
 42 | 
 43 | # Add any paths that contain templates here, relative to this directory.
 44 | templates_path = ['_templates']
 45 | 
 46 | # The suffix(es) of source filenames.
 47 | # You can specify multiple suffix as a list of string:
 48 | #
 49 | # source_suffix = ['.rst', '.md']
 50 | source_suffix = '.rst'
 51 | 
 52 | # The master toctree document.
 53 | master_doc = 'index'
 54 | 
 55 | # General information about the project.
 56 | project = 'd6tjoin'
 57 | copyright = '2017, databolt'
 58 | author = 'databolt'
 59 | 
 60 | # The version info for the project you're documenting, acts as replacement for
 61 | # |version| and |release|, also used in various other places throughout the
 62 | # built documents.
 63 | #
 64 | # The short X.Y version.
 65 | version = '0.1'
 66 | # The full version, including alpha/beta/rc tags.
 67 | release = '0.1'
 68 | 
 69 | # The language for content autogenerated by Sphinx. Refer to documentation
 70 | # for a list of supported languages.
 71 | #
 72 | # This is also used if you do content translation via gettext catalogs.
 73 | # Usually you set "language" from the command line for these cases.
 74 | language = None
 75 | 
 76 | # List of patterns, relative to source directory, that match files and
 77 | # directories to ignore when looking for source files.
 78 | # This patterns also effect to html_static_path and html_extra_path
 79 | exclude_patterns = []
 80 | 
 81 | # The name of the Pygments (syntax highlighting) style to use.
 82 | pygments_style = 'sphinx'
 83 | 
 84 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 85 | todo_include_todos = True
 86 | 
 87 | # -- Options for HTML output ----------------------------------------------
 88 | 
 89 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 90 | # a list of builtin themes.
 91 | #
 92 | html_theme = 'sphinx_rtd_theme'  # 'alabaster'
 93 | 
 94 | # Theme options are theme-specific and customize the look and feel of a theme
 95 | # further.  For a list of options available for each theme, see the
 96 | # documentation.
 97 | #
 98 | # html_theme_options = {}
 99 | 
100 | # Add any paths that contain custom static files (such as style sheets) here,
101 | # relative to this directory. They are copied after the builtin static files,
102 | # so a file named "default.css" will overwrite the builtin "default.css".
103 | html_static_path = ['_static']
104 | 
105 | # Custom sidebar templates, must be a dictionary that maps document names
106 | # to template names.
107 | #
108 | # This is required for the alabaster theme
109 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
110 | # html_sidebars = {
111 | #     '**': [
112 | #         'about.html',
113 | #         'navigation.html',
114 | #         'relations.html',  # needs 'show_related': True theme option to display
115 | #         'searchbox.html',
116 | #         'donate.html',
117 | #     ]
118 | # }
119 | 
120 | 
121 | # -- Options for HTMLHelp output ------------------------------------------
122 | 
123 | # Output file base name for HTML help builder.
124 | htmlhelp_basename = 'd6tjoin-doc'
125 | 
126 | # -- Options for LaTeX output ---------------------------------------------
127 | 
128 | latex_elements = {
129 |     # The paper size ('letterpaper' or 'a4paper').
130 |     #
131 |     # 'papersize': 'letterpaper',
132 | 
133 |     # The font size ('10pt', '11pt' or '12pt').
134 |     #
135 |     # 'pointsize': '10pt',
136 | 
137 |     # Additional stuff for the LaTeX preamble.
138 |     #
139 |     # 'preamble': '',
140 | 
141 |     # Latex figure (float) alignment
142 |     #
143 |     # 'figure_align': 'htbp',
144 | }
145 | 
146 | # Grouping the document tree into LaTeX files. List of tuples
147 | # (source start file, target name, title,
148 | #  author, documentclass [howto, manual, or own class]).
149 | latex_documents = [
150 |     (master_doc, 'd6tjoin.tex', 'd6tjoin Documentation',
151 |      'nn', 'manual'),
152 | ]
153 | 
154 | # -- Options for manual page output ---------------------------------------
155 | 
156 | # One entry per manual page. List of tuples
157 | # (source start file, name, description, authors, manual section).
158 | man_pages = [
159 |     (master_doc, 'd6tjoin', 'd6tjoin Documentation',
160 |      [author], 1)
161 | ]
162 | 
163 | # -- Options for Texinfo output -------------------------------------------
164 | 
165 | # Grouping the document tree into Texinfo files. List of tuples
166 | # (source start file, target name, title, author,
167 | #  dir menu entry, description, category)
168 | texinfo_documents = [
169 |     (master_doc, 'd6tjoin', 'd6tjoin Documentation',
170 |      author, 'd6tjoin', 'Databolt python library - Accelerate data engineering',
171 |      'Miscellaneous'),
172 | ]
173 | 


--------------------------------------------------------------------------------
/docs/source/d6tjoin.rst:
--------------------------------------------------------------------------------
 1 | d6tjoin package
 2 | ===============
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | d6tjoin\.top1 module
 8 | --------------------
 9 | 
10 | .. automodule:: d6tjoin.top1
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | d6tjoin\.utils module
16 | ---------------------
17 | 
18 | .. automodule:: d6tjoin.utils
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: d6tjoin
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. d6tjoin documentation master file, created by
 2 |    sphinx-quickstart on Tue Nov 28 11:32:56 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to d6tjoin documentation!
 7 | ==============================================
 8 | 
 9 | Documentation for using the databolt python Smart Join Combine library.
10 | 
11 | Library Docs
12 | ==================
13 | 
14 | * :ref:`modindex`
15 | 
16 | Search
17 | ==================
18 | 
19 | * :ref:`search`
20 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
 1 | d6tjoin
 2 | =======
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 4
 6 | 
 7 |    d6tjoin
 8 |    setup
 9 |    tests
10 | 


--------------------------------------------------------------------------------
/docs/source/setup.rst:
--------------------------------------------------------------------------------
1 | setup module
2 | ============
3 | 
4 | .. automodule:: setup
5 |     :members:
6 |     :undoc-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/examples-prejoin.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Engineering in Python with databolt  - Identify and analyze join problems (d6tjoin.Prejoin)\n",
  8 |     "\n",
  9 |     "## Introduction\n",
 10 |     "\n",
 11 |     "Joining datasets is a common data engineering operation. However, often there are problems merging datasets from different sources because of mismatched identifiers, date conventions etc. \n",
 12 |     "\n",
 13 |     "** `d6tjoin.Prejoin` module allows you to test for join accuracy and quickly identify and analyze join problems. **\n",
 14 |     "\n",
 15 |     "Here are some examples which show you how to:\n",
 16 |     "* do join quality analysis prior to attempting a join\n",
 17 |     "* detect and analyze a string-based identifiers mismatch\n",
 18 |     "* detect and analyze a date mismatch"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Generate sample data\n",
 26 |     "\n",
 27 |     "Let's generate some random respresentative data:\n",
 28 |     "* identifier (string)\n",
 29 |     "* date (np.datetime)\n",
 30 |     "* values (flaot)"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 1,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "import pandas as pd\n",
 40 |     "import numpy as np\n",
 41 |     "import uuid\n",
 42 |     "import itertools\n",
 43 |     "import importlib\n",
 44 |     "\n",
 45 |     "import d6tjoin\n",
 46 |     "\n",
 47 |     "# ******************************************\n",
 48 |     "# generate sample data\n",
 49 |     "# ******************************************\n",
 50 |     "nobs = 10\n",
 51 |     "uuid1 = [str(uuid.uuid4()) for _ in range(nobs)]\n",
 52 |     "dates1 = pd.date_range('1/1/2010','1/1/2011')\n",
 53 |     "\n",
 54 |     "df1 = pd.DataFrame(list(itertools.product(uuid1,dates1)),columns=['id','date'])\n",
 55 |     "df1['v']=np.random.sample(df1.shape[0])"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 2,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/html": [
 66 |        "<div>\n",
 67 |        "<style scoped>\n",
 68 |        "    .dataframe tbody tr th:only-of-type {\n",
 69 |        "        vertical-align: middle;\n",
 70 |        "    }\n",
 71 |        "\n",
 72 |        "    .dataframe tbody tr th {\n",
 73 |        "        vertical-align: top;\n",
 74 |        "    }\n",
 75 |        "\n",
 76 |        "    .dataframe thead th {\n",
 77 |        "        text-align: right;\n",
 78 |        "    }\n",
 79 |        "</style>\n",
 80 |        "<table border=\"1\" class=\"dataframe\">\n",
 81 |        "  <thead>\n",
 82 |        "    <tr style=\"text-align: right;\">\n",
 83 |        "      <th></th>\n",
 84 |        "      <th>id</th>\n",
 85 |        "      <th>date</th>\n",
 86 |        "      <th>v</th>\n",
 87 |        "    </tr>\n",
 88 |        "  </thead>\n",
 89 |        "  <tbody>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>0</th>\n",
 92 |        "      <td>26e41c83-630e-47c5-a410-83fd7865e826</td>\n",
 93 |        "      <td>2010-01-01</td>\n",
 94 |        "      <td>0.589946</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>1</th>\n",
 98 |        "      <td>26e41c83-630e-47c5-a410-83fd7865e826</td>\n",
 99 |        "      <td>2010-01-02</td>\n",
100 |        "      <td>0.367214</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>366</th>\n",
104 |        "      <td>049676df-998a-4322-9121-84dac8b7547f</td>\n",
105 |        "      <td>2010-01-01</td>\n",
106 |        "      <td>0.570425</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>367</th>\n",
110 |        "      <td>049676df-998a-4322-9121-84dac8b7547f</td>\n",
111 |        "      <td>2010-01-02</td>\n",
112 |        "      <td>0.524693</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>732</th>\n",
116 |        "      <td>ad14d610-3a0b-4d87-8a29-236c9b6e817e</td>\n",
117 |        "      <td>2010-01-01</td>\n",
118 |        "      <td>0.681610</td>\n",
119 |        "    </tr>\n",
120 |        "    <tr>\n",
121 |        "      <th>733</th>\n",
122 |        "      <td>ad14d610-3a0b-4d87-8a29-236c9b6e817e</td>\n",
123 |        "      <td>2010-01-02</td>\n",
124 |        "      <td>0.236658</td>\n",
125 |        "    </tr>\n",
126 |        "  </tbody>\n",
127 |        "</table>\n",
128 |        "</div>"
129 |       ],
130 |       "text/plain": [
131 |        "                                       id       date         v\n",
132 |        "0    26e41c83-630e-47c5-a410-83fd7865e826 2010-01-01  0.589946\n",
133 |        "1    26e41c83-630e-47c5-a410-83fd7865e826 2010-01-02  0.367214\n",
134 |        "366  049676df-998a-4322-9121-84dac8b7547f 2010-01-01  0.570425\n",
135 |        "367  049676df-998a-4322-9121-84dac8b7547f 2010-01-02  0.524693\n",
136 |        "732  ad14d610-3a0b-4d87-8a29-236c9b6e817e 2010-01-01  0.681610\n",
137 |        "733  ad14d610-3a0b-4d87-8a29-236c9b6e817e 2010-01-02  0.236658"
138 |       ]
139 |      },
140 |      "execution_count": 2,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "df1.groupby(['id']).head(2).head(6)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Use Case: assert 100% join accuracy for data integrity checks \n",
154 |     "\n",
155 |     "In data enginerring QA you want to test that data is joined correctly. This is particularly useful for detecting potential data problems in production."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 3,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "df2 = df1.copy()\n",
165 |     "\n",
166 |     "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n",
167 |     "assert j.is_all_matched() # succeeds\n",
168 |     "assert j.is_all_matched('id') # succeeds\n",
169 |     "assert j.is_all_matched('date') # succeeds\n"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "## Use Case: detect and analyze id mismatch \n",
177 |     "\n",
178 |     "When joining data from different sources, eg different vendors, often your ids don't match and then you need to manually analyze the situation. With databolt this becomes much easier."
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "### 100% id mismatch\n",
186 |     "\n",
187 |     "Let's look at an example where say vendor 1 uses a different id convention than vendor 2 and none of the ids match."
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 4,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "assert fails!\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "# create mismatch\n",
205 |     "df2['id'] = df1['id'].str[1:-1]\n",
206 |     "\n",
207 |     "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n",
208 |     "\n",
209 |     "try:\n",
210 |     "    assert j.is_all_matched() # fails\n",
211 |     "except:\n",
212 |     "    print('assert fails!')"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "The QA check shows there's a problem, lets analyze the issue with `Prejoin.match_quality()`. We can immediately see that none of the ids match."
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 5,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "  key left key right  all matched  inner  left  right  outer  unmatched total  unmatched left  unmatched right\n",
232 |       "0       id        id        False      0    10     10     20               20              10               10\n",
233 |       "1     date      date         True    366   366    366    366                0               0                0\n",
234 |       "2  __all__   __all__        False      0  3660   3660   7320             7320            3660             3660\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "j.match_quality()"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "Let's look at some of the mismatched records with `Prejoin.show_unmatched()`. Looks like there might be a length problem."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 6,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "                                        id       date         v\n",
259 |       "1098  b5c945ce-fdf6-4774-bbae-ff5f5787a7eb 2010-01-01  0.194907\n",
260 |       "1099  b5c945ce-fdf6-4774-bbae-ff5f5787a7eb 2010-01-02  0.558549\n",
261 |       "1100  b5c945ce-fdf6-4774-bbae-ff5f5787a7eb 2010-01-03  0.316138\n",
262 |       "                                   id       date         v\n",
263 |       "0  6e41c83-630e-47c5-a410-83fd7865e82 2010-01-01  0.589946\n",
264 |       "1  6e41c83-630e-47c5-a410-83fd7865e82 2010-01-02  0.367214\n",
265 |       "2  6e41c83-630e-47c5-a410-83fd7865e82 2010-01-03  0.290587\n"
266 |      ]
267 |     }
268 |    ],
269 |    "source": [
270 |     "print(j.show_unmatched('id')['left'])\n",
271 |     "print(j.show_unmatched('id')['right'])"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "We can show string length statistics using `d6tjoin.Prejoin().describe_str()` which confirms that the id string lenghts are different."
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 7,
284 |    "metadata": {
285 |     "scrolled": true
286 |    },
287 |    "outputs": [
288 |     {
289 |      "name": "stdout",
290 |      "output_type": "stream",
291 |      "text": [
292 |       "dataframe #0\n",
293 |       "    median   min   max  nrecords\n",
294 |       "id    36.0  36.0  36.0    3660.0\n",
295 |       "dataframe #1\n",
296 |       "    median   min   max  nrecords\n",
297 |       "id    34.0  34.0  34.0    3660.0\n",
298 |       "None\n"
299 |      ]
300 |     }
301 |    ],
302 |    "source": [
303 |     "print(j.describe_str())\n"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "### Partial id mismatch\n",
311 |     "\n",
312 |     "Let's look at another example where there is a partial mismatch. In this case let's say vendor 2 only has a certain percentage of ids covered."
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 8,
318 |    "metadata": {
319 |     "scrolled": true
320 |    },
321 |    "outputs": [
322 |     {
323 |      "name": "stdout",
324 |      "output_type": "stream",
325 |      "text": [
326 |       "assert fails!\n"
327 |      ]
328 |     }
329 |    ],
330 |    "source": [
331 |     "# create partial mismatch\n",
332 |     "uuid_sel = np.array(uuid1)[np.random.choice(nobs, nobs//5, replace=False)].tolist()\n",
333 |     "df2 = df1[~df1['id'].isin(uuid_sel)]\n",
334 |     "\n",
335 |     "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n",
336 |     "\n",
337 |     "try:\n",
338 |     "    assert j.is_all_matched() # fails\n",
339 |     "except:\n",
340 |     "    print('assert fails!')"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "Again we've quickly identified a problem. This would typically cause you to do manual and tedious manual QA work but with `Prejoin().match_quality()` you can quickly see how many ids were mismatched."
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 9,
353 |    "metadata": {
354 |     "scrolled": true
355 |    },
356 |    "outputs": [
357 |     {
358 |      "name": "stdout",
359 |      "output_type": "stream",
360 |      "text": [
361 |       "  key left key right  all matched  inner  left  right  outer  unmatched total  unmatched left  unmatched right\n",
362 |       "0       id        id        False      8    10      8     10                2               2                0\n",
363 |       "1     date      date         True    366   366    366    366                0               0                0\n",
364 |       "2  __all__   __all__        False   2928  3660   2928   3660              732             732                0\n"
365 |      ]
366 |     }
367 |    ],
368 |    "source": [
369 |     "j.match_quality()"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "## Use Case: detect and analyze date mismatch \n",
377 |     "\n",
378 |     "Dates are another common sources of frustration for data engineers working with time series data. Dates come in a variety of different formats and conventions. Let's use databolt to analyze a date mismatch situation."
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 10,
384 |    "metadata": {},
385 |    "outputs": [],
386 |    "source": [
387 |     "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n",
388 |     "df2 = pd.DataFrame(list(itertools.product(uuid1,dates2)),columns=['id','date'])\n",
389 |     "df2['v']=np.random.sample(df2.shape[0])"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "metadata": {},
395 |    "source": [
396 |     "To highlight some different functionality for `Prejoin().match_quality()`. The QA test for all matches fails."
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 11,
402 |    "metadata": {},
403 |    "outputs": [
404 |     {
405 |      "name": "stdout",
406 |      "output_type": "stream",
407 |      "text": [
408 |       "  key left key right  all matched  inner  left  right  outer  unmatched total  unmatched left  unmatched right\n",
409 |       "0       id        id         True     10    10     10     10                0               0                0\n",
410 |       "1     date      date        False    261   366    261    366              105             105                0\n",
411 |       "2  __all__   __all__        False   2610  3660   2610   3660             1050            1050                0\n",
412 |       "assert fails!\n"
413 |      ]
414 |     }
415 |    ],
416 |    "source": [
417 |     "j = d6tjoin.Prejoin([df1,df2],['id','date'])\n",
418 |     "dfr = j.match_quality()\n",
419 |     "try:\n",
420 |     "    assert dfr['all matched'].all() # fails\n",
421 |     "except:\n",
422 |     "    print('assert fails!')"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "markdown",
427 |    "metadata": {},
428 |    "source": [
429 |     "We can look at the dataframe to see 105 dates are not matched."
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": 12,
435 |    "metadata": {},
436 |    "outputs": [],
437 |    "source": [
438 |     "dfr"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "markdown",
443 |    "metadata": {},
444 |    "source": [
445 |     "We can look at mismatched records using `Prejoin.show_unmatched()`. Here we will return all mismatched records into a dataframe you can analyze."
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 13,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "dft = j.show_unmatched('date',keys_only=False,nrecords=-1,nrows=-1)['left']"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": 14,
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "data": {
464 |       "text/html": [
465 |        "<div>\n",
466 |        "<style scoped>\n",
467 |        "    .dataframe tbody tr th:only-of-type {\n",
468 |        "        vertical-align: middle;\n",
469 |        "    }\n",
470 |        "\n",
471 |        "    .dataframe tbody tr th {\n",
472 |        "        vertical-align: top;\n",
473 |        "    }\n",
474 |        "\n",
475 |        "    .dataframe thead th {\n",
476 |        "        text-align: right;\n",
477 |        "    }\n",
478 |        "</style>\n",
479 |        "<table border=\"1\" class=\"dataframe\">\n",
480 |        "  <thead>\n",
481 |        "    <tr style=\"text-align: right;\">\n",
482 |        "      <th></th>\n",
483 |        "      <th>id</th>\n",
484 |        "      <th>date</th>\n",
485 |        "      <th>v</th>\n",
486 |        "    </tr>\n",
487 |        "  </thead>\n",
488 |        "  <tbody>\n",
489 |        "    <tr>\n",
490 |        "      <th>1</th>\n",
491 |        "      <td>26e41c83-630e-47c5-a410-83fd7865e826</td>\n",
492 |        "      <td>2010-01-02</td>\n",
493 |        "      <td>0.367214</td>\n",
494 |        "    </tr>\n",
495 |        "    <tr>\n",
496 |        "      <th>2</th>\n",
497 |        "      <td>26e41c83-630e-47c5-a410-83fd7865e826</td>\n",
498 |        "      <td>2010-01-03</td>\n",
499 |        "      <td>0.290587</td>\n",
500 |        "    </tr>\n",
501 |        "    <tr>\n",
502 |        "      <th>8</th>\n",
503 |        "      <td>26e41c83-630e-47c5-a410-83fd7865e826</td>\n",
504 |        "      <td>2010-01-09</td>\n",
505 |        "      <td>0.663732</td>\n",
506 |        "    </tr>\n",
507 |        "    <tr>\n",
508 |        "      <th>9</th>\n",
509 |        "      <td>26e41c83-630e-47c5-a410-83fd7865e826</td>\n",
510 |        "      <td>2010-01-10</td>\n",
511 |        "      <td>0.210751</td>\n",
512 |        "    </tr>\n",
513 |        "    <tr>\n",
514 |        "      <th>15</th>\n",
515 |        "      <td>26e41c83-630e-47c5-a410-83fd7865e826</td>\n",
516 |        "      <td>2010-01-16</td>\n",
517 |        "      <td>0.889254</td>\n",
518 |        "    </tr>\n",
519 |        "  </tbody>\n",
520 |        "</table>\n",
521 |        "</div>"
522 |       ],
523 |       "text/plain": [
524 |        "                                      id       date         v\n",
525 |        "1   26e41c83-630e-47c5-a410-83fd7865e826 2010-01-02  0.367214\n",
526 |        "2   26e41c83-630e-47c5-a410-83fd7865e826 2010-01-03  0.290587\n",
527 |        "8   26e41c83-630e-47c5-a410-83fd7865e826 2010-01-09  0.663732\n",
528 |        "9   26e41c83-630e-47c5-a410-83fd7865e826 2010-01-10  0.210751\n",
529 |        "15  26e41c83-630e-47c5-a410-83fd7865e826 2010-01-16  0.889254"
530 |       ]
531 |      },
532 |      "execution_count": 14,
533 |      "metadata": {},
534 |      "output_type": "execute_result"
535 |     }
536 |    ],
537 |    "source": [
538 |     "dft.head()"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "markdown",
543 |    "metadata": {},
544 |    "source": [
545 |     "Looking at the weekdays of the mismatched entries, you can see they are all weekends. "
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "code",
550 |    "execution_count": 15,
551 |    "metadata": {},
552 |    "outputs": [
553 |     {
554 |      "data": {
555 |       "text/plain": [
556 |        "array([5, 6])"
557 |       ]
558 |      },
559 |      "execution_count": 15,
560 |      "metadata": {},
561 |      "output_type": "execute_result"
562 |     }
563 |    ],
564 |    "source": [
565 |     "dft['date_wkday']=dft['date'].dt.weekday\n",
566 |     "dft['date_wkday'].unique()"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "markdown",
571 |    "metadata": {},
572 |    "source": [
573 |     "## Conclusion\n",
574 |     "\n",
575 |     "Joining datasets from different sources can be a big time waster for data engineers! With databolt you can quickly do join QA and analyze problems without doing manual tedious work."
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": null,
581 |    "metadata": {},
582 |    "outputs": [],
583 |    "source": []
584 |   }
585 |  ],
586 |  "metadata": {
587 |   "kernelspec": {
588 |    "display_name": "Python 3",
589 |    "language": "python",
590 |    "name": "python3"
591 |   },
592 |   "language_info": {
593 |    "codemirror_mode": {
594 |     "name": "ipython",
595 |     "version": 3
596 |    },
597 |    "file_extension": ".py",
598 |    "mimetype": "text/x-python",
599 |    "name": "python",
600 |    "nbconvert_exporter": "python",
601 |    "pygments_lexer": "ipython3",
602 |    "version": "3.7.6"
603 |   }
604 |  },
605 |  "nbformat": 4,
606 |  "nbformat_minor": 2
607 | }
608 | 


--------------------------------------------------------------------------------
/examples-tokencluster.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Engineering in Python with databolt - Find Token Clusters for Fuzzy Merging Identifiers (d6tlib/d6tjoin.utils)\n",
  8 |     "\n",
  9 |     "## Introduction\n",
 10 |     "\n",
 11 |     "Identifiers such as securities IDs often come in different conventions which makes joining them difficult. Normal joins don't work and fuzzy joins often get tripped up by commonly occuring tokens. \n",
 12 |     "\n",
 13 |     "In this notebook we will show how to use `d6tstack.utils.tokenCount` to find clusters of tokens and match on tokens."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import d6tjoin.utils\n",
 23 |     "import d6tjoin.top1\n",
 24 |     "import pandas as pd\n",
 25 |     "pd.set_option('display.expand_frame_repr', False)\n",
 26 |     "import numpy as np"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# data is tickers from two different vendors which try to join\n",
 36 |     "df1 = pd.DataFrame({'id':[\"AAP\",\"AAPL\",\"APRN\",\"AMZN-AMZN\",\"BBW\",\"NMG\",\"JLP\"]})\n",
 37 |     "df2 = pd.DataFrame({'id':[\"AAP_US_Equity\",\"AAPL_US_Equity\",\"AMZN_US_Equity\",\"APRN_US_Equity\",\"AD_NA_Equity\",\"BBY_US_Equity\",\"BMW_NA_Equity\",\"PRIVATE_NMG\",\"PRIVATE_JLP\"]})\n"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "  key left key right  all matched  inner  left  right  outer  unmatched total  unmatched left  unmatched right\n",
 50 |       "0       id        id        False      0     7      9     16               16               7                9\n",
 51 |       "1  __all__   __all__        False      0     7      9     16               16               7                9\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "# d6tjoin.Prejoin() shows none of the ids match\n",
 57 |     "\n",
 58 |     "d6tjoin.Prejoin([df1,df2],['id']).match_quality()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "     key left   key right  all matched  inner  left  right  outer  unmatched total  unmatched left  unmatched right\n",
 71 |       "0  id_cleaned  id_cleaned        False      4     7      8     11                7               3                4\n",
 72 |       "1     __all__     __all__        False      4     7      8     11                7               3                4\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "# attempt to join manually, better but still missing a few\n",
 78 |     "\n",
 79 |     "df1['id_cleaned'] = df1['id'].str.split('-').str[0]\n",
 80 |     "df2['id_cleaned'] = df2['id'].str.split('_').str[0]\n",
 81 |     "\n",
 82 |     "d6tjoin.Prejoin([df1,df2],['id_cleaned']).match_quality()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Fuzzy joins get confused by tokens\n",
 90 |     "\n",
 91 |     "Fuzzy joins to the rescue? Unfortunately, the presence of commonly occuring string tokens is messing with the string similarity functions."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 5,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stderr",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "/Users/haijing/Haijing/ds_project/d6t/d6tjoin/d6tjoin/top1.py:118: UserWarning: Top1 join for id has duplicates\n",
104 |       "  warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)\n"
105 |      ]
106 |     },
107 |     {
108 |      "data": {
109 |       "text/html": [
110 |        "<div>\n",
111 |        "<style scoped>\n",
112 |        "    .dataframe tbody tr th:only-of-type {\n",
113 |        "        vertical-align: middle;\n",
114 |        "    }\n",
115 |        "\n",
116 |        "    .dataframe tbody tr th {\n",
117 |        "        vertical-align: top;\n",
118 |        "    }\n",
119 |        "\n",
120 |        "    .dataframe thead th {\n",
121 |        "        text-align: right;\n",
122 |        "    }\n",
123 |        "</style>\n",
124 |        "<table border=\"1\" class=\"dataframe\">\n",
125 |        "  <thead>\n",
126 |        "    <tr style=\"text-align: right;\">\n",
127 |        "      <th></th>\n",
128 |        "      <th>__top1left__</th>\n",
129 |        "      <th>__top1right__</th>\n",
130 |        "      <th>__matchtype__</th>\n",
131 |        "      <th>__top1diff__</th>\n",
132 |        "    </tr>\n",
133 |        "  </thead>\n",
134 |        "  <tbody>\n",
135 |        "    <tr>\n",
136 |        "      <th>40</th>\n",
137 |        "      <td>AAP</td>\n",
138 |        "      <td>PRIVATE_JLP</td>\n",
139 |        "      <td>top1 left</td>\n",
140 |        "      <td>9</td>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>58</th>\n",
144 |        "      <td>AAPL</td>\n",
145 |        "      <td>PRIVATE_JLP</td>\n",
146 |        "      <td>top1 left</td>\n",
147 |        "      <td>9</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>27</th>\n",
151 |        "      <td>AMZN-AMZN</td>\n",
152 |        "      <td>PRIVATE_NMG</td>\n",
153 |        "      <td>top1 left</td>\n",
154 |        "      <td>10</td>\n",
155 |        "    </tr>\n",
156 |        "    <tr>\n",
157 |        "      <th>30</th>\n",
158 |        "      <td>AMZN-AMZN</td>\n",
159 |        "      <td>AD_NA_Equity</td>\n",
160 |        "      <td>top1 left</td>\n",
161 |        "      <td>10</td>\n",
162 |        "    </tr>\n",
163 |        "    <tr>\n",
164 |        "      <th>34</th>\n",
165 |        "      <td>AMZN-AMZN</td>\n",
166 |        "      <td>AMZN_US_Equity</td>\n",
167 |        "      <td>top1 left</td>\n",
168 |        "      <td>10</td>\n",
169 |        "    </tr>\n",
170 |        "    <tr>\n",
171 |        "      <th>9</th>\n",
172 |        "      <td>APRN</td>\n",
173 |        "      <td>PRIVATE_NMG</td>\n",
174 |        "      <td>top1 left</td>\n",
175 |        "      <td>9</td>\n",
176 |        "    </tr>\n",
177 |        "    <tr>\n",
178 |        "      <th>0</th>\n",
179 |        "      <td>BBW</td>\n",
180 |        "      <td>PRIVATE_NMG</td>\n",
181 |        "      <td>top1 left</td>\n",
182 |        "      <td>11</td>\n",
183 |        "    </tr>\n",
184 |        "    <tr>\n",
185 |        "      <th>1</th>\n",
186 |        "      <td>BBW</td>\n",
187 |        "      <td>BBY_US_Equity</td>\n",
188 |        "      <td>top1 left</td>\n",
189 |        "      <td>11</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>4</th>\n",
193 |        "      <td>BBW</td>\n",
194 |        "      <td>PRIVATE_JLP</td>\n",
195 |        "      <td>top1 left</td>\n",
196 |        "      <td>11</td>\n",
197 |        "    </tr>\n",
198 |        "    <tr>\n",
199 |        "      <th>5</th>\n",
200 |        "      <td>BBW</td>\n",
201 |        "      <td>BMW_NA_Equity</td>\n",
202 |        "      <td>top1 left</td>\n",
203 |        "      <td>11</td>\n",
204 |        "    </tr>\n",
205 |        "    <tr>\n",
206 |        "      <th>22</th>\n",
207 |        "      <td>JLP</td>\n",
208 |        "      <td>PRIVATE_JLP</td>\n",
209 |        "      <td>top1 left</td>\n",
210 |        "      <td>8</td>\n",
211 |        "    </tr>\n",
212 |        "    <tr>\n",
213 |        "      <th>45</th>\n",
214 |        "      <td>NMG</td>\n",
215 |        "      <td>PRIVATE_NMG</td>\n",
216 |        "      <td>top1 left</td>\n",
217 |        "      <td>8</td>\n",
218 |        "    </tr>\n",
219 |        "  </tbody>\n",
220 |        "</table>\n",
221 |        "</div>"
222 |       ],
223 |       "text/plain": [
224 |        "   __top1left__   __top1right__ __matchtype__  __top1diff__\n",
225 |        "40          AAP     PRIVATE_JLP     top1 left             9\n",
226 |        "58         AAPL     PRIVATE_JLP     top1 left             9\n",
227 |        "27    AMZN-AMZN     PRIVATE_NMG     top1 left            10\n",
228 |        "30    AMZN-AMZN    AD_NA_Equity     top1 left            10\n",
229 |        "34    AMZN-AMZN  AMZN_US_Equity     top1 left            10\n",
230 |        "9          APRN     PRIVATE_NMG     top1 left             9\n",
231 |        "0           BBW     PRIVATE_NMG     top1 left            11\n",
232 |        "1           BBW   BBY_US_Equity     top1 left            11\n",
233 |        "4           BBW     PRIVATE_JLP     top1 left            11\n",
234 |        "5           BBW   BMW_NA_Equity     top1 left            11\n",
235 |        "22          JLP     PRIVATE_JLP     top1 left             8\n",
236 |        "45          NMG     PRIVATE_NMG     top1 left             8"
237 |       ]
238 |      },
239 |      "execution_count": 5,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "# attempt a fuzzy join using edit distance => not looking good\n",
246 |     "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id']).merge()['top1']['id']"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 6,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stderr",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "/Users/haijing/Haijing/ds_project/d6t/d6tjoin/d6tjoin/top1.py:118: UserWarning: Top1 join for id has duplicates\n",
259 |       "  warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)\n"
260 |      ]
261 |     },
262 |     {
263 |      "data": {
264 |       "text/html": [
265 |        "<div>\n",
266 |        "<style scoped>\n",
267 |        "    .dataframe tbody tr th:only-of-type {\n",
268 |        "        vertical-align: middle;\n",
269 |        "    }\n",
270 |        "\n",
271 |        "    .dataframe tbody tr th {\n",
272 |        "        vertical-align: top;\n",
273 |        "    }\n",
274 |        "\n",
275 |        "    .dataframe thead th {\n",
276 |        "        text-align: right;\n",
277 |        "    }\n",
278 |        "</style>\n",
279 |        "<table border=\"1\" class=\"dataframe\">\n",
280 |        "  <thead>\n",
281 |        "    <tr style=\"text-align: right;\">\n",
282 |        "      <th></th>\n",
283 |        "      <th>__top1left__</th>\n",
284 |        "      <th>__top1right__</th>\n",
285 |        "      <th>__matchtype__</th>\n",
286 |        "      <th>__top1diff__</th>\n",
287 |        "    </tr>\n",
288 |        "  </thead>\n",
289 |        "  <tbody>\n",
290 |        "    <tr>\n",
291 |        "      <th>42</th>\n",
292 |        "      <td>AAP</td>\n",
293 |        "      <td>AAP_US_Equity</td>\n",
294 |        "      <td>top1 left</td>\n",
295 |        "      <td>13.000</td>\n",
296 |        "    </tr>\n",
297 |        "    <tr>\n",
298 |        "      <th>56</th>\n",
299 |        "      <td>AAPL</td>\n",
300 |        "      <td>AAPL_US_Equity</td>\n",
301 |        "      <td>top1 left</td>\n",
302 |        "      <td>14.000</td>\n",
303 |        "    </tr>\n",
304 |        "    <tr>\n",
305 |        "      <th>34</th>\n",
306 |        "      <td>AMZN-AMZN</td>\n",
307 |        "      <td>AMZN_US_Equity</td>\n",
308 |        "      <td>top1 left</td>\n",
309 |        "      <td>64.625</td>\n",
310 |        "    </tr>\n",
311 |        "    <tr>\n",
312 |        "      <th>17</th>\n",
313 |        "      <td>APRN</td>\n",
314 |        "      <td>APRN_US_Equity</td>\n",
315 |        "      <td>top1 left</td>\n",
316 |        "      <td>14.000</td>\n",
317 |        "    </tr>\n",
318 |        "    <tr>\n",
319 |        "      <th>1</th>\n",
320 |        "      <td>BBW</td>\n",
321 |        "      <td>BBY_US_Equity</td>\n",
322 |        "      <td>top1 left</td>\n",
323 |        "      <td>23.000</td>\n",
324 |        "    </tr>\n",
325 |        "    <tr>\n",
326 |        "      <th>5</th>\n",
327 |        "      <td>BBW</td>\n",
328 |        "      <td>BMW_NA_Equity</td>\n",
329 |        "      <td>top1 left</td>\n",
330 |        "      <td>23.000</td>\n",
331 |        "    </tr>\n",
332 |        "    <tr>\n",
333 |        "      <th>24</th>\n",
334 |        "      <td>JLP</td>\n",
335 |        "      <td>AAP_US_Equity</td>\n",
336 |        "      <td>top1 left</td>\n",
337 |        "      <td>33.000</td>\n",
338 |        "    </tr>\n",
339 |        "    <tr>\n",
340 |        "      <th>50</th>\n",
341 |        "      <td>NMG</td>\n",
342 |        "      <td>BMW_NA_Equity</td>\n",
343 |        "      <td>top1 left</td>\n",
344 |        "      <td>33.000</td>\n",
345 |        "    </tr>\n",
346 |        "  </tbody>\n",
347 |        "</table>\n",
348 |        "</div>"
349 |       ],
350 |       "text/plain": [
351 |        "   __top1left__   __top1right__ __matchtype__  __top1diff__\n",
352 |        "42          AAP   AAP_US_Equity     top1 left        13.000\n",
353 |        "56         AAPL  AAPL_US_Equity     top1 left        14.000\n",
354 |        "34    AMZN-AMZN  AMZN_US_Equity     top1 left        64.625\n",
355 |        "17         APRN  APRN_US_Equity     top1 left        14.000\n",
356 |        "1           BBW   BBY_US_Equity     top1 left        23.000\n",
357 |        "5           BBW   BMW_NA_Equity     top1 left        23.000\n",
358 |        "24          JLP   AAP_US_Equity     top1 left        33.000\n",
359 |        "50          NMG   BMW_NA_Equity     top1 left        33.000"
360 |       ]
361 |      },
362 |      "execution_count": 6,
363 |      "metadata": {},
364 |      "output_type": "execute_result"
365 |     }
366 |    ],
367 |    "source": [
368 |     "# attempt a fuzzy join using affine gap distance => not looking good\n",
369 |     "import affinegap\n",
370 |     "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[affinegap.affineGapDistance]).merge()['top1']['id']"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "metadata": {},
376 |    "source": [
377 |     "## Token-based clustering\n",
378 |     "\n",
379 |     "With `d6tjoin.utils.splitcharTokenCount` you can quickly split the ids into tokens to find commonly occuring substrings. You can then use that knowledge to join the data."
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 7,
385 |    "metadata": {},
386 |    "outputs": [
387 |     {
388 |      "name": "stdout",
389 |      "output_type": "stream",
390 |      "text": [
391 |       "*** token counts ***\n",
392 |       "      word  count\n",
393 |       "0   Equity      7\n",
394 |       "1       US      5\n",
395 |       "2       NA      2\n",
396 |       "3  PRIVATE      2\n",
397 |       "\n",
398 |       " *** token occurance ***\n",
399 |       "[('Equity', ['AAPL_US_Equity', 'AAP_US_Equity', 'AD_NA_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity', 'BMW_NA_Equity']), ('US', ['AAPL_US_Equity', 'AAP_US_Equity', 'AMZN_US_Equity', 'APRN_US_Equity', 'BBY_US_Equity']), ('NA', ['AD_NA_Equity', 'BMW_NA_Equity']), ('PRIVATE', ['PRIVATE_JLP', 'PRIVATE_NMG'])]\n"
400 |      ]
401 |     }
402 |    ],
403 |    "source": [
404 |     "dftoken=d6tjoin.utils.splitcharTokenCount(df2['id'])\n",
405 |     "print('*** token counts ***')\n",
406 |     "print(dftoken)\n",
407 |     "print('\\n *** token occurance ***')\n",
408 |     "print(d6tjoin.utils.unique_contains(df2['id'], dftoken['word'].values))\n"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "markdown",
413 |    "metadata": {},
414 |    "source": [
415 |     "## Token-based joins\n",
416 |     "\n",
417 |     "Based on the analysis above, we want to join pairs which have at least 1 common token. It's easy to define a function which computes that and pass that to `d6tjoin.top1.MergeTop1()` to get a good join."
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 8,
423 |    "metadata": {},
424 |    "outputs": [
425 |     {
426 |      "data": {
427 |       "text/html": [
428 |        "<div>\n",
429 |        "<style scoped>\n",
430 |        "    .dataframe tbody tr th:only-of-type {\n",
431 |        "        vertical-align: middle;\n",
432 |        "    }\n",
433 |        "\n",
434 |        "    .dataframe tbody tr th {\n",
435 |        "        vertical-align: top;\n",
436 |        "    }\n",
437 |        "\n",
438 |        "    .dataframe thead th {\n",
439 |        "        text-align: right;\n",
440 |        "    }\n",
441 |        "</style>\n",
442 |        "<table border=\"1\" class=\"dataframe\">\n",
443 |        "  <thead>\n",
444 |        "    <tr style=\"text-align: right;\">\n",
445 |        "      <th></th>\n",
446 |        "      <th>__top1left__</th>\n",
447 |        "      <th>__top1right__</th>\n",
448 |        "      <th>__matchtype__</th>\n",
449 |        "      <th>__top1diff__</th>\n",
450 |        "    </tr>\n",
451 |        "  </thead>\n",
452 |        "  <tbody>\n",
453 |        "    <tr>\n",
454 |        "      <th>42</th>\n",
455 |        "      <td>AAP</td>\n",
456 |        "      <td>AAP_US_Equity</td>\n",
457 |        "      <td>top1 left</td>\n",
458 |        "      <td>2</td>\n",
459 |        "    </tr>\n",
460 |        "    <tr>\n",
461 |        "      <th>56</th>\n",
462 |        "      <td>AAPL</td>\n",
463 |        "      <td>AAPL_US_Equity</td>\n",
464 |        "      <td>top1 left</td>\n",
465 |        "      <td>2</td>\n",
466 |        "    </tr>\n",
467 |        "    <tr>\n",
468 |        "      <th>34</th>\n",
469 |        "      <td>AMZN-AMZN</td>\n",
470 |        "      <td>AMZN_US_Equity</td>\n",
471 |        "      <td>top1 left</td>\n",
472 |        "      <td>2</td>\n",
473 |        "    </tr>\n",
474 |        "    <tr>\n",
475 |        "      <th>17</th>\n",
476 |        "      <td>APRN</td>\n",
477 |        "      <td>APRN_US_Equity</td>\n",
478 |        "      <td>top1 left</td>\n",
479 |        "      <td>2</td>\n",
480 |        "    </tr>\n",
481 |        "    <tr>\n",
482 |        "      <th>22</th>\n",
483 |        "      <td>JLP</td>\n",
484 |        "      <td>PRIVATE_JLP</td>\n",
485 |        "      <td>top1 left</td>\n",
486 |        "      <td>2</td>\n",
487 |        "    </tr>\n",
488 |        "    <tr>\n",
489 |        "      <th>45</th>\n",
490 |        "      <td>NMG</td>\n",
491 |        "      <td>PRIVATE_NMG</td>\n",
492 |        "      <td>top1 left</td>\n",
493 |        "      <td>2</td>\n",
494 |        "    </tr>\n",
495 |        "  </tbody>\n",
496 |        "</table>\n",
497 |        "</div>"
498 |       ],
499 |       "text/plain": [
500 |        "   __top1left__   __top1right__ __matchtype__  __top1diff__\n",
501 |        "42          AAP   AAP_US_Equity     top1 left             2\n",
502 |        "56         AAPL  AAPL_US_Equity     top1 left             2\n",
503 |        "34    AMZN-AMZN  AMZN_US_Equity     top1 left             2\n",
504 |        "17         APRN  APRN_US_Equity     top1 left             2\n",
505 |        "22          JLP     PRIVATE_JLP     top1 left             2\n",
506 |        "45          NMG     PRIVATE_NMG     top1 left             2"
507 |       ]
508 |      },
509 |      "execution_count": 8,
510 |      "metadata": {},
511 |      "output_type": "execute_result"
512 |     }
513 |    ],
514 |    "source": [
515 |     "import re\n",
516 |     "splitchars=\"[^a-zA-Z0-9]+\"\n",
517 |     "def tokenmatch(s1,s2):\n",
518 |     "    s1=set(re.split(splitchars,s1))\n",
519 |     "    s2=set(re.split(splitchars,s2))\n",
520 |     "    return 3-len(s1 & s2)\n",
521 |     "\n",
522 |     "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[tokenmatch], top_limit=[2]).merge()['top1']['id']\n"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 9,
528 |    "metadata": {},
529 |    "outputs": [
530 |     {
531 |      "name": "stderr",
532 |      "output_type": "stream",
533 |      "text": [
534 |       "/Users/haijing/Haijing/ds_project/d6t/d6tjoin/d6tjoin/top1.py:118: UserWarning: Top1 join for id has duplicates\n",
535 |       "  warnings.warn('Top1 join for %s has duplicates' %self.cfg_fuzzy_left_on)\n"
536 |      ]
537 |     },
538 |     {
539 |      "data": {
540 |       "text/html": [
541 |        "<div>\n",
542 |        "<style scoped>\n",
543 |        "    .dataframe tbody tr th:only-of-type {\n",
544 |        "        vertical-align: middle;\n",
545 |        "    }\n",
546 |        "\n",
547 |        "    .dataframe tbody tr th {\n",
548 |        "        vertical-align: top;\n",
549 |        "    }\n",
550 |        "\n",
551 |        "    .dataframe thead th {\n",
552 |        "        text-align: right;\n",
553 |        "    }\n",
554 |        "</style>\n",
555 |        "<table border=\"1\" class=\"dataframe\">\n",
556 |        "  <thead>\n",
557 |        "    <tr style=\"text-align: right;\">\n",
558 |        "      <th></th>\n",
559 |        "      <th>__top1left__</th>\n",
560 |        "      <th>__top1right__</th>\n",
561 |        "      <th>__matchtype__</th>\n",
562 |        "      <th>__top1diff__</th>\n",
563 |        "    </tr>\n",
564 |        "  </thead>\n",
565 |        "  <tbody>\n",
566 |        "    <tr>\n",
567 |        "      <th>42</th>\n",
568 |        "      <td>AAP</td>\n",
569 |        "      <td>AAP_US_Equity</td>\n",
570 |        "      <td>top1 left</td>\n",
571 |        "      <td>2</td>\n",
572 |        "    </tr>\n",
573 |        "    <tr>\n",
574 |        "      <th>56</th>\n",
575 |        "      <td>AAPL</td>\n",
576 |        "      <td>AAPL_US_Equity</td>\n",
577 |        "      <td>top1 left</td>\n",
578 |        "      <td>2</td>\n",
579 |        "    </tr>\n",
580 |        "    <tr>\n",
581 |        "      <th>34</th>\n",
582 |        "      <td>AMZN-AMZN</td>\n",
583 |        "      <td>AMZN_US_Equity</td>\n",
584 |        "      <td>top1 left</td>\n",
585 |        "      <td>2</td>\n",
586 |        "    </tr>\n",
587 |        "    <tr>\n",
588 |        "      <th>17</th>\n",
589 |        "      <td>APRN</td>\n",
590 |        "      <td>APRN_US_Equity</td>\n",
591 |        "      <td>top1 left</td>\n",
592 |        "      <td>2</td>\n",
593 |        "    </tr>\n",
594 |        "    <tr>\n",
595 |        "      <th>0</th>\n",
596 |        "      <td>BBW</td>\n",
597 |        "      <td>PRIVATE_NMG</td>\n",
598 |        "      <td>top1 left</td>\n",
599 |        "      <td>3</td>\n",
600 |        "    </tr>\n",
601 |        "    <tr>\n",
602 |        "      <th>1</th>\n",
603 |        "      <td>BBW</td>\n",
604 |        "      <td>BBY_US_Equity</td>\n",
605 |        "      <td>top1 left</td>\n",
606 |        "      <td>3</td>\n",
607 |        "    </tr>\n",
608 |        "    <tr>\n",
609 |        "      <th>2</th>\n",
610 |        "      <td>BBW</td>\n",
611 |        "      <td>AAPL_US_Equity</td>\n",
612 |        "      <td>top1 left</td>\n",
613 |        "      <td>3</td>\n",
614 |        "    </tr>\n",
615 |        "    <tr>\n",
616 |        "      <th>3</th>\n",
617 |        "      <td>BBW</td>\n",
618 |        "      <td>AD_NA_Equity</td>\n",
619 |        "      <td>top1 left</td>\n",
620 |        "      <td>3</td>\n",
621 |        "    </tr>\n",
622 |        "    <tr>\n",
623 |        "      <th>4</th>\n",
624 |        "      <td>BBW</td>\n",
625 |        "      <td>PRIVATE_JLP</td>\n",
626 |        "      <td>top1 left</td>\n",
627 |        "      <td>3</td>\n",
628 |        "    </tr>\n",
629 |        "    <tr>\n",
630 |        "      <th>5</th>\n",
631 |        "      <td>BBW</td>\n",
632 |        "      <td>BMW_NA_Equity</td>\n",
633 |        "      <td>top1 left</td>\n",
634 |        "      <td>3</td>\n",
635 |        "    </tr>\n",
636 |        "    <tr>\n",
637 |        "      <th>6</th>\n",
638 |        "      <td>BBW</td>\n",
639 |        "      <td>AAP_US_Equity</td>\n",
640 |        "      <td>top1 left</td>\n",
641 |        "      <td>3</td>\n",
642 |        "    </tr>\n",
643 |        "    <tr>\n",
644 |        "      <th>7</th>\n",
645 |        "      <td>BBW</td>\n",
646 |        "      <td>AMZN_US_Equity</td>\n",
647 |        "      <td>top1 left</td>\n",
648 |        "      <td>3</td>\n",
649 |        "    </tr>\n",
650 |        "    <tr>\n",
651 |        "      <th>8</th>\n",
652 |        "      <td>BBW</td>\n",
653 |        "      <td>APRN_US_Equity</td>\n",
654 |        "      <td>top1 left</td>\n",
655 |        "      <td>3</td>\n",
656 |        "    </tr>\n",
657 |        "    <tr>\n",
658 |        "      <th>22</th>\n",
659 |        "      <td>JLP</td>\n",
660 |        "      <td>PRIVATE_JLP</td>\n",
661 |        "      <td>top1 left</td>\n",
662 |        "      <td>2</td>\n",
663 |        "    </tr>\n",
664 |        "    <tr>\n",
665 |        "      <th>45</th>\n",
666 |        "      <td>NMG</td>\n",
667 |        "      <td>PRIVATE_NMG</td>\n",
668 |        "      <td>top1 left</td>\n",
669 |        "      <td>2</td>\n",
670 |        "    </tr>\n",
671 |        "  </tbody>\n",
672 |        "</table>\n",
673 |        "</div>"
674 |       ],
675 |       "text/plain": [
676 |        "   __top1left__   __top1right__ __matchtype__  __top1diff__\n",
677 |        "42          AAP   AAP_US_Equity     top1 left             2\n",
678 |        "56         AAPL  AAPL_US_Equity     top1 left             2\n",
679 |        "34    AMZN-AMZN  AMZN_US_Equity     top1 left             2\n",
680 |        "17         APRN  APRN_US_Equity     top1 left             2\n",
681 |        "0           BBW     PRIVATE_NMG     top1 left             3\n",
682 |        "1           BBW   BBY_US_Equity     top1 left             3\n",
683 |        "2           BBW  AAPL_US_Equity     top1 left             3\n",
684 |        "3           BBW    AD_NA_Equity     top1 left             3\n",
685 |        "4           BBW     PRIVATE_JLP     top1 left             3\n",
686 |        "5           BBW   BMW_NA_Equity     top1 left             3\n",
687 |        "6           BBW   AAP_US_Equity     top1 left             3\n",
688 |        "7           BBW  AMZN_US_Equity     top1 left             3\n",
689 |        "8           BBW  APRN_US_Equity     top1 left             3\n",
690 |        "22          JLP     PRIVATE_JLP     top1 left             2\n",
691 |        "45          NMG     PRIVATE_NMG     top1 left             2"
692 |       ]
693 |      },
694 |      "execution_count": 9,
695 |      "metadata": {},
696 |      "output_type": "execute_result"
697 |     }
698 |    ],
699 |    "source": [
700 |     "# note that we applied top_limit=[2], meaning strings should have at most 2 tokens mismatched, to exclude bad matches for BBW\n",
701 |     "d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'], fun_diff=[tokenmatch]).merge()['top1']['id']\n"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "code",
706 |    "execution_count": null,
707 |    "metadata": {},
708 |    "outputs": [],
709 |    "source": []
710 |   }
711 |  ],
712 |  "metadata": {
713 |   "kernelspec": {
714 |    "display_name": "Python 3",
715 |    "language": "python",
716 |    "name": "python3"
717 |   },
718 |   "language_info": {
719 |    "codemirror_mode": {
720 |     "name": "ipython",
721 |     "version": 3
722 |    },
723 |    "file_extension": ".py",
724 |    "mimetype": "text/x-python",
725 |    "name": "python",
726 |    "nbconvert_exporter": "python",
727 |    "pygments_lexer": "ipython3",
728 |    "version": "3.7.6"
729 |   }
730 |  },
731 |  "nbformat": 4,
732 |  "nbformat_minor": 2
733 | }
734 | 


--------------------------------------------------------------------------------
/examples-top1.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Data Engineering in Python with databolt - Fuzzy Joins (d6tlib/d6tjoin.utils)\n",
   8 |     "\n",
   9 |     "## Introduction\n",
  10 |     "\n",
  11 |     "Joining datasets is a common data engineering operation. However, often there are problems merging datasets from different sources because of mismatched identifiers, date conventions etc. \n",
  12 |     "\n",
  13 |     "** `d6tjoin.top1` module allows you to quickly join datasets even if they don't perfectly match. **\n",
  14 |     "Easily join different datasets without writing custom code. Does fuzzy top1 similarity joins for strings, dates and numbers, for example you can quickly join similar but not identical stock tickers, addresses, names without manual processing. It will find the top 1 matched entry from the right dataframe to join onto the left dataframe.\n",
  15 |     "\n",
  16 |     "Here are some examples which show you how to:\n",
  17 |     "1. join on mismatched identifiers\n",
  18 |     "2. join on calendar vs business dates\n",
  19 |     "3. join on both mismatched dates and identifiers"
  20 |    ]
  21 |   },
  22 |   {
  23 |    "cell_type": "code",
  24 |    "execution_count": 1,
  25 |    "metadata": {},
  26 |    "outputs": [
  27 |     {
  28 |      "data": {
  29 |       "text/html": [
  30 |        "<div>\n",
  31 |        "<style scoped>\n",
  32 |        "    .dataframe tbody tr th:only-of-type {\n",
  33 |        "        vertical-align: middle;\n",
  34 |        "    }\n",
  35 |        "\n",
  36 |        "    .dataframe tbody tr th {\n",
  37 |        "        vertical-align: top;\n",
  38 |        "    }\n",
  39 |        "\n",
  40 |        "    .dataframe thead th {\n",
  41 |        "        text-align: right;\n",
  42 |        "    }\n",
  43 |        "</style>\n",
  44 |        "<table border=\"1\" class=\"dataframe\">\n",
  45 |        "  <thead>\n",
  46 |        "    <tr style=\"text-align: right;\">\n",
  47 |        "      <th></th>\n",
  48 |        "      <th>date</th>\n",
  49 |        "      <th>id</th>\n",
  50 |        "      <th>v</th>\n",
  51 |        "    </tr>\n",
  52 |        "  </thead>\n",
  53 |        "  <tbody>\n",
  54 |        "    <tr>\n",
  55 |        "      <th>0</th>\n",
  56 |        "      <td>2010-01-01</td>\n",
  57 |        "      <td>e3e70682</td>\n",
  58 |        "      <td>0.393</td>\n",
  59 |        "    </tr>\n",
  60 |        "    <tr>\n",
  61 |        "      <th>1</th>\n",
  62 |        "      <td>2010-01-01</td>\n",
  63 |        "      <td>f728b4fa</td>\n",
  64 |        "      <td>0.837</td>\n",
  65 |        "    </tr>\n",
  66 |        "    <tr>\n",
  67 |        "      <th>2</th>\n",
  68 |        "      <td>2010-01-01</td>\n",
  69 |        "      <td>eb1167b3</td>\n",
  70 |        "      <td>0.389</td>\n",
  71 |        "    </tr>\n",
  72 |        "    <tr>\n",
  73 |        "      <th>3</th>\n",
  74 |        "      <td>2010-01-01</td>\n",
  75 |        "      <td>f7c1bd87</td>\n",
  76 |        "      <td>0.555</td>\n",
  77 |        "    </tr>\n",
  78 |        "    <tr>\n",
  79 |        "      <th>4</th>\n",
  80 |        "      <td>2010-01-01</td>\n",
  81 |        "      <td>e443df78</td>\n",
  82 |        "      <td>0.886</td>\n",
  83 |        "    </tr>\n",
  84 |        "  </tbody>\n",
  85 |        "</table>\n",
  86 |        "</div>"
  87 |       ],
  88 |       "text/plain": [
  89 |        "        date        id      v\n",
  90 |        "0 2010-01-01  e3e70682  0.393\n",
  91 |        "1 2010-01-01  f728b4fa  0.837\n",
  92 |        "2 2010-01-01  eb1167b3  0.389\n",
  93 |        "3 2010-01-01  f7c1bd87  0.555\n",
  94 |        "4 2010-01-01  e443df78  0.886"
  95 |       ]
  96 |      },
  97 |      "execution_count": 1,
  98 |      "metadata": {},
  99 |      "output_type": "execute_result"
 100 |     }
 101 |    ],
 102 |    "source": [
 103 |     "import pandas as pd\n",
 104 |     "import numpy as np\n",
 105 |     "import itertools\n",
 106 |     "from faker import Faker\n",
 107 |     "import importlib\n",
 108 |     "\n",
 109 |     "import d6tjoin.top1\n",
 110 |     "importlib.reload(d6tjoin.top1)\n",
 111 |     "import d6tjoin.utils\n",
 112 |     "\n",
 113 |     "# *******************************************************\n",
 114 |     "# generate sample time series data with id and value\n",
 115 |     "# *******************************************************\n",
 116 |     "nobs = 10\n",
 117 |     "f1 = Faker()\n",
 118 |     "Faker.seed(0)\n",
 119 |     "uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)]\n",
 120 |     "dates1 = pd.date_range('1/1/2010','1/1/2011')\n",
 121 |     "\n",
 122 |     "df1 = pd.DataFrame(list(itertools.product(dates1,uuid1)),columns=['date','id'])\n",
 123 |     "df1['v']=np.round(np.random.sample(df1.shape[0]),3)\n",
 124 |     "df1.head()"
 125 |    ]
 126 |   },
 127 |   {
 128 |    "cell_type": "markdown",
 129 |    "metadata": {},
 130 |    "source": [
 131 |     "# Example 1: join datasets on misalgined ids\n",
 132 |     "\n",
 133 |     "When joining data from different sources, eg different vendors, often your ids don't match perfect and then you need to manually analyze the situation. With databolt this becomes much easier.\n",
 134 |     "\n",
 135 |     "Let's create another dataset where the `id` is slightly different."
 136 |    ]
 137 |   },
 138 |   {
 139 |    "cell_type": "code",
 140 |    "execution_count": 2,
 141 |    "metadata": {},
 142 |    "outputs": [
 143 |     {
 144 |      "data": {
 145 |       "text/html": [
 146 |        "<div>\n",
 147 |        "<style scoped>\n",
 148 |        "    .dataframe tbody tr th:only-of-type {\n",
 149 |        "        vertical-align: middle;\n",
 150 |        "    }\n",
 151 |        "\n",
 152 |        "    .dataframe tbody tr th {\n",
 153 |        "        vertical-align: top;\n",
 154 |        "    }\n",
 155 |        "\n",
 156 |        "    .dataframe thead th {\n",
 157 |        "        text-align: right;\n",
 158 |        "    }\n",
 159 |        "</style>\n",
 160 |        "<table border=\"1\" class=\"dataframe\">\n",
 161 |        "  <thead>\n",
 162 |        "    <tr style=\"text-align: right;\">\n",
 163 |        "      <th></th>\n",
 164 |        "      <th>date</th>\n",
 165 |        "      <th>id</th>\n",
 166 |        "      <th>v</th>\n",
 167 |        "    </tr>\n",
 168 |        "  </thead>\n",
 169 |        "  <tbody>\n",
 170 |        "    <tr>\n",
 171 |        "      <th>0</th>\n",
 172 |        "      <td>2010-01-01</td>\n",
 173 |        "      <td>3e7068</td>\n",
 174 |        "      <td>0.393</td>\n",
 175 |        "    </tr>\n",
 176 |        "    <tr>\n",
 177 |        "      <th>1</th>\n",
 178 |        "      <td>2010-01-01</td>\n",
 179 |        "      <td>728b4f</td>\n",
 180 |        "      <td>0.837</td>\n",
 181 |        "    </tr>\n",
 182 |        "    <tr>\n",
 183 |        "      <th>2</th>\n",
 184 |        "      <td>2010-01-01</td>\n",
 185 |        "      <td>b1167b</td>\n",
 186 |        "      <td>0.389</td>\n",
 187 |        "    </tr>\n",
 188 |        "    <tr>\n",
 189 |        "      <th>3</th>\n",
 190 |        "      <td>2010-01-01</td>\n",
 191 |        "      <td>7c1bd8</td>\n",
 192 |        "      <td>0.555</td>\n",
 193 |        "    </tr>\n",
 194 |        "    <tr>\n",
 195 |        "      <th>4</th>\n",
 196 |        "      <td>2010-01-01</td>\n",
 197 |        "      <td>443df7</td>\n",
 198 |        "      <td>0.886</td>\n",
 199 |        "    </tr>\n",
 200 |        "  </tbody>\n",
 201 |        "</table>\n",
 202 |        "</div>"
 203 |       ],
 204 |       "text/plain": [
 205 |        "        date      id      v\n",
 206 |        "0 2010-01-01  3e7068  0.393\n",
 207 |        "1 2010-01-01  728b4f  0.837\n",
 208 |        "2 2010-01-01  b1167b  0.389\n",
 209 |        "3 2010-01-01  7c1bd8  0.555\n",
 210 |        "4 2010-01-01  443df7  0.886"
 211 |       ]
 212 |      },
 213 |      "execution_count": 2,
 214 |      "metadata": {},
 215 |      "output_type": "execute_result"
 216 |     }
 217 |    ],
 218 |    "source": [
 219 |     "# create mismatch\n",
 220 |     "df2 = df1.copy()\n",
 221 |     "df2['id'] = df1['id'].str[1:-1]\n",
 222 |     "df2.head()"
 223 |    ]
 224 |   },
 225 |   {
 226 |    "cell_type": "markdown",
 227 |    "metadata": {},
 228 |    "source": [
 229 |     "`d6tjoin.Prejoin.match_quality()` shows you there is none of `id` match so a normal join won't work well."
 230 |    ]
 231 |   },
 232 |   {
 233 |    "cell_type": "code",
 234 |    "execution_count": 3,
 235 |    "metadata": {},
 236 |    "outputs": [
 237 |     {
 238 |      "name": "stdout",
 239 |      "output_type": "stream",
 240 |      "text": [
 241 |       "  key left key right  all matched  inner  left  right  outer  unmatched total  unmatched left  unmatched right\n",
 242 |       "0       id        id        False      0    10     10     20               20              10               10\n",
 243 |       "1     date      date         True    366   366    366    366                0               0                0\n",
 244 |       "2  __all__   __all__        False      0  3660   3660   7320             7320            3660             3660\n"
 245 |      ]
 246 |     }
 247 |    ],
 248 |    "source": [
 249 |     "d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()"
 250 |    ]
 251 |   },
 252 |   {
 253 |    "cell_type": "markdown",
 254 |    "metadata": {},
 255 |    "source": [
 256 |     "Using `d6tjoin.top1.MergeTop1()` you can quickly merge this dataset without having to do any manual processing. It will find the closest matching id using the Levenstein string similarity metric. We want to look at the closest id by date so we will pass in date as an exact match key."
 257 |    ]
 258 |   },
 259 |   {
 260 |    "cell_type": "code",
 261 |    "execution_count": 4,
 262 |    "metadata": {},
 263 |    "outputs": [],
 264 |    "source": [
 265 |     "result = d6tjoin.top1.MergeTop1(df1.head(),df2,fuzzy_left_on=['id'],fuzzy_right_on=['id'],exact_left_on=['date'],exact_right_on=['date']).merge()"
 266 |    ]
 267 |   },
 268 |   {
 269 |    "cell_type": "markdown",
 270 |    "metadata": {},
 271 |    "source": [
 272 |     "Lets check what matches it found. Looking at the top1 match table, it shows the closest string with only 2 character difference in id, meaning it found the correct substring. "
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "code",
 277 |    "execution_count": 5,
 278 |    "metadata": {},
 279 |    "outputs": [
 280 |     {
 281 |      "data": {
 282 |       "text/html": [
 283 |        "<div>\n",
 284 |        "<style scoped>\n",
 285 |        "    .dataframe tbody tr th:only-of-type {\n",
 286 |        "        vertical-align: middle;\n",
 287 |        "    }\n",
 288 |        "\n",
 289 |        "    .dataframe tbody tr th {\n",
 290 |        "        vertical-align: top;\n",
 291 |        "    }\n",
 292 |        "\n",
 293 |        "    .dataframe thead th {\n",
 294 |        "        text-align: right;\n",
 295 |        "    }\n",
 296 |        "</style>\n",
 297 |        "<table border=\"1\" class=\"dataframe\">\n",
 298 |        "  <thead>\n",
 299 |        "    <tr style=\"text-align: right;\">\n",
 300 |        "      <th></th>\n",
 301 |        "      <th>date</th>\n",
 302 |        "      <th>__top1left__</th>\n",
 303 |        "      <th>__top1right__</th>\n",
 304 |        "      <th>__top1diff__</th>\n",
 305 |        "      <th>__matchtype__</th>\n",
 306 |        "    </tr>\n",
 307 |        "  </thead>\n",
 308 |        "  <tbody>\n",
 309 |        "    <tr>\n",
 310 |        "      <th>10</th>\n",
 311 |        "      <td>2010-01-01</td>\n",
 312 |        "      <td>e3e70682</td>\n",
 313 |        "      <td>3e7068</td>\n",
 314 |        "      <td>2</td>\n",
 315 |        "      <td>top1 left</td>\n",
 316 |        "    </tr>\n",
 317 |        "    <tr>\n",
 318 |        "      <th>34</th>\n",
 319 |        "      <td>2010-01-01</td>\n",
 320 |        "      <td>e443df78</td>\n",
 321 |        "      <td>443df7</td>\n",
 322 |        "      <td>2</td>\n",
 323 |        "      <td>top1 left</td>\n",
 324 |        "    </tr>\n",
 325 |        "    <tr>\n",
 326 |        "      <th>42</th>\n",
 327 |        "      <td>2010-01-01</td>\n",
 328 |        "      <td>eb1167b3</td>\n",
 329 |        "      <td>b1167b</td>\n",
 330 |        "      <td>2</td>\n",
 331 |        "      <td>top1 left</td>\n",
 332 |        "    </tr>\n",
 333 |        "    <tr>\n",
 334 |        "      <th>21</th>\n",
 335 |        "      <td>2010-01-01</td>\n",
 336 |        "      <td>f728b4fa</td>\n",
 337 |        "      <td>728b4f</td>\n",
 338 |        "      <td>2</td>\n",
 339 |        "      <td>top1 left</td>\n",
 340 |        "    </tr>\n",
 341 |        "    <tr>\n",
 342 |        "      <th>3</th>\n",
 343 |        "      <td>2010-01-01</td>\n",
 344 |        "      <td>f7c1bd87</td>\n",
 345 |        "      <td>7c1bd8</td>\n",
 346 |        "      <td>2</td>\n",
 347 |        "      <td>top1 left</td>\n",
 348 |        "    </tr>\n",
 349 |        "  </tbody>\n",
 350 |        "</table>\n",
 351 |        "</div>"
 352 |       ],
 353 |       "text/plain": [
 354 |        "         date __top1left__ __top1right__  __top1diff__ __matchtype__\n",
 355 |        "10 2010-01-01     e3e70682        3e7068             2     top1 left\n",
 356 |        "34 2010-01-01     e443df78        443df7             2     top1 left\n",
 357 |        "42 2010-01-01     eb1167b3        b1167b             2     top1 left\n",
 358 |        "21 2010-01-01     f728b4fa        728b4f             2     top1 left\n",
 359 |        "3  2010-01-01     f7c1bd87        7c1bd8             2     top1 left"
 360 |       ]
 361 |      },
 362 |      "execution_count": 5,
 363 |      "metadata": {},
 364 |      "output_type": "execute_result"
 365 |     }
 366 |    ],
 367 |    "source": [
 368 |     "result['top1']['id']"
 369 |    ]
 370 |   },
 371 |   {
 372 |    "cell_type": "markdown",
 373 |    "metadata": {},
 374 |    "source": [
 375 |     "Since the match results look good, you can use the merged dataset."
 376 |    ]
 377 |   },
 378 |   {
 379 |    "cell_type": "code",
 380 |    "execution_count": 6,
 381 |    "metadata": {},
 382 |    "outputs": [
 383 |     {
 384 |      "data": {
 385 |       "text/html": [
 386 |        "<div>\n",
 387 |        "<style scoped>\n",
 388 |        "    .dataframe tbody tr th:only-of-type {\n",
 389 |        "        vertical-align: middle;\n",
 390 |        "    }\n",
 391 |        "\n",
 392 |        "    .dataframe tbody tr th {\n",
 393 |        "        vertical-align: top;\n",
 394 |        "    }\n",
 395 |        "\n",
 396 |        "    .dataframe thead th {\n",
 397 |        "        text-align: right;\n",
 398 |        "    }\n",
 399 |        "</style>\n",
 400 |        "<table border=\"1\" class=\"dataframe\">\n",
 401 |        "  <thead>\n",
 402 |        "    <tr style=\"text-align: right;\">\n",
 403 |        "      <th></th>\n",
 404 |        "      <th>date</th>\n",
 405 |        "      <th>id</th>\n",
 406 |        "      <th>v</th>\n",
 407 |        "      <th>id_right</th>\n",
 408 |        "      <th>v_right</th>\n",
 409 |        "    </tr>\n",
 410 |        "  </thead>\n",
 411 |        "  <tbody>\n",
 412 |        "    <tr>\n",
 413 |        "      <th>0</th>\n",
 414 |        "      <td>2010-01-01</td>\n",
 415 |        "      <td>e3e70682</td>\n",
 416 |        "      <td>0.393</td>\n",
 417 |        "      <td>3e7068</td>\n",
 418 |        "      <td>0.393</td>\n",
 419 |        "    </tr>\n",
 420 |        "    <tr>\n",
 421 |        "      <th>1</th>\n",
 422 |        "      <td>2010-01-01</td>\n",
 423 |        "      <td>f728b4fa</td>\n",
 424 |        "      <td>0.837</td>\n",
 425 |        "      <td>728b4f</td>\n",
 426 |        "      <td>0.837</td>\n",
 427 |        "    </tr>\n",
 428 |        "    <tr>\n",
 429 |        "      <th>2</th>\n",
 430 |        "      <td>2010-01-01</td>\n",
 431 |        "      <td>eb1167b3</td>\n",
 432 |        "      <td>0.389</td>\n",
 433 |        "      <td>b1167b</td>\n",
 434 |        "      <td>0.389</td>\n",
 435 |        "    </tr>\n",
 436 |        "    <tr>\n",
 437 |        "      <th>3</th>\n",
 438 |        "      <td>2010-01-01</td>\n",
 439 |        "      <td>f7c1bd87</td>\n",
 440 |        "      <td>0.555</td>\n",
 441 |        "      <td>7c1bd8</td>\n",
 442 |        "      <td>0.555</td>\n",
 443 |        "    </tr>\n",
 444 |        "    <tr>\n",
 445 |        "      <th>4</th>\n",
 446 |        "      <td>2010-01-01</td>\n",
 447 |        "      <td>e443df78</td>\n",
 448 |        "      <td>0.886</td>\n",
 449 |        "      <td>443df7</td>\n",
 450 |        "      <td>0.886</td>\n",
 451 |        "    </tr>\n",
 452 |        "  </tbody>\n",
 453 |        "</table>\n",
 454 |        "</div>"
 455 |       ],
 456 |       "text/plain": [
 457 |        "        date        id      v id_right  v_right\n",
 458 |        "0 2010-01-01  e3e70682  0.393   3e7068    0.393\n",
 459 |        "1 2010-01-01  f728b4fa  0.837   728b4f    0.837\n",
 460 |        "2 2010-01-01  eb1167b3  0.389   b1167b    0.389\n",
 461 |        "3 2010-01-01  f7c1bd87  0.555   7c1bd8    0.555\n",
 462 |        "4 2010-01-01  e443df78  0.886   443df7    0.886"
 463 |       ]
 464 |      },
 465 |      "execution_count": 6,
 466 |      "metadata": {},
 467 |      "output_type": "execute_result"
 468 |     }
 469 |    ],
 470 |    "source": [
 471 |     "result['merged'].head()"
 472 |    ]
 473 |   },
 474 |   {
 475 |    "cell_type": "code",
 476 |    "execution_count": 7,
 477 |    "metadata": {},
 478 |    "outputs": [],
 479 |    "source": [
 480 |     "assert not result['duplicates']"
 481 |    ]
 482 |   },
 483 |   {
 484 |    "cell_type": "markdown",
 485 |    "metadata": {},
 486 |    "source": [
 487 |     "# Example 2: join 2 datasets with misalgined dates\n",
 488 |     "\n",
 489 |     "As another example, instead of the ids not matching, lets look at an example where the dates don't match. We will look at calendar vs business month end dates."
 490 |    ]
 491 |   },
 492 |   {
 493 |    "cell_type": "code",
 494 |    "execution_count": 8,
 495 |    "metadata": {},
 496 |    "outputs": [],
 497 |    "source": [
 498 |     "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n",
 499 |     "df2 = pd.DataFrame(list(itertools.product(dates2,uuid1)),columns=['date','id'])\n",
 500 |     "df2['v']=np.round(np.random.sample(df2.shape[0]),3)"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "markdown",
 505 |    "metadata": {},
 506 |    "source": [
 507 |     "`d6tjoin.Prejoin()` shows some but not all of the dates match. All the ids match."
 508 |    ]
 509 |   },
 510 |   {
 511 |    "cell_type": "code",
 512 |    "execution_count": 9,
 513 |    "metadata": {},
 514 |    "outputs": [
 515 |     {
 516 |      "name": "stdout",
 517 |      "output_type": "stream",
 518 |      "text": [
 519 |       "  key left key right  all matched  inner  left  right  outer  unmatched total  unmatched left  unmatched right\n",
 520 |       "0       id        id         True     10    10     10     10                0               0                0\n",
 521 |       "1     date      date        False    261   366    261    366              105             105                0\n",
 522 |       "2  __all__   __all__        False   2610  3660   2610   3660             1050            1050                0\n"
 523 |      ]
 524 |     }
 525 |    ],
 526 |    "source": [
 527 |     "d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()"
 528 |    ]
 529 |   },
 530 |   {
 531 |    "cell_type": "markdown",
 532 |    "metadata": {},
 533 |    "source": [
 534 |     "So we want to do a fuzzy match on dates but have the id match perfectly."
 535 |    ]
 536 |   },
 537 |   {
 538 |    "cell_type": "code",
 539 |    "execution_count": 10,
 540 |    "metadata": {},
 541 |    "outputs": [],
 542 |    "source": [
 543 |     "result = d6tjoin.top1.MergeTop1(df1,df2,fuzzy_left_on=['date'],fuzzy_right_on=['date'],exact_left_on=['id'],exact_right_on=['id']).merge()"
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "markdown",
 548 |    "metadata": {},
 549 |    "source": [
 550 |     "Again lets check if the fuzzy matches are correct. If either matches or is off by a day most, looks good!"
 551 |    ]
 552 |   },
 553 |   {
 554 |    "cell_type": "code",
 555 |    "execution_count": 11,
 556 |    "metadata": {},
 557 |    "outputs": [
 558 |     {
 559 |      "data": {
 560 |       "text/html": [
 561 |        "<div>\n",
 562 |        "<style scoped>\n",
 563 |        "    .dataframe tbody tr th:only-of-type {\n",
 564 |        "        vertical-align: middle;\n",
 565 |        "    }\n",
 566 |        "\n",
 567 |        "    .dataframe tbody tr th {\n",
 568 |        "        vertical-align: top;\n",
 569 |        "    }\n",
 570 |        "\n",
 571 |        "    .dataframe thead th {\n",
 572 |        "        text-align: right;\n",
 573 |        "    }\n",
 574 |        "</style>\n",
 575 |        "<table border=\"1\" class=\"dataframe\">\n",
 576 |        "  <thead>\n",
 577 |        "    <tr style=\"text-align: right;\">\n",
 578 |        "      <th></th>\n",
 579 |        "      <th>id</th>\n",
 580 |        "      <th>__top1left__</th>\n",
 581 |        "      <th>__top1right__</th>\n",
 582 |        "      <th>__top1diff__</th>\n",
 583 |        "      <th>__matchtype__</th>\n",
 584 |        "    </tr>\n",
 585 |        "  </thead>\n",
 586 |        "  <tbody>\n",
 587 |        "    <tr>\n",
 588 |        "      <th>0</th>\n",
 589 |        "      <td>1846d424</td>\n",
 590 |        "      <td>2010-01-01</td>\n",
 591 |        "      <td>2010-01-01</td>\n",
 592 |        "      <td>0 days</td>\n",
 593 |        "      <td>exact</td>\n",
 594 |        "    </tr>\n",
 595 |        "    <tr>\n",
 596 |        "      <th>1</th>\n",
 597 |        "      <td>eb1167b3</td>\n",
 598 |        "      <td>2010-01-01</td>\n",
 599 |        "      <td>2010-01-01</td>\n",
 600 |        "      <td>0 days</td>\n",
 601 |        "      <td>exact</td>\n",
 602 |        "    </tr>\n",
 603 |        "    <tr>\n",
 604 |        "      <th>2</th>\n",
 605 |        "      <td>e443df78</td>\n",
 606 |        "      <td>2010-01-01</td>\n",
 607 |        "      <td>2010-01-01</td>\n",
 608 |        "      <td>0 days</td>\n",
 609 |        "      <td>exact</td>\n",
 610 |        "    </tr>\n",
 611 |        "  </tbody>\n",
 612 |        "</table>\n",
 613 |        "</div>"
 614 |       ],
 615 |       "text/plain": [
 616 |        "         id __top1left__ __top1right__ __top1diff__ __matchtype__\n",
 617 |        "0  1846d424   2010-01-01    2010-01-01       0 days         exact\n",
 618 |        "1  eb1167b3   2010-01-01    2010-01-01       0 days         exact\n",
 619 |        "2  e443df78   2010-01-01    2010-01-01       0 days         exact"
 620 |       ]
 621 |      },
 622 |      "execution_count": 11,
 623 |      "metadata": {},
 624 |      "output_type": "execute_result"
 625 |     }
 626 |    ],
 627 |    "source": [
 628 |     "result['top1']['date'].head(3)"
 629 |    ]
 630 |   },
 631 |   {
 632 |    "cell_type": "code",
 633 |    "execution_count": 12,
 634 |    "metadata": {},
 635 |    "outputs": [
 636 |     {
 637 |      "data": {
 638 |       "text/html": [
 639 |        "<div>\n",
 640 |        "<style scoped>\n",
 641 |        "    .dataframe tbody tr th:only-of-type {\n",
 642 |        "        vertical-align: middle;\n",
 643 |        "    }\n",
 644 |        "\n",
 645 |        "    .dataframe tbody tr th {\n",
 646 |        "        vertical-align: top;\n",
 647 |        "    }\n",
 648 |        "\n",
 649 |        "    .dataframe thead th {\n",
 650 |        "        text-align: right;\n",
 651 |        "    }\n",
 652 |        "</style>\n",
 653 |        "<table border=\"1\" class=\"dataframe\">\n",
 654 |        "  <thead>\n",
 655 |        "    <tr style=\"text-align: right;\">\n",
 656 |        "      <th></th>\n",
 657 |        "      <th>id</th>\n",
 658 |        "      <th>__top1left__</th>\n",
 659 |        "      <th>__top1right__</th>\n",
 660 |        "      <th>__top1diff__</th>\n",
 661 |        "      <th>__matchtype__</th>\n",
 662 |        "    </tr>\n",
 663 |        "  </thead>\n",
 664 |        "  <tbody>\n",
 665 |        "    <tr>\n",
 666 |        "      <th>3657</th>\n",
 667 |        "      <td>1846d424</td>\n",
 668 |        "      <td>2011-01-01</td>\n",
 669 |        "      <td>2010-12-31</td>\n",
 670 |        "      <td>1 days</td>\n",
 671 |        "      <td>top1 left</td>\n",
 672 |        "    </tr>\n",
 673 |        "    <tr>\n",
 674 |        "      <th>3658</th>\n",
 675 |        "      <td>f7c1bd87</td>\n",
 676 |        "      <td>2011-01-01</td>\n",
 677 |        "      <td>2010-12-31</td>\n",
 678 |        "      <td>1 days</td>\n",
 679 |        "      <td>top1 left</td>\n",
 680 |        "    </tr>\n",
 681 |        "    <tr>\n",
 682 |        "      <th>3659</th>\n",
 683 |        "      <td>fcbd04c3</td>\n",
 684 |        "      <td>2011-01-01</td>\n",
 685 |        "      <td>2010-12-31</td>\n",
 686 |        "      <td>1 days</td>\n",
 687 |        "      <td>top1 left</td>\n",
 688 |        "    </tr>\n",
 689 |        "  </tbody>\n",
 690 |        "</table>\n",
 691 |        "</div>"
 692 |       ],
 693 |       "text/plain": [
 694 |        "            id __top1left__ __top1right__ __top1diff__ __matchtype__\n",
 695 |        "3657  1846d424   2011-01-01    2010-12-31       1 days     top1 left\n",
 696 |        "3658  f7c1bd87   2011-01-01    2010-12-31       1 days     top1 left\n",
 697 |        "3659  fcbd04c3   2011-01-01    2010-12-31       1 days     top1 left"
 698 |       ]
 699 |      },
 700 |      "execution_count": 12,
 701 |      "metadata": {},
 702 |      "output_type": "execute_result"
 703 |     }
 704 |    ],
 705 |    "source": [
 706 |     "result['top1']['date'].tail(3)"
 707 |    ]
 708 |   },
 709 |   {
 710 |    "cell_type": "code",
 711 |    "execution_count": 13,
 712 |    "metadata": {},
 713 |    "outputs": [
 714 |     {
 715 |      "data": {
 716 |       "text/plain": [
 717 |        "Timedelta('1 days 00:00:00')"
 718 |       ]
 719 |      },
 720 |      "execution_count": 13,
 721 |      "metadata": {},
 722 |      "output_type": "execute_result"
 723 |     }
 724 |    ],
 725 |    "source": [
 726 |     "result['top1']['date']['__top1diff__'].max()"
 727 |    ]
 728 |   },
 729 |   {
 730 |    "cell_type": "markdown",
 731 |    "metadata": {},
 732 |    "source": [
 733 |     "Again with very little effort we were able to join this dataset together."
 734 |    ]
 735 |   },
 736 |   {
 737 |    "cell_type": "code",
 738 |    "execution_count": 14,
 739 |    "metadata": {},
 740 |    "outputs": [
 741 |     {
 742 |      "data": {
 743 |       "text/html": [
 744 |        "<div>\n",
 745 |        "<style scoped>\n",
 746 |        "    .dataframe tbody tr th:only-of-type {\n",
 747 |        "        vertical-align: middle;\n",
 748 |        "    }\n",
 749 |        "\n",
 750 |        "    .dataframe tbody tr th {\n",
 751 |        "        vertical-align: top;\n",
 752 |        "    }\n",
 753 |        "\n",
 754 |        "    .dataframe thead th {\n",
 755 |        "        text-align: right;\n",
 756 |        "    }\n",
 757 |        "</style>\n",
 758 |        "<table border=\"1\" class=\"dataframe\">\n",
 759 |        "  <thead>\n",
 760 |        "    <tr style=\"text-align: right;\">\n",
 761 |        "      <th></th>\n",
 762 |        "      <th>date</th>\n",
 763 |        "      <th>id</th>\n",
 764 |        "      <th>v</th>\n",
 765 |        "      <th>date_right</th>\n",
 766 |        "      <th>v_right</th>\n",
 767 |        "    </tr>\n",
 768 |        "  </thead>\n",
 769 |        "  <tbody>\n",
 770 |        "    <tr>\n",
 771 |        "      <th>0</th>\n",
 772 |        "      <td>2010-01-01</td>\n",
 773 |        "      <td>e3e70682</td>\n",
 774 |        "      <td>0.393</td>\n",
 775 |        "      <td>2010-01-01</td>\n",
 776 |        "      <td>0.110</td>\n",
 777 |        "    </tr>\n",
 778 |        "    <tr>\n",
 779 |        "      <th>1</th>\n",
 780 |        "      <td>2010-01-02</td>\n",
 781 |        "      <td>e3e70682</td>\n",
 782 |        "      <td>0.537</td>\n",
 783 |        "      <td>2010-01-01</td>\n",
 784 |        "      <td>0.110</td>\n",
 785 |        "    </tr>\n",
 786 |        "    <tr>\n",
 787 |        "      <th>2</th>\n",
 788 |        "      <td>2010-01-01</td>\n",
 789 |        "      <td>f728b4fa</td>\n",
 790 |        "      <td>0.837</td>\n",
 791 |        "      <td>2010-01-01</td>\n",
 792 |        "      <td>0.197</td>\n",
 793 |        "    </tr>\n",
 794 |        "    <tr>\n",
 795 |        "      <th>3</th>\n",
 796 |        "      <td>2010-01-02</td>\n",
 797 |        "      <td>f728b4fa</td>\n",
 798 |        "      <td>0.517</td>\n",
 799 |        "      <td>2010-01-01</td>\n",
 800 |        "      <td>0.197</td>\n",
 801 |        "    </tr>\n",
 802 |        "    <tr>\n",
 803 |        "      <th>4</th>\n",
 804 |        "      <td>2010-01-01</td>\n",
 805 |        "      <td>eb1167b3</td>\n",
 806 |        "      <td>0.389</td>\n",
 807 |        "      <td>2010-01-01</td>\n",
 808 |        "      <td>0.385</td>\n",
 809 |        "    </tr>\n",
 810 |        "  </tbody>\n",
 811 |        "</table>\n",
 812 |        "</div>"
 813 |       ],
 814 |       "text/plain": [
 815 |        "        date        id      v date_right  v_right\n",
 816 |        "0 2010-01-01  e3e70682  0.393 2010-01-01    0.110\n",
 817 |        "1 2010-01-02  e3e70682  0.537 2010-01-01    0.110\n",
 818 |        "2 2010-01-01  f728b4fa  0.837 2010-01-01    0.197\n",
 819 |        "3 2010-01-02  f728b4fa  0.517 2010-01-01    0.197\n",
 820 |        "4 2010-01-01  eb1167b3  0.389 2010-01-01    0.385"
 821 |       ]
 822 |      },
 823 |      "execution_count": 14,
 824 |      "metadata": {},
 825 |      "output_type": "execute_result"
 826 |     }
 827 |    ],
 828 |    "source": [
 829 |     "result['merged'].head()"
 830 |    ]
 831 |   },
 832 |   {
 833 |    "cell_type": "markdown",
 834 |    "metadata": {},
 835 |    "source": [
 836 |     "# Example 3: join 2 datasets with misalgined dates AND ids\n",
 837 |     "\n",
 838 |     "In the final example, we combine the above cases. None of the ids match and some of the dates are mismatched. As before with little manual effort we are able to correctly merge the dataset."
 839 |    ]
 840 |   },
 841 |   {
 842 |    "cell_type": "code",
 843 |    "execution_count": 15,
 844 |    "metadata": {},
 845 |    "outputs": [],
 846 |    "source": [
 847 |     "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n",
 848 |     "df2 = pd.DataFrame(list(itertools.product(dates2,uuid1)),columns=['date','id'])\n",
 849 |     "df2['v']=np.round(np.random.sample(df2.shape[0]),3)\n",
 850 |     "df2['id'] = df2['id'].str[1:-1]"
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "code",
 855 |    "execution_count": 16,
 856 |    "metadata": {},
 857 |    "outputs": [
 858 |     {
 859 |      "name": "stdout",
 860 |      "output_type": "stream",
 861 |      "text": [
 862 |       "  key left key right  all matched  inner  left  right  outer  unmatched total  unmatched left  unmatched right\n",
 863 |       "0       id        id        False      0    10     10     20               20              10               10\n",
 864 |       "1     date      date        False    261   366    261    366              105             105                0\n",
 865 |       "2  __all__   __all__        False      0  3660   2610   6270             6270            3660             2610\n"
 866 |      ]
 867 |     }
 868 |    ],
 869 |    "source": [
 870 |     "d6tjoin.Prejoin([df1,df2],['id','date']).match_quality()"
 871 |    ]
 872 |   },
 873 |   {
 874 |    "cell_type": "code",
 875 |    "execution_count": 17,
 876 |    "metadata": {},
 877 |    "outputs": [],
 878 |    "source": [
 879 |     "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id']).merge()"
 880 |    ]
 881 |   },
 882 |   {
 883 |    "cell_type": "code",
 884 |    "execution_count": 18,
 885 |    "metadata": {},
 886 |    "outputs": [
 887 |     {
 888 |      "data": {
 889 |       "text/html": [
 890 |        "<div>\n",
 891 |        "<style scoped>\n",
 892 |        "    .dataframe tbody tr th:only-of-type {\n",
 893 |        "        vertical-align: middle;\n",
 894 |        "    }\n",
 895 |        "\n",
 896 |        "    .dataframe tbody tr th {\n",
 897 |        "        vertical-align: top;\n",
 898 |        "    }\n",
 899 |        "\n",
 900 |        "    .dataframe thead th {\n",
 901 |        "        text-align: right;\n",
 902 |        "    }\n",
 903 |        "</style>\n",
 904 |        "<table border=\"1\" class=\"dataframe\">\n",
 905 |        "  <thead>\n",
 906 |        "    <tr style=\"text-align: right;\">\n",
 907 |        "      <th></th>\n",
 908 |        "      <th>date</th>\n",
 909 |        "      <th>id</th>\n",
 910 |        "      <th>v</th>\n",
 911 |        "      <th>date_right</th>\n",
 912 |        "      <th>id_right</th>\n",
 913 |        "      <th>v_right</th>\n",
 914 |        "    </tr>\n",
 915 |        "  </thead>\n",
 916 |        "  <tbody>\n",
 917 |        "    <tr>\n",
 918 |        "      <th>0</th>\n",
 919 |        "      <td>2010-01-01</td>\n",
 920 |        "      <td>e3e70682</td>\n",
 921 |        "      <td>0.393</td>\n",
 922 |        "      <td>2010-01-01</td>\n",
 923 |        "      <td>3e7068</td>\n",
 924 |        "      <td>0.693</td>\n",
 925 |        "    </tr>\n",
 926 |        "    <tr>\n",
 927 |        "      <th>1</th>\n",
 928 |        "      <td>2010-01-02</td>\n",
 929 |        "      <td>e3e70682</td>\n",
 930 |        "      <td>0.537</td>\n",
 931 |        "      <td>2010-01-01</td>\n",
 932 |        "      <td>3e7068</td>\n",
 933 |        "      <td>0.693</td>\n",
 934 |        "    </tr>\n",
 935 |        "    <tr>\n",
 936 |        "      <th>2</th>\n",
 937 |        "      <td>2010-01-01</td>\n",
 938 |        "      <td>f728b4fa</td>\n",
 939 |        "      <td>0.837</td>\n",
 940 |        "      <td>2010-01-01</td>\n",
 941 |        "      <td>728b4f</td>\n",
 942 |        "      <td>0.463</td>\n",
 943 |        "    </tr>\n",
 944 |        "    <tr>\n",
 945 |        "      <th>3</th>\n",
 946 |        "      <td>2010-01-02</td>\n",
 947 |        "      <td>f728b4fa</td>\n",
 948 |        "      <td>0.517</td>\n",
 949 |        "      <td>2010-01-01</td>\n",
 950 |        "      <td>728b4f</td>\n",
 951 |        "      <td>0.463</td>\n",
 952 |        "    </tr>\n",
 953 |        "    <tr>\n",
 954 |        "      <th>4</th>\n",
 955 |        "      <td>2010-01-01</td>\n",
 956 |        "      <td>eb1167b3</td>\n",
 957 |        "      <td>0.389</td>\n",
 958 |        "      <td>2010-01-01</td>\n",
 959 |        "      <td>b1167b</td>\n",
 960 |        "      <td>0.227</td>\n",
 961 |        "    </tr>\n",
 962 |        "  </tbody>\n",
 963 |        "</table>\n",
 964 |        "</div>"
 965 |       ],
 966 |       "text/plain": [
 967 |        "        date        id      v date_right id_right  v_right\n",
 968 |        "0 2010-01-01  e3e70682  0.393 2010-01-01   3e7068    0.693\n",
 969 |        "1 2010-01-02  e3e70682  0.537 2010-01-01   3e7068    0.693\n",
 970 |        "2 2010-01-01  f728b4fa  0.837 2010-01-01   728b4f    0.463\n",
 971 |        "3 2010-01-02  f728b4fa  0.517 2010-01-01   728b4f    0.463\n",
 972 |        "4 2010-01-01  eb1167b3  0.389 2010-01-01   b1167b    0.227"
 973 |       ]
 974 |      },
 975 |      "execution_count": 18,
 976 |      "metadata": {},
 977 |      "output_type": "execute_result"
 978 |     }
 979 |    ],
 980 |    "source": [
 981 |     "result['merged'].head()"
 982 |    ]
 983 |   },
 984 |   {
 985 |    "cell_type": "code",
 986 |    "execution_count": 19,
 987 |    "metadata": {
 988 |     "scrolled": true
 989 |    },
 990 |    "outputs": [
 991 |     {
 992 |      "data": {
 993 |       "text/html": [
 994 |        "<div>\n",
 995 |        "<style scoped>\n",
 996 |        "    .dataframe tbody tr th:only-of-type {\n",
 997 |        "        vertical-align: middle;\n",
 998 |        "    }\n",
 999 |        "\n",
1000 |        "    .dataframe tbody tr th {\n",
1001 |        "        vertical-align: top;\n",
1002 |        "    }\n",
1003 |        "\n",
1004 |        "    .dataframe thead th {\n",
1005 |        "        text-align: right;\n",
1006 |        "    }\n",
1007 |        "</style>\n",
1008 |        "<table border=\"1\" class=\"dataframe\">\n",
1009 |        "  <thead>\n",
1010 |        "    <tr style=\"text-align: right;\">\n",
1011 |        "      <th></th>\n",
1012 |        "      <th>__top1left__</th>\n",
1013 |        "      <th>__top1right__</th>\n",
1014 |        "      <th>__top1diff__</th>\n",
1015 |        "      <th>__matchtype__</th>\n",
1016 |        "    </tr>\n",
1017 |        "  </thead>\n",
1018 |        "  <tbody>\n",
1019 |        "    <tr>\n",
1020 |        "      <th>361</th>\n",
1021 |        "      <td>2010-12-28</td>\n",
1022 |        "      <td>2010-12-28</td>\n",
1023 |        "      <td>0 days</td>\n",
1024 |        "      <td>exact</td>\n",
1025 |        "    </tr>\n",
1026 |        "    <tr>\n",
1027 |        "      <th>362</th>\n",
1028 |        "      <td>2010-12-29</td>\n",
1029 |        "      <td>2010-12-29</td>\n",
1030 |        "      <td>0 days</td>\n",
1031 |        "      <td>exact</td>\n",
1032 |        "    </tr>\n",
1033 |        "    <tr>\n",
1034 |        "      <th>363</th>\n",
1035 |        "      <td>2010-12-30</td>\n",
1036 |        "      <td>2010-12-30</td>\n",
1037 |        "      <td>0 days</td>\n",
1038 |        "      <td>exact</td>\n",
1039 |        "    </tr>\n",
1040 |        "    <tr>\n",
1041 |        "      <th>364</th>\n",
1042 |        "      <td>2010-12-31</td>\n",
1043 |        "      <td>2010-12-31</td>\n",
1044 |        "      <td>0 days</td>\n",
1045 |        "      <td>exact</td>\n",
1046 |        "    </tr>\n",
1047 |        "    <tr>\n",
1048 |        "      <th>365</th>\n",
1049 |        "      <td>2011-01-01</td>\n",
1050 |        "      <td>2010-12-31</td>\n",
1051 |        "      <td>1 days</td>\n",
1052 |        "      <td>top1 left</td>\n",
1053 |        "    </tr>\n",
1054 |        "  </tbody>\n",
1055 |        "</table>\n",
1056 |        "</div>"
1057 |       ],
1058 |       "text/plain": [
1059 |        "    __top1left__ __top1right__ __top1diff__ __matchtype__\n",
1060 |        "361   2010-12-28    2010-12-28       0 days         exact\n",
1061 |        "362   2010-12-29    2010-12-29       0 days         exact\n",
1062 |        "363   2010-12-30    2010-12-30       0 days         exact\n",
1063 |        "364   2010-12-31    2010-12-31       0 days         exact\n",
1064 |        "365   2011-01-01    2010-12-31       1 days     top1 left"
1065 |       ]
1066 |      },
1067 |      "execution_count": 19,
1068 |      "metadata": {},
1069 |      "output_type": "execute_result"
1070 |     }
1071 |    ],
1072 |    "source": [
1073 |     "result['top1']['date'].tail()"
1074 |    ]
1075 |   },
1076 |   {
1077 |    "cell_type": "code",
1078 |    "execution_count": 20,
1079 |    "metadata": {},
1080 |    "outputs": [
1081 |     {
1082 |      "data": {
1083 |       "text/html": [
1084 |        "<div>\n",
1085 |        "<style scoped>\n",
1086 |        "    .dataframe tbody tr th:only-of-type {\n",
1087 |        "        vertical-align: middle;\n",
1088 |        "    }\n",
1089 |        "\n",
1090 |        "    .dataframe tbody tr th {\n",
1091 |        "        vertical-align: top;\n",
1092 |        "    }\n",
1093 |        "\n",
1094 |        "    .dataframe thead th {\n",
1095 |        "        text-align: right;\n",
1096 |        "    }\n",
1097 |        "</style>\n",
1098 |        "<table border=\"1\" class=\"dataframe\">\n",
1099 |        "  <thead>\n",
1100 |        "    <tr style=\"text-align: right;\">\n",
1101 |        "      <th></th>\n",
1102 |        "      <th>__top1right__date</th>\n",
1103 |        "      <th>__top1left__</th>\n",
1104 |        "      <th>__top1right__</th>\n",
1105 |        "      <th>__top1diff__</th>\n",
1106 |        "      <th>__matchtype__</th>\n",
1107 |        "    </tr>\n",
1108 |        "  </thead>\n",
1109 |        "  <tbody>\n",
1110 |        "    <tr>\n",
1111 |        "      <th>9396</th>\n",
1112 |        "      <td>2010-01-01</td>\n",
1113 |        "      <td>1846d424</td>\n",
1114 |        "      <td>846d42</td>\n",
1115 |        "      <td>2</td>\n",
1116 |        "      <td>top1 left</td>\n",
1117 |        "    </tr>\n",
1118 |        "    <tr>\n",
1119 |        "      <th>3915</th>\n",
1120 |        "      <td>2010-01-01</td>\n",
1121 |        "      <td>23a7711a</td>\n",
1122 |        "      <td>3a7711</td>\n",
1123 |        "      <td>2</td>\n",
1124 |        "      <td>top1 left</td>\n",
1125 |        "    </tr>\n",
1126 |        "    <tr>\n",
1127 |        "      <th>20619</th>\n",
1128 |        "      <td>2010-01-01</td>\n",
1129 |        "      <td>259f4329</td>\n",
1130 |        "      <td>59f432</td>\n",
1131 |        "      <td>2</td>\n",
1132 |        "      <td>top1 left</td>\n",
1133 |        "    </tr>\n",
1134 |        "    <tr>\n",
1135 |        "      <th>12528</th>\n",
1136 |        "      <td>2010-01-01</td>\n",
1137 |        "      <td>b4862b21</td>\n",
1138 |        "      <td>4862b2</td>\n",
1139 |        "      <td>2</td>\n",
1140 |        "      <td>top1 left</td>\n",
1141 |        "    </tr>\n",
1142 |        "    <tr>\n",
1143 |        "      <th>13050</th>\n",
1144 |        "      <td>2010-01-01</td>\n",
1145 |        "      <td>e3e70682</td>\n",
1146 |        "      <td>3e7068</td>\n",
1147 |        "      <td>2</td>\n",
1148 |        "      <td>top1 left</td>\n",
1149 |        "    </tr>\n",
1150 |        "  </tbody>\n",
1151 |        "</table>\n",
1152 |        "</div>"
1153 |       ],
1154 |       "text/plain": [
1155 |        "      __top1right__date __top1left__ __top1right__  __top1diff__ __matchtype__\n",
1156 |        "9396         2010-01-01     1846d424        846d42             2     top1 left\n",
1157 |        "3915         2010-01-01     23a7711a        3a7711             2     top1 left\n",
1158 |        "20619        2010-01-01     259f4329        59f432             2     top1 left\n",
1159 |        "12528        2010-01-01     b4862b21        4862b2             2     top1 left\n",
1160 |        "13050        2010-01-01     e3e70682        3e7068             2     top1 left"
1161 |       ]
1162 |      },
1163 |      "execution_count": 20,
1164 |      "metadata": {},
1165 |      "output_type": "execute_result"
1166 |     }
1167 |    ],
1168 |    "source": [
1169 |     "result['top1']['id'].head()"
1170 |    ]
1171 |   },
1172 |   {
1173 |    "cell_type": "markdown",
1174 |    "metadata": {
1175 |     "collapsed": true
1176 |    },
1177 |    "source": [
1178 |     "# Advanced Usage Options"
1179 |    ]
1180 |   },
1181 |   {
1182 |    "cell_type": "markdown",
1183 |    "metadata": {},
1184 |    "source": [
1185 |     "## Passing a difference limit\n",
1186 |     "By default every record in the left dataframe will be matched with a record in the right dataframe. Sometimes the difference is too large though to be considered a match. You can control this by passing the `top_limit` parameter."
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "code",
1191 |    "execution_count": 21,
1192 |    "metadata": {},
1193 |    "outputs": [],
1194 |    "source": [
1195 |     "dates2 = pd.bdate_range('1/1/2010','1/1/2011') # business instead of calendar dates\n",
1196 |     "df2 = pd.DataFrame(list(itertools.product(dates2,uuid1[:-2])),columns=['date','id'])\n",
1197 |     "df2['v']=np.random.sample(df2.shape[0])\n",
1198 |     "df2['id'] = df2['id'].str[1:-1]"
1199 |    ]
1200 |   },
1201 |   {
1202 |    "cell_type": "code",
1203 |    "execution_count": 22,
1204 |    "metadata": {},
1205 |    "outputs": [
1206 |     {
1207 |      "data": {
1208 |       "text/html": [
1209 |        "<div>\n",
1210 |        "<style scoped>\n",
1211 |        "    .dataframe tbody tr th:only-of-type {\n",
1212 |        "        vertical-align: middle;\n",
1213 |        "    }\n",
1214 |        "\n",
1215 |        "    .dataframe tbody tr th {\n",
1216 |        "        vertical-align: top;\n",
1217 |        "    }\n",
1218 |        "\n",
1219 |        "    .dataframe thead th {\n",
1220 |        "        text-align: right;\n",
1221 |        "    }\n",
1222 |        "</style>\n",
1223 |        "<table border=\"1\" class=\"dataframe\">\n",
1224 |        "  <thead>\n",
1225 |        "    <tr style=\"text-align: right;\">\n",
1226 |        "      <th></th>\n",
1227 |        "      <th>__top1right__date</th>\n",
1228 |        "      <th>__top1left__</th>\n",
1229 |        "      <th>__top1right__</th>\n",
1230 |        "      <th>__top1diff__</th>\n",
1231 |        "      <th>__matchtype__</th>\n",
1232 |        "    </tr>\n",
1233 |        "  </thead>\n",
1234 |        "  <tbody>\n",
1235 |        "    <tr>\n",
1236 |        "      <th>7830</th>\n",
1237 |        "      <td>2010-01-01</td>\n",
1238 |        "      <td>1846d424</td>\n",
1239 |        "      <td>846d42</td>\n",
1240 |        "      <td>2</td>\n",
1241 |        "      <td>top1 left</td>\n",
1242 |        "    </tr>\n",
1243 |        "    <tr>\n",
1244 |        "      <th>3393</th>\n",
1245 |        "      <td>2010-01-01</td>\n",
1246 |        "      <td>23a7711a</td>\n",
1247 |        "      <td>3a7711</td>\n",
1248 |        "      <td>2</td>\n",
1249 |        "      <td>top1 left</td>\n",
1250 |        "    </tr>\n",
1251 |        "    <tr>\n",
1252 |        "      <th>16182</th>\n",
1253 |        "      <td>2010-01-01</td>\n",
1254 |        "      <td>259f4329</td>\n",
1255 |        "      <td>846d42</td>\n",
1256 |        "      <td>6</td>\n",
1257 |        "      <td>top1 left</td>\n",
1258 |        "    </tr>\n",
1259 |        "    <tr>\n",
1260 |        "      <th>8874</th>\n",
1261 |        "      <td>2010-01-01</td>\n",
1262 |        "      <td>b4862b21</td>\n",
1263 |        "      <td>b1167b</td>\n",
1264 |        "      <td>5</td>\n",
1265 |        "      <td>top1 left</td>\n",
1266 |        "    </tr>\n",
1267 |        "    <tr>\n",
1268 |        "      <th>9918</th>\n",
1269 |        "      <td>2010-01-01</td>\n",
1270 |        "      <td>b4862b21</td>\n",
1271 |        "      <td>846d42</td>\n",
1272 |        "      <td>5</td>\n",
1273 |        "      <td>top1 left</td>\n",
1274 |        "    </tr>\n",
1275 |        "  </tbody>\n",
1276 |        "</table>\n",
1277 |        "</div>"
1278 |       ],
1279 |       "text/plain": [
1280 |        "      __top1right__date __top1left__ __top1right__  __top1diff__ __matchtype__\n",
1281 |        "7830         2010-01-01     1846d424        846d42             2     top1 left\n",
1282 |        "3393         2010-01-01     23a7711a        3a7711             2     top1 left\n",
1283 |        "16182        2010-01-01     259f4329        846d42             6     top1 left\n",
1284 |        "8874         2010-01-01     b4862b21        b1167b             5     top1 left\n",
1285 |        "9918         2010-01-01     b4862b21        846d42             5     top1 left"
1286 |       ]
1287 |      },
1288 |      "execution_count": 22,
1289 |      "metadata": {},
1290 |      "output_type": "execute_result"
1291 |     }
1292 |    ],
1293 |    "source": [
1294 |     "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id']).merge()\n",
1295 |     "result['top1']['id'].head()"
1296 |    ]
1297 |   },
1298 |   {
1299 |    "cell_type": "markdown",
1300 |    "metadata": {},
1301 |    "source": [
1302 |     "We have some correct matches but also some bad matches with `__top1diff__`>2. We will restrict `top_limit` to be at most 2."
1303 |    ]
1304 |   },
1305 |   {
1306 |    "cell_type": "code",
1307 |    "execution_count": 23,
1308 |    "metadata": {},
1309 |    "outputs": [],
1310 |    "source": [
1311 |     "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id'], top_limit=[None,2]).merge()"
1312 |    ]
1313 |   },
1314 |   {
1315 |    "cell_type": "code",
1316 |    "execution_count": 24,
1317 |    "metadata": {},
1318 |    "outputs": [
1319 |     {
1320 |      "data": {
1321 |       "text/html": [
1322 |        "<div>\n",
1323 |        "<style scoped>\n",
1324 |        "    .dataframe tbody tr th:only-of-type {\n",
1325 |        "        vertical-align: middle;\n",
1326 |        "    }\n",
1327 |        "\n",
1328 |        "    .dataframe tbody tr th {\n",
1329 |        "        vertical-align: top;\n",
1330 |        "    }\n",
1331 |        "\n",
1332 |        "    .dataframe thead th {\n",
1333 |        "        text-align: right;\n",
1334 |        "    }\n",
1335 |        "</style>\n",
1336 |        "<table border=\"1\" class=\"dataframe\">\n",
1337 |        "  <thead>\n",
1338 |        "    <tr style=\"text-align: right;\">\n",
1339 |        "      <th></th>\n",
1340 |        "      <th>__top1right__date</th>\n",
1341 |        "      <th>__top1left__</th>\n",
1342 |        "      <th>__top1right__</th>\n",
1343 |        "      <th>__top1diff__</th>\n",
1344 |        "      <th>__matchtype__</th>\n",
1345 |        "    </tr>\n",
1346 |        "  </thead>\n",
1347 |        "  <tbody>\n",
1348 |        "    <tr>\n",
1349 |        "      <th>7830</th>\n",
1350 |        "      <td>2010-01-01</td>\n",
1351 |        "      <td>1846d424</td>\n",
1352 |        "      <td>846d42</td>\n",
1353 |        "      <td>2</td>\n",
1354 |        "      <td>top1 left</td>\n",
1355 |        "    </tr>\n",
1356 |        "    <tr>\n",
1357 |        "      <th>3393</th>\n",
1358 |        "      <td>2010-01-01</td>\n",
1359 |        "      <td>23a7711a</td>\n",
1360 |        "      <td>3a7711</td>\n",
1361 |        "      <td>2</td>\n",
1362 |        "      <td>top1 left</td>\n",
1363 |        "    </tr>\n",
1364 |        "    <tr>\n",
1365 |        "      <th>10440</th>\n",
1366 |        "      <td>2010-01-01</td>\n",
1367 |        "      <td>e3e70682</td>\n",
1368 |        "      <td>3e7068</td>\n",
1369 |        "      <td>2</td>\n",
1370 |        "      <td>top1 left</td>\n",
1371 |        "    </tr>\n",
1372 |        "    <tr>\n",
1373 |        "      <th>5220</th>\n",
1374 |        "      <td>2010-01-01</td>\n",
1375 |        "      <td>e443df78</td>\n",
1376 |        "      <td>443df7</td>\n",
1377 |        "      <td>2</td>\n",
1378 |        "      <td>top1 left</td>\n",
1379 |        "    </tr>\n",
1380 |        "    <tr>\n",
1381 |        "      <th>17226</th>\n",
1382 |        "      <td>2010-01-01</td>\n",
1383 |        "      <td>eb1167b3</td>\n",
1384 |        "      <td>b1167b</td>\n",
1385 |        "      <td>2</td>\n",
1386 |        "      <td>top1 left</td>\n",
1387 |        "    </tr>\n",
1388 |        "  </tbody>\n",
1389 |        "</table>\n",
1390 |        "</div>"
1391 |       ],
1392 |       "text/plain": [
1393 |        "      __top1right__date __top1left__ __top1right__  __top1diff__ __matchtype__\n",
1394 |        "7830         2010-01-01     1846d424        846d42             2     top1 left\n",
1395 |        "3393         2010-01-01     23a7711a        3a7711             2     top1 left\n",
1396 |        "10440        2010-01-01     e3e70682        3e7068             2     top1 left\n",
1397 |        "5220         2010-01-01     e443df78        443df7             2     top1 left\n",
1398 |        "17226        2010-01-01     eb1167b3        b1167b             2     top1 left"
1399 |       ]
1400 |      },
1401 |      "execution_count": 24,
1402 |      "metadata": {},
1403 |      "output_type": "execute_result"
1404 |     }
1405 |    ],
1406 |    "source": [
1407 |     "result['top1']['id'].head()"
1408 |    ]
1409 |   },
1410 |   {
1411 |    "cell_type": "markdown",
1412 |    "metadata": {},
1413 |    "source": [
1414 |     "## Passing a custom difference function\n",
1415 |     "By default string matches are done using Levenstein edit distance. You can pass a custom function using `fun_diff`. For example lets pass Hamming distance."
1416 |    ]
1417 |   },
1418 |   {
1419 |    "cell_type": "code",
1420 |    "execution_count": 25,
1421 |    "metadata": {},
1422 |    "outputs": [],
1423 |    "source": [
1424 |     "import jellyfish\n",
1425 |     "result = d6tjoin.top1.MergeTop1(df1,df2,['date','id'],['date','id'], fun_diff=[None,jellyfish.hamming_distance]).merge()"
1426 |    ]
1427 |   },
1428 |   {
1429 |    "cell_type": "code",
1430 |    "execution_count": 26,
1431 |    "metadata": {},
1432 |    "outputs": [
1433 |     {
1434 |      "data": {
1435 |       "text/html": [
1436 |        "<div>\n",
1437 |        "<style scoped>\n",
1438 |        "    .dataframe tbody tr th:only-of-type {\n",
1439 |        "        vertical-align: middle;\n",
1440 |        "    }\n",
1441 |        "\n",
1442 |        "    .dataframe tbody tr th {\n",
1443 |        "        vertical-align: top;\n",
1444 |        "    }\n",
1445 |        "\n",
1446 |        "    .dataframe thead th {\n",
1447 |        "        text-align: right;\n",
1448 |        "    }\n",
1449 |        "</style>\n",
1450 |        "<table border=\"1\" class=\"dataframe\">\n",
1451 |        "  <thead>\n",
1452 |        "    <tr style=\"text-align: right;\">\n",
1453 |        "      <th></th>\n",
1454 |        "      <th>__top1right__date</th>\n",
1455 |        "      <th>__top1left__</th>\n",
1456 |        "      <th>__top1right__</th>\n",
1457 |        "      <th>__top1diff__</th>\n",
1458 |        "      <th>__matchtype__</th>\n",
1459 |        "    </tr>\n",
1460 |        "  </thead>\n",
1461 |        "  <tbody>\n",
1462 |        "    <tr>\n",
1463 |        "      <th>6786</th>\n",
1464 |        "      <td>2010-01-01</td>\n",
1465 |        "      <td>1846d424</td>\n",
1466 |        "      <td>b1167b</td>\n",
1467 |        "      <td>7</td>\n",
1468 |        "      <td>top1 left</td>\n",
1469 |        "    </tr>\n",
1470 |        "    <tr>\n",
1471 |        "      <th>7047</th>\n",
1472 |        "      <td>2010-01-01</td>\n",
1473 |        "      <td>1846d424</td>\n",
1474 |        "      <td>7c1bd8</td>\n",
1475 |        "      <td>7</td>\n",
1476 |        "      <td>top1 left</td>\n",
1477 |        "    </tr>\n",
1478 |        "    <tr>\n",
1479 |        "      <th>3393</th>\n",
1480 |        "      <td>2010-01-01</td>\n",
1481 |        "      <td>23a7711a</td>\n",
1482 |        "      <td>3a7711</td>\n",
1483 |        "      <td>6</td>\n",
1484 |        "      <td>top1 left</td>\n",
1485 |        "    </tr>\n",
1486 |        "    <tr>\n",
1487 |        "      <th>14877</th>\n",
1488 |        "      <td>2010-01-01</td>\n",
1489 |        "      <td>259f4329</td>\n",
1490 |        "      <td>728b4f</td>\n",
1491 |        "      <td>7</td>\n",
1492 |        "      <td>top1 left</td>\n",
1493 |        "    </tr>\n",
1494 |        "    <tr>\n",
1495 |        "      <th>16182</th>\n",
1496 |        "      <td>2010-01-01</td>\n",
1497 |        "      <td>259f4329</td>\n",
1498 |        "      <td>846d42</td>\n",
1499 |        "      <td>7</td>\n",
1500 |        "      <td>top1 left</td>\n",
1501 |        "    </tr>\n",
1502 |        "  </tbody>\n",
1503 |        "</table>\n",
1504 |        "</div>"
1505 |       ],
1506 |       "text/plain": [
1507 |        "      __top1right__date __top1left__ __top1right__  __top1diff__ __matchtype__\n",
1508 |        "6786         2010-01-01     1846d424        b1167b             7     top1 left\n",
1509 |        "7047         2010-01-01     1846d424        7c1bd8             7     top1 left\n",
1510 |        "3393         2010-01-01     23a7711a        3a7711             6     top1 left\n",
1511 |        "14877        2010-01-01     259f4329        728b4f             7     top1 left\n",
1512 |        "16182        2010-01-01     259f4329        846d42             7     top1 left"
1513 |       ]
1514 |      },
1515 |      "execution_count": 26,
1516 |      "metadata": {},
1517 |      "output_type": "execute_result"
1518 |     }
1519 |    ],
1520 |    "source": [
1521 |     "result['top1']['id'].head()"
1522 |    ]
1523 |   },
1524 |   {
1525 |    "cell_type": "code",
1526 |    "execution_count": null,
1527 |    "metadata": {},
1528 |    "outputs": [],
1529 |    "source": []
1530 |   }
1531 |  ],
1532 |  "metadata": {
1533 |   "kernelspec": {
1534 |    "display_name": "Python 3",
1535 |    "language": "python",
1536 |    "name": "python3"
1537 |   },
1538 |   "language_info": {
1539 |    "codemirror_mode": {
1540 |     "name": "ipython",
1541 |     "version": 3
1542 |    },
1543 |    "file_extension": ".py",
1544 |    "mimetype": "text/x-python",
1545 |    "name": "python",
1546 |    "nbconvert_exporter": "python",
1547 |    "pygments_lexer": "ipython3",
1548 |    "version": "3.7.6"
1549 |   }
1550 |  },
1551 |  "nbformat": 4,
1552 |  "nbformat_minor": 2
1553 | }
1554 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | sphinx
3 | sphinxcontrib-napoleon
4 | sphinx_rtd_theme
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | jellyfish
4 | d6tstack
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='d6tjoin',
 5 |     version='0.2.1',
 6 |     packages=['d6tjoin'],
 7 |     url='https://github.com/d6t/d6tjoin',
 8 |     license='MIT',
 9 |     author='DataBolt Team',
10 |     author_email='support@databolt.tech',
11 |     description='Easily join python pandas dataframes',
12 |     long_description='Easily join python pandas dataframes'
13 |         'See https://github.com/d6t/d6tjoin for details',
14 |     install_requires=[
15 |         'numpy',
16 |         'pandas',
17 |         'jellyfish',
18 |         'joblib',
19 |         'd6tstack',
20 |         'affinegap'
21 |     ],
22 |     include_package_data=True,
23 |     python_requires='>=3.6'
24 | )
25 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/d6t/d6tjoin/9618b129601aa0b4a9247d7001da8c2220d36d9c/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_pre_pd.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | import pytest
  5 | 
  6 | import d6tjoin
  7 | 
  8 | def fake_2dfs_identical():
  9 |     df = pd.DataFrame({'a':range(10)})
 10 |     df['b'] = ['b']*5+['bb']*5
 11 |     return [df, df.copy()]
 12 | 
 13 | def fake_2dfs_1missing():
 14 |     df = pd.DataFrame({'a':range(10)})
 15 |     df['b'] = ['b']*5+['bb']*5
 16 |     return [df, df.copy().drop(['b'],1)]
 17 | 
 18 | def test_internals():
 19 |     dfs = fake_2dfs_identical()
 20 | 
 21 |     pdj = d6tjoin.Prejoin(dfs, print_only=False)
 22 |     assert pdj.keys is None and pdj.keysdf is None
 23 |     assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs])
 24 |     assert all([dfg.shape==(pdj.nrows, dfs[0].shape[1]) for dfg in pdj.dfshead])
 25 |     dfc = pdj.head()
 26 |     assert all([dfg.head().equals(dfc[idx]) for idx,dfg in enumerate(dfs)])
 27 |     dfc = pdj.head(10)
 28 |     assert all([dfg.head(10).equals(dfc[idx]) for idx,dfg in enumerate(dfs)])
 29 | 
 30 |     # single keys param
 31 |     cfg_keys = ['b']
 32 |     pdj = d6tjoin.Prejoin(dfs,keys=cfg_keys)
 33 |     assert pdj.keys == [['b','b']] and pdj.keysdf == [['b'],['b']]
 34 |     assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs])
 35 |     assert all([dfg.shape==(pdj.nrows, len(cfg_keys)) for dfg in pdj.dfshead])
 36 | 
 37 |     dfs[1] = dfs[1].rename(columns={'b': 'c'})
 38 |     with pytest.raises(KeyError, match='Columns missing'):
 39 |         pdj = d6tjoin.Prejoin(dfs, keys=['b'])
 40 | 
 41 |     # different keys for dfs
 42 |     pdj = d6tjoin.Prejoin(dfs,keys=[['b'],['c']])
 43 |     assert pdj.keys == [['b','c']] and pdj.keysdf == [['b'],['c']]
 44 |     assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs])
 45 |     assert all([dfg.shape==(pdj.nrows, 1) for dfg in pdj.dfshead])
 46 |     pdj = d6tjoin.Prejoin(dfs,keys=[['b','c']], keys_bydf=False)
 47 |     assert pdj.keys == [['b','c']] and pdj.keysdf == [['b'],['c']]
 48 | 
 49 |     # multi keys param
 50 |     dfs[0]['b1']=dfs[0]['b'];dfs[1]['c1']=dfs[1]['c'];
 51 |     pdj = d6tjoin.Prejoin(dfs,keys=[['b','b1'],['c','c1']])
 52 |     assert pdj.keys == [['b','c'],['b1','c1']] and pdj.keysdf == [['b','b1'],['c','c1']]
 53 |     assert all([dfg.shape==dfs[0].shape for dfg in pdj.dfs])
 54 |     assert all([dfg.shape==(pdj.nrows, 2) for dfg in pdj.dfshead])
 55 | 
 56 |     # joins with keys specified
 57 |     dfs = fake_2dfs_identical()
 58 |     pdj = d6tjoin.Prejoin(dfs,keys=['b'], print_only=False)
 59 |     assert pdj.columns_common()==['b']
 60 |     assert pdj.columns_all()==['b']
 61 | 
 62 |     dfs[1] = dfs[1].rename(columns={'b': 'c'})
 63 |     pdj = d6tjoin.Prejoin(dfs,keys=[['b'],['c']], print_only=False)
 64 |     assert pdj.columns_all()==['b','c']
 65 | 
 66 | 
 67 | def test_pre_columns():
 68 |     dfs = fake_2dfs_identical()
 69 |     pdj = d6tjoin.Prejoin(dfs,print_only=False)
 70 |     assert pdj.columns_common()==['a','b']
 71 |     assert pdj.columns_all()==['a','b']
 72 | 
 73 |     pdj.describe()
 74 |     assert pdj.shape() == {0: (10, 2), 1: (10, 2)}
 75 | 
 76 |     dfs = fake_2dfs_1missing()
 77 |     pdj = d6tjoin.Prejoin(dfs,print_only=False)
 78 |     assert pdj.columns_common()==['a']
 79 |     assert pdj.columns_all()==['a','b']
 80 | 
 81 | def test_pre_describe():
 82 |     # describe_str
 83 |     chk = {'b': {'median': 1.5, 'min': 1.0, 'max': 2.0, 'nrecords': 10.0}}
 84 |     dfs = fake_2dfs_identical()
 85 |     pdj = d6tjoin.Prejoin(dfs,print_only=False)
 86 |     assert pdj.describe_str()[0].to_dict(orient='index')==chk
 87 |     pdj = d6tjoin.Prejoin(dfs,keys=['b'],print_only=False)
 88 |     assert pdj.describe_str()[0].to_dict(orient='index')==chk
 89 | 
 90 |     # describe_str
 91 |     chk = {'a': {'nrecords': 10, 'unique': 10, 'nan': 0, 'unique rate': 1.0},
 92 |      'b': {'nrecords': 10, 'unique': 2, 'nan': 0, 'unique rate': 0.2}}
 93 |     pdj = d6tjoin.Prejoin(dfs,print_only=False)
 94 |     assert pdj.describe_data()[0].to_dict(orient='index')==chk
 95 |     pdj = d6tjoin.Prejoin(dfs,keys=['b'],print_only=False)
 96 |     assert pdj.describe_data()[0].to_dict(orient='index')==chk
 97 | 
 98 | def test_pre_data_match():
 99 |     dfs = fake_2dfs_identical()
100 |     pdj = d6tjoin.Prejoin(dfs,print_only=False)
101 | 
102 |     dfc = {'__left__': {0: 'b'},
103 |  '__right__': {0: 'b'},
104 |  '__similarity__': {0: 1.0},
105 |  '__left-sample__': {0: 'bb'},
106 |  '__right-sample__': {0: 'bb'},
107 |  '__left-nunique__': {0: 2},
108 |  '__right-nunique__': {0: 2}}
109 | 
110 |     assert pd.DataFrame(dfc).equals(pdj.data_match())
111 | 
112 |     dfc = {0: {'__left__': 'a',
113 |   '__right__': 'a',
114 |   '__similarity__': 1.0,
115 |   '__left-sample__': 0,
116 |   '__right-sample__': 0,
117 |   '__left-nunique__': 10,
118 |   '__right-nunique__': 10},
119 |  1: {'__left__': 'b',
120 |   '__right__': 'b',
121 |   '__similarity__': 1.0,
122 |   '__left-sample__': 'bb',
123 |   '__right-sample__': 'bb',
124 |   '__left-nunique__': 2,
125 |   '__right-nunique__': 2}}
126 | 
127 |     assert dfc==pdj.data_match(ignore_value_columns=False, max_unique_pct=1.0).to_dict(orient='index')
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/tests/test_smartjoin.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import pandas as pd
  3 | import numpy as np
  4 | 
  5 | # fuzzy join
  6 | from faker import Faker
  7 | import importlib
  8 | 
  9 | import d6tjoin.smart_join
 10 | importlib.reload(d6tjoin.smart_join)
 11 | cfg_num = 10
 12 | cfg_num_unmatched = 2
 13 | cfg_num_matched = cfg_num-cfg_num_unmatched
 14 | 
 15 | # d6t
 16 | from d6tjoin.utils import df_str_summary, BaseJoin, PreJoin
 17 | 
 18 | # ******************************************
 19 | # helpers
 20 | # ******************************************
 21 | def gen_multikey_simple():
 22 |     fake = Faker()
 23 |     fake.seed(1)
 24 | 
 25 |     pool_names = [fake.name() for _ in range(cfg_num)]
 26 |     pool_dates = pd.date_range('1/1/2018', periods=cfg_num)
 27 | 
 28 |     # case multikey
 29 |     df1 = pd.DataFrame({'key': pool_names[:-cfg_num_unmatched], 'date': pool_dates[:-cfg_num_unmatched]})
 30 |     df2 = pd.DataFrame({'key': pool_names[cfg_num_unmatched:], 'date': pool_dates[cfg_num_unmatched:]})
 31 |     df1['val1'] = range(df1.shape[0])
 32 |     df2['val2'] = range(df2.shape[0])
 33 | 
 34 |     return df1, df2
 35 | 
 36 | def gen_multikey_complex(unmatched_date=True):
 37 | 
 38 |     fake = Faker()
 39 |     fake.seed(1)
 40 | 
 41 |     pool_names = [fake.name() for _ in range(cfg_num)]
 42 |     cfg_num_per_group = 4
 43 |     pool_date1 = pd.date_range('1/1/2010', periods=cfg_num_per_group, freq='1M')
 44 |     if unmatched_date:
 45 |         pool_date2 = pd.bdate_range('1/1/2010', periods=cfg_num_per_group, freq='1BM')
 46 |     else:
 47 |         pool_date2 = pool_date1
 48 | 
 49 |     def gen_df(cfg_pool_rates, cfg_offset=0):
 50 |         dfg = []
 51 |         for i in range(cfg_num_per_group):
 52 |             dft = pd.DataFrame({'key': np.roll(pool_names, i + cfg_offset)[:cfg_num_per_group]})
 53 |             dft['date'] = cfg_pool_rates[i]
 54 |             dft['value'] = np.random.randn(dft.shape[0])
 55 |             dfg.append(dft)
 56 |         return pd.concat(dfg)
 57 | 
 58 |     df1 = gen_df(pool_date1)
 59 |     df2 = gen_df(pool_date2, 2)
 60 | 
 61 |     return df1, df2
 62 | 
 63 | 
 64 | # ******************************************
 65 | # utils
 66 | # ******************************************
 67 | 
 68 | def test_df_str_summary():
 69 |     df = pd.DataFrame({'a': ['a', 'aa'] * 2})
 70 |     df['b'] = ['aa', 'aaa'] * 2
 71 | 
 72 |     dft = df_str_summary(df)
 73 |     assert np.all(dft.values == np.array([[ 1.5,  1.5,  1. ,  2. ,  4. ],
 74 |        [ 2.5,  2.5,  2. ,  3. ,  4. ]]))
 75 |     dft = df_str_summary(df,['a'])
 76 |     assert np.all(dft.values == np.array([1.5,  1.5,  1. ,  2. ,  4.]))
 77 | 
 78 |     dft = df_str_summary(df,unique_count=True)
 79 |     assert np.all(dft.values == np.array([[ 1.5,  1.5,  1. ,  2. ,  4. ,  2. ],
 80 |        [ 2.5,  2.5,  2. ,  3. ,  4. ,  2. ]]))
 81 | 
 82 | 
 83 | def test_basejoin():
 84 |     df1 = pd.DataFrame({'a': range(3), 'b': range(3)})
 85 |     df2 = pd.DataFrame({'a': range(3), 'b': range(3)})
 86 | 
 87 |     with pytest.raises(ValueError) as e:
 88 |         j = PreJoin([df1], ['a'])
 89 |     with pytest.raises(NotImplementedError) as e:
 90 |         j = PreJoin([df1,df2,df1], ['a'])
 91 | 
 92 |     j1 = PreJoin([df1,df2], ['a','b'])
 93 |     j2 = PreJoin([df1,df2], [['a','b'],['a','b']], keys_bydf=True)
 94 |     j3 = PreJoin([df1,df2], [['a','a'],['b','b']])
 95 |     assert j1.keys == [['a', 'a'], ['b', 'b']]
 96 |     assert j1.keys == j2.keys
 97 |     assert j2.keys == j3.keys
 98 |     assert j1.keysdf == [['a', 'b'], ['a', 'b']]
 99 |     assert j1.keysdf == j2.keysdf
100 |     assert j3.keysdf == j2.keysdf
101 | 
102 |     df2 = pd.DataFrame({'a': range(3), 'c': range(3)})
103 | 
104 |     with pytest.raises(KeyError) as e:
105 |         j1 = PreJoin([df1,df2], ['a','c'])
106 | 
107 |     j2 = PreJoin([df1,df2], [['a','b'],['a','c']], keys_bydf=True)
108 |     j3 = PreJoin([df1,df2], [['a','a'],['b','c']])
109 |     assert j2.keys == [['a', 'a'], ['b', 'c']]
110 |     assert j3.keys == j2.keys
111 |     assert j2.keysdf == [['a', 'b'], ['a', 'c']]
112 |     assert j3.keysdf == j2.keysdf
113 | 
114 | # ******************************************
115 | # prejoin
116 | # ******************************************
117 | def test_prejoin():
118 |     df1 = pd.DataFrame({'a': range(3), 'b': range(3)})
119 |     df2 = pd.DataFrame({'a': range(3), 'c': range(3)})
120 | 
121 |     j = PreJoin([df1,df2],['a'])
122 |     dfr = j.stats_prejoin(print_only=False)
123 |     results = dfr.to_dict()
124 |     check = {'all matched': {0: True, 1: True},
125 |          'inner': {0: 3, 1: 3},
126 |          'key left': {0: 'a', 1: '__all__'},
127 |          'key right': {0: 'a', 1: '__all__'},
128 |          'left': {0: 3, 1: 3},
129 |          'outer': {0: 3, 1: 3},
130 |          'right': {0: 3, 1: 3},
131 |          'unmatched left': {0: 0, 1: 0},
132 |          'unmatched right': {0: 0, 1: 0},
133 |          'unmatched total': {0: 0, 1: 0}}
134 |     assert results == check
135 |     assert j.is_all_matched()
136 |     assert j.is_all_matched('a')
137 | 
138 |     df2 = pd.DataFrame({'a': range(3,6), 'c': range(3)})
139 | 
140 |     j = PreJoin([df1,df2],['a'])
141 |     dfr = j.stats_prejoin(print_only=False)
142 |     assert (~dfr['all matched']).all()
143 |     assert not j.is_all_matched()
144 |     assert not j.is_all_matched('a')
145 | 
146 |     df2 = pd.DataFrame({'b': range(3,6), 'a': range(3), 'v':range(3)})
147 |     cfg_keys = ['a', 'b']
148 |     j = PreJoin([df1,df2],cfg_keys)
149 |     dfr = j.stats_prejoin(print_only=False)
150 |     assert dfr['all matched'].tolist()==[True, False, False]
151 |     assert not j.is_all_matched()
152 |     assert j.is_all_matched('a')
153 |     assert not j.is_all_matched('b')
154 | 
155 |     # test show_input
156 |     dfr = j.show_input(1,keys_only=False)
157 |     assert dfr[0].equals(df1.head(1))
158 |     assert dfr[1].equals(df2.head(1))
159 |     dfr = j.show_input(-1,keys_only=True)
160 |     assert dfr[0][cfg_keys].equals(df1[cfg_keys])
161 |     assert dfr[1][cfg_keys].equals(df2[cfg_keys])
162 | 
163 |     # test show_unmatched
164 |     j.show_unmatched('b',print_only=True) # just make sure print_only runs without errors
165 |     dfr = j.show_unmatched('b',nrecords=-1)
166 |     assert dfr['left'].equals(df1['b'])
167 |     assert dfr['right'].equals(df2['b'])
168 |     dfr = j.show_matched('a',nrecords=-1)
169 |     assert dfr['left'].equals(df1['a'])
170 |     assert dfr['right'].equals(df2['a'])
171 |     dfr = j.show_unmatched('__all__',nrecords=-1)
172 |     assert dfr['left'].equals(df1[cfg_keys])
173 |     assert dfr['right'].equals(df2[cfg_keys])
174 |     dfr = j.show_matched('__all__')
175 |     assert dfr['left'].empty
176 |     assert dfr['right'].empty
177 | 
178 |     dfr = j.show_unmatched('b',nrecords=1)
179 |     assert dfr['left'].equals(df1['b'].head(1))
180 |     assert dfr['right'].equals(df2['b'].head(1))
181 | 
182 |     dfr = j.show_unmatched('b',keys_only=False,nrecords=-1)
183 |     assert dfr['left'].equals(df1)
184 |     assert dfr['right'].equals(df2)
185 | 
186 |     dfr = j.show_unmatched('a')
187 |     assert dfr['left'].empty
188 |     assert dfr['right'].empty
189 |     dfr = j.show_matched('b')
190 |     assert dfr['left'].empty
191 |     assert dfr['right'].empty
192 | 
193 |     # test show_unmatched
194 |     j = PreJoin([df1,df2],['a'])
195 |     with pytest.raises(RuntimeError) as e:
196 |         j.show_unmatched('a', print_only=True)
197 |     j.stats_prejoin()
198 |     dfr = j.show_matched('__all__',nrecords=-1)
199 |     assert dfr['left'].equals(df1[['a']])
200 |     assert dfr['right'].equals(df2[['a']])
201 |     dfr = j.show_unmatched('__all__',nrecords=-1)
202 |     assert dfr['left'].empty
203 |     assert dfr['right'].empty
204 | 
205 | 
206 | # ******************************************
207 | # fuzzy join
208 | # ******************************************
209 | def test_fakedata_singlekey_string():
210 | 
211 |     fake = Faker()
212 |     fake.seed(1)
213 | 
214 |     pool_names = [fake.name() for _ in range(cfg_num)]
215 |     pool_names_unmatched_left = pool_names[:cfg_num_unmatched]
216 | 
217 |     # case single key unmatched
218 |     df1=pd.DataFrame({'key':pool_names[:-cfg_num_unmatched]})
219 |     df2=pd.DataFrame({'key':pool_names[cfg_num_unmatched:]})
220 |     df1['val1']=range(df1.shape[0])
221 |     df2['val2']=range(df2.shape[0])
222 | 
223 | 
224 |     with pytest.raises(ValueError) as e_info:
225 |         d6tjoin.smart_join.FuzzyJoinTop1([df1, df2], [], [])
226 |     with pytest.raises(KeyError) as e_info:
227 |         d6tjoin.smart_join.FuzzyJoinTop1([df1,df2], fuzzy_keys=['unmatched'])
228 | 
229 |     importlib.reload(d6tjoin.smart_join)
230 |     sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key'])
231 |     assert sj.keysdf_fuzzy == [['key']]*2
232 |     assert sj.keysdf_exact == []
233 | 
234 |     import jellyfish
235 |     def diff_edit(a, b):
236 |         return jellyfish.levenshtein_distance(a, b)
237 |     def diff_hamming(a, b):
238 |         return jellyfish.hamming_distance(a, b)
239 | 
240 |     sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key'])
241 |     dfr = sj._gen_match_top1(0)['table'].copy()
242 |     assert sj._gen_match_top1(0)['has duplicates']
243 |     assert set(dfr.loc[dfr['__top1diff__']>0,'__top1left__'].unique()) == set(pool_names_unmatched_left)
244 |     assert dfr.loc[dfr['__top1diff__']>0,'__top1right__'].values.tolist() == ['Teresa James', 'Rachel Davis', 'Teresa James']
245 |     dfr['__top1diff__check'] = dfr.apply(lambda x: diff_edit(x['__top1left__'],x['__top1right__']),1)
246 |     assert (dfr['__top1diff__']==dfr['__top1diff__check']).all()
247 | 
248 |     sj.set_fuzzy_how(0,{'fun_diff':[diff_hamming,diff_edit]})
249 |     dfr = sj._gen_match_top1(0)['table'].copy()
250 |     assert dfr.loc[dfr['__top1diff__']>0,'__top1right__'].values.tolist() == ['Teresa James', 'Amanda Johnson']
251 |     assert not sj._gen_match_top1(0)['has duplicates']
252 | 
253 | 
254 |     sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key'])
255 |     dfr1 = sj._gen_match_top1(0)['table']
256 |     # assert df1.shape[0] == dfr1.shape[0] # todo: deal with duplicates
257 |     dfr2 = sj.join(True)
258 |     assert np.array_equal(dfr1['__top1diff__'].sort_values().values, dfr2['__top1diff__key'].sort_values().values)
259 | 
260 | def test_fakedata_singlekey_number():
261 |     pool_dates = pd.date_range('1/1/2018',periods=cfg_num)
262 | 
263 |     # case single key date
264 |     df1=pd.DataFrame({'date':pool_dates[:-cfg_num_unmatched]})
265 |     df2=pd.DataFrame({'date':pool_dates[cfg_num_unmatched:]})
266 | 
267 |     sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['date'])
268 |     dfr = sj._gen_match_top1_left_number([],[],'date','date',None)
269 | 
270 |     df_check = pd.DataFrame({'__top1left__':pool_dates[:-cfg_num_unmatched],'__top1right__':[pool_dates[cfg_num_unmatched]]*cfg_num_unmatched+pool_dates[cfg_num_unmatched:-cfg_num_unmatched].tolist()})
271 |     df_check['__top1diff__'] = (df_check['__top1left__'] - df_check['__top1right__']).abs()
272 | 
273 |     assert dfr.equals(df_check)
274 | 
275 |     # apply top_nrecords
276 |     sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['date'],fuzzy_how={0:{'top_limit':1}})
277 |     dfr = sj._gen_match_top1_left_number([],[],'date','date',None)
278 | 
279 |     df_check = pd.DataFrame({'__top1left__':pool_dates[:-cfg_num_unmatched],'__top1right__':[pool_dates[cfg_num_unmatched]]*cfg_num_unmatched+pool_dates[cfg_num_unmatched:-cfg_num_unmatched].tolist()})
280 |     df_check['__top1diff__'] = (df_check['__top1left__'] - df_check['__top1right__']).abs()
281 | 
282 |     assert dfr.equals(df_check)
283 | 
284 |     # case single key date, with exact keys
285 |     pool_dates2 = pd.date_range('12/31/2017',periods=cfg_num)
286 |     df1=pd.DataFrame({'grp':['a']*cfg_num_matched+['b']*cfg_num_matched,'date':pool_dates[:-cfg_num_unmatched].tolist()+pool_dates2[:-cfg_num_unmatched].tolist()})
287 |     df2=pd.DataFrame({'grp':['a']*cfg_num_matched+['b']*cfg_num_matched,'date2':pool_dates[cfg_num_unmatched:].tolist()+pool_dates2[cfg_num_unmatched:].tolist()})
288 |     sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],exact_keys=['grp'],fuzzy_keys=[['date', 'date2']])
289 |     dfr = sj._gen_match_top1_left_number(['grp'],['grp'],'date','date2',None)
290 | 
291 |     dfc0 = pd.merge_asof(df1.sort_values('date'), df2.sort_values('date2'), left_on='date', right_on='date2', by='grp', direction='nearest')
292 |     dfc = dfc0.rename(columns={'date':'__top1left__','date2':'__top1right__'})
293 |     dfc['__top1diff__'] = (dfc['__top1left__'] - dfc['__top1right__']).abs()
294 |     dfc = dfc[dfr.columns.tolist()]
295 | 
296 |     assert dfr.equals(dfc)
297 | 
298 |     dfc['__match type__'] = 'exact'
299 |     dfc.loc[dfc['__top1diff__'].dt.days>0,'__match type__'] = 'top1 left'
300 | 
301 |     assert sj._gen_match_top1(0)['table'].equals(dfc)
302 |     assert sj.join().sort_values(['date','grp']).reset_index(drop=True).equals(dfc0)
303 | 
304 | 
305 | def fakedata_multikey():
306 | 
307 |     df1, df2 = gen_multikey_simple()
308 | 
309 |     cfg_group_left=['date']
310 |     cfg_group_right=cfg_group_left
311 |     keyleft='key'
312 |     keyright=keyleft
313 | 
314 |     '''
315 |     from d6tjoin.smart_join import apply_gen_candidates_group
316 |     df_keys_left = pd.DataFrame(df1.groupby(cfg_group_left)[keyleft].unique())
317 |     df_keys_right = pd.DataFrame(df2.groupby(cfg_group_right)[keyright].unique())
318 |     df_keysets_groups = df_keys_left.merge(df_keys_right, left_index=True, right_index=True)
319 |     df_keysets_groups.columns = ['__top1left__', '__top1right__']
320 |     dfg = df_keysets_groups.reset_index().groupby(cfg_group_left).apply(apply_gen_candidates_group)
321 |     dfg = dfg.reset_index(-1, drop=True).reset_index()
322 |     '''
323 |     with pytest.raises(NotImplementedError) as e_info:
324 |             d6tjoin.smart_join.FuzzyJoinTop1([df1,df2], fuzzy_keys=['key','date'])
325 | 
326 | 
327 |     '''
328 |     df1
329 |     df2
330 | 
331 | 
332 | tests fuzzy string, exact keys
333 | tests fuzzy number int+float
334 | tests with nans
335 | groupby unique deal with nans
336 | 
337 | merge just the keys together [often date, key = 1 row...]
338 | => as soon as have >1 fuzzy key need to specify if hierarchical 
339 | // does it increase the compute complexity? have to do the same all pairs compute for every date!!
340 | => do global match, from there find the closest ones by date
341 | 
342 | explain: warnings.warn('Multi-key fuzzy joins are currently done globally for each key indivudally, not hierarchically for each unique fuzzy key value pair')
343 | tests for factor data id vs date, id matching
344 |     
345 |     '''
346 |     # with pytest.raises(ValueError) as e_info:
347 |     #     d6tjoin.smart_join.FuzzyJoinTop1([df1,df2], fuzzy_keys=['key','key'], fuzzy_how=[])
348 |     #
349 |     # importlib.reload(d6tjoin.smart_join)
350 |     # sj = d6tjoin.smart_join.FuzzyJoinTop1([df1,df2],fuzzy_keys=['key','date'])
351 |     # dfr = sj.join(True)
352 |     # assert df1.shape[0] == dfr.shape[0]
353 | 
354 | # fakedata_multikey()
355 | 
356 | 
357 | def test_fakedata_multikey_iddate():
358 |     import uuid
359 |     import itertools
360 | 
361 |     nobs = 10
362 |     uuid1 = [str(uuid.uuid4()) for _ in range(nobs)]
363 |     dates1 = pd.date_range('1/1/2010','1/1/2011')
364 | 
365 |     dates2 = pd.bdate_range('1/1/2010', '1/1/2011')  # business instead of calendar dates
366 | 
367 |     df1 = pd.DataFrame(list(itertools.product(uuid1, dates1)), columns=['id', 'date'])
368 |     df1['v'] = np.random.sample(df1.shape[0])
369 | 
370 |     df2 = pd.DataFrame(list(itertools.product(uuid1, dates2)), columns=['id', 'date'])
371 |     df2['v'] = np.random.sample(df2.shape[0])
372 | 
373 |     sj = d6tjoin.smart_join.FuzzyJoinTop1([df1, df2], exact_keys=['id'], fuzzy_keys=['date'])
374 |     dft = sj.preview_fuzzy(0)
375 | 
376 | 
377 |     df2 = df1.copy()
378 |     df2['id'] = df1['id'].str[1:-1]
379 | 
380 |     sj = d6tjoin.smart_join.FuzzyJoinTop1([df1, df2], exact_keys=['date'], fuzzy_keys=['id'])
381 |     dft = sj.preview_fuzzy(0)
382 |     dft.shape
383 |     dft = sj._gen_match_top1(0)
384 |     dft['table'].shape
385 | 
386 |     print('a')
387 | 
388 | 
389 | def fiddle():
390 |     cfg_path_folder_base = '/mnt/data/data.raw/travelclick/'
391 |     from d6tstack.read_excel_adv import read_excel_advanced
392 |     cfg_path = cfg_path_folder_base+'predict/STR Rolling Weekly Since 9-11-01 to 4-14-18 values weekly.xlsx'
393 |     df_str=read_excel_advanced(cfg_path, header_xls_start="A7", header_xls_end="D7",remove_blank_cols=True,remove_blank_rows=True)
394 |     df_str['STAY_WEEK'] = df_str['Date']-pd.DateOffset(days=6)
395 |     df_str.head()
396 | 
397 |     df_alltier2 = pd.read_excel(cfg_path_folder_base + 'predict/travelcity-revpar-unsorted.xlsx')
398 |     sj = d6tjoin.smart_join.FuzzyJoinTop1([df_alltier2,df_str],fuzzy_keys=['STAY_WEEK'])
399 |     sj._gen_match_top1(0)
400 | 
401 | # fiddle()
402 | 
403 | # test_fakedata_multikey_iddate()


--------------------------------------------------------------------------------
/tests/test_top1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | pd.set_option('display.expand_frame_repr', False)
  4 | import importlib
  5 | import d6tjoin.top1
  6 | import jellyfish
  7 | from faker import Faker
  8 | 
  9 | import tests.test_smartjoin
 10 | 
 11 | def gen_df2_str():
 12 |     l1 = ['a', 'b']
 13 |     l2 = [l1[0], 'ba', 'cd']
 14 |     df1 = pd.DataFrame({'id':l1*4})
 15 |     df2 = pd.DataFrame({'id':l2*4})
 16 |     df1['v1']=range(df1.shape[0])
 17 |     df2['v2']=range(df2.shape[0])
 18 |     return df1, df2
 19 | 
 20 | def gen_df2_num():
 21 |     l1 = [1,2]
 22 |     l2 = [l1[0],1.1,1.2]
 23 |     df1 = pd.DataFrame({'id': l1 * 4})
 24 |     df2 = pd.DataFrame({'id': l2 * 4})
 25 |     return df1, df2
 26 | 
 27 | 
 28 | def test_top1_gen_candidates():
 29 | 
 30 |     def helper(df1, df2):
 31 | 
 32 |         dfr = d6tjoin.top1.MergeTop1Diff(df1, df2,'id','id',jellyfish.levenshtein_distance)._allpairs_candidates()
 33 |         assert dfr.shape==(4, 3)
 34 |         assert (dfr['__top1left__'].values[0]==df1['id'].values[0])
 35 |         assert np.all(dfr['__top1left__'].values[1:]==df1['id'].values[1])
 36 |         assert (dfr['__top1right__'].values[0]==df1['id'].values[0])
 37 |         assert (dfr['__top1right__']==df2['id'].values[1]).sum()==1
 38 |         assert (dfr['__top1right__']==df2['id'].values[2]).sum()==1
 39 |         assert (dfr['__matchtype__']=='exact').sum()==1
 40 |         assert (dfr['__matchtype__']=='top1 left').sum()==3
 41 | 
 42 |     df1, df2 = gen_df2_str()
 43 |     helper(df1, df2)
 44 | 
 45 |     df1, df2 = gen_df2_num()
 46 |     helper(df1, df2)
 47 | 
 48 | 
 49 | def test_top1_str():
 50 | 
 51 |     df1, df2 = gen_df2_str()
 52 | 
 53 |     r = d6tjoin.top1.MergeTop1Diff(df1, df2,'id','id',jellyfish.levenshtein_distance).merge()
 54 |     dfr = r['top1']
 55 |     assert dfr['__top1diff__'].min()==0
 56 |     assert dfr['__top1diff__'].max()==1
 57 |     assert dfr.shape==(3, 4)
 58 |     dfr = r['merged']
 59 |     assert dfr.shape==(48, 4)
 60 |     assert np.all(dfr.groupby('id').size().values==np.array([16, 32]))
 61 | 
 62 |     df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=False)
 63 |     r = d6tjoin.top1.MergeTop1Diff(df1, df2,'key','key',jellyfish.levenshtein_distance,['date'],['date']).merge()
 64 |     dfr = r['merged']
 65 |     assert dfr.shape==(18, 5)
 66 |     assert np.all(dfr.groupby(['date','key']).size().values==np.array([1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))
 67 | 
 68 |     df1.head()
 69 |     df1.merge(df2, on=['date','key']).head()
 70 |     dfr.head()
 71 | 
 72 | def test_top1_num():
 73 | 
 74 |     df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=True)
 75 |     r = d6tjoin.top1.MergeTop1Number(df1, df2,'date','date',is_keep_debug=True).merge()
 76 |     dfr = r['top1']
 77 |     assert dfr.shape==(4, 4)
 78 |     assert np.all(dfr.groupby('__matchtype__').size().values==np.array([2, 2]))
 79 |     assert dfr['__top1diff__'].dt.days.max()==2
 80 |     assert dfr['__top1diff__'].dt.days.min()==0
 81 | 
 82 |     df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=True)
 83 |     r = d6tjoin.top1.MergeTop1Number(df1, df2,'date','date',['key'],['key']).merge()
 84 |     dfr = r['merged']
 85 |     dfr.sort_values(['date','key'])
 86 |     r['top1'].sort_values(['__top1left__','key'])
 87 |     df1.sort_values(['key','date'])
 88 |     df2.sort_values(['key','date'])
 89 |     r['top1']
 90 | 
 91 | def test_top1_multi():
 92 | 
 93 |     df1, df2 = tests.test_smartjoin.gen_multikey_complex(unmatched_date=True)
 94 |     df2['key'] = 'Mr. '+df1['key']
 95 | 
 96 |     r = d6tjoin.top1.MergeTop1(df1, df2,['date','key'],['date','key']).merge()
 97 | 
 98 | 
 99 |     assert True
100 | 
101 | 
102 | def test_top1_examples():
103 |     import uuid
104 |     import itertools
105 | 
106 |     # ******************************************
107 |     # generate sample data
108 |     # ******************************************
109 |     nobs = 10
110 |     # todo: set uuid seed
111 |     # todo: only pick first 2 blocks
112 |     f1 = Faker()
113 |     f1.seed(0)
114 |     uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)]
115 |     dates1 = pd.date_range('1/1/2010', '1/1/2011')
116 |     dates2 = pd.bdate_range('1/1/2010', '1/1/2011')  # business instead of calendar dates
117 | 
118 |     df1 = pd.DataFrame(list(itertools.product(uuid1, dates1)), columns=['id', 'date'])
119 |     df1['v'] = np.random.sample(df1.shape[0])
120 |     df2 = df1.copy()
121 |     df2['id'] = df1['id'].str[1:-1]
122 | 
123 |     # r = d6tjoin.top1.MergeTop1Number(df1, df2, 'id', 'id', ['date'], ['date']).merge()
124 |     # assert raises ValueError => should check it's a number to do number join
125 | 
126 |     # r = d6tjoin.top1.MergeTop1Diff(df1, df2, 'id', 'id', jellyfish.levenshtein_distance, ['date'], ['date']).merge()
127 |     # assert min()==2
128 |     # assert diff no duplicates
129 |     # assert diff found == substring
130 |     # assert only 100 candidates (not 366*100)
131 | 
132 |     # r = d6tjoin.top1.MergeTop1(df1, df2, ['id'], ['id'], ['date'], ['date']).merge()
133 |     # assert merged==merged
134 |     # assert diff==diff
135 | 
136 |     # dates2 = pd.bdate_range('1/1/2010', '1/1/2011')  # business instead of calendar dates
137 |     # df2 = pd.DataFrame(list(itertools.product(uuid1, dates2)), columns=['id', 'date'])
138 |     # df2['v'] = np.random.sample(df2.shape[0])
139 |     # r = d6tjoin.top1.MergeTop1(df1, df2, ['date'], ['date'], ['id'], ['id']).merge()
140 |     # # why cause error?
141 |     # r = d6tjoin.top1.MergeTop1(df1.head(), df2, ['date'], ['date'], ['id'], ['id']).merge()
142 | 
143 |     df2 = pd.DataFrame(list(itertools.product(uuid1, dates2)), columns=['id', 'date'])
144 |     df2['v'] = np.random.sample(df2.shape[0])
145 |     df2['id'] = df1['id'].str[1:-1]
146 | 
147 |     result = d6tjoin.top1.MergeTop1(df1, df2, ['date', 'id'], ['date', 'id']).merge()
148 |     result['merged']
149 |     # o=d6tjoin.top1.MergeTop1(df1, df2, ['date', 'id'], ['date', 'id'])
150 |     # o.cfg_exact_left_on
151 |     result = d6tjoin.top1.MergeTop1(df1, df2, ['date', 'id'], ['date', 'id']).merge()
152 | 
153 |     d6tjoin.utils.PreJoin([df1, df2], ['id', 'date']).stats_prejoin(print_only=False)
154 | 
155 |     assert True
156 | 
157 | 
158 | def fiddle_set():
159 | 
160 |     import pandas as pd
161 |     import numpy as np
162 |     import importlib
163 |     import d6tjoin.top1
164 | 
165 |     import ciseau
166 |     import scipy.spatial.distance
167 | 
168 |     df_db = pd.read_csv('~/database.csv',index_col=0)
169 | 
170 |     def diff_jaccard(a, b):
171 |         # pad with empty str to make euqal length
172 |         a = np.pad(a, (0, max(0, len(b) - len(a))), 'constant', constant_values=(0, 0))
173 |         b = np.pad(b, (0, max(0, len(a) - len(b))), 'constant', constant_values=(0, 0))
174 |         return scipy.spatial.distance.jaccard(a, b)
175 | 
176 |     def strsplit(t):
177 |         return [s for s in [s.replace(" ", "") for s in ciseau.tokenize(t)] if s not in ['.', ',', '-', ';', '(', ')']]
178 | 
179 |     importlib.reload(d6tjoin.top1)
180 |     j = d6tjoin.top1.MergeTop1Diff(df_db.head(),df_db,'description','description',fun_diff=diff_jaccard,topn=2,fun_preapply=strsplit,fun_postapply=lambda x: ' '.join(x))
181 |     j.merge()['merged']
182 | 
183 | 
184 | def test_multicore():
185 |     nobs = 10
186 |     f1 = Faker()
187 |     f1.seed(0)
188 |     uuid1 = [str(f1.uuid4()).split('-')[0] for _ in range(nobs)]
189 | 
190 |     df1 = pd.DataFrame(uuid1, columns=['id'])
191 |     df1['val1'] = np.round(np.random.sample(df1.shape[0]), 3)
192 | 
193 |     # create mismatch
194 |     df2 = df1.copy()
195 |     df2['id'] = df1['id'].str[1:-1]
196 |     df2['val2'] = np.round(np.random.sample(df2.shape[0]), 3)
197 | 
198 | 
199 |     m = d6tjoin.top1.MergeTop1Diff(df1,df2,'id','id',fun_diff=jellyfish.levenshtein_distance)
200 |     df_candidates = m._allpairs_candidates()
201 | 
202 |     idxSel = df_candidates['__matchtype__'] != 'exact'
203 |     dfd2 = df_candidates.copy()
204 |     dfd2.loc[idxSel,'__top1diff__'] = d6tjoin.top1._applyFunMulticore(df_candidates.loc[idxSel,'__top1left__'].values, df_candidates.loc[idxSel,'__top1right__'].values,jellyfish.levenshtein_distance)
205 | 
206 |     dfd1 = df_candidates.copy()
207 |     dfd1.loc[idxSel, '__top1diff__'] = df_candidates[idxSel].apply(lambda x: jellyfish.levenshtein_distance(x['__top1left__'], x['__top1right__']), axis=1)
208 |     assert dfd2.equals(dfd1)
209 | 
210 |     assert True
211 | 
212 |     '''
213 |     multicore in caller class
214 |     pass multicore on
215 |     make ifelse multicore for every apply diff
216 |     
217 |     default yes?
218 |     part of requirements
219 |     
220 |     update setup.py requirements
221 |     
222 |     
223 |     '''
224 | 
225 | 
226 | test_top1_gen_candidates()


--------------------------------------------------------------------------------
/tests/tmp.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import importlib
 3 | 
 4 | import d6tjoin
 5 | import d6tjoin.utils
 6 | importlib.reload(d6tjoin.utils)
 7 | 
 8 | df1=pd.DataFrame({'v':list(range(10))*2,'g':['a']*10+['b']*10})
 9 | df2=df1.copy()
10 | 
11 | j = d6tjoin.PreJoin([df1,df2])
12 | j.str_describe()
13 | j.data_describe()
14 | j.columns_common()
15 | j.columns_ispresent()
16 | j.data_match()
17 | 
18 | j = d6tjoin.PreJoin([df1,df2], print_only=False)
19 | r = j.data_match()
20 | dfc = {'__left__': {0: 'g', 1: 'v'},
21 |  '__right__': {0: 'g', 1: 'v'},
22 |  '__similarity__': {0: 1.0, 1: 1.0}}
23 | dfc = pd.DataFrame(dfc)
24 | assert r.equals(dfc)
25 | print(r)
26 | 
27 | quit()
28 | 
29 | df1=pd.DataFrame({'a':range(3),'b':range(3)})
30 | df2=pd.DataFrame({'a':range(3),'c':range(3)})
31 | df2=pd.DataFrame({'a':range(3),'b':range(3,6)})
32 | df2=pd.DataFrame({'a':range(3,6),'c':range(3)})
33 | 
34 | 
35 | j = d6tjoin.utils.BaseJoin([df1,df2],['a'])
36 | 
37 | j = d6tjoin.utils.BaseJoin([df1,df2],['a','b'])
38 | j.keys
39 | dfr = j.stats_prejoin(return_results=True)
40 | dfr
41 | (~dfr['all matched']).all()
42 | 
43 | j = d6tjoin.utils.BaseJoin([df1,df2],['a'])
44 | j.stats_prejoin(return_results=True).to_dict()
45 | 
46 | 


--------------------------------------------------------------------------------