├── .gitignore
├── LICENSE
├── README.md
├── compare.py
├── pyarrow_ops
    ├── __init__.py
    ├── cjoin.c
    ├── cjoin.pyx
    ├── group.py
    ├── helpers.py
    ├── join.py
    ├── jsons.py
    ├── ml.py
    ├── ops.py
    └── table.py
├── pyproject.toml
├── setup.py
├── test_func.py
└── test_ml.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Folders
  2 | .DS_Store
  3 | data/
  4 | logs/
  5 | dev/
  6 | numpy/
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | pip-wheel-metadata/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # IPython
 88 | profile_default/
 89 | ipython_config.py
 90 | 
 91 | # pyenv
 92 | .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pyarrow ops
  2 | Pyarrow ops is Python libary for data crunching operations directly on the pyarrow.Table class, implemented in numpy & Cython. For convenience, function naming and behavior tries to replicates that of the Pandas API. The Join / Groupy performance is slightly slower than that of pandas, especially on multi column joins.
  3 | 
  4 | Current use cases:
  5 | - Data operations like joins, groupby (aggregations), filters & drop_duplicates
  6 | - (Very fast) reusable pre-processing for ML applications
  7 | 
  8 | ## Installation
  9 | 
 10 | Use the package manager [pip](https://pip.pypa.io/en/stable/) to install pyarrow_ops.
 11 | 
 12 | ```bash
 13 | pip install pyarrow_ops
 14 | ```
 15 | 
 16 | ## Usage
 17 | See test_*.py for runnable test examples
 18 | 
 19 | Data operations:
 20 | ```python
 21 | import pyarrow as pa 
 22 | from pyarrow_ops import join, filters, groupby, head, drop_duplicates
 23 | 
 24 | # Create pyarrow.Table
 25 | t = pa.Table.from_pydict({
 26 |     'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'],
 27 |     'Max Speed': [380., 370., 24., 26., 24.]
 28 | })
 29 | head(t) # Use head to print, like df.head()
 30 | 
 31 | # Drop duplicates based on column values
 32 | d = drop_duplicates(t, on=['Animal'], keep='first')
 33 | 
 34 | # Groupby iterable
 35 | for key, value in groupby(t, ['Animal']):
 36 |     print(key)
 37 |     head(value)
 38 | 
 39 | # Group by aggregate functions
 40 | g = groupby(t, ['Animal']).sum()
 41 | g = groupby(t, ['Animal']).agg({'Max Speed': 'max'})
 42 | 
 43 | # Use filter predicates using list of tuples (column, operation, value)
 44 | f = filters(t, [('Animal', 'not in', ['Falcon', 'Duck']), ('Max Speed', '<', 25)])
 45 | 
 46 | # Join operations (currently performs inner join)
 47 | t2 = pa.Table.from_pydict({
 48 |     'Animal': ['Falcon', 'Parrot'],
 49 |     'Age': [10, 20]
 50 | })
 51 | j = join(t, t2, on=['Animal'])
 52 | ```
 53 | 
 54 | ML Preprocessing (note: personal tests showed ~5x speed up compared to pandas on large datasets)
 55 | ```python
 56 | import pyarrow as pa 
 57 | from pyarrow_ops import head, TableCleaner
 58 | 
 59 | # Training data
 60 | t1 = pa.Table.from_pydict({
 61 |     'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'],
 62 |     'Max Speed': [380., 370., None, 26., 24.],
 63 |     'Value': [2000, 1500, 10, 30, 20],
 64 | })
 65 | 
 66 | # Create TableCleaner & register columns to be processed
 67 | cleaner = TableCleaner()
 68 | cleaner.register_numeric('Max Speed', impute='min', clip=True)
 69 | cleaner.register_label('Animal', categories=['Goose', 'Falcon'])
 70 | cleaner.register_one_hot('Animal')
 71 | 
 72 | # Clean table and split into train/test
 73 | X, y = cleaner.clean_table(t1, label='Value')
 74 | X_train, X_test, y_train, y_test = cleaner.split(X, y)
 75 | 
 76 | # Train a model + Save cleaner settings
 77 | cleaner_dict = cleaner.to_dict()
 78 | 
 79 | # Prediction data
 80 | t2 = pa.Table.from_pydict({
 81 |     'Animal': ['Falcon', 'Goose', 'Parrot', 'Parrot'],
 82 |     'Max Speed': [380., 10., None, 26.]
 83 | })
 84 | new_cleaner = TableCleaner().from_dict(cleaner_dict)
 85 | X_pred = new_cleaner.clean_table(t2)
 86 | ```
 87 | 
 88 | ### To Do's
 89 | - [x] Improve groupby speed by not create copys of table
 90 | - [x] Add ML cleaning class
 91 | - [x] Improve speed of groupby by avoiding for loops
 92 | - [x] Improve join speed by moving code to C
 93 | - [ ] Add unit tests using pytest
 94 | - [ ] Add window functions on groupby
 95 | - [ ] Add more join options (left, right, outer, full, cross)
 96 | - [ ] Allow for functions to be classmethods of pa.Table* (t.groupby(...))
 97 | 
 98 | *One of the main difficulties is that the pyarrow classes are written in C and do not have a __dict__ method, this hinders inheritance and adding classmethods.
 99 | 
100 | ## Relation to pyarrow
101 | In the future many of these functions might be obsolete by enhancements in the pyarrow package, but for now it is a convenient alternative to switching back and forth between pyarrow and pandas.
102 | 
103 | ## Contributing
104 | Pull requests are very welcome, however I believe in 80% of the utility in 20% of the code. I personally get lost reading the tranches of the pandas source code. If you would like to seriously improve this work, please let me know!


--------------------------------------------------------------------------------
/compare.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | import pyarrow as pa
 4 | from pyarrow_ops import groupby, join, head, drop_duplicates
 5 | 
 6 | # Generate ids
 7 | left_size = int(1e4)
 8 | right_size = int(1e5)
 9 | 
10 | # Create table
11 | ids = np.random.choice(np.arange(left_size), size=left_size, replace=False)
12 | l = pa.Table.from_arrays(
13 |     [ids, np.random.randint(0, 10000, size=(left_size))],
14 |     names=['id', 'salary']
15 | )
16 | head(l)
17 | r = pa.Table.from_arrays(
18 |     [np.random.choice(ids, size=(right_size)), np.random.randint(0, 20, size=(right_size))],
19 |     names=['id', 'age_children']
20 | )
21 | head(r)
22 | 
23 | # Pyarrow ops
24 | ti = time.time()
25 | j = join(l, r, on=['id'])
26 | print("Pyarrow ops join took:", time.time() - ti)
27 | 
28 | ti = time.time()
29 | d = drop_duplicates(j, on=['id'])
30 | print("Pyarrow ops drop_duplicates took:", time.time() - ti)
31 | 
32 | tg = time.time()
33 | g = groupby(j, by=['id']).agg({'age_children': 'mean'})
34 | print("Pyarrow ops groupby took:", time.time() - tg)
35 | 
36 | # Pandas
37 | dfl, dfr = l.to_pandas(), r.to_pandas()
38 | 
39 | ti = time.time()
40 | dfj = dfl.merge(dfr, how='left', left_on='id', right_on='id')
41 | print("Pandas merge took:", time.time() - ti)
42 | 
43 | ti = time.time()
44 | dfj = dfj.drop_duplicates(subset=['id'])
45 | print("Pandas drop_duplicates took:", time.time() - ti)
46 | 
47 | tg = time.time()
48 | dfg = dfj.groupby(['id']).agg({'age_children': 'mean'})
49 | print("Pandas groupby took:", time.time() - tg)


--------------------------------------------------------------------------------
/pyarrow_ops/__init__.py:
--------------------------------------------------------------------------------
1 | from pyarrow_ops.ops import head, filters, drop_duplicates, head
2 | from pyarrow_ops.group import groupby
3 | from pyarrow_ops.ml import TableCleaner
4 | from pyarrow_ops.join import join


--------------------------------------------------------------------------------
/pyarrow_ops/cjoin.pyx:
--------------------------------------------------------------------------------
 1 | import cython
 2 | from cython import Py_ssize_t
 3 | import numpy as np
 4 | 
 5 | cimport numpy as cnp
 6 | from numpy cimport ndarray, int64_t
 7 | cnp.import_array()
 8 | 
 9 | @cython.boundscheck(False)
10 | def inner_join(
11 |         const int64_t[:] left_idxs, const int64_t[:] right_idxs, 
12 |         const int64_t[:] left_counts, const int64_t[:] right_counts, 
13 |         const int64_t[:] left_bidxs, const int64_t[:] right_bidxs):
14 |     cdef:
15 |         Py_ssize_t i, li, ri, rows = 0, p = 0
16 |         int64_t cats, lbi, rbi, lc, rc, lp, rp
17 |         ndarray[int64_t] left_align, right_align
18 |     
19 |     cats = left_counts.shape[0]
20 |     with nogil:
21 |         for i in range(cats):
22 |             lc = left_counts[i]
23 |             rc = right_counts[i]
24 |             rows += lc * rc
25 | 
26 |     left_align, right_align = np.empty(rows, dtype=np.int64), np.empty(rows, dtype=np.int64)
27 | 
28 |     with nogil:
29 |         for i in range(cats):
30 |             lc = left_counts[i]
31 |             rc = right_counts[i]
32 |             if lc > 0 and rc > 0:
33 |                 lbi = left_bidxs[i]
34 |                 for li in range(lc):
35 |                     rbi = right_bidxs[i]
36 |                     for ri in range(rc):
37 |                         lp = left_idxs[lbi]
38 |                         rp = right_idxs[rbi]
39 |                         left_align[p] = lp
40 |                         right_align[p] = rp
41 |                         rbi += 1
42 |                         p += 1
43 |                     lbi += 1
44 |     return left_align, right_align
45 |                             
46 | 
47 | 


--------------------------------------------------------------------------------
/pyarrow_ops/group.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyarrow as pa
 3 | from pyarrow_ops.helpers import combine_column, columns_to_array, groupify_array
 4 | 
 5 | # Grouping / groupby methods
 6 | agg_methods = {
 7 |     'sum': np.sum,
 8 |     'max': np.max,
 9 |     'min': np.min,
10 |     'mean': np.mean,
11 |     'median': np.median
12 | }
13 | def add_agg_method(self, name, method):
14 |     def f(agg_columns=[]):
15 |         methods = {col: method for col in (agg_columns if agg_columns else self.table.column_names) if col not in self.columns}
16 |         return self.aggregate(methods=methods)
17 |     setattr(self, name, f)
18 | 
19 | class Grouping():
20 |     def __init__(self, table, columns):
21 |         self.table = table
22 |         self.columns = list(set(columns))
23 | 
24 |         # Initialize array + groupify
25 |         self.arr = columns_to_array(table, columns)
26 |         self.dic, self.counts, self.sort_idxs, self.bgn_idxs = groupify_array(self.arr)
27 |         self.set_methods()
28 | 
29 |     def __iter__(self):
30 |         for i in range(len(self.dic)):
31 |             idxs = self.sort_idxs[self.bgn_idxs[i] : self.bgn_idxs[i] + self.counts[i]]
32 |             yield {k: v[0] for k, v in self.table.select(self.columns).take([self.sort_idxs[self.bgn_idxs[i]]]).to_pydict().items()}, self.table.take(idxs)
33 | 
34 |     # Aggregation methods
35 |     def set_methods(self):
36 |         for k, m in agg_methods.items():
37 |             add_agg_method(self, k, m)
38 | 
39 |     def aggregate(self, methods):
40 |         # Create index columns
41 |         table = self.table.select(self.columns).take(self.sort_idxs[self.bgn_idxs])
42 | 
43 |         data = {k: self.table.column(k).to_numpy() for k in methods.keys()}
44 |         for col, f in methods.items():
45 |             vf = np.vectorize(f, otypes=[object])
46 |             agg_arr = vf(np.split(data[col][self.sort_idxs], self.bgn_idxs[1:]))
47 |             table = table.append_column(col, pa.array(agg_arr))
48 |         return table
49 | 
50 |     def agg(self, methods):
51 |         methods = {col: agg_methods[m] for col, m in methods.items()}
52 |         return self.aggregate(methods=methods)
53 | 
54 | def groupby(table, by):
55 |     return Grouping(table, by)


--------------------------------------------------------------------------------
/pyarrow_ops/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def groupify_array(arr):
 4 |     # Input: Pyarrow/Numpy array
 5 |     # Output:
 6 |     #   - 1. Unique values
 7 |     #   - 2. Count per unique
 8 |     #   - 3. Sort index
 9 |     #   - 4. Begin index per unique
10 |     dic, counts = np.unique(arr, return_counts=True)
11 |     sort_idx = np.argsort(arr)
12 |     return dic, counts, sort_idx, [0] + np.cumsum(counts)[:-1].tolist()
13 | 
14 | def combine_column(table, name):
15 |     return table.column(name).combine_chunks()
16 | 
17 | f = np.vectorize(hash)
18 | def columns_to_array(table, columns):
19 |     columns = ([columns] if isinstance(columns, str) else list(set(columns)))
20 |     if len(columns) == 1:
21 |         #return combine_column(table, columns[0]).to_numpy(zero_copy_only=False)
22 |         return f(combine_column(table, columns[0]).to_numpy(zero_copy_only=False))
23 |     else:
24 |         values = [c.to_numpy() for c in table.select(columns).itercolumns()]
25 |         return np.array(list(map(hash, zip(*values))))
26 | 
27 | # Old helpers
28 | 
29 | # Splitting tables by columns
30 | def split_array(arr):
31 |     arr = arr.dictionary_encode()
32 |     ind, dic = arr.indices.to_numpy(zero_copy_only=False), arr.dictionary.to_numpy(zero_copy_only=False)
33 | 
34 |     if len(dic) < 1000:
35 |         # This method is much faster for small amount of categories, but slower for large ones
36 |         return {v: (ind == i).nonzero()[0] for i, v in enumerate(dic)}
37 |     else:
38 |         idxs = [[] for _ in dic]
39 |         [idxs[v].append(i) for i, v in enumerate(ind)]
40 |         return dict(zip(dic, idxs))
41 | 
42 | def split(table, columns, group=(), idx=None):
43 |     # idx keeps track of the orginal table index, getting split recurrently
44 |     if not isinstance(idx, np.ndarray):
45 |         idx = np.arange(table.num_rows)
46 |     val_idxs = split_array(combine_column(table, columns[0]))
47 |     if columns[1:]:
48 |         return [s for v, i in val_idxs.items() for s in split(table, columns[1:], group + (v,), idx[i])]
49 |     else:
50 |         return [(group + (v,), i) for v, i in val_idxs.items()]
51 | 


--------------------------------------------------------------------------------
/pyarrow_ops/join.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | import pyarrow as pa
 4 | from pyarrow_ops.helpers import columns_to_array, groupify_array
 5 | from cjoin import inner_join
 6 | 
 7 | def align_tables(t1, t2, l1, l2):
 8 |     # Align tables
 9 |     table = t1.take(l1)
10 |     for c in t2.column_names:
11 |         if c not in t1.column_names:
12 |             table = table.append_column(c, t2.column(c).take(l2))
13 |     return table
14 | 
15 | def join(left, right, on):
16 |     # Gather join columns
17 |     t0 = time.time()
18 |     l_arr, r_arr = columns_to_array(left, on), columns_to_array(right, on)
19 | 
20 |     # Groupify the join array
21 |     t1 = time.time()
22 |     ld, lc, lidxs, lbi = groupify_array(l_arr)
23 |     rd, rc, ridxs, rbi = groupify_array(r_arr)
24 | 
25 |     # Find both dicts
26 |     t2 = time.time()
27 |     bd, inv = np.unique(np.concatenate([ld, rd]), return_inverse=True)
28 |     
29 |     # Align Left side
30 |     t3 = time.time()
31 |     linv = inv[:ld.shape[0]]
32 |     lcc, lbic = np.zeros_like(bd), np.zeros_like(bd)
33 |     lcc[linv] = lc
34 |     lbic[linv] = lbi
35 |     
36 |     # Align right side
37 |     rinv = inv[ld.shape[0]:]
38 |     rcc, rbic = np.zeros_like(bd), np.zeros_like(bd)
39 |     rcc[rinv] = rc
40 |     rbic[rinv] = rbi
41 | 
42 |     # Perform cjoin
43 |     t4 = time.time()
44 |     left_align, right_align = inner_join(lidxs.astype(np.int64), ridxs.astype(np.int64), lcc.astype(np.int64), rcc.astype(np.int64), lbic.astype(np.int64), rbic.astype(np.int64))   
45 |     
46 |     # print("Join took:", time.time() - t4, t4 - t3 , t2 - t1, t1 - t0)
47 |     return align_tables(left, right, left_align, right_align)
48 | 
49 | # Old Code:
50 | def single_key_hash_join(t1, t2, key):
51 |     # Create idx_maps per distinct value
52 |     #ht = defaultdict(list, split_array(column(t2, key)))
53 |     ht = defaultdict(list)
54 |     [ht[t].append(i) for i, t in enumerate(column(t2, key).to_numpy(zero_copy_only=False))]
55 |     f = operator.itemgetter(*column(t1, key).to_numpy(zero_copy_only=False))
56 |     idx_maps = f(ht)
57 | 
58 |     # Gather indices
59 |     l1 = [i1 for i1, idx_map in enumerate(idx_maps) for i2 in idx_map]
60 |     l2 = list(itertools.chain.from_iterable(idx_maps))
61 |     return align_tables(t1, t2, l1, l2)
62 | 
63 | def multi_key_hash_join(t1, t2, on):
64 |     # List of tuples of columns
65 |     on1, on2 = [c.to_numpy() for c in t1.select(on).itercolumns()], [c.to_numpy() for c in t2.select(on).itercolumns()]
66 | 
67 |     # Zip idx / on values
68 |     tup1 = map(hash, zip(*on1))
69 |     tup2 = map(hash, zip(*on2))
70 | 
71 |     # Hash smaller table into dict {(on):[idx1, idx2, ...]}
72 |     ht = defaultdict(list)
73 |     [ht[t].append(i) for i, t in enumerate(tup2)]
74 |     f = operator.itemgetter(*tup1)
75 |     idx_maps = f(ht)
76 | 
77 |     # Gather indices
78 |     l1 = [i1 for i1, idx_map in enumerate(idx_maps) for i2 in idx_map]
79 |     l2 = list(itertools.chain.from_iterable(idx_maps))
80 |     return align_tables(t1, t2, l1, l2)
81 | 
82 | def join_old(left, right, on):
83 |     # We want the smallest table to be on the right
84 |     if left.num_rows >= right.num_rows:
85 |         t1, t2 = left, right
86 |     else:
87 |         t1, t2 = right, left
88 | 
89 |     # Choose join method
90 |     if len(on) == 1:
91 |         return single_key_hash_join(t1, t2, on[0])
92 |     else:
93 |         return multi_key_hash_join(t1, t2, on)


--------------------------------------------------------------------------------
/pyarrow_ops/jsons.py:
--------------------------------------------------------------------------------
1 | import pyarrow as pa
2 | import json
3 | import numpy as np
4 | 
5 | def str_to_table(arr):
6 |     arr = arr.to_numpy()
7 |     arr = np.vectorize(json.loads)(arr)
8 |     return pa.Table.from_pydict({k: [dic.get(k, None) for dic in arr] for k in arr[0]})


--------------------------------------------------------------------------------
/pyarrow_ops/ml.py:
--------------------------------------------------------------------------------
  1 | import pyarrow as pa
  2 | import numpy as np
  3 | import pyarrow.compute as c
  4 | 
  5 | # Cleaning functions
  6 | def clean_num(arr, impute=0.0, clip_min=None, clip_max=None):
  7 |     return (pa.array(np.nan_to_num(arr.to_numpy(zero_copy_only=False).astype(np.float64), nan=impute).clip(clip_min, clip_max)), )
  8 | 
  9 | def clean_cat(arr, categories=[]):
 10 |     arr = arr.cast(pa.string()).dictionary_encode()
 11 |     dic = arr.dictionary.to_pylist()
 12 |     if categories:
 13 |         d = {i:(categories.index(v) + 1 if v in categories else 0) for i, v in enumerate(dic)}
 14 |         d[-1] = 0 # NULLs -> 0
 15 |         return (pa.array(np.vectorize(d.get)(arr.indices.fill_null(-1).to_numpy())), ['Unknown'] + categories)
 16 |     else:
 17 |         return (c.add(arr.indices, pa.array([1], type=pa.int32())[0]).fill_null(0), ['Unknown'] + dic)
 18 | 
 19 | def clean_hot(arr, categories=[], drop_first=False):
 20 |     arr = arr.cast(pa.string())
 21 |     if categories:
 22 |         clns =[c.equal(arr, v).fill_null(False) for v in categories]
 23 |     else:
 24 |         categories = [u for u in arr.unique().to_pylist() if u]
 25 |         clns = [c.equal(arr, v).fill_null(False) for v in categories]
 26 |     return clns[(1 if drop_first else 0):], categories[(1 if drop_first else 0):]
 27 | 
 28 | # Cleaning Classes
 29 | class NumericalColumn():
 30 |     def __init__(self, name, impute='mean', clip=True, v_min=None, v_mean=None, v_max=None):
 31 |         self.name, self.impute, self.clip = name, impute, clip
 32 |         self.measured = any([v_min, v_mean, v_max])
 33 |         self.mean, self.min, self.max = (v_mean or 0), (v_min or 0), (v_max or 0)
 34 | 
 35 |     def to_dict(self):
 36 |         return {"name": self.name, "type": "numerical", "impute": self.impute, "clip": self.clip, "v_min": self.min, "v_mean": self.mean, "v_max": self.max}
 37 | 
 38 |     def update(self, arr):
 39 |         self.mean = float(c.mean(arr).as_py())
 40 |         minmax = c.min_max(arr)
 41 |         self.min, self.max = float(minmax['min'].as_py()), float(minmax['max'].as_py())
 42 | 
 43 |     def value(self):
 44 |         if self.impute == 'mean':
 45 |             return self.mean
 46 |         elif self.impute == 'min':
 47 |             return self.min
 48 |         elif self.impute == 'max':
 49 |             return self.max
 50 |         else:
 51 |             raise Exception("{} is not a valid impute method".format(self.impute))
 52 |     
 53 |     def clean(self, arr):
 54 |         if not self.measured:
 55 |             self.update(arr)
 56 |         cln, = clean_num(arr, impute=self.value(), clip_min=(self.min if self.clip else None), clip_max=(self.max if self.clip else None))
 57 |         return cln, None
 58 | 
 59 | class CategoricalColumn():
 60 |     def __init__(self, name, method, categories=[]):
 61 |         self.name, self.method, self.categories = name, method, categories
 62 |         self.measured = (True if categories else False)
 63 | 
 64 |     def to_dict(self):
 65 |         return {"name": self.name, "type": "categorical", "method": self.method, "categories": self.categories}
 66 | 
 67 |     def update(self, categories):
 68 |         self.categories = self.categories + [c for c in categories if c not in self.categories]
 69 | 
 70 |     def clean(self, arr):
 71 |         if self.method == 'one_hot':
 72 |             cln, cats = clean_hot(arr, categories=self.categories)
 73 |         else:
 74 |             cln, cats = clean_cat(arr, categories=self.categories)
 75 |         if not self.measured:
 76 |             self.categories = cats
 77 |         return cln, cats
 78 | 
 79 | class TableCleaner():
 80 |     def __init__(self):
 81 |         self.columns = []
 82 | 
 83 |     def to_dict(self):
 84 |         return [column.to_dict() for column in self.columns]
 85 | 
 86 |     def from_dict(self, columns):
 87 |         for column in columns:
 88 |             t = column.pop('type')
 89 |             if t == 'numerical':
 90 |                 self.columns.append(NumericalColumn(**column))
 91 |             else:
 92 |                 self.columns.append(CategoricalColumn(**column))  
 93 |         return self
 94 |     
 95 |     def register_numeric(self, name, impute='mean', clip=True):
 96 |         self.columns.append(NumericalColumn(name, impute, clip))
 97 | 
 98 |     def register_label(self, name, categories=[]):
 99 |         self.columns.append(CategoricalColumn(name, method='label', categories=categories))
100 |     
101 |     def register_one_hot(self, name, categories=[]):
102 |         self.columns.append(CategoricalColumn(name, method='one_hot', categories=categories)) 
103 | 
104 |     def clean_column(self, table, column):
105 |         arr = table.column(column.name).combine_chunks()
106 |         cln, cats = column.clean(arr)
107 |         if column.__dict__.get('method', '') == 'one_hot':
108 |             return [column.name + '_' + cat for cat in cats], cln
109 |         else:
110 |             return [column.name], [cln]
111 | 
112 |     def clean_table(self, table, label=None):
113 |         keys, arrays = [], []
114 |         for column in self.columns:
115 |             k, a = self.clean_column(table, column)
116 |             keys.extend(k)
117 |             arrays.extend(a)
118 |         if label:
119 |             return pa.Table.from_arrays(arrays, names=keys), table.column(label)
120 |         else:
121 |             return pa.Table.from_arrays(arrays, names=keys)
122 | 
123 |     def split(self, X, y=None, test_size=0.2):
124 |         mask = np.random.rand(X.num_rows) > test_size
125 |         while np.all(mask): # [True, True, True] is invalid
126 |             mask = np.random.rand(X.num_rows) > test_size
127 |         idxs, not_idxs = np.where(mask)[0], np.where(~mask)[0]
128 |         return X.take(idxs), X.take(not_idxs), y.take(idxs), y.take(not_idxs) # X_train, X_test, y_train, y_test
129 | 
130 |     


--------------------------------------------------------------------------------
/pyarrow_ops/ops.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pyarrow as pa
 3 | from pyarrow_ops.helpers import columns_to_array, groupify_array
 4 | 
 5 | # Filter functionality
 6 | def arr_op_to_idxs(arr, op, value):
 7 |     # Cast value to type arr
 8 |     try:
 9 |         value = np.array(value, dtype=arr.dtype)
10 |     except:
11 |         raise Exception("Cannot downcast {} to data type {}".format(value, arr.dtype))
12 | 
13 |     if op in ['=', '==']:
14 |         return np.where(arr == value)
15 |     elif op == '!=':
16 |         return np.where(arr != value)
17 |     elif op == '<':
18 |         return np.where(arr < value)
19 |     elif op == '>':
20 |         return np.where(arr > value)
21 |     elif op == '<=':
22 |         return np.where(arr <= value)
23 |     elif op == '>=':
24 |         return np.where(arr >= value)
25 |     elif op == 'in':
26 |         mask = np.isin(arr, value)
27 |         return np.arange(len(arr))[mask]
28 |     elif op == 'not in':
29 |         mask = np.invert(np.isin(arr, value))
30 |         return np.arange(len(arr))[mask]
31 |     else:
32 |         raise Exception("Operand {} is not implemented!".format(op))
33 | 
34 | def filters(table, filters):
35 |     filters = ([filters] if isinstance(filters, tuple) else filters)
36 |     # Filter is a list of (col, op, value) tuples
37 |     idxs = np.arange(table.num_rows)
38 |     for (col, op, value) in filters: #= or ==, !=, <, >, <=, >=, in and not in
39 |         arr = table.column(col).to_numpy()
40 |         f_idxs = arr_op_to_idxs(arr[idxs], op, value)
41 |         idxs = idxs[f_idxs]
42 |     return table.take(idxs)
43 | 
44 | # Drop duplicates
45 | def drop_duplicates(table, on=[], keep='first'):
46 |     # Gather columns to arr
47 |     arr = columns_to_array(table, (on if on else table.column_names))
48 | 
49 |     # Groupify
50 |     dic, counts, sort_idxs, bgn_idxs = groupify_array(arr)
51 | 
52 |     # Gather idxs
53 |     if keep == 'last':
54 |         idxs = (np.array(bgn_idxs) - 1)[1:].tolist() + [len(sort_idxs) - 1]
55 |     elif keep == 'first':
56 |         idxs = bgn_idxs
57 |     elif keep == 'drop':
58 |         idxs = [i for i, c in zip(bgn_idxs, counts) if c == 1]
59 |     return table.take(sort_idxs[idxs])
60 | 
61 | # Show for easier printing
62 | def head(table, n=5, max_width=100):
63 |     if table.num_rows == 0:
64 |         print("No data in table")
65 |         return
66 |     
67 |     # Extract head data
68 |     t = table.slice(length=n)
69 |     head = {k: list(map(str, v)) for k, v in t.to_pydict().items()}
70 | 
71 |     # Calculate width
72 |     col_width = list(map(len, head.keys()))
73 |     data_width = [max(map(len, h)) for h in head.values()]
74 | 
75 |     # Print data
76 |     data = [list(head.keys())] + [[head[c][i] for c in head.keys()] for i in range(t.num_rows)]
77 |     for i in range(len(data)):
78 |         adjust = [w.ljust(max(cw, dw) + 2) for w, cw, dw in zip(data[i], col_width, data_width)]
79 |         print(('Row  ' if i == 0 else str(i-1).ljust(5)) + "".join(adjust)[:max_width])
80 |     print('\n')


--------------------------------------------------------------------------------
/pyarrow_ops/table.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa
 2 | from pyarrow_ops import join, filters, groupby, drop_duplicates, head
 3 | 
 4 | # Table wrapper: does not work because pa.Table.from_pandas/from_arrays/from_pydict always returns pa.Table
 5 | class Table(pa.Table):
 6 |     def __init__(*args, **kwargs):
 7 |         super(Table, self).__init__(*args, **kwargs)
 8 |     
 9 |     def join(self, right, on):
10 |         return join(self, right, on)
11 |     
12 |     def filters(self, filters):
13 |         return filters(self, filters)
14 |     
15 |     def groupby(self, by):
16 |         return groupby(self, by)
17 | 
18 |     def drop_duplicates(self, on=[], keep='last'):
19 |         return drop_duplicates(self, on, keep)
20 | 
21 |     def head(self, n=5):
22 |         return head(self, n)
23 | 
24 | # Add methods to class pa.Table or instances of pa.Table: does not work because pyarrow.lib.Table is build in C
25 | def add_table_methods(table):
26 |     def join(self, right, on):
27 |         return join(self, right, on)
28 |     table.join = join
29 |     
30 |     def filters(self, filters):
31 |         return filters(self, filters)
32 |     table.filters = filters
33 |     
34 |     def groupby(self, by):
35 |         return groupby(self, by)
36 |     table.groupby = groupby
37 | 
38 |     def drop_duplicates(self, on=[], keep='last'):
39 |         return drop_duplicates(self, on, keep)
40 |     table.drop_duplicates = drop_duplicates
41 | 
42 |     def head(self, n=5):
43 |         return head(self, n)
44 |     table.head = head


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel", "numpy>=1.19.0", "Cython>=0.29.21"]
3 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | from setuptools import Extension
 3 | import numpy as np
 4 | from Cython.Build import cythonize
 5 | 
 6 | __version__ = "0.0.8"
 7 | 
 8 | extensions = [
 9 |     Extension(
10 |         name="cjoin", 
11 |         sources=["pyarrow_ops/cjoin.pyx"], 
12 |         include_dirs=[np.get_include()]
13 |     )
14 | ]
15 | 
16 | with open('README.md') as readme_file:
17 |     README = readme_file.read()
18 | 
19 | setup(
20 |     name='pyarrow_ops',
21 |     version=__version__,
22 |     description='Useful data crunching tools for pyarrow',
23 |     long_description_content_type="text/markdown",
24 |     long_description=README,
25 |     license='APACHE',
26 |     packages=find_packages(),
27 |     author='Tom Scheffers',
28 |     author_email='tom@youngbulls.nl ',
29 |     keywords=['arrow', 'pyarrow', 'data'],
30 |     url='https://github.com/TomScheffers/pyarrow_ops',
31 |     download_url='https://pypi.org/project/pyarrow-ops/',
32 | 
33 |     ext_modules=cythonize(extensions),
34 |     install_requires=[
35 |         'numpy>=1.19.2',
36 |         'pyarrow>=3.0'
37 |     ],
38 | )
39 | 


--------------------------------------------------------------------------------
/test_func.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa 
 2 | from pyarrow_ops import join, filters, groupby, head, drop_duplicates
 3 | 
 4 | # Create data
 5 | t = pa.Table.from_pydict({
 6 |     'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'],
 7 |     'Max Speed': [380., 370., 24., 26., 24.]
 8 | })
 9 | print("Source:")
10 | head(t)
11 | 
12 | # Drop duplicates
13 | print("Drop duplicates:")
14 | d = drop_duplicates(t, on=['Animal'], keep='first')
15 | head(d)
16 | 
17 | # Groupby aggregations
18 | print("Groupby loop:")
19 | for key, value in groupby(t, ['Animal']):
20 |     print(key)
21 |     head(value)
22 | 
23 | print("Aggregrations:")
24 | g = groupby(t, ['Animal']).median()
25 | g = groupby(t, ['Animal']).sum()
26 | g = groupby(t, ['Animal']).min()
27 | g = groupby(t, ['Animal']).agg({'Max Speed': 'max'})
28 | head(g)
29 | 
30 | # Filters
31 | print("Filters:")
32 | f = filters(t, ('Animal', '=', 'Falcon'))
33 | f = filters(t, [('Animal', 'not in', ['Falcon', 'Duck']), ('Max Speed', '<', 25)])
34 | head(f)
35 | 
36 | # Join operations
37 | print("Join:")
38 | t2 = pa.Table.from_pydict({
39 |     'Animal': ['Falcon', 'Parrot'],
40 |     'Age': [10, 20]
41 | })
42 | j = join(t, t2, on=['Animal'])
43 | head(j)


--------------------------------------------------------------------------------
/test_ml.py:
--------------------------------------------------------------------------------
 1 | import pyarrow as pa 
 2 | from pyarrow_ops import head, TableCleaner
 3 | 
 4 | # Training data
 5 | t1 = pa.Table.from_pydict({
 6 |     'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot', 'Parrot'],
 7 |     'Max Speed': [380., 370., None, 26., 24.],
 8 |     'Value': [2000, 1500, 10, 30, 20],
 9 | })
10 | 
11 | # Create TableCleaner
12 | cleaner = TableCleaner()
13 | cleaner.register_numeric('Max Speed', impute='min', clip=True)
14 | cleaner.register_label('Animal', categories=['Goose', 'Falcon']) # Categories is optional, unknown values get set to 0
15 | cleaner.register_one_hot('Animal')
16 | 
17 | # Clean table and split into train/test
18 | X, y = cleaner.clean_table(t1, label='Value')
19 | head(X)
20 | X_train, X_test, y_train, y_test = cleaner.split(X, y)
21 | 
22 | 
23 | # Train a model + save cleaner dictionary for reuse (serialize to JSON or pickle)
24 | cleaner_dict = cleaner.to_dict()
25 | for c in cleaner_dict:
26 |     print(c)
27 | 
28 | # Prediction data
29 | t2 = pa.Table.from_pydict({
30 |     'Animal': ['Falcon', 'Goose', 'Parrot', 'Parrot'],
31 |     'Max Speed': [380., 10., None, 26.]
32 | })
33 | new_cleaner = TableCleaner().from_dict(cleaner_dict)
34 | X_pred = new_cleaner.clean_table(t2)
35 | head(X_pred)


--------------------------------------------------------------------------------