├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── .vscode └── settings.json ├── CITATION.cff ├── LICENSE ├── README.md ├── docs ├── Makefile ├── make.bat └── source │ ├── _static │ ├── css │ │ └── custom.css │ ├── graphext.png │ └── gx_logo_sq_blue.jpg │ ├── _templates │ └── python │ │ └── module.rst │ ├── conf.py │ ├── index.rst │ ├── quick.rst │ ├── reader.rst │ └── types.rst ├── environment.yml ├── lector ├── __init__.py ├── cli.py ├── csv │ ├── __init__.py │ ├── abc.py │ ├── arrow.py │ ├── dialects.py │ ├── encodings.py │ ├── pandas.py │ └── preambles.py ├── log.py ├── types │ ├── __init__.py │ ├── abc.py │ ├── bools.py │ ├── cast.py │ ├── lists.py │ ├── numbers.py │ ├── regex.py │ ├── strings.py │ └── timestamps.py └── utils.py ├── notebooks ├── example.ipynb └── usage.ipynb ├── pyproject.toml ├── recipe └── meta.yaml ├── setup.cfg ├── setup.py └── test ├── __init__.py ├── conftest.py ├── test_dialects.py ├── test_encodings.py ├── test_formats.py ├── test_preambles.py ├── test_reader.py ├── test_types.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | notebooks/*.csv 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/#use-with-ide 112 | .pdm.toml 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | .ruff_cache/ 165 | .DS_Store 166 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3.9 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.5.0 6 | hooks: 7 | - id: check-yaml 8 | exclude: recipe/meta.yaml 9 | args: 10 | - --allow-multiple-documents 11 | - id: end-of-file-fixer 12 | - id: trailing-whitespace 13 | 14 | - repo: https://github.com/astral-sh/ruff-pre-commit 15 | rev: 'v0.5.4' 16 | hooks: 17 | - id: ruff 18 | args: [--fix, --exit-non-zero-on-fix] 19 | # Respect `exclude` and `extend-exclude` settings. 20 | # args: ["--force-exclude"] 21 | - id: ruff-format 22 | args: [--check] 23 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-20.04" 5 | tools: 6 | python: "3.10" 7 | 8 | python: 9 | install: 10 | - method: pip 11 | path: . 12 | extra_requirements: 13 | - docs 14 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | // Default (langugage-independent) settings 3 | "editor.tabSize": 4, 4 | "editor.trimAutoWhitespace": true, 5 | "editor.insertSpaces": true, 6 | "editor.autoIndent": "full", 7 | "editor.formatOnSave": false, 8 | "editor.formatOnPaste": false, 9 | // "workbench.colorCustomizations": { 10 | // "editorRuler.foreground": "#4093ff" 11 | // }, 12 | "files.insertFinalNewline": true, 13 | "files.trimFinalNewlines": true, 14 | "files.autoSave": "afterDelay", 15 | "files.autoSaveDelay": 500, 16 | // Python 17 | "[python]": { 18 | "editor.formatOnSave": true, 19 | "editor.formatOnPaste": true, 20 | "editor.tabSize": 4, 21 | "editor.insertSpaces": true, 22 | "editor.codeActionsOnSave": { 23 | "source.organizeImports": "explicit", 24 | "source.fixAll": "explicit" 25 | } 26 | }, 27 | } 28 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Buhrmann" 5 | given-names: "Thomas" 6 | title: "Lector" 7 | version: 0.3.3 8 | date-released: 2023-12-07 9 | url: "https://github.com/graphext/lector" 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 Graphext Labs S.L. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/graphext/lector/HEAD?labpath=notebooks%2Fusage.ipynb) 2 | 3 | # Lector 4 | 5 | [Lector](https://github.com/graphext/lector) aims to be a fast reader for potentially messy CSV files with configurable column type inference. It combines automatic detection of file encodings, CSV dialects (separator, escaping etc.) and preambles (initial lines containing metadata or junk unrelated to the actual tabular data). Its goal is to just-read-the-effing-CSV file without manual configuration in most cases. Each of the detection components is configurable and can be swapped out easily with custom implementations. 6 | 7 | Also, since both [pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) and Apache [Arrow](https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html) will destructively cast columns to the wrong type in some cases (e.g. large ID-like integer strings to floats), it provides an alternative and customisable inference and casting mechanism. 8 | 9 | Under the hood it uses pyarrow's [CSV parser](https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html) for reading, and its [compute functions](https://arrow.apache.org/docs/python/api/compute.html) for optional type inference. 10 | 11 | Lector is used at [Graphext](https://www.graphext.com) behind the scenes whenever a user uploads a new dataset, and so implicitly has been validated across 1000s of different CSV files from all kinds of sources. Note, however, that this is Graphext's first foray into open-sourcing our code and still _work-in-progress_. So at least initially we won't provide any guarantees as to support of this library. 12 | 13 | For quick usage examples see the [Usage](#usage) section below or the [notebook](notebooks/usage.ipynb) in this repo. 14 | 15 | For detailed documentation visit https://lector.readthedocs.io/. 16 | 17 | ## Installing 18 | 19 | While this library is not available yet on pypi, you can easily install it from Github with 20 | 21 | ``` 22 | pip install git+https://github.com/graphext/lector 23 | ``` 24 | 25 | ## Usage 26 | 27 | Let's assume we receive a CSV file containing some initial metadata, using the semicolon as separator, having some missing fields, and being encoded in Latin-1 (you'd be surprised how common such files are in the real world). 28 | 29 |
30 | Create example CSV file 31 | 32 | ``` python 33 | csv = """ 34 | Some preamble content here 35 | This is still "part of the metadata preamble" 36 | id;genre;metric;count;content;website;tags;vecs;date 37 | 1234982348728374;a;0.1;1;; http://www.graphext.com;"[a,b,c]";"[1.3, 1.4, 1.67]";11/10/2022 38 | ;b;0.12;;"Natural language text is different from categorical data."; https://www.twitter.com;[d];"[0, 1.9423]";01/10/2022 39 | 9007199254740993;a;3.14;3;"The Project · Gutenberg » EBook « of Die Fürstin.";http://www.google.com;"['e', 'f']";["84.234, 12509.99"];13/10/2021 40 | """.encode("ISO-8859-1") 41 | 42 | with open("example.csv", "wb") as fp: 43 | fp.write(csv) 44 | ``` 45 |
46 |
47 | 48 | To read this with lector into a pandas DataFrame, simply use 49 | 50 | ``` python 51 | df = lector.read_csv("example.csv", to_pandas=True) 52 | ``` 53 | 54 | Printing the DataFrame and its column types produces the following output: 55 | 56 | ``` 57 | id genre metric count \ 58 | 0 1234982348728374 a 0.10 1 59 | 1 b 0.12 60 | 2 9007199254740993 a 3.14 3 61 | 62 | content website \ 63 | 0 http://www.graphext.com 64 | 1 Natural language text is different from catego... https://www.twitter.com 65 | 2 The Project · Gutenberg » EBook « of Die Fürstin. http://www.google.com 66 | 67 | tags vecs date 68 | 0 [a, b, c] [1.3, 1.4, 1.67] 2022-10-11 69 | 1 [d] [0.0, 1.9423] 2022-10-01 70 | 2 [e, f] [84.234, 12509.99] 2021-10-13 71 | 72 | id Int64 73 | genre category 74 | metric float64 75 | count UInt8 76 | content string 77 | website category 78 | tags object 79 | vecs object 80 | date datetime64[ns] 81 | dtype: object 82 | ``` 83 | 84 | This is pretty sweet, because 85 | 86 | - we didn't have to tell lector _how_ to read this file (text encoding, lines to skip, separator etc.) 87 | - we didn't have to tell lector the _data types_ of the columns, but it inferred the correct and most efficient ones automatically, e.g.: 88 | - a nullable `Int64` extension type was necessary to correctly represent values in the `id` column 89 | - the `category` column was automatically converted to the efficient `dictionary` (categorical) type 90 | - the `count` column uses the _smallest_ integer type necessary 91 | - the `text` column, containing natural language text, has _not_ been converted to a categortical type, but kept as string values (it is unlikely to benefit from dictionary-encoding) 92 | - the `date` column was converted to datetime's correctly, even though the original 93 | strings are not in an ISO format 94 | - the `tags` and `vecs` columns have been imported with `object` dtype (since pandas 95 | doesn't officially support iterables as elements in a column), but its values are in fact numpy array of the correct dtype! 96 | 97 | Neither pandas nor arrow will do this. In fact, they cannot even import this data correctly, _without_ attempting to do any smart type inference. Compare e.g. with pandas attempt to read the same CSV file: 98 | 99 |
100 | Pandas and Arrow fail 101 | Firstly, to get something close to the above, you'll have to spend a good amount of time manually inspecting the CSV file and come up with the following verbose pandas call: 102 | 103 | ``` python 104 | dtypes = { 105 | "id": "Int64", 106 | "genre": "category", 107 | "metric": "float", 108 | "count": "UInt8", 109 | "content": "string", 110 | "website": "category", 111 | "tags": "object", 112 | "vecs": "object" 113 | } 114 | 115 | df = pd.read_csv( 116 | fp, 117 | encoding="ISO-8859-1", 118 | skiprows=3, 119 | sep=";", 120 | dtype=dtypes, 121 | parse_dates=["date"], 122 | infer_datetime_format=True 123 | ) 124 | 125 | ``` 126 | 127 | While this _parses_ the CSV file alright, the result is, urm, lacking. Let's see: 128 | 129 | ``` 130 | id genre metric count \ 131 | 0 1234982348728374 a 0.10 1 132 | 1 b 0.12 133 | 2 9007199254740992 a 3.14 3 134 | 135 | content \ 136 | 0 137 | 1 Natural language text is different from catego... 138 | 2 The Project · Gutenberg » EBook « of Die Fürstin. 139 | 140 | website tags vecs date 141 | 0 http://www.graphext.com [a,b,c] [1.3, 1.4, 1.67] 2022-11-10 142 | 1 https://www.twitter.com [d] [0, 1.9423] 2022-01-10 143 | 2 http://www.google.com ['e', 'f'] ["84.234, 12509.99"] 2021-10-13 144 | 145 | id Int64 146 | genre category 147 | metric float64 148 | count UInt8 149 | content string 150 | website category 151 | tags object 152 | vecs object 153 | date datetime64[ns] 154 | dtype: object 155 | ``` 156 | 157 | A couple of observations: 158 | 159 | - Pandas _will_ cast numeric columns with missing data to the float type always, before any of our custom types are applied. This is a big problem, as we can see in the `id` column, since not all integers can be represented exactly by a 64 bit floating type (the correct value in our file is `9007199254740993` 👀). It is also a sneaky problem, because this happens silently, and so you may not realize you've got wrong IDs, and may produce totally wrong analyses if you use them down the line for joins etc. The only way to import CSV files like this with pandas correctly is to inspect the actual data in a text editor, guess the best data type, import the data without any type inference, and then individually cast to the correct types. There is no way to configure pandas to import the data correctly. 160 | - Pandas has messed up the dates. While at least warning us about it, pandas doesn't try to infer a consistent date format across all rows. While the CSV file contains all dates in a single consistent format (`%d/%m/%Y`), pandas has used mixed formats and so imported some dates wrongly. 161 | - The `category` and `text` columns have been imported with the `object` dtype, which is not particularly useful, but not necessarily a problem either. 162 | - Since pandas doesn't support iterable dtypes, the tags and vecs columns haven't been parsed into any useful structures 163 | 164 | Note that Arrow doesn't fare much better. It doesn't parse and infer its own `list` data type, it doesn't know how to parse dates in any format other than ISO 8601, and commits the same integer-as-float conversion error. 165 |
166 |
167 | 168 | ## Development 169 | 170 | To install a local copy for development, including all dependencies for test, documentation and code quality, use the following commands: 171 | 172 | ``` bash 173 | clone git+https://github.com/graphext/lector 174 | cd lector 175 | pip install -v -e ".[dev]" 176 | pre-commit install 177 | ``` 178 | 179 | The [pre-commit](https://pre-commit.com/) command will make sure that whenever you try to commit changes to this repo code quality and formatting tools will be executed. This ensures e.g. a common coding style, such that any changes to be commited are functional changes only, not changes due to different personal coding style preferences. This in turn makes it either to collaborate via pull requests etc. 180 | 181 | To test installation you may execute the [pytest](https://docs.pytest.org/) suite to make sure everything's setup correctly, e.g.: 182 | 183 | ``` bash 184 | pytest -v . 185 | ``` 186 | 187 | ## Documentation 188 | 189 | The documentation is created using Sphinx and is available here: https://lector.readthedocs.io/. 190 | 191 | You can build and view the static html locally like any other Sphinx project: 192 | 193 | ``` bash 194 | (cd docs && make clean html) 195 | (cd docs/build/html && python -m http.server) 196 | ``` 197 | 198 | 199 | ## To Do 200 | 201 | - _Parallelize type inference_? While type inference is already pretty fast, it can potentially be sped up by processing columns in parallel. 202 | - _Testing_. The current pytest setup is terrible. I've given `hypothesis_csv` a try here, 203 | but I'm probably making bad use of it. Tests are convoluted and probably not even good a catching corner cases. 204 | 205 | ## License 206 | 207 | This project is licensed under the terms of the Apache License 2.0. 208 | 209 | ## Links 210 | 211 | - Documentation: https://lector.readthedocs.io/ 212 | - Source: https://github.com/graphext/lector 213 | - Graphext: https://www.graphext.com 214 | - Graphext on Twitter: https://twitter.com/graphext 215 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | rm -rf source/_api 21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 22 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | /* Visually separate classes a bit more. */ 2 | .py.class > dt { 3 | border-top: solid 3px; 4 | margin-top: 50px; 5 | padding-left: 5px 6 | } 7 | 8 | .py.method > dt { 9 | border-left: solid 3px; 10 | display: inline; 11 | padding: 5px 10px 5px 10px; 12 | font-size: 0.9rem; 13 | } 14 | 15 | .py.function > dt { 16 | border-left: solid 3px; 17 | /* display: inline; */ 18 | padding: 5px 10px 5px 10px; 19 | font-size: 0.9rem; 20 | } 21 | 22 | .py.function > dt:before { 23 | content: "func"; 24 | font-style: italic; 25 | } 26 | 27 | .py.attribute > dt, .py.data > dt{ 28 | font-size: 0.9rem; 29 | } 30 | 31 | html[data-theme="dark"], html[data-theme="light"] { 32 | --pst-color-inline-code: var(--pst-color-primary); 33 | } 34 | 35 | html[data-theme="light"] .py.class > dt, 36 | html[data-theme="light"] .py.method > dt, 37 | html[data-theme="light"] .py.function > dt { 38 | border-color: lightgrey; 39 | background: #f0f0f0; 40 | } 41 | 42 | html[data-theme="light"] .autosummary.table .row-odd { 43 | background: #f0f0f0; 44 | } 45 | 46 | html[data-theme="dark"] .py.class > dt, 47 | html[data-theme="dark"] .py.method > dt, 48 | html[data-theme="dark"] .py.function > dt { 49 | /* border-color: #82265b; 50 | background: #372b32; */ 51 | border-color: var(--pst-color-primary); 52 | background: #263033; 53 | } 54 | 55 | html[data-theme="dark"] .autosummary.table .row-odd { 56 | background: #263033; 57 | } 58 | -------------------------------------------------------------------------------- /docs/source/_static/graphext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphext/lector/fb76591acd69aa01ee37a03f08fc8e68fd5f8aa8/docs/source/_static/graphext.png -------------------------------------------------------------------------------- /docs/source/_static/gx_logo_sq_blue.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphext/lector/fb76591acd69aa01ee37a03f08fc8e68fd5f8aa8/docs/source/_static/gx_logo_sq_blue.jpg -------------------------------------------------------------------------------- /docs/source/_templates/python/module.rst: -------------------------------------------------------------------------------- 1 | {% if not obj.display %} 2 | :orphan: 3 | 4 | {% endif %} 5 | :py:mod:`{{ obj.name }}` 6 | =========={{ "=" * obj.name|length }} 7 | 8 | .. py:module:: {{ obj.name }} 9 | 10 | {% if obj.docstring %} 11 | .. autoapi-nested-parse:: 12 | 13 | {{ obj.docstring|indent(3) }} 14 | 15 | {% endif %} 16 | 17 | {% block subpackages %} 18 | {% set visible_subpackages = obj.subpackages|selectattr("display")|list %} 19 | {% if visible_subpackages %} 20 | Subpackages 21 | ----------- 22 | .. toctree:: 23 | :titlesonly: 24 | :maxdepth: 3 25 | 26 | {% for subpackage in visible_subpackages %} 27 | {{ subpackage.short_name }}/index.rst 28 | {% endfor %} 29 | 30 | 31 | {% endif %} 32 | {% endblock %} 33 | {% block submodules %} 34 | {% set visible_submodules = obj.submodules|selectattr("display")|list %} 35 | {% if visible_submodules %} 36 | Submodules 37 | ---------- 38 | .. toctree:: 39 | :titlesonly: 40 | :maxdepth: 1 41 | 42 | {% for submodule in visible_submodules %} 43 | {{ submodule.short_name }}/index.rst 44 | {% endfor %} 45 | 46 | 47 | {% endif %} 48 | {% endblock %} 49 | {% block content %} 50 | {% if obj.all is not none %} 51 | {% set visible_children = obj.children|selectattr("short_name", "in", obj.all)|list %} 52 | {% elif obj.type is equalto("package") %} 53 | {% set visible_children = obj.children|selectattr("display")|list %} 54 | {% else %} 55 | {% set visible_children = obj.children|selectattr("display")|rejectattr("imported")|list %} 56 | {% endif %} 57 | {% if visible_children %} 58 | 59 | {% set visible_classes = visible_children|selectattr("type", "equalto", "class")|list %} 60 | {% set visible_functions = visible_children|selectattr("type", "equalto", "function")|list %} 61 | {% set visible_attributes = visible_children|selectattr("type", "equalto", "data")|list %} 62 | {% if "show-module-summary" in autoapi_options and (visible_classes or visible_functions) %} 63 | {% block classes scoped %} 64 | {% if visible_classes %} 65 | Classes 66 | ~~~~~~~ 67 | 68 | .. autoapisummary:: 69 | 70 | {% for klass in visible_classes %} 71 | {{ klass.id }} 72 | {% endfor %} 73 | 74 | 75 | {% endif %} 76 | {% endblock %} 77 | 78 | {% block functions scoped %} 79 | {% if visible_functions %} 80 | Functions 81 | ~~~~~~~~~ 82 | 83 | .. autoapisummary:: 84 | 85 | {% for function in visible_functions %} 86 | {{ function.id }} 87 | {% endfor %} 88 | 89 | 90 | {% endif %} 91 | {% endblock %} 92 | 93 | {% block attributes scoped %} 94 | {% if visible_attributes %} 95 | Attributes 96 | ~~~~~~~~~~ 97 | 98 | .. autoapisummary:: 99 | 100 | {% for attribute in visible_attributes %} 101 | {{ attribute.id }} 102 | {% endfor %} 103 | 104 | 105 | {% endif %} 106 | {% endblock %} 107 | {% endif %} 108 | {% for obj_item in visible_children %} 109 | {{ obj_item.render()|indent(0) }} 110 | {% endfor %} 111 | {% endif %} 112 | {% endblock %} 113 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.insert(0, os.path.abspath("../..")) 5 | 6 | from lector import __version__ # noqa 7 | 8 | project = "lector" 9 | copyright = "2022, Thomas Buhrmann" 10 | author = "Thomas Buhrmann" 11 | version = __version__ 12 | release = version 13 | 14 | extensions = [ 15 | # "sphinx.ext.autodoc", 16 | "sphinx.ext.autosectionlabel", 17 | # "sphinx.ext.autosummary", 18 | "sphinx.ext.napoleon", 19 | "sphinx.ext.viewcode", 20 | # "sphinx_autodoc_typehints", 21 | "sphinx.ext.autodoc.typehints", 22 | "sphinx.ext.todo", 23 | "autoapi.extension", 24 | ] 25 | 26 | autodoc_typehints = "description" 27 | 28 | autoapi_dirs = ["../../lector"] 29 | autoapi_member_order = "groupwise" 30 | autoapi_add_toctree_entry = False 31 | autoapi_template_dir = "_templates" 32 | 33 | autosectionlabel_prefix_document = True 34 | 35 | 36 | templates_path = ["_templates"] 37 | exclude_patterns = [] 38 | 39 | html_theme = "pydata_sphinx_theme" 40 | html_static_path = ["_static"] 41 | html_css_files = ["css/custom.css"] 42 | 43 | html_theme_options = { 44 | "icon_links": [ 45 | { 46 | "name": "Github", 47 | "url": "https://github.com/graphext/lector", 48 | "icon": "fab fa-github-square", 49 | "type": "fontawesome", 50 | }, 51 | { 52 | "name": "Graphext", 53 | "url": "https://www.graphext.com", 54 | "icon": "_static/gx_logo_sq_blue.jpg", 55 | "type": "local", 56 | }, 57 | ] 58 | } 59 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Lector 2 | ====== 3 | 4 | `Lector `_ aims to be a fast reader for potentially 5 | messy CSV files with configurable column type inference. It combines automatic detection 6 | of :ref:`reader:File encodings` , :ref:`CSV dialects ` (separator, 7 | escaping etc.) and :ref:`reader:preambles` (initial lines 8 | containing metadata or junk unrelated to the actual tabular data). Its goal is to 9 | just-read-the-effing-CSV file without manual configuration in most cases. Each of the 10 | detection components is configurable and can be swapped out easily with custom implementations. 11 | 12 | Also, since both `pandas `_ 13 | and Apache's `arrow `_ 14 | will destructively cast columns to the wrong type in some cases (e.g. large ID-like integer 15 | strings to floats), it provides an alternative and customisable column type :doc:`inference and 16 | casting ` mechanism. 17 | 18 | Under the hood it uses pyarrow's CSV parser for reading, and its compute functions for optional 19 | type inference. 20 | 21 | Lector is used at `Graphext `_ behind the scenes whenever a user 22 | uploads a new dataset, and so implicitly has been validated across 1000s of different CSV 23 | files from all kinds of sources. 24 | 25 | Note, however, that this is Graphext's first foray into open-sourcing our code and still 26 | *work-in-progress*. So at least initially we won't provide any guarantees as to support 27 | of this library. 28 | 29 | For a quick illustration of how to use ``lector``, see the the :doc:`quickstart guide `. 30 | 31 | 32 | .. toctree:: 33 | :hidden: 34 | 35 | Home 36 | Quickstart 37 | CSV Reader 38 | Types 39 | API 40 | -------------------------------------------------------------------------------- /docs/source/quick.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ========== 3 | 4 | 5 | Installation 6 | ------------ 7 | 8 | While this library is not available yet on pypi, you can easily install it from Github with 9 | 10 | .. code-block:: bash 11 | 12 | pip install git+https://github.com/graphext/lector 13 | 14 | The project depends on ``cchardet`` for encoding detection, ``clevercsv`` for advanced 15 | dialect detection, ``pyarrow`` for CSV parsing and type inference/conversion, as well as 16 | ``rich`` and ``typer`` for pretty output and the command-line interface. 17 | 18 | Quickstart 19 | ---------- 20 | 21 | The below examples illustrate lector's default behaviour when reading CSV files. For 22 | customization options, check the :doc:`reader` and :doc:`types` sections as well as the 23 | :doc:`API reference `. 24 | 25 | Let's assume we receive the following CSV file, containing some initial metadata, using 26 | the semicolon as separator, having some missing fields, and being encoded in Latin-1: 27 | 28 | .. code-block:: python 29 | 30 | csv = """ 31 | Some metadata 32 | Some more metadata 33 | id;category;metric;count;text 34 | 1234982348728374;a;0.1;1; 35 | ;b;0.12;;"Natural language text is different from categorical data." 36 | 18446744073709551615;a;3.14;3;"The Project · Gutenberg » EBook « of Die Fürstin." 37 | """.encode("ISO-8859-1") 38 | 39 | with open("example.csv", "wb") as fp: 40 | fp.write(csv) 41 | 42 | The recommended way to use lector for reading this CSV (without type-inference) would be 43 | 44 | .. code-block:: python 45 | 46 | import lector 47 | 48 | tbl = lector.read_csv("example.csv", types="string", log=True) 49 | 50 | which produces something like the following output:: 51 | 52 | 'Fieldless' matches CSV buffer: detected 3 rows to skip. 53 | 54 | ─────────── CSV Format ──────────── 55 | { 56 | 'encoding': 'ISO-8859-1', 57 | 'preamble': 3, 58 | 'dialect': Dialect( 59 | delimiter=';', 60 | quote_char='"', 61 | escape_char=None, 62 | double_quote=True, 63 | skip_initial_space=False, 64 | line_terminator='\r\n', 65 | quoting=0 66 | ) 67 | } 68 | ─────────────────────────────────── 69 | 70 | The log provides some feedback about proporties of the CSV that lector has detected 71 | automatically, namely: 72 | 73 | - It has found a *preamble* pattern named ``Fieldless`` that matches the beginning of the 74 | CSV file and indicates that the first 3 rows should be skipped (lector has an extensible 75 | list of such patterns which are tried in order until a match is found) 76 | - It has detected the *encoding* correctly as ``ISO-8859-1`` (this cannot be guaranteed in all 77 | cases, but the CSV will be read always with a fallback encoding, usually ``utf-8``, and 78 | characters that cannot be decoded will be represented by �) 79 | - It has correctly detected the CSV *dialect* (the delimiter used etc.) 80 | - The encoding, preamble and dialect together are stored in a ``Format`` object, which holds 81 | all the necessary parameters to parse the CSV file correctly with pandas or arrow 82 | 83 | Using the detected CSV format, the data is parsed (using pyarrow's ``csv.read_csv()`` under 84 | the hood). Note we have indicated to arrow to parse all columns using the ``string`` type, 85 | effectively turning *off* its internal type inference. We can use lector's type inference by 86 | not specifying the ``types`` argument or selecting it explicitly: 87 | 88 | .. code-block:: python 89 | 90 | tbl = lector.read_csv("example.csv") 91 | tbl = lector.read_csv("example.csv", types=lector.Inference.Auto) # equivalent 92 | print(tbl.schema) 93 | 94 | We see this results in the most appropriate type for each column: 95 | 96 | .. code-block:: 97 | 98 | pyarrow.Table 99 | id: uint64 100 | category: dictionary 101 | metric: double 102 | count: uint8 103 | text: string 104 | 105 | Notice that: 106 | 107 | - An unsigned int (``uint64``) was necessary to correctly represent all values in the id 108 | column. Had values been even larger than the maximum of the ``uint64`` type, the values 109 | would have been converted to a categorical type (strings), rather than floats 110 | - The category column was automatically converted to the memory-efficient ``dictionary`` 111 | (categorical) type 112 | - The count column uses the smallest integer type necessary (``uint8``, unsigned since all 113 | values are positive) 114 | - The text column, containing natural language text, has *not* been converted to a categorical 115 | type, but kept as simple ``string`` values (it is unlikely to benefit from dictionary-encoding) 116 | 117 | We could have relied on arrow's built-in type inference instead, like so: 118 | 119 | .. code-block:: python 120 | 121 | tbl = lector.read_csv("example.csv", types=lector.Inference.Native) 122 | 123 | but this would result in less memory-efficient and even erroneous data types (see the 124 | pandas and pure arrow comparisons below). 125 | 126 | Finally, if you need the CSV table in pandas, lector provides a little helper for correct 127 | conversion (again, pure arrow's ``to_pandas(...)`` isn't smart or flexible enough to use pandas 128 | extension dtypes for correct conversion). Use it as an argument to ``read_csv(...)`` or explicitly: 129 | 130 | .. code-block:: python 131 | 132 | from lector.utils import to_pandas 133 | 134 | df = lector.read_csv("example.csv", to_pandas=True) 135 | 136 | # equivalent: 137 | tbl = lector.read_csv("example.csv") 138 | df = to_pandas(tbl) 139 | print(df) 140 | print(df.types) 141 | 142 | Which outputs:: 143 | 144 | id category metric count \ 145 | 0 1234982348728374 a 0.10 1 146 | 1 b 0.12 147 | 2 18446744073709551615 a 3.14 3 148 | 149 | text 150 | 0 151 | 1 Natural language text is different from catego... 152 | 2 The Project · Gutenberg » EBook « of Die Fürstin. 153 | 154 | id UInt64 155 | category category 156 | metric float64 157 | count UInt8 158 | text string 159 | dtype: object 160 | 161 | Note how nullable pandas extension dtypes are used to preserve correct integer values, where pure arrow would have used the unsafe float type instead. 162 | 163 | Compared with pandas 164 | -------------------- 165 | 166 | Trying to read CSV files like the above using ``pandas.read_csv(...)`` and default arguments 167 | only will fail (at least in pandas < 2.0). To find the correct arguments, you'll have to open the CSV in a text editor 168 | and manually identify the separator and the initial lines to skip, and then try different 169 | encodings until you find one that seems to decode all characters correctly. But even if you 170 | then manage to read the CSV, the result may not be what you expected: 171 | 172 | .. code-block:: python 173 | 174 | csv = """ 175 | Some metadata 176 | Some more metadata 177 | id;category;metric;count;text 178 | 1234982348728374;a;0.1;1;"This is a text." 179 | ;b;0.12;;"Natural language text is different from categorical data." 180 | 9007199254740993;a;3.14;3;"The Project · Gutenberg » EBook « of Die Fürstin." 181 | """.encode("ISO-8859-1") 182 | 183 | df = pd.read_csv( 184 | io.BytesIO(csv), 185 | encoding="ISO-8859-1", 186 | skiprows=3, 187 | sep=";", 188 | index_col=False 189 | ) 190 | print(df) 191 | print(df.dtypes) 192 | 193 | results in:: 194 | 195 | id category metric count \ 196 | 0 1.234982e+15 a 0.10 1.0 197 | 1 NaN b 0.12 NaN 198 | 2 9.007199e+15 a 3.14 3.0 199 | 200 | text 201 | 0 This is a text. 202 | 1 Natural language text is different from catego... 203 | 2 The Project · Gutenberg » EBook « of Die Fürstin. 204 | 205 | 206 | id float64 207 | category object 208 | metric float64 209 | count float64 210 | text object 211 | 212 | 213 | The ``category`` and ``text`` columns have been imported with the ``object`` dtype, 214 | which is not particularly useful, but not necessarily a problem either. 215 | 216 | Note, however, that numeric-like columns with missing data have been cast to the ``float`` 217 | type. This may seem merely a nuisance in the case of the ``count`` column, which could easily 218 | be cast to a (nullable) integer type. It is, however, a big problem for the ``id`` column, 219 | since not all integers can be represented exactly by a 64 bit floating type: 220 | 221 | .. code-block:: 222 | 223 | >>> print(df.id.iloc[2]) 224 | 9007199254740992.0 225 | 226 | which is not the value ``"9007199254740993"`` contained in our CSV file! We cannot cast 227 | this column to the correct type anymore either (e.g. ``int64`` or ``string``), because 228 | the original value is lost. It is also a sneaky problem, because you may not realize 229 | you've got wrong IDs, and may produce totally wrong analyses if you use them down the 230 | line for joins etc. The only way to import CSV files like this correctly is to inspect 231 | essentially all columns and all rows manually in a text editor, choose the best data type 232 | manually, and then provide these types via pandas ``dtype`` argument. This may be feasible 233 | if you work with CSVs only sporadically, but quickly becomes cumbersome otherwise. 234 | 235 | 236 | Compared with arrow 237 | ------------------- 238 | 239 | The arrow CSV reader unfotunately faces exactly the same limitations as pandas: 240 | 241 | .. code-block:: python 242 | 243 | import pyarrow as pa 244 | import pyarrow.csv 245 | 246 | 247 | csv = """ 248 | Some metadata 249 | Some more metadata 250 | id;category;metric;count;text 251 | 1234982348728374;a;0.1;1; 252 | ;b;0.12;;"Natural language text is different from categorical data." 253 | 18446744073709551615;a;3.14;3;"The Project · Gutenberg » EBook « of Die Fürstin." 254 | """.encode("ISO-8859-1") 255 | 256 | tbl = pa.csv.read_csv( 257 | io.BytesIO(csv), 258 | read_options=pa.csv.ReadOptions(encoding="ISO-8859-1", skip_rows=3), 259 | parse_options=pa.csv.ParseOptions(delimiter=";"), 260 | convert_options=pa.csv.ConvertOptions(strings_can_be_null=True) 261 | ) 262 | 263 | print(tbl) 264 | int(tbl.column("id")[2].as_py()) 265 | 266 | It needs the same level of human inspection to identify the correct arguments to read the CSV, 267 | and destructively casts IDs to floats (but at least uses a more efficient string type where 268 | applicable, in contrast to pandas object dtype):: 269 | 270 | pyarrow.Table 271 | id: double 272 | category: string 273 | metric: double 274 | count: int64 275 | text: string 276 | ---- 277 | id: [[1.234982348728374e+15,null,1.8446744073709552e+19]] 278 | category: [["a","b","a"]] 279 | metric: [[0.1,0.12,3.14]] 280 | count: [[1,null,3]] 281 | text: [[null,"Natural language text is different from categorical data.","The Project · Gutenberg » EBook « of Die Fürstin."]] 282 | 283 | 18446744073709551616 284 | 285 | Again, the only way to ensure correctness of the parsed CSV is to not use arrow's built-in 286 | type inference, but provide the desired type for each column manually. 287 | -------------------------------------------------------------------------------- /docs/source/reader.rst: -------------------------------------------------------------------------------- 1 | CSV Reader 2 | ========== 3 | 4 | The :doc:`CSV Reader ` has the simple task of detecting 3 5 | properties of a CSV file: 6 | 7 | 1. The text encoding (utf-8, latin-1 etc.) 8 | 2. A potential preamble (initial lines to skip) 9 | 3. The CSV dialect (delimiter etc.) 10 | 11 | Lector provides an abstract base class and default implementations for each of 12 | the three detectors (see below). 13 | 14 | A reader itself then simply receives instances of these detectors (or the results 15 | of the detection), and configures the parameters of a CSV parser accordingly. The 16 | main CSV parser in lector is pyarrow's `csv.read_csv() `_, 17 | as used in the :doc:`ArrowReader `. As an example 18 | for using alternative parsers we also include a :doc:`PandasReader `. 19 | Both implement the abstract :doc:`Reader ` class. 20 | 21 | File encodings 22 | -------------- 23 | 24 | An encoding detector in lector is any class having a ``detect()`` method that 25 | accepts a binary (bytes) buffer, and returns a string indicating the name of 26 | a `Python codec `_, as the 27 | :class:`abstract base class ` requires: 28 | 29 | .. code-block:: python 30 | 31 | @dataclass 32 | class EncodingDetector(ABC): 33 | """Base class specifying interface for all encoding detetors.""" 34 | 35 | @abstractmethod 36 | def detect(self, buffer: BinaryIO) -> str: 37 | """Implement me."" 38 | 39 | 40 | The :class:`default implementation ` uses the 41 | `cchardet `_ library internally and has the following 42 | interface: 43 | 44 | .. code-block:: python 45 | 46 | @dataclass 47 | class Chardet(EncodingDetector): 48 | """An encoding detector using cchardet if the default utf-8 generates too many errors.""" 49 | 50 | n_bytes: int = int(1e7) # 10 MB 51 | """Use this many bytes to detect encoding.""" 52 | error_threshold: float = 0.0 53 | """A greater proportion of decoding errors than this will be considered a failed encoding.""" 54 | confidence_threshold: float = 0.6 55 | """Minimum level of confidence to accept an encoding automatically detected by cchardet.""" 56 | 57 | It reads a maximum of ``n_bytes`` bytes from the received buffer, and then in the following 58 | order: 59 | 60 | - Tries to identify an initial byte-order mark (`BOM `_) 61 | indicating the file's codec 62 | - Checks whether assuming ``utf-8`` produces less than ``error_threshold`` decoding errors 63 | (and returns this codec if true) 64 | - Uses ``cchardet`` to detect the encoding. If cchardet's confidence is greater than the 65 | ``confidence_threshold``, returns the detected encoding. Otherwise it falls back on the 66 | ``windows-1250`` codec as the windows/latin-like codec that most acts as a superset of 67 | special characters amongst related codecs. 68 | 69 | 70 | Preambles 71 | --------- 72 | 73 | By "preamble" lector understands initial lines in CSV files to be skipped, e.g. metadata 74 | that should not be interpreted part of the tabular data itself. 75 | 76 | It is impossible to always detect arbitrary preambles from the CSV data itself. There are, 77 | however, common patterns amongst preambles written to CSV by certain sources. E.g. 78 | some exporters may separate the metadata from actual data by a line of delimiters only. 79 | Others may write metadata only that does not itself contain the delimiter used otherwise 80 | to separate fields in the tabular part. 81 | 82 | Since it is essentially an open-ended exercise to detect arbitrary preambles, lector was 83 | designed to allow easy extension of the patterns to be detected. One simply implements 84 | a new subclass of :class:`PreambleDetector `, and 85 | uses a decorator to register it with the :class:`preamble registry `. 86 | Like so: 87 | 88 | .. code-block:: python 89 | 90 | @Preambles.register 91 | @dataclass 92 | class MyPreamble(PreambleDetector): 93 | 94 | def detect(self, buffer: TextIO) -> int: 95 | ... 96 | 97 | In this case the detector will receive an already decoded *text* buffer, and should 98 | return an integer indicating the number of lines to skip. 99 | 100 | :class:`lector.csv.preambles.Brandwatch`, and :class:`lector.csv.preambles.Fieldless` 101 | are two detectors provided out of the box. The former checks for initial lines followed 102 | by a single line of commas only. The second checks for N initial lines containing a single 103 | field only, followed by at least one line containing multiple fields. It then returns N as 104 | the number of rows to skip. 105 | 106 | :meth:`lector.csv.preambles.Preambles.detect` is responsible for trying all 107 | implemented detectors in the order they have been registered and returns the first match 108 | (returning N > 0 lines to skip). This may provide too contraining in the long run and 109 | may change in the future so that the order is more easily configurable. 110 | 111 | Dialects 112 | -------- 113 | 114 | The CSV format is not in fact a strict standard, and there are a number of differences 115 | in how CSVs files can be generated. E.g. while the delimiter is usually the comma, it may 116 | also be a semi-colon, the tab or any other arbitrary character. To handle the delimiter 117 | appearing *within* fields, one may choose to quote such fields, or use a special escape 118 | character etc. 119 | 120 | A `CSV dialect `_ 121 | is a set of parameters describing how to parse a CSV file, i.e. identifying the delimiter, 122 | quote character and so on. In Python's `csv` module, it was decided unfortunately that 123 | to use such dialects one has to pass around subclasses of it, rather than instances. Since 124 | this is somewhat awkward, lector implements it's own :class:`lector.csv.dialects.Dialect`. 125 | 126 | Instances of dialects are used as return values by dialect detectors in lector, the abstract 127 | base class of which is simply 128 | 129 | .. code-block:: python 130 | 131 | @dataclass 132 | class DialectDetector(ABC): 133 | """Base class for all dialect detectors.""" 134 | 135 | @abstractmethod 136 | def detect(self, buffer: TextIO) -> Dialect: 137 | ... 138 | 139 | Lector provides two default implementations. :class:`lector.csv.dialects.PySniffer` uses the 140 | Python standard library's `CSV Sniffer `_ 141 | internally and fixes up the result specifically for more robust *parsing* of CSVs. 142 | 143 | Alternatively, if `clevercsv `_ has 144 | been installed as an optional dependency, lector wraps it inside the 145 | :class:`lector.csv.dialects.CleverCsv` detector class. It can be used to trade-off speed 146 | against more robust dialect inference. 147 | 148 | Readers 149 | ------- 150 | 151 | Finally, a CSV Reader in lector simply receices an encoding (or encoding detector), 152 | a preamble (or preamble detector) and a dialect (or, wait, a dialect detector). The 153 | abstract base class for readers, :class:`lector.csv.abc.Reader`, is essentially 154 | 155 | .. code-block:: python 156 | 157 | class Reader(ABC): 158 | """Base class for CSV readers.""" 159 | 160 | def __init__( 161 | self, 162 | fp: FileLike, 163 | encoding: str | EncodingDetector | None = None, 164 | dialect: dict | DialectDetector | None = None, 165 | preamble: int | PreambleRegistry | None = None, 166 | log: bool = True, 167 | ) -> None: 168 | self.fp = fp 169 | self.encoding = encoding or encodings.Chardet() 170 | self.dialect = dialect or dialects.PySniffer() 171 | self.preamble = preamble or Preambles 172 | self.log = log 173 | 174 | def read(self, *args, **kwds) -> Any: 175 | try: 176 | self.analyze() 177 | result = self.parse(*args, **kwds) 178 | self.buffer.close() 179 | return result 180 | except Exception: 181 | raise 182 | 183 | @abstractmethod 184 | def parse(self, *args, **kwds) -> Any: 185 | """Parse the file pointer or text buffer. Args are forwarded to read().""" 186 | ... 187 | 188 | The base class uses the provided detectors to infer (if necessary) all the information 189 | required to call a CSV parser. It wraps all inferred information in a 190 | :class:`lector.csv.abc.Format` object, which Reader subclasses can then translate 191 | to a specific parser's own parameters. E.g., the only thing the :class:`lector.csv.arrow.ArrowReader` 192 | does, is translate a CSV Format, to arrow's own ``csv.ReadOptions``, ``csv.ParseOptions`` 193 | and ``csv.ConvertOptions`` objects. 194 | 195 | If no parameters (other than a file pointer) are passed, a reader uses the default 196 | implementations of all detectors, which means that if no customization is needed, 197 | reading almost any CSV becomes simply: 198 | 199 | .. code-block:: python 200 | 201 | from lector import ArrowReader 202 | 203 | tbl = ArrowReader("/path/to/file.csv").read() 204 | -------------------------------------------------------------------------------- /docs/source/types.rst: -------------------------------------------------------------------------------- 1 | Types 2 | ===== 3 | 4 | Introduction 5 | ------------ 6 | 7 | Lector implements its own column type inference. It can be used by parsing a CSV file 8 | with ``string`` types only (preserving the original fields without modification), 9 | and then auto-casting all columns to the most appropriate and efficient data type: 10 | 11 | .. code-block:: python 12 | 13 | import io 14 | import lector 15 | from lector import ArrowReader, Autocast 16 | from lector.log import schema_view 17 | 18 | csv = """id,genre,metric,count,content,website,tags 19 | 1234982348728374,a,0.1,1,, http://www.graphext.com,"[a,b,c]" 20 | ,b,0.12,,"Natural language text is different from categorical data.", https://www.twitter.com,[d] 21 | 18446744073709551615,a,3.14,3,"The Project · Gutenberg » EBook « of Die Fürstin.",http://www.google.com,"['e', 'f']" 22 | """.encode() 23 | 24 | # High-level API uses automatic type-inference and casting by default 25 | tbl = lector.read_csv(io.BytesIO(csv)) 26 | 27 | # Equivalent low-level API 28 | tbl = ArrowReader(io.BytesIO(csv)).read(types="string") 29 | tbl = Autocast().cast(tbl) 30 | 31 | schema_view(tbl.schema) 32 | 33 | Printing the table schema this way will produce the following output: 34 | 35 | .. code-block:: 36 | 37 | Schema 38 | ───────────────────────────────────────────────────────── 39 | Column Type Meta 40 | ───────────────────────────────────────────────────────── 41 | id uint64 {'semantic': 'number[UInt64]'} 42 | genre dict {'semantic': 'category'} 43 | metric double {'semantic': 'number[double]'} 44 | count uint8 {'semantic': 'number[UInt8]'} 45 | content string {'semantic': 'text'} 46 | website dict {'semantic': 'url'} 47 | tags list {'semantic': 'list[category]'} 48 | ───────────────────────────────────────────────────────── 49 | 50 | The schema view contains for each column in the table the arrow type it has been 51 | cast to, as well as some metadata about the *semantic* type lector has inferred. 52 | By semantic type we mean the kind of content the column contains, which may be 53 | different from (more specific than) the "physical" (arrow) type used to store it. 54 | 55 | For example, the "website" column has been cast to arrow using a dictionary type with 56 | string values (``dict``). However, lector has in fact inferred that the column 57 | contains URLs, and simply selected the dictionary type as the best storage type for URLs. 58 | Equally, the "content" column has been inferred to contain natural language text, and in 59 | this case arrow's ``string`` type is used for storage. Also note that lector handles 60 | types that have no equivalent in `pandas`. The "tags" column contains lists of strings, for 61 | example, which lector has automatically parsed and cast to arrow's ``list`` 62 | type. 63 | 64 | For numeric columns lector has automatically identified the most efficient (least 65 | memory-hungry) data types. The semantic metadata here is used to indicate pandas' 66 | corresponding (potentially nullable extension) ``dtype``. 67 | 68 | Using lector's ``to_pandas()`` function we can convert the arrow table to a pandas DataFrame 69 | ensuring that all data is converted correctly, even when there is no corresponding 70 | pandas type: 71 | 72 | .. code-block:: python 73 | 74 | from lector.utils import to_pandas 75 | 76 | df = to_pandas(tbl) 77 | print(df) 78 | print(df.dtypes) 79 | 80 | produces 81 | 82 | .. code-block:: 83 | 84 | id genre metric count \ 85 | 0 1234982348728374 a 0.10 1 86 | 1 b 0.12 87 | 2 18446744073709551615 a 3.14 3 88 | 89 | content website \ 90 | 0 http://www.graphext.com 91 | 1 Natural language text is different from catego... https://www.twitter.com 92 | 2 The Project · Gutenberg » EBook « of Die Fürstin. http://www.google.com 93 | 94 | tags 95 | 0 [a, b, c] 96 | 1 [d] 97 | 2 [e, f] 98 | 99 | 100 | id UInt64 101 | genre category 102 | metric float64 103 | count UInt8 104 | content string 105 | website category 106 | tags object 107 | dtype: object 108 | 109 | Note that arrow's ``tbl.to_pandas()`` would have converted integer columns with 110 | missing data to the float type, which is not save and may introduce erroneous data 111 | (because of insufficient float precision when representing large integers). Lector 112 | uses extension dtypes where necessary. Also note how all tag lists have been parsed 113 | correctly, despite having various different representations in the CSV data (use of 114 | quotes etc.). In pandas, the lists are representated by a column of numpy arrays. 115 | 116 | Array Converters 117 | ---------------- 118 | 119 | ``Converters`` in Lector are responsible for inferring the semantic type of a column 120 | (i.e. an arrow Array or ChunkedArray), identifying the corresponding storage type, 121 | and potentially generating some useful metadata. 122 | 123 | Lector implements one subclass of :class:`lector.types.abc.Converter` for each semantic 124 | type. At the moment there are: 125 | 126 | - :class:`lector.types.numbers.Number` (``number``) 127 | - :class:`lector.types.lists.List` (``list[number]`` or ``list[category]``) 128 | - :class:`lector.types.strings.Text` (``text``) 129 | - :class:`lector.types.strings.Category` (``category``) 130 | - :class:`lector.types.strings.Url` (``url``) 131 | - :class:`lector.types.timestamps.Timestamp` (``date``) 132 | 133 | Their interface is simple. Minimally they have to accept a ``threshold`` as 134 | parameter, and must implement a ``convert()`` method: 135 | 136 | .. code-block:: python 137 | 138 | @dataclass 139 | class Converter(ABC): 140 | 141 | threshold: float = 1.0 142 | 143 | @abstractmethod 144 | def convert(self, arr: Array) -> Conversion | None: 145 | ... 146 | 147 | @dataclass 148 | class Conversion: 149 | 150 | result: Array 151 | meta: dict = field(default_factory=dict) 152 | 153 | A specific converter (e.g. ``Number``) returns ``None`` if the passed 154 | array data is not compatible with the type (e.g. the values are not numeric). 155 | Otherwise it will return a ``Conversion`` object containing the correctly 156 | cast array and potential metadata. 157 | 158 | The converters can be used in two ways for casting a table of raw data: using 159 | :class:`lector.types.cast.Autocast` to infer the best type automatically, or 160 | using :class:`lector.types.cast.Cast` to specify the (semantic) type for each 161 | column explicitly (see below for more information). 162 | 163 | In both cases, exactly *how* types are used for inference and casting can be 164 | configured by 165 | 166 | - limiting or extending the list of allowed semantic types/converters 167 | - configuring each converter via parameters 168 | 169 | The single common parameter for all converters is the ``threshold``. This 170 | is used to identify the proportion of values in an array that have to be 171 | valid according to the given type for it to return a cast result. I.e. 172 | a converter (e.g. ``Number``) should return ``None`` if the proportion 173 | of valid (e.g. numeric-like) values is less than ``threshold``. 174 | 175 | Automatic Table Cast 176 | -------------------- 177 | 178 | For each column, the :class:`lector.types.cast.Autocast` simply tries each semantic 179 | type (i.e. ``Converter``) in an ordered list. The first ``Converter`` returning a 180 | ``Conversion`` (rather than ``None``) is used to cast the column. If no list of 181 | converters is specified explicitly (as in the first example above), a default 182 | order of all implemented converters with default parameters is used 183 | (:data:`lector.types.cast.DEFAULT_CONVERTERS`). 184 | 185 | As mentioned above, the ``Autocast`` can be configured by passing an ordered list 186 | of allowed converters, and by configuring the parameters of individual converters. 187 | If that is not enough, and a more complicated cast strategy is required, one can 188 | also implement a new subclass of :class:`lector.types.cast.CastStratregy`. The base 189 | class takes care of iteration over columns and some other minor boilerplate so 190 | that subclasses only have to implement the conversion of a single Array. 191 | 192 | Explicit Table Cast 193 | ------------------- 194 | 195 | :class:`lector.types.cast.Cast` is even simpler. Given a mapping of column names to 196 | converters, it applies a specific converter to a specific column. If the conversion 197 | is successful that column is cast, otherwise left as is. For example, given the CSV 198 | data above: 199 | 200 | .. code-block:: python 201 | 202 | from lector import Cast 203 | from lector.types import Category, Number 204 | 205 | strategy = Cast({ 206 | "id": Number(), 207 | "genre": Category(max_cardinality=None), 208 | # ... 209 | }) 210 | 211 | # High-level API 212 | tbl = lector.read_csv(io.BytesIO(csv), strategy=strategy) 213 | 214 | # Low-level API 215 | tbl = ArrowReader(io.BytesIO(csv)).read(types="string") 216 | tbl = strategy.cast(tbl) 217 | schema_view(tbl.schema) 218 | 219 | produces 220 | 221 | .. code-block:: 222 | 223 | Schema 224 | ───────────────────────────────────────────────────────── 225 | Column Type Meta 226 | ───────────────────────────────────────────────────────── 227 | id uint64 {'semantic': 'number[UInt64]'} 228 | genre dict {'semantic': 'category'} 229 | metric string 230 | count string 231 | content string 232 | website string 233 | tags string 234 | ───────────────────────────────────────────────────────── 235 | 236 | I.e., only the two specified columns have been converted using the configured 237 | types. 238 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: lectorpy310 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - cchardet 6 | - gdown 7 | - humanize 8 | - rich 9 | - typer 10 | - pandas 11 | - pyarrow>=8.0.0 12 | - python=3.10 13 | - pip 14 | - pip: 15 | - clevercsv<0.8.1 16 | - lector @ git+https://github.com/graphext/lector@main 17 | -------------------------------------------------------------------------------- /lector/__init__.py: -------------------------------------------------------------------------------- 1 | """A package for fast parsing of messy CSV files and smart-ish type inference.""" 2 | 3 | from __future__ import annotations 4 | 5 | from enum import Enum 6 | 7 | from . import utils 8 | from .csv import ArrowReader, Dialect, EmptyFileError, Format, Preambles 9 | from .csv.abc import FileLike, PreambleRegistry 10 | from .csv.dialects import DialectDetector 11 | from .csv.encodings import EncodingDetector 12 | from .log import CONSOLE, LOG, schema_view, table_view 13 | from .types import Autocast, Cast, Converter, Registry 14 | from .types.cast import CastStrategy 15 | 16 | 17 | class Inference(str, Enum): 18 | Native = "Native" 19 | Auto = "Auto" 20 | Disable = "Disable" 21 | 22 | 23 | def read_csv( 24 | fp: FileLike, 25 | encoding: str | EncodingDetector | None = None, 26 | dialect: dict | DialectDetector | None = None, 27 | preamble: int | PreambleRegistry | None = None, 28 | types: str | dict | Inference = Inference.Auto, 29 | strategy: CastStrategy | None = None, 30 | to_pandas: bool = False, 31 | log: bool = False, 32 | ): 33 | """Thin wrapper around class-based reader interface.""" 34 | 35 | reader = ArrowReader(fp, encoding=encoding, dialect=dialect, preamble=preamble, log=log) 36 | 37 | dtypes = types 38 | if isinstance(types, Inference): 39 | dtypes = None if types == Inference.Native else "string" 40 | 41 | tbl = reader.read(types=dtypes) 42 | 43 | if types == Inference.Auto: 44 | strategy = strategy or Autocast(log=log) 45 | tbl = strategy.cast(tbl) 46 | 47 | if to_pandas: 48 | if utils.PANDAS_INSTALLED: 49 | return utils.to_pandas(tbl) 50 | 51 | raise Exception("It seems pandas isn't installed in this environment!") 52 | 53 | return tbl 54 | 55 | 56 | __all__ = [ 57 | "Autocast", 58 | "ArrowReader", 59 | "Cast", 60 | "CONSOLE", 61 | "Converter", 62 | "EmptyFileError", 63 | "Dialect", 64 | "Format", 65 | "LOG", 66 | "Preambles", 67 | "Registry", 68 | "schema_view", 69 | "table_view", 70 | ] 71 | 72 | __version__ = "0.3.5" 73 | -------------------------------------------------------------------------------- /lector/cli.py: -------------------------------------------------------------------------------- 1 | """Command-line interface.""" 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | import typer 6 | 7 | from . import Inference, read_csv 8 | from .log import LOG, pformat, schema_view, table_view 9 | from .utils import Timer 10 | 11 | CLI = typer.Typer() 12 | 13 | 14 | @CLI.command() 15 | def read( 16 | fp: Path = typer.Argument( # noqa: B008 17 | ..., exists=True, file_okay=True, dir_okay=False, resolve_path=True 18 | ), 19 | types: Optional[Inference] = typer.Option(Inference.Auto), 20 | log: Optional[bool] = typer.Option(False), 21 | ): 22 | """Read a CSV file into an Arrow table.""" 23 | with Timer() as t: 24 | tbl = read_csv(fp, types=types, log=log) 25 | 26 | LOG.info(pformat(table_view(tbl, title="Final table"))) 27 | LOG.info(pformat(schema_view(tbl.schema, title="Schema"))) 28 | LOG.info(f"Import took {t.elapsed:.2f} seconds.") 29 | -------------------------------------------------------------------------------- /lector/csv/__init__.py: -------------------------------------------------------------------------------- 1 | """Subpackage for smart parsing of CSV files. 2 | 3 | Helps deteting encoding, preambles (initial junk to skip), CSV dialects etc. 4 | """ 5 | from .abc import EmptyFileError, Format, Reader 6 | from .arrow import ArrowReader 7 | from .dialects import Dialect, PySniffer 8 | from .encodings import Chardet 9 | from .preambles import Preambles 10 | 11 | __all__ = [ 12 | "ArrowReader", 13 | "Chardet", 14 | "Dialect", 15 | "EmptyFileError", 16 | "Format", 17 | "Preambles", 18 | "PySniffer", 19 | "Reader", 20 | ] 21 | -------------------------------------------------------------------------------- /lector/csv/abc.py: -------------------------------------------------------------------------------- 1 | """Fast and robust parser for potentially messy CSV data.""" 2 | from __future__ import annotations 3 | 4 | import io 5 | from abc import ABC, abstractmethod 6 | from contextlib import suppress 7 | from csv import DictReader 8 | from dataclasses import dataclass, field 9 | from pathlib import Path 10 | from typing import IO, Any, TextIO, Union 11 | 12 | from rich.table import Table as RichTable 13 | 14 | from ..log import LOG, dict_view, pformat 15 | from ..utils import reset_buffer 16 | from . import dialects, encodings 17 | from .dialects import Dialect, DialectDetector 18 | from .encodings import EncodingDetector 19 | from .preambles import Preambles 20 | 21 | FileLike = Union[str, Path, IO] 22 | 23 | PreambleRegistry = type(Preambles) 24 | 25 | 26 | class EmptyFileError(Exception): 27 | """Raised when a binary file read() returns 0 bytes.""" 28 | 29 | 30 | def is_empty(buffer: IO) -> bool: 31 | """Check if a binary or text buffer is empty (from current position onwards).""" 32 | pos = buffer.tell() 33 | empty = len(buffer.read(1)) == 0 34 | buffer.seek(pos) 35 | return empty 36 | 37 | 38 | class CleanTextBuffer(io.TextIOWrapper): 39 | """Remove null bytes on the fly.""" 40 | 41 | def read(self, *args): 42 | return super().read(*args).replace("\x00", "") 43 | 44 | def readline(self, *args): 45 | return super().readline(*args).replace("\x00", "") 46 | 47 | 48 | @dataclass 49 | class Format: 50 | """Holds all parameters needed to successfully read a CSV file.""" 51 | 52 | encoding: str | None = "utf-8" 53 | preamble: int | None = 0 54 | dialect: Dialect | None = field(default_factory=lambda: Dialect()) 55 | columns: list[str] | None = None 56 | 57 | def __rich__(self) -> RichTable: 58 | return dict_view( 59 | {k: v for k, v in self.__dict__.items() if k != "columns"}, 60 | title="CSV Format", 61 | width=120, 62 | ) 63 | 64 | 65 | class Reader(ABC): 66 | """Base class for CSV readers.""" 67 | 68 | def __init__( 69 | self, 70 | fp: FileLike, 71 | encoding: str | EncodingDetector | None = None, 72 | dialect: dict | Dialect | DialectDetector | None = None, 73 | preamble: int | PreambleRegistry | None = None, 74 | log: bool = True, 75 | ) -> None: 76 | self.fp = fp 77 | self.encoding = encoding or encodings.Chardet() 78 | self.dialect = dialect or dialects.CleverCSV() 79 | self.preamble = preamble if preamble is not None else Preambles 80 | self.log = log 81 | 82 | def decode(self, fp: FileLike) -> TextIO: 83 | """Make sure we have a text buffer.""" 84 | buffer = fp 85 | 86 | if isinstance(buffer, (str, Path)): 87 | if isinstance(self.encoding, str): 88 | buffer = open(buffer, encoding=self.encoding, errors="replace") # noqa: SIM115 89 | else: 90 | buffer = open(buffer, "rb") # noqa: SIM115 91 | 92 | if is_empty(buffer): 93 | raise EmptyFileError(f"The passed object ({buffer}) contained 0 bytes of data.") 94 | 95 | if isinstance(buffer, io.BufferedIOBase): 96 | if isinstance(self.encoding, EncodingDetector): 97 | with reset_buffer(buffer): 98 | self.encoding = self.encoding.detect(buffer) 99 | 100 | buffer = CleanTextBuffer(buffer, encoding=self.encoding, errors="replace") 101 | else: 102 | self.encoding = buffer.encoding or "UTF-8" 103 | 104 | return buffer 105 | 106 | def detect_preamble(self, buffer: TextIO) -> int: 107 | """Detect the number of junk lines at the start of the file.""" 108 | if self.preamble is None: 109 | return 0 110 | if isinstance(self.preamble, (int, float)): 111 | return self.preamble 112 | if issubclass(self.preamble, Preambles): 113 | return Preambles.detect(buffer, log=self.log) or 0 114 | 115 | return 0 116 | 117 | def detect_dialect(self, buffer: TextIO) -> dict: 118 | """Detect separator, quote character etc.""" 119 | if isinstance(self.dialect, DialectDetector): 120 | return self.dialect.detect(buffer) 121 | 122 | if isinstance(self.dialect, dict): 123 | return Dialect(**self.dialect) 124 | 125 | return self.dialect 126 | 127 | @classmethod 128 | def detect_columns(cls, buffer: TextIO, dialect: Dialect) -> list[str]: 129 | """Extract column names from buffer pointing at header row.""" 130 | reader = DictReader(buffer, dialect=dialect.to_builtin()) 131 | with suppress(StopIteration): 132 | _ = next(reader) 133 | 134 | return reader.fieldnames 135 | 136 | def analyze(self): 137 | """Infer all parameters required for reading a csv file.""" 138 | self.buffer = self.decode(self.fp) 139 | cursor = self.buffer.tell() 140 | 141 | with reset_buffer(self.buffer): 142 | self.preamble = self.detect_preamble(self.buffer) 143 | 144 | for _ in range(self.preamble): 145 | self.buffer.readline() 146 | 147 | with reset_buffer(self.buffer): 148 | self.dialect = self.detect_dialect(self.buffer) 149 | 150 | with reset_buffer(self.buffer): 151 | self.columns = self.detect_columns(self.buffer, self.dialect) 152 | 153 | self.format = Format( 154 | encoding=self.encoding, 155 | preamble=self.preamble, 156 | dialect=self.dialect, 157 | columns=self.columns, 158 | ) 159 | 160 | if self.log: 161 | LOG.info(pformat(self.format)) 162 | 163 | self.buffer.seek(cursor) 164 | 165 | @abstractmethod 166 | def parse(self, *args, **kwds) -> Any: 167 | """Parse the file pointer or text buffer. Args are forwarded to read().""" 168 | 169 | def read(self, *args, **kwds) -> Any: 170 | try: 171 | self.analyze() 172 | result = self.parse(*args, **kwds) 173 | self.buffer.close() 174 | return result 175 | except Exception: 176 | raise 177 | 178 | __call__ = read 179 | -------------------------------------------------------------------------------- /lector/csv/arrow.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import codecs 4 | from codecs import StreamRecoder 5 | from collections.abc import Iterable 6 | from io import SEEK_CUR, BufferedIOBase, TextIOBase 7 | from pathlib import Path 8 | from typing import Union 9 | 10 | import pyarrow as pa 11 | import pyarrow.csv as pacsv 12 | from pyarrow import DataType 13 | from pyarrow.csv import InvalidRow 14 | 15 | from ..log import LOG 16 | from ..utils import MISSING_STRINGS, ensure_type, uniquify 17 | from .abc import EmptyFileError, FileLike, Format, Reader 18 | 19 | TypeDict = dict[str, Union[str, DataType]] 20 | 21 | MAX_MSG_LEN = 200 # characters 22 | SKIPPED_MSG_N_MAX = 20 23 | 24 | 25 | def clean_column_names(names: list[str]) -> list[str]: 26 | """Handle empty and duplicate column names.""" 27 | 28 | # Arrow doesn't (yet?) have support for CSV dialect "skipinitialspace" option 29 | names = [name.strip() for name in names] 30 | unnamed = [i for i, x in enumerate(names) if not x] 31 | for i, col_idx in enumerate(unnamed): 32 | names[col_idx] = f"Unnamed_{i}" 33 | 34 | return uniquify(names) 35 | 36 | 37 | def transcode( 38 | fp: FileLike, 39 | codec_in: str = "utf-8", 40 | codec_out: str = "utf-8", 41 | errors="replace", 42 | ) -> StreamRecoder: 43 | """Safely transcode any readable byte stream from decoder to encoder codecs. 44 | 45 | Arrow only accepts byte streams and optional encoding, but has no option to 46 | automatically handle codec errors. It also doesn't seem to like the interface 47 | of a Python recoder when the encoding is "utf-16" (rather than more specific 48 | "utf-16-le" or "utf-16-be"). 49 | """ 50 | if isinstance(fp, (str, Path)): 51 | fp = open(fp, "rb") # noqa: SIM115 52 | elif isinstance(fp, TextIOBase): 53 | # Not a no-operation! If we read 3 characteres from a text buffer, the underlying binary 54 | # buffer might actually read more, since it reads in batches. Which means its internal 55 | # cursor might be in advance of the current position in the text buffer read so far. 56 | fp.seek(0, SEEK_CUR) 57 | fp = fp.buffer 58 | 59 | if not isinstance(fp, BufferedIOBase): 60 | raise ValueError(f"Have unsupported input: {type(fp)}") 61 | 62 | return codecs.EncodedFile(fp, data_encoding=codec_out, file_encoding=codec_in, errors=errors) 63 | 64 | 65 | class ArrowReader(Reader): 66 | """Use base class detection methods to configure a pyarrow.csv.read_csv() call.""" 67 | 68 | def skip_invalid_row(self, row: InvalidRow) -> str: 69 | self.n_skipped += 1 70 | 71 | if self.n_skipped < SKIPPED_MSG_N_MAX: 72 | if row.text and len(row.text) > MAX_MSG_LEN: 73 | row = row._replace(text=row.text[:MAX_MSG_LEN]) 74 | LOG.warning(f"Skipping row:\n{row}") 75 | 76 | elif self.n_skipped == SKIPPED_MSG_N_MAX: 77 | LOG.warning("Won't show more skipped row messages.") 78 | 79 | return "skip" 80 | 81 | def configure(self, format: Format) -> dict: 82 | return { 83 | "read_options": { 84 | "encoding": format.encoding, 85 | "skip_rows": format.preamble, 86 | "block_size": 2 << 20, # 2 MiB, twice arrow's default of 1 MiB (1 << 20) 87 | }, 88 | "parse_options": { 89 | "delimiter": format.dialect.delimiter, 90 | "quote_char": format.dialect.quote_char, 91 | "double_quote": format.dialect.double_quote, 92 | "escape_char": format.dialect.escape_char, 93 | "newlines_in_values": True, 94 | "invalid_row_handler": self.skip_invalid_row, 95 | }, 96 | "convert_options": { 97 | "check_utf8": False, 98 | "strings_can_be_null": True, 99 | "quoted_strings_can_be_null": True, 100 | }, 101 | } 102 | 103 | def parse( # noqa: PLR0912 104 | self, 105 | types: str | TypeDict | None = None, 106 | timestamp_formats: str | list[str] | None = None, 107 | null_values: str | Iterable[str] | None = None, 108 | ) -> pa.Table: 109 | """Invoke Arrow's parser with inferred CSV format.""" 110 | self.n_skipped = 0 111 | 112 | config = self.configure(self.format) 113 | 114 | ro = config["read_options"] 115 | po = config["parse_options"] 116 | co = config["convert_options"] 117 | 118 | if types is not None: 119 | if isinstance(types, (str, DataType)): 120 | types = {col: ensure_type(types) for col in self.columns} 121 | elif isinstance(types, dict): 122 | types = {col: ensure_type(type) for col, type in types.items()} 123 | 124 | co["column_types"] = types 125 | 126 | if timestamp_formats is not None: 127 | if not isinstance(timestamp_formats, list): 128 | timestamp_formats = [timestamp_formats] 129 | 130 | co["timestamp_parsers"] = timestamp_formats 131 | 132 | if null_values is not None: 133 | if isinstance(null_values, str): 134 | null_values = [null_values] 135 | else: 136 | null_values = list(null_values) 137 | 138 | co["null_values"] = null_values 139 | else: 140 | co["null_values"] = MISSING_STRINGS 141 | 142 | try: 143 | fp = transcode(self.fp, codec_in=self.encoding, codec_out="utf-8") 144 | ro["encoding"] = "utf-8" 145 | 146 | tbl = pacsv.read_csv( 147 | fp, 148 | read_options=pa.csv.ReadOptions(**ro), 149 | parse_options=pa.csv.ParseOptions(**po), 150 | convert_options=pa.csv.ConvertOptions(**co), 151 | ) 152 | 153 | column_names = list(clean_column_names(tbl.column_names)) 154 | tbl = tbl.rename_columns(column_names) 155 | return tbl 156 | except pa.ArrowInvalid as exc: 157 | if "Empty CSV file or block" in (msg := str(exc)): 158 | raise EmptyFileError(msg) from None 159 | 160 | raise 161 | -------------------------------------------------------------------------------- /lector/csv/dialects.py: -------------------------------------------------------------------------------- 1 | """Detectors of CSV dialects (separator, quoting etc.). 2 | 3 | Note that python.csv is not even internally consistent. E.g. although the dialect used to produce a 4 | CSV may specify ``\\n`` as the line terminator, the python sniffer is hard-coded to return 5 | ``\\r\\n`` (it doesn't actually support detecting it). It's own reader (and others hopefully) deal 6 | internally with different line breaks, but it means one cannot compare a dialect used to generate a 7 | CSV and a dialect created by sniffing the same (quoting is equally hard-coded to ``QUOTE_MINIMAL``). 8 | 9 | Python quoting levels: 10 | 11 | - ``QUOTE_ALL``: 1 12 | - ``QUOTE_MINIMAL``: 0 13 | - ``QUOTE_NONE``: 3 14 | - ``QUOTE_NONNUMERIC``: 2 15 | 16 | """ 17 | from __future__ import annotations 18 | 19 | from abc import ABC, abstractmethod 20 | from collections.abc import Iterable 21 | from contextlib import suppress 22 | from csv import QUOTE_MINIMAL, QUOTE_NONE, Sniffer, get_dialect 23 | from csv import Dialect as PyDialect 24 | from dataclasses import dataclass 25 | from itertools import islice 26 | from typing import TextIO 27 | 28 | from ..log import LOG 29 | 30 | try: 31 | import clevercsv as ccsv 32 | 33 | CLEVER_CSV = True 34 | except Exception: 35 | CLEVER_CSV = False 36 | 37 | PyDialectT = type(PyDialect) 38 | 39 | N_ROWS_DFAULT: int = 100 40 | """How many rows to use for dialect detection.""" 41 | 42 | DELIMITER_OPTIONS: tuple[str] = (",", ";", "\t", "|") 43 | """Allowed delimiters for dialect detection.""" 44 | 45 | 46 | @dataclass 47 | class Dialect: 48 | """A more convenient class for dialects than Python's built-in. 49 | 50 | The built-in Dialect is a class with class attributes only, and so instead of instances 51 | of that class, Python wants you to send references to subclasses around, which is, uhm, 52 | awkward to say the least (see below _to_builtin() for an example). 53 | """ 54 | 55 | delimiter: str = "," 56 | quote_char: str = '"' 57 | escape_char: str | None = None 58 | double_quote: bool = True 59 | skip_initial_space: bool = False 60 | line_terminator: str = "\r\n" # Default in Python and correct according to official spec 61 | quoting: int = QUOTE_MINIMAL 62 | 63 | @classmethod 64 | def from_builtin(cls, dialect: str | PyDialectT) -> Dialect: 65 | """Make instance from built-in dialect class configured for reliable reading(!).""" 66 | if isinstance(dialect, str): 67 | dialect = get_dialect(dialect) 68 | 69 | # A dialect without delimiter doesn't make sense, though CleverCSV may return one, 70 | # e.g. when a CSV file contains a single column only 71 | delimiter = dialect.delimiter or "," 72 | 73 | # To read reliably we need one of escape_char or double quote defined 74 | double_quote = dialect.doublequote or (dialect.escapechar is None) 75 | 76 | # Although most parsers ignore this, Python's csv module complains when its missing 77 | line_terminator = dialect.lineterminator or "\r\n" 78 | 79 | # Minimal quoting won't hurt and is sensible if we already know how quoting is used 80 | quoting = dialect.quoting 81 | if quoting == QUOTE_NONE and (dialect.quotechar is not None or dialect.doublequote): 82 | quoting = QUOTE_MINIMAL 83 | 84 | return Dialect( 85 | delimiter=delimiter, 86 | quote_char=dialect.quotechar, 87 | escape_char=dialect.escapechar, 88 | double_quote=double_quote, 89 | skip_initial_space=dialect.skipinitialspace, 90 | line_terminator=line_terminator, 91 | quoting=quoting, 92 | ) 93 | 94 | def to_builtin(self) -> PyDialectT: 95 | """Make a subclass of built-in Dialect from this instance.""" 96 | 97 | class _Dialect(PyDialect): 98 | _name = "generated" 99 | lineterminator = self.line_terminator 100 | quoting = self.quoting 101 | escapechar = self.escape_char or None 102 | doublequote = self.double_quote 103 | delimiter = self.delimiter 104 | quotechar = self.quote_char 105 | skipinitialspace = self.skip_initial_space 106 | strict = False 107 | 108 | return _Dialect 109 | 110 | 111 | @dataclass 112 | class DialectDetector(ABC): 113 | """Base class for all dialect detectors.""" 114 | 115 | @abstractmethod 116 | def detect(self, buffer: TextIO) -> Dialect: 117 | ... 118 | 119 | 120 | @dataclass 121 | class PySniffer(DialectDetector): 122 | """Use Python's built-in csv sniffer.""" 123 | 124 | delimiters: Iterable[str] = DELIMITER_OPTIONS 125 | n_rows: int = N_ROWS_DFAULT 126 | log: bool = False 127 | 128 | def detect(self, buffer: TextIO) -> Dialect: 129 | """Detect a dialect we can read(!) a CSV with using the python sniffer. 130 | 131 | Note that the sniffer is not reliable for detecting quoting, quotechar etc., but reasonable 132 | defaults are almost guaranteed to work with most parsers. E.g. the lineterminator is not 133 | even configurable in pyarrow's csv reader, nor in pandas (python engine). 134 | """ 135 | 136 | pos = buffer.tell() 137 | sniffer = Sniffer() 138 | sniffer.preferred = [] 139 | 140 | for n_rows in (self.n_rows, 1): 141 | with suppress(Exception): 142 | buffer.seek(pos) 143 | sample = "\n".join(islice(buffer, n_rows)) 144 | dialect = sniffer.sniff(sample, delimiters=self.delimiters) 145 | 146 | # To read(!) a CSV reliably, we must have either doublequote=True or an escapechar, 147 | # yet Python's sniffer may return doublequote=False and no escapechar if nothing 148 | # was escaped in any way in the given CSV. 149 | dialect.doublequote = dialect.escapechar is None 150 | 151 | # The lineterminator is always returned as "\r\n", but that's ok since parsers 152 | # tend to ignore it anyways 153 | # dialect.lineterminator = ... # noqa 154 | 155 | # May detect that sample has no quotes, but if correct, parsing with minimal quote 156 | # option will still work, and if detection was erroneous, assuming minimal quoting 157 | # is more robust. It's also the default in pandas (=0) and arrow ignores it. 158 | if dialect.quoting == QUOTE_NONE: 159 | dialect.quoting = QUOTE_MINIMAL 160 | 161 | return Dialect.from_builtin(dialect) 162 | 163 | if self.log: 164 | LOG.info("Falling back to default dialect...") 165 | 166 | return Dialect() 167 | 168 | 169 | if CLEVER_CSV: 170 | # CleverCSV may return non-sensical characters as escapechar. 171 | # Monkey-patch to at least limit to ASCII chars. 172 | is_potential_escapechar_orig = ccsv.escape.is_potential_escapechar 173 | 174 | def is_potential_escapechar(char, encoding, block_char=None): 175 | if not char.isascii(): 176 | return False 177 | 178 | return is_potential_escapechar_orig(char, encoding, block_char) 179 | 180 | ccsv.escape.is_potential_escapechar = is_potential_escapechar 181 | ccsv.potential_dialects.is_potential_escapechar = is_potential_escapechar 182 | ccsv.normal_form.is_potential_escapechar = is_potential_escapechar 183 | 184 | @dataclass 185 | class CleverCSV(DialectDetector): 186 | """A more advanced dialect detector using CleverCsv.""" 187 | 188 | num_chars: int = int(1e6) 189 | skip: bool = True 190 | method: str = "auto" 191 | verbose: bool = False 192 | 193 | def detect(self, buffer: TextIO) -> Dialect: 194 | text = buffer.read(self.num_chars) 195 | dialect = ccsv.Detector().detect( 196 | text, 197 | delimiters=DELIMITER_OPTIONS, 198 | verbose=self.verbose, 199 | method=self.method, 200 | skip=self.skip, 201 | ) 202 | return Dialect.from_builtin(dialect.to_csv_dialect()) 203 | -------------------------------------------------------------------------------- /lector/csv/encodings.py: -------------------------------------------------------------------------------- 1 | """Helpers to detecting character encodings in binary buffers.""" 2 | from __future__ import annotations 3 | 4 | import codecs 5 | from abc import ABC, abstractmethod 6 | from dataclasses import dataclass 7 | from typing import BinaryIO, Literal 8 | 9 | import cchardet as cdet 10 | 11 | BOMS: dict[str, tuple[Literal, ...]] = { 12 | "utf-8-sig": (codecs.BOM_UTF8,), 13 | "utf-16": (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE), 14 | "utf-32": (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE), 15 | } 16 | """Map BOM (Byte-order mark) to encoding.""" 17 | 18 | 19 | MAX_INT32: int = 2_147_483_647 20 | """Cannot read more than this number of bytes at once to detect encoding.""" 21 | 22 | CODEC_ERR_CHAR = "�" 23 | """Character representing non-codable bytes.""" 24 | 25 | 26 | def detect_bom(bs: bytes): 27 | """Detect encoding by looking for a BOM at the start of the file.""" 28 | for enc, boms in BOMS.items(): 29 | if any(bs.startswith(bom) for bom in boms): 30 | return enc 31 | 32 | return None 33 | 34 | 35 | def decoding_errors(bs: bytes, encoding: str, prop: bool = True) -> float: 36 | """The proportion of characters that couldn't be decoded correctly.""" 37 | string = bytes.decode(bs, encoding, errors="replace") 38 | err = string.count(CODEC_ERR_CHAR) / (len(string) if prop else 1.0) 39 | return err 40 | 41 | 42 | @dataclass 43 | class EncodingDetector(ABC): 44 | """Base class specifying interface for all encoding detetors.""" 45 | 46 | @abstractmethod 47 | def detect(self, buffer: BinaryIO) -> str: 48 | """Implement me.""" 49 | 50 | 51 | @dataclass 52 | class Chardet(EncodingDetector): 53 | """An encoding detector using cchardet if the default utf-8 generates too many errors.""" 54 | 55 | n_bytes: int = int(1e7) # 10 MB 56 | """Use this many bytes to detect encoding.""" 57 | error_threshold: float = 0.001 58 | """A greater proportion of decoding errors than this will be considered a failed encoding.""" 59 | confidence_threshold: float = 0.6 60 | """Minimum level of confidence to accept an encoding automatically detected by cchardet.""" 61 | 62 | def detect(self, buffer: BinaryIO) -> str: 63 | """Somewhat 'opinionated' encoding detection. 64 | 65 | Assumes utf-8 as most common encoding, falling back on cchardet detection, and 66 | if all else fails on windows-1250 if encoding is latin-like. 67 | """ 68 | head: bytes = buffer.read(min(self.n_bytes, MAX_INT32)) 69 | 70 | bom_encoding = detect_bom(head) 71 | if bom_encoding: 72 | return bom_encoding 73 | 74 | if decoding_errors(head, "utf-8", prop=True) <= self.error_threshold: 75 | return "utf-8" 76 | 77 | detected = cdet.detect(head) 78 | encoding, confidence = detected["encoding"], detected["confidence"] 79 | 80 | if encoding: 81 | if confidence > self.confidence_threshold: 82 | return encoding 83 | 84 | if any(label in encoding.lower() for label in ("windows", "iso-8859")): 85 | # Iso-like, will use windows-1250 as super set for special chars 86 | return "windows-1250" 87 | 88 | return "windows-1250" 89 | -------------------------------------------------------------------------------- /lector/csv/pandas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from .abc import Reader 4 | 5 | 6 | class PandasReader(Reader): 7 | """Use base class detection methods to configure a pandas.read_csv() call.""" 8 | 9 | def parse(self, *args, **kwds): 10 | """Invoke Pandas' parser with inferred CSV format.""" 11 | cfg = { 12 | "encoding": self.format.encoding, 13 | "skiprows": self.format.preamble, 14 | "on_bad_lines": "warn", 15 | "engine": "python", 16 | } 17 | cfg.update(self.format.dialect) 18 | 19 | # Or "\n"? Pandas doesn't allow "\r\n"... 20 | cfg["lineterminator"] = None 21 | 22 | kwds = {**cfg, **kwds} 23 | result = pd.read_csv(self.buffer, *args, **kwds) 24 | return result 25 | -------------------------------------------------------------------------------- /lector/csv/preambles.py: -------------------------------------------------------------------------------- 1 | """Detectors of preambles in CSV files. 2 | 3 | This is generally a chicken-and-egg-type situation. Do detect generic preambles robustly and 4 | efficiently, it would really help to know the CSV dialect, or at least the delimiter. But to detect 5 | the dialect/delimiter correctly, we need to ignore/(skip) the preamble. Detectors may therefore 6 | rely on (somtimes) overly simplistic heuristics implicitly assuming a certain dialect. 7 | """ 8 | from __future__ import annotations 9 | 10 | import csv 11 | from abc import ABC, abstractmethod 12 | from collections.abc import Iterable 13 | from dataclasses import dataclass, field 14 | from itertools import islice 15 | from typing import TextIO 16 | 17 | from ..log import LOG 18 | 19 | 20 | @dataclass 21 | class PreambleDetector(ABC): 22 | """Base class for detecting preambles (initial junk) in a CSV buffer.""" 23 | 24 | n_rows: int = 100 25 | 26 | @abstractmethod 27 | def detect(self, buffer: TextIO) -> int: 28 | """Detect preamble and return number of lines to skip.""" 29 | 30 | 31 | class Preambles: 32 | """Registry to manage preamble detectors.""" 33 | 34 | DETECTORS = {} 35 | 36 | @classmethod 37 | def register(cls, registered: type) -> type: 38 | cls.DETECTORS[registered.__name__] = registered 39 | return registered 40 | 41 | @classmethod 42 | def detect( 43 | cls, 44 | buffer: TextIO, 45 | detectors: Iterable[PreambleDetector] | None = None, 46 | log: bool = False, 47 | ) -> int: 48 | """Get result of first preamble detector matching the csv buffer. 49 | 50 | Matching here means detecting more than 0 rows of preamble text, and result 51 | is the number of rows to skip. 52 | 53 | If no detectors are provided (as ordered sequence), all registered 54 | detector classes are tried in registered order and using default parameters. 55 | """ 56 | cursor = buffer.tell() 57 | 58 | if detectors is None: 59 | detectors = (det() for det in cls.DETECTORS.values()) 60 | 61 | for detector in detectors: 62 | skiprows = detector.detect(buffer) 63 | if skiprows: 64 | if log: 65 | name = detector.__class__.__name__ 66 | msg = f"'{name}' matches CSV buffer: detected {skiprows} rows to skip." 67 | LOG.info(msg) 68 | return skiprows 69 | 70 | buffer.seek(cursor) 71 | 72 | return 0 73 | 74 | 75 | @Preambles.register 76 | @dataclass 77 | class Brandwatch(PreambleDetector): 78 | """Detect CSV files exported from Brandwatch. 79 | 80 | Brandwatch uses the comma as separator and includes a row of commas only 81 | to separate preamble texts from the CSV table as such. 82 | """ 83 | 84 | def detect(self, buffer: TextIO) -> int: 85 | rows = [row.strip() for row in islice(buffer, self.n_rows)] 86 | 87 | for i, row in enumerate(rows): 88 | if len(row) > 0 and all(x == "," for x in row): 89 | return i + 1 90 | 91 | return 0 92 | 93 | 94 | @Preambles.register 95 | @dataclass 96 | class Fieldless(PreambleDetector): 97 | """Detects initial rows that don't contain any delimited fields. 98 | 99 | Tries parsing buffer using Python's built-in csv functionality, assuming as delimiter the most 100 | frequent character amongst those configured via ``delimiters``. Given this delimiter, the parser 101 | detects N initial lines containing a single field only, followed by at least one line containing 102 | multiple fields. N is then the number of rows to skip. 103 | """ 104 | 105 | delimiters: str | list[str] = field(default_factory=lambda: [",", ";", "\t"]) 106 | 107 | def detect_with_delimiter(self, buffer: TextIO, delimiter: str) -> int: 108 | """Count how many consecutive initial fieldless rows we have given specific delimiter.""" 109 | 110 | reader = csv.reader( 111 | islice(buffer, self.n_rows), 112 | delimiter=delimiter, 113 | quotechar='"', 114 | quoting=csv.QUOTE_MINIMAL, 115 | doublequote=True, 116 | skipinitialspace=True, 117 | ) 118 | 119 | for row in reader: 120 | if len(row) > 1: 121 | return reader.line_num - 1 122 | 123 | return 0 124 | 125 | def detect(self, buffer: TextIO) -> int: 126 | """Count consecutive initial fieldless rows given the most frequent delimiter.""" 127 | 128 | cursor = buffer.tell() 129 | delimiters = [self.delimiters] if isinstance(self.delimiters, str) else self.delimiters 130 | 131 | text = "".join(islice(buffer, self.n_rows)) 132 | counts = {delim: text.count(delim) for delim in delimiters} 133 | delimiter = max(counts.items(), key=lambda item: item[1])[0] 134 | 135 | buffer.seek(cursor) 136 | return self.detect_with_delimiter(buffer, delimiter) 137 | 138 | 139 | @Preambles.register 140 | @dataclass 141 | class GoogleAds(Fieldless): 142 | """In GoogleAds CSVs the garbage lines don't contain the separator (comma or tab). 143 | 144 | The only complications are that 1) GoogleAds has two CSV export formats: 'Excel' using tabs 145 | as separators and normal 'CSV' the comma; 2) A single column CSV wouldn't have the 146 | separator either. 147 | 148 | GoogleAds also seems to include two "totals" rows at the end, which we exclude here. 149 | """ 150 | 151 | def detect(self, buffer: TextIO) -> int: 152 | cursor = buffer.tell() 153 | skip = super().detect(buffer) 154 | 155 | if skip: 156 | buffer.seek(cursor) 157 | rows = [row.strip() for row in islice(buffer, self.n_rows)] 158 | 159 | is_report = any("informe de" in row.lower() for row in rows[0:skip]) 160 | has_campaign_col = any("Campaña" in col for col in rows[skip].split(",")) 161 | 162 | if is_report and has_campaign_col: 163 | self.skipfooter = 2 164 | else: 165 | skip = 0 166 | self.skipfooter = 0 167 | 168 | return skip 169 | -------------------------------------------------------------------------------- /lector/log.py: -------------------------------------------------------------------------------- 1 | """Helpers to pretty print/log objects using Rich.""" 2 | from __future__ import annotations 3 | 4 | import logging 5 | import sys 6 | from collections.abc import Iterable, Sequence 7 | from functools import partial 8 | from typing import TypeVar 9 | 10 | import pyarrow as pa 11 | import pyarrow.types as pat 12 | from pyarrow import DataType, Schema 13 | from pyarrow import Table as PaTable 14 | from rich import box, get_console 15 | from rich.padding import Padding 16 | from rich.panel import Panel 17 | from rich.pretty import Pretty 18 | from rich.progress import Progress, TimeElapsedColumn 19 | from rich.table import Column, Table 20 | from rich.text import Text 21 | 22 | from .utils import decode_metadata 23 | 24 | CONSOLE = get_console() 25 | 26 | BOX = box.HORIZONTALS 27 | 28 | Item = TypeVar("Item") 29 | 30 | 31 | class ColoredFormatter(logging.Formatter): 32 | """A custom formatter controlling message color.""" 33 | 34 | RESET = "\x1b[0m" 35 | 36 | FORMAT = "{asctime} {levelname} | {name} | {module}.{funcName}:{lineno} \n{message}" 37 | 38 | COLORS = { 39 | logging.DEBUG: "\x1b[38;20m", # grey 40 | logging.INFO: "\x1b[38;20m", # grey 41 | logging.WARNING: "\x1b[33;1m", # bold yellow 42 | logging.ERROR: "\x1b[31;1m", # bold red 43 | logging.CRITICAL: "\x1b[31;1m", # bold red 44 | } 45 | 46 | def __init__(self, datefmt=None, validate=True): 47 | super().__init__(self.FORMAT, style="{", datefmt=datefmt, validate=validate) 48 | 49 | def format(self, record): 50 | msg = super().format(record) 51 | col = self.COLORS.get(record.levelno) 52 | return msg.replace("", col).replace("", self.RESET) 53 | 54 | 55 | def setup_logging(level=logging.DEBUG, color=True): 56 | """Ensure logging handler is only added once.""" 57 | date_fmt = "%H:%M:%S" 58 | if color: 59 | fmt = ColoredFormatter(datefmt=date_fmt) 60 | else: 61 | fmt = logging.Formatter( 62 | "{asctime} {levelname} | {name} | {module}.{funcName}:{lineno} \n{message}", 63 | datefmt=date_fmt, 64 | style="{", 65 | ) 66 | 67 | logger = logging.getLogger("lector") 68 | logger.setLevel(level) 69 | 70 | _sh = logging.StreamHandler(sys.stdout) 71 | _sh.setFormatter(fmt) 72 | logger.addHandler(_sh) 73 | 74 | return logger 75 | 76 | 77 | LOG = setup_logging(level=logging.INFO, color=True) 78 | 79 | 80 | def pformat(obj, console=None, markup=True, end="", strip=False, **kwargs): 81 | """Pretty format any object, if possible with Rich.""" 82 | console = console or CONSOLE 83 | 84 | with console.capture() as capture: 85 | console.print(obj, markup=markup, end=end) 86 | 87 | result = capture.get() 88 | 89 | if strip: 90 | result = result.strip() 91 | 92 | return result 93 | 94 | 95 | iformat = partial(pformat, strip=True) 96 | """Shortcut for inline formatting, avoiding strings terminating in line breaks.""" 97 | 98 | 99 | def track( 100 | items: Iterable[Item] | Sequence[Item], 101 | columns: Iterable[Column] | None = None, 102 | total: float | None = None, 103 | desc: str = "Processing", 104 | update_period: float = 0.1, 105 | **kwds, 106 | ) -> Iterable[Item]: 107 | """Rich track with elapsed time by default.""" 108 | if columns is None: 109 | columns = (*Progress.get_default_columns(), TimeElapsedColumn()) 110 | 111 | with Progress(*columns, **kwds) as progress: 112 | yield from progress.track( 113 | items, 114 | total=total, 115 | description=desc, 116 | update_period=update_period, 117 | ) 118 | 119 | 120 | def type_view(type: DataType) -> str: 121 | """More compact strinf represenation of arrow data types.""" 122 | if pat.is_list(type): 123 | return f"list<{type.value_type}>" 124 | if pat.is_dictionary(type): 125 | if type.ordered: 126 | return f"dict<{type.value_type}, ordered>" 127 | return f"dict<{type.value_type}>" 128 | return str(type) 129 | 130 | 131 | def dict_view( 132 | d: dict, title: str = "", expand: bool = False, width=None, padding=1, **kwds 133 | ) -> Panel: 134 | dv = Pretty(d, **kwds) 135 | p = Panel(dv, expand=expand, title=title, width=width, box=BOX) 136 | return Padding(p, padding) 137 | 138 | 139 | def schema_view(schema: Schema, title: str | None = "Schema", padding: int = 1) -> Table: 140 | """Make a rich view for arrow schema.""" 141 | 142 | meta = {field.name: decode_metadata(field.metadata or {}) for field in schema} 143 | have_meta = any(meta.values()) 144 | 145 | rt = Table(title=title, title_justify="left", box=BOX) 146 | rt.add_column("Column", justify="left", style="indian_red1", no_wrap=True) 147 | rt.add_column("Type", style="yellow3") 148 | if have_meta: 149 | rt.add_column("Meta") 150 | 151 | for field in schema: 152 | if have_meta: 153 | field_meta = meta.get(field.name) 154 | field_meta = Pretty(field_meta) if field_meta else None 155 | rt.add_row(field.name, type_view(field.type), field_meta) 156 | else: 157 | rt.add_row(field.name, type_view(field.type)) 158 | 159 | return Padding(rt, padding) 160 | 161 | 162 | def schema_comparison( 163 | s1: Schema, 164 | s2: Schema, 165 | title: str | None = None, 166 | padding: int = 1, 167 | left: str = "Before", 168 | right: str = "After", 169 | ): 170 | meta = {field.name: decode_metadata(field.metadata or {}) for field in s2} 171 | have_meta = any(meta.values()) 172 | 173 | t = Table(title=title, title_justify="left", box=BOX) 174 | t.add_column("Column", justify="left", style="indian_red1", no_wrap=True) 175 | t.add_column(left, style="orange1") 176 | t.add_column(right, style="yellow3") 177 | if have_meta: 178 | t.add_column("Meta") 179 | 180 | for field in s2: 181 | if have_meta: 182 | field_meta = meta.get(field.name) 183 | field_meta = Pretty(field_meta) if field_meta else "" 184 | 185 | other = s1.field(field.name) 186 | if field.type != other.type: 187 | orig_type = type_view(other.type) 188 | else: 189 | orig_type = "" 190 | 191 | t.add_row(field.name, orig_type, type_view(field.type), field_meta) 192 | 193 | return Padding(t, padding) 194 | 195 | 196 | def schema_diff_view(diff: dict, title: str | None = None, padding: int = 1) -> Table: 197 | """Make a rich view for an arrow schema diff.""" 198 | 199 | t = Table(title=title, title_justify="left", box=BOX) 200 | t.add_column("Column", justify="left", style="indian_red1", no_wrap=True) 201 | t.add_column("Before", style="orange1") 202 | t.add_column("After", style="yellow3") 203 | 204 | for col, (before, after) in diff.items(): 205 | t.add_row(col, type_view(before), type_view(after)) 206 | 207 | return Padding(t, padding) 208 | 209 | 210 | def table_view( 211 | tbl: PaTable, 212 | title: str | None = None, 213 | n_rows_max: int = 10, 214 | n_columns_max: int = 6, 215 | max_column_width: int = 20, 216 | padding: int = 1, 217 | ) -> Table: 218 | """Pyarrow table to rich table.""" 219 | 220 | sample = tbl 221 | 222 | if sample.num_rows > n_rows_max: 223 | sample = sample.slice(0, n_rows_max) 224 | 225 | if sample.num_columns > n_columns_max: 226 | sample = sample.select(range(n_columns_max)) 227 | rest = pa.array(["..."] * len(sample)) 228 | sample = sample.append_column(field_="...", column=rest) 229 | 230 | style = "bold indian_red1" 231 | caption = Text.from_markup( 232 | f"[{style}]{tbl.num_rows:,}[/] rows ✕ [{style}]{tbl.num_columns}[/] columns" 233 | ) 234 | 235 | table = Table( 236 | title=title, 237 | caption=caption, 238 | title_justify="left", 239 | caption_justify="left", 240 | box=BOX, 241 | ) 242 | 243 | for field in sample.schema: 244 | name = field.name 245 | table.add_column( 246 | name, 247 | max_width=max_column_width, 248 | overflow="crop", 249 | no_wrap=True, 250 | ) 251 | 252 | rows = sample.to_pylist() 253 | ellipses = len(rows) < tbl.num_rows 254 | 255 | def value_repr(x): 256 | if x is None: 257 | return None 258 | if x == "...": 259 | return x 260 | return Pretty(x, max_length=max_column_width, max_string=max_column_width) 261 | 262 | for i, row in enumerate(rows): 263 | row = [value_repr(x) for x in row.values()] 264 | end_section = False if ellipses else i == len(rows) - 1 265 | table.add_row(*row, end_section=end_section) 266 | 267 | if ellipses: 268 | table.add_row(*["..."] * len(rows[0]), end_section=True) 269 | 270 | def type_repr(table, column): 271 | if column == "...": 272 | return "" 273 | 274 | style = "italic yellow3" 275 | type_ = table.schema.field(column).type 276 | return Text.from_markup(f"[{style}]{type_view(type_)}[/]") 277 | 278 | def null_repr(table, column): 279 | if column == "...": 280 | return "" 281 | 282 | style = "italic" 283 | n_nulls = table.column(column).null_count 284 | if n_nulls: 285 | return Text.from_markup(f"[{style} bold]nulls {n_nulls}[/]") 286 | 287 | return Text.from_markup(f"[{style}]nulls 0[/]") 288 | 289 | types = [type_repr(sample, column) for column in sample.column_names] 290 | nulls = [null_repr(sample, column) for column in sample.column_names] 291 | table.add_row(*nulls) 292 | table.add_row(*types) 293 | 294 | return Padding(table, padding) 295 | -------------------------------------------------------------------------------- /lector/types/__init__.py: -------------------------------------------------------------------------------- 1 | """Subpackage for inferring column types in CSV files. 2 | 3 | This is instead or on top of Arrow's built-in inference, which currently doesn't detect 4 | list columns, timestamps in non-ISO formats, or semantic types such as URLs, natural language 5 | text etc. 6 | """ 7 | from .abc import Converter, Registry 8 | from .bools import Boolean 9 | from .cast import Autocast, Cast 10 | from .lists import List 11 | from .numbers import Number 12 | from .strings import Category, Text, Url 13 | from .timestamps import Timestamp 14 | 15 | """Note, we need to import the types here, otherwise they won't be registered.""" 16 | 17 | __all__ = [ 18 | "Autocast", 19 | "Boolean", 20 | "Cast", 21 | "Converter", 22 | "Registry", 23 | "Category", 24 | "List", 25 | "Number", 26 | "Text", 27 | "Timestamp", 28 | "Url", 29 | ] 30 | -------------------------------------------------------------------------------- /lector/types/abc.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from dataclasses import dataclass, field 5 | 6 | import rich.repr 7 | from pyarrow import Array 8 | 9 | 10 | @dataclass 11 | class Conversion: 12 | """Return type of a conversion/cast.""" 13 | 14 | result: Array 15 | meta: dict = field(default_factory=dict) 16 | 17 | 18 | @dataclass 19 | @rich.repr.auto 20 | class Converter(ABC): 21 | """Simple base class for dependency injection of new custom data types. 22 | 23 | If a proportion of values smaller than `threshold` can be successfully 24 | converted, the converter should return None. 25 | """ 26 | 27 | threshold: float = 1.0 28 | 29 | @abstractmethod 30 | def convert(self, arr: Array) -> Conversion | None: 31 | """To be implemented in subclasses.""" 32 | 33 | 34 | @dataclass 35 | class ConverterRegistry: 36 | """Registry to manage converters.""" 37 | 38 | convs: dict[str, Converter] = field(default_factory=dict) 39 | 40 | def register(self, registered: type) -> type: 41 | self.convs[registered.__name__.lower()] = registered 42 | return registered 43 | 44 | def __getitem__(self, item: str) -> Converter: 45 | return self.convs[item.lower()] 46 | 47 | 48 | Registry = ConverterRegistry() 49 | """'Singleton' conversion registry.""" 50 | -------------------------------------------------------------------------------- /lector/types/bools.py: -------------------------------------------------------------------------------- 1 | """Classes for converting arrays to the boolean type.""" 2 | from __future__ import annotations 3 | 4 | from contextlib import suppress 5 | from dataclasses import dataclass 6 | 7 | import pyarrow as pa 8 | from pyarrow import Array 9 | 10 | from ..utils import is_stringy 11 | from .abc import Conversion, Converter, Registry 12 | 13 | 14 | @dataclass 15 | @Registry.register 16 | class Boolean(Converter): 17 | """Converts stringy booleans ("true" / "False"), and ints (0/1) to the boolean type.""" 18 | 19 | def convert(self, array: Array) -> Conversion | None: 20 | if not is_stringy(array.type) or array.null_count == len(array): 21 | return None 22 | 23 | meta = {"semantic": "boolean"} 24 | 25 | with suppress(pa.ArrowInvalid): 26 | converted = array.cast(pa.bool_()) 27 | 28 | n = len(array) 29 | valid_before = n - array.null_count 30 | valid_after = n - converted.null_count 31 | proportion_valid = valid_after / valid_before 32 | 33 | if proportion_valid >= self.threshold: 34 | return Conversion(converted, meta=meta) 35 | 36 | return None 37 | -------------------------------------------------------------------------------- /lector/types/cast.py: -------------------------------------------------------------------------------- 1 | """Helpers to easily cast columns to their most appropriate/efficient type.""" 2 | from __future__ import annotations 3 | 4 | from abc import ABC, abstractmethod 5 | from collections.abc import Iterable 6 | from dataclasses import dataclass, field 7 | from typing import Union 8 | 9 | import pyarrow as pa 10 | from pyarrow import Array, ChunkedArray, Table 11 | from tqdm.auto import tqdm 12 | 13 | from ..log import LOG, iformat, pformat, schema_diff_view 14 | from ..utils import encode_metadata, schema_diff 15 | from .abc import Conversion, Converter, Registry 16 | from .numbers import DecimalMode 17 | from .strings import Category 18 | 19 | Config = dict[str, dict] 20 | """An (ordered) dict of converter class names and corresponding parameters.""" 21 | 22 | Converters = Union[Config, Iterable[Converter], None] 23 | """Accepted argument type where converters are expected.""" 24 | 25 | DEFAULT_CONVERTERS: Config = { 26 | "number": {"threshold": 0.95, "allow_unsigned_int": True, "decimal": DecimalMode.INFER}, 27 | "boolean": {"threshold": 1.0}, 28 | "list": {"threshold": 0.95, "threshold_urls": 0.8}, 29 | "timestamp": {"threshold": 0.95}, 30 | "text": {"threshold": 0.8, "min_unique": 0.1}, 31 | "url": {"threshold": 0.8}, 32 | "category": {"threshold": 0.0, "max_cardinality": None}, 33 | } 34 | 35 | 36 | def ensure_converters(converters: Converters = None) -> list[Converter]: 37 | """Turn a type conversion config into a list of converter instances.""" 38 | if converters is None: 39 | converters = DEFAULT_CONVERTERS.copy() 40 | 41 | if isinstance(converters, dict): 42 | return [Registry[name](**params) for name, params in converters.items()] 43 | 44 | if isinstance(converters, list) and converters and isinstance(converters[0], Converter): 45 | return converters 46 | 47 | raise ValueError(f"Object cannot be made into type converters: {converters}") 48 | 49 | 50 | @dataclass 51 | class CastStrategy(ABC): 52 | """Base class for autocasting implementations.""" 53 | 54 | converters: Converters | None = None 55 | columns: list[str] | None = None 56 | log: bool = False 57 | 58 | def __post_init__(self): 59 | self.converters = ensure_converters(self.converters) 60 | 61 | @abstractmethod 62 | def cast_array(self, array: Array, name: str | None = None) -> Conversion: 63 | """Only need to override this.""" 64 | 65 | def cast_table(self, table: Table) -> Table: 66 | """Takes care of updating fields, including metadata etc.""" 67 | schema = table.schema 68 | columns = self.columns or table.column_names 69 | 70 | for name in tqdm(columns, desc="Autocasting", disable=not self.log): 71 | array = table.column(name) 72 | conv = self.cast_array(array, name=name) 73 | 74 | if conv is not None: 75 | result = conv.result 76 | meta = conv.meta or {} 77 | meta = encode_metadata(meta) if meta else None 78 | field = pa.field(name, type=result.type, metadata=meta) 79 | table = table.set_column(table.column_names.index(name), field, result) 80 | 81 | if self.log: 82 | diff = schema_diff(schema, table.schema) 83 | if diff: 84 | LOG.info(pformat(schema_diff_view(diff, title="Changed types"))) 85 | 86 | return table 87 | 88 | def cast(self, data: Array | ChunkedArray | Table) -> Conversion | Table: 89 | """Shouldn't be necessary, but @singledispatchmethod doesn't work with inheritance.""" 90 | if isinstance(data, (Array, ChunkedArray)): 91 | return self.cast_array(data) 92 | 93 | if isinstance(data, Table): 94 | return self.cast_table(data) 95 | 96 | raise ValueError(f"Can only cast arrays or tables, got {type(data)}!") 97 | 98 | 99 | @dataclass 100 | class Autocast(CastStrategy): 101 | """Simple cast trying each registered type in order. 102 | 103 | As a little performance optimization (having a huge effect on execution time), 104 | types are first tested on a sample for fast rejection of non-matching types. 105 | """ 106 | 107 | n_samples: int = 100 108 | fallback: Converter | None = field( 109 | default_factory=lambda: Category(threshold=0.0, max_cardinality=None) 110 | ) 111 | 112 | def cast_array(self, array: Array | ChunkedArray, name: str | None = None) -> Conversion: 113 | name = name or "" 114 | 115 | if array.null_count == len(array): 116 | if self.fallback: 117 | LOG.info(f"Column '{name}' is all null, trying fallback {iformat(self.fallback)}") 118 | return self.fallback.convert(array) 119 | 120 | LOG.debug(f"Column '{name}' is all null, skipping.") 121 | return None 122 | 123 | for converter in self.converters: 124 | sample = array.drop_null().slice(length=self.n_samples) 125 | if ( 126 | len(sample) > 0 127 | and converter.convert(sample) 128 | and (result := converter.convert(array)) 129 | ): 130 | if self.log: 131 | LOG.debug(f'Converted column "{name}" with converter\n{iformat(converter)}') 132 | return result 133 | 134 | if self.fallback and pa.types.is_string(array.type) or pa.types.is_null(array.type): 135 | LOG.debug( 136 | f"Got no matching converter for string column '{name}'. " 137 | f"Will try fallback {iformat(self.fallback)}." 138 | ) 139 | return self.fallback.convert(array) 140 | 141 | return None 142 | 143 | 144 | @dataclass 145 | class Cast: 146 | """Tries a specific cast for each column.""" 147 | 148 | converters: dict[str, Converter] 149 | log: bool = False 150 | 151 | def cast(self, table: Table) -> Table: 152 | schema = table.schema 153 | 154 | for _, (name, converter) in tqdm( 155 | enumerate(self.converters.items()), 156 | total=len(self.converters), 157 | desc="Explicit casting", 158 | disable=not self.log, 159 | ): 160 | array = table.column(name) 161 | try: 162 | conv = converter.convert(array) 163 | except Exception: 164 | LOG.error(f"Couldn't convert column {name} with converter {iformat(converter)}!") 165 | raise 166 | 167 | if conv is not None: 168 | result = conv.result 169 | meta = conv.meta or {} 170 | meta = encode_metadata(meta) if meta else None 171 | field = pa.field(name, type=result.type, metadata=meta) 172 | idx = table.schema.get_field_index(name) 173 | table = table.set_column(idx, field, result) 174 | else: 175 | LOG.error( 176 | f"Conversion of columns '{name}' with converter '{iformat(converter)}' failed!" 177 | ) 178 | LOG.error(f"Original column ({array.type}):\n{array}") 179 | 180 | if self.log: 181 | diff = schema_diff(schema, table.schema) 182 | if diff: 183 | LOG.info(pformat(schema_diff_view(diff, title="Changed types"))) 184 | 185 | return table 186 | -------------------------------------------------------------------------------- /lector/types/lists.py: -------------------------------------------------------------------------------- 1 | """List parsing and casting. 2 | 3 | Currently NOT supported in CSV strings: 4 | 5 | - floats with comma as the decimal delimiter (must be the period character) 6 | - floats with thousands separator 7 | """ 8 | from __future__ import annotations 9 | 10 | from collections.abc import Iterable 11 | from contextlib import suppress 12 | from csv import reader as csvreader 13 | from dataclasses import dataclass 14 | 15 | import msgspec 16 | import pyarrow as pa 17 | import pyarrow.compute as pac 18 | import pyarrow.types as pat 19 | from pyarrow import Array, DataType 20 | 21 | from ..log import LOG 22 | from ..utils import ensure_type, min_max, proportion_trueish, smallest_int_type 23 | from .abc import Conversion, Converter, Registry 24 | from .regex import RE_LIST_CLEAN, RE_LIST_LIKE 25 | from .strings import proportion_url 26 | 27 | LIST_TYPES: tuple[str] = (pa.int64(), pa.float64(), pa.timestamp(unit="ms")) 28 | 29 | JSON_DECODE = msgspec.json.Decoder(type=list).decode 30 | 31 | SAFE_CSV_PARSING = False 32 | 33 | 34 | def parse_csvs(strings: Iterable[str], safe=SAFE_CSV_PARSING, **kwds) -> Iterable[list]: 35 | """Parse a list of strings as CSV, to separate it into individual fields. 36 | 37 | The non-safe option uses python's built-in reader. But it either raises on invalid 38 | rows, or silently returns fewer parsed rows than original rows, depending on the 39 | "strict" parameter. The safe option will always return the expected number of rows, 40 | with values being None where a string couldn't be parsed. 41 | """ 42 | if safe: 43 | for s in strings: 44 | try: 45 | yield next(csvreader([s], **kwds)) 46 | except Exception: 47 | yield None 48 | else: 49 | yield from csvreader(strings, **kwds) 50 | 51 | 52 | def parse_lists_csv(arr: Array, **kwds) -> Array: 53 | """Parse strings as lines of CSV, to separate it into individual fields. 54 | 55 | Respects the separator being escaped when enclosed in (double) quotes etc. 56 | """ 57 | content = pac.replace_substring_regex(arr, pattern=RE_LIST_CLEAN, replacement="") 58 | strings = (s.as_py() if s.is_valid else "" for s in content) 59 | lists = parse_csvs(strings, **kwds) 60 | lists = ([elem.strip("' ") for elem in l] if l is not None else l for l in lists) 61 | result = pa.array(lists) 62 | result = pac.if_else(arr.is_null(), pa.NA, result) # Restore original nulls 63 | return result 64 | 65 | 66 | def parse_json(s: str): 67 | """Parse a single string as json.""" 68 | l = JSON_DECODE(s) 69 | 70 | if l and any(isinstance(x, (list, dict)) for x in l): 71 | l = [str(x) for x in l] 72 | 73 | return l 74 | 75 | 76 | def parse_lists_json(arr: Array) -> Array: 77 | """Parse strings as lists using the significantly faster msgspec.""" 78 | parsed = (parse_json(s.as_py()) if s.is_valid else None for s in arr) 79 | return pa.array(parsed) 80 | 81 | 82 | def proportion_listlike(arr: Array) -> float: 83 | """Calculate proportion of non-null strings that could be lists.""" 84 | valid = arr.drop_null() 85 | is_list = pac.match_substring_regex(valid, RE_LIST_LIKE) 86 | return proportion_trueish(is_list) 87 | 88 | 89 | def maybe_cast_lists( 90 | arr: Array, 91 | types: list[DataType | str] = LIST_TYPES, 92 | downcast: bool = True, 93 | ) -> Array | None: 94 | """Cast lists (of strings) to first valid type, if any.""" 95 | 96 | for type in types: 97 | type = ensure_type(type) 98 | 99 | if arr.type == type: 100 | return arr 101 | 102 | with suppress(Exception): 103 | result = pac.cast(arr, pa.list_(type)) 104 | 105 | if type == "int64" and downcast: 106 | vmin, vmax = min_max(pac.list_flatten(result)) 107 | itype = smallest_int_type(vmin, vmax) 108 | 109 | if itype is not None: 110 | try: 111 | itype = ensure_type(itype) 112 | result = pac.cast(result, pa.list_(itype)) 113 | except Exception as exc: 114 | LOG.error(exc) 115 | LOG.error("Will not downcast lists of int64.") 116 | 117 | return result 118 | 119 | return None 120 | 121 | 122 | def maybe_parse_lists( 123 | arr: Array, 124 | type: str | DataType | None = None, 125 | threshold: float = 1.0, 126 | quote_char: str = '"', 127 | delimiter: str = ",", 128 | ) -> Array | None: 129 | """Parse strings into list, optionally with (inferrable) element type.""" 130 | if proportion_listlike(arr.drop_null()) < threshold: 131 | return None 132 | 133 | try: 134 | result = parse_lists_json(arr) 135 | LOG.debug("[List] Was able to fast-parse as json") 136 | except Exception: 137 | try: 138 | result = parse_lists_csv( 139 | arr, skipinitialspace=True, quotechar=quote_char, delimiter=delimiter 140 | ) 141 | except Exception as exc: 142 | LOG.error(f"Cannot parse lists as CSV: {exc}") 143 | return None 144 | 145 | if type is not None: 146 | return result.cast(pa.list_(ensure_type(type))) 147 | 148 | return maybe_cast_lists(result, types=LIST_TYPES) or result 149 | 150 | 151 | @dataclass 152 | @Registry.register 153 | class List(Converter): 154 | type: str | DataType | None = None 155 | infer_urls: bool = True 156 | threshold_urls: float = 1.0 157 | quote_char: str = '"' 158 | delimiter: str = "," 159 | 160 | def convert(self, array: Array) -> Conversion | None: 161 | result = None 162 | 163 | if pat.is_string(array.type): 164 | result = maybe_parse_lists( 165 | array, 166 | type=self.type, 167 | threshold=self.threshold, 168 | quote_char=self.quote_char, 169 | delimiter=self.delimiter, 170 | ) 171 | elif pat.is_list(array.type): 172 | result = array 173 | 174 | if result is None: 175 | return None 176 | 177 | vtype = result.type.value_type 178 | 179 | if pat.is_string(vtype): 180 | if self.infer_urls and proportion_url(pac.list_flatten(result)) >= self.threshold_urls: 181 | semantic = "list[url]" 182 | else: 183 | semantic = "list[category]" 184 | elif pat.is_timestamp(vtype): 185 | semantic = "list[date]" 186 | elif pat.is_integer(vtype): 187 | semantic = f"list[number[{vtype}]]" 188 | else: 189 | if str(vtype) == "float": 190 | # pyarrow's "float" means float32, but pandas would interpret it as float64 191 | vtype = "float32" 192 | 193 | semantic = f"list[number[{vtype}]]" 194 | 195 | return Conversion(result, meta={"semantic": semantic}) 196 | -------------------------------------------------------------------------------- /lector/types/numbers.py: -------------------------------------------------------------------------------- 1 | """Helpers for parsing and downcasting numeric data. 2 | 3 | Note: Arrow uses Google's RE2 to implement regex functionality: 4 | https://github.com/google/re2/wiki/Syntax 5 | 6 | """ 7 | from __future__ import annotations 8 | 9 | from collections import Counter 10 | from dataclasses import dataclass 11 | from enum import Enum 12 | 13 | import pyarrow as pa 14 | import pyarrow.compute as pac 15 | import pyarrow.types as pat 16 | from pyarrow import Array 17 | 18 | from ..log import LOG 19 | from ..utils import ( 20 | dtype_name, 21 | empty_to_null, 22 | min_max, 23 | proportion_equal, 24 | smallest_int_type, 25 | ) 26 | from .abc import Conversion, Converter, Registry 27 | from .regex import RE_IS_FLOAT, RE_IS_INT 28 | 29 | DECIMAL_SUPPORT_MIN = 0.2 # 20% 30 | DECIMAL_CONFIDENCE_MIN = 1.5 # 150% 31 | 32 | 33 | class DecimalMode(str, Enum): 34 | INFER = "INFER" 35 | COMPARE = "COMPARE" 36 | 37 | 38 | def clean_float_pattern(thousands: str = ",") -> str: 39 | """Removes characters in number strings that Arrow cannot parse.""" 40 | if thousands == ",": 41 | # Match a "+" at the beginning and commas anywhere 42 | return r"^\+|," 43 | 44 | # Match a "+" at the beginning and a period anywhere 45 | return r"^\+|\." 46 | 47 | 48 | def decimal_delimiter( # noqa: PLR0911, PLR0912 49 | s: str, 50 | n_chars_max: int = 20, 51 | ) -> str | None: 52 | """Infer decimal delimiter from string representation s of an input number. 53 | 54 | Returns None if not unambiguously inferrable. 55 | """ 56 | n_commas = n_dots = n_delims = 0 57 | first_comma_idx = first_dot_idx = None 58 | n = len(s) 59 | 60 | for i, c in enumerate(s): 61 | if i > n_chars_max and n_delims == 0: # noqa: PLR2004 62 | return None # Early out for long strings that are unlikely to represent numbers 63 | 64 | if c in ".,": 65 | if i == 0 or (i == 1 and s[0] == "0"): 66 | return c # ".123" or "0.123": can only be decimal 67 | 68 | if i >= 4 and n_delims == 0: # noqa: PLR2004 69 | return c # First delim at 5th position: cannot be thousands (1234.00) 70 | 71 | if i + 3 >= n: 72 | return c # Less than 3 characters after delim: cannot be thousands (1.12) 73 | 74 | n_delims += 1 75 | 76 | if c == ".": 77 | n_dots += 1 78 | if first_dot_idx is None: 79 | first_dot_idx = i 80 | else: 81 | n_commas += 1 82 | if first_comma_idx is None: 83 | first_comma_idx = i 84 | 85 | if n_dots == 1 and n_commas == 0: 86 | return "." 87 | if n_dots > 0 and n_commas > 0: 88 | return "." if first_comma_idx < first_dot_idx else "," 89 | if n_commas == 1 and n_dots == 0: 90 | return "," 91 | if n_commas > 1: 92 | return "." 93 | if n_dots > 1: 94 | return "," 95 | 96 | return None 97 | 98 | 99 | def infer_decimal_delimiter(arr: Array) -> str | None: 100 | """Get most frequent decimal delimiter in array. 101 | 102 | If most frequent delimiter doesn't occur in sufficient proportion (support), 103 | or not significantly more often than other delimiters (confidence), returns 104 | None. 105 | """ 106 | n = len(arr) 107 | counts = Counter(decimal_delimiter(s.as_py()) for s in arr) 108 | counts.update({".": 0, ",": 0}) 109 | ranked = [d for d in counts.most_common(3) if d[0]] 110 | 111 | if all(delim[1] == 0 for delim in ranked): 112 | return None 113 | 114 | if ranked[1][1] > 0: 115 | # If ambiguous 116 | 117 | if (ranked[0][1] / n) < DECIMAL_SUPPORT_MIN: 118 | # Most frequent delimiter should occur in at least 30% of rows 119 | return None 120 | 121 | if (ranked[0][1] / ranked[1][1]) < DECIMAL_CONFIDENCE_MIN: 122 | # Most frequent delimiter should occur at least 50% more often than next delimiter 123 | return None 124 | 125 | return ranked[0][0] 126 | 127 | 128 | def clean_float_strings(arr: Array, decimal: str) -> tuple[Array, Array, float]: 129 | """Prepare an array of strings so that Arrow can cast the result to floats. 130 | 131 | Arrow allows exponential syntax and omission of 0s before and after the decimal point, 132 | i.e. the following are all valid string representations of floating point numbers: 133 | "-1e10", "1e10", "1e-2", "1.2e3", "-1.2e3", "1." ".12", "-1.", "-.1". 134 | 135 | Arrow doesn't allow prefix of a positive sign indicator, nor thousands separator, i.e. 136 | the following are not(!) valid: 137 | "+1e10", "+1.", "+.1", "123,456.0" 138 | 139 | We hence remove occurrences of both the thousands character and the positive sign 140 | before extracting the floating point part of strings using regex. 141 | 142 | Also see following for more regex parsing options: 143 | https://stackoverflow.com/questions/12643009/regular-expression-for-floating-point-numbers 144 | 145 | Note, we don't parse as float if there isn't a single value with decimals. If this is 146 | the case they should be integers really, and if they haven't been parsed as ints before, 147 | that's because the values didn't fit into Arrow's largesy integer type, in which case it 148 | isn't safe to parse as float, which Arrow would otherwise do unsafely(!) and silently. 149 | """ 150 | thousands = "," if decimal == "." else "." 151 | pattern = clean_float_pattern(thousands) 152 | clean = pac.replace_substring_regex(arr, pattern=pattern, replacement="") 153 | if decimal == ",": 154 | clean = pac.replace_substring(clean, pattern=",", replacement=".", max_replacements=1) 155 | 156 | # Arrow doesn't recognize upper case exponential ("1.03481E-11") 157 | clean = pac.utf8_lower(clean) 158 | is_float = pac.match_substring_regex(clean, pattern=RE_IS_FLOAT) 159 | 160 | if is_float.null_count == len(is_float): 161 | prop_valid = 0.0 162 | else: 163 | prop_valid = pac.sum(is_float).as_py() / (len(arr) - arr.null_count) 164 | 165 | return clean, is_float, prop_valid 166 | 167 | 168 | def maybe_parse_ints( 169 | arr: Array, 170 | threshold: float = 1.0, 171 | allow_unsigned: bool = False, 172 | ) -> Array | None: 173 | """Use regex to extract castable ints. 174 | 175 | Arrow's internal casting from string to int doesn't allow for an 176 | initial positive sign character, so we have to handle that separately. 177 | """ 178 | is_int = pac.match_substring_regex(arr, pattern=RE_IS_INT) 179 | if is_int.null_count == len(is_int): 180 | return None 181 | 182 | valid_prop = pac.sum(is_int).as_py() / (len(arr) - arr.null_count) 183 | if valid_prop < threshold: 184 | return None 185 | 186 | clean = pac.if_else(is_int, arr, None) 187 | clean = pac.replace_substring_regex(clean, r"^\+", "") 188 | 189 | try: 190 | return pac.cast(clean, pa.int64()) 191 | except Exception: 192 | if allow_unsigned: 193 | try: 194 | return pac.cast(clean, pa.uint64()) 195 | except Exception as exc: 196 | LOG.error(exc) 197 | 198 | return None 199 | 200 | 201 | def maybe_parse_floats( 202 | arr: Array, 203 | threshold: float = 0.5, 204 | decimal: str | DecimalMode = DecimalMode.INFER, 205 | ) -> Array | None: 206 | """Parse valid string representations of floating point numbers.""" 207 | if decimal == DecimalMode.INFER: 208 | decimal = infer_decimal_delimiter(arr.drop_null()) 209 | if decimal is None: 210 | return None 211 | 212 | if isinstance(decimal, str) and decimal in ".,": 213 | clean, is_float, prop_valid = clean_float_strings(arr, decimal=decimal) 214 | elif decimal == DecimalMode.COMPARE: 215 | result_dot = clean_float_strings(arr, decimal=".") 216 | result_comma = clean_float_strings(arr, decimal=",") 217 | if result_dot[2] >= result_comma[2]: 218 | clean, is_float, prop_valid = result_dot 219 | else: 220 | clean, is_float, prop_valid = result_comma 221 | else: 222 | raise ValueError(f"Must have decimal char or one of ['infer', 'compare']! Got '{decimal}'.") 223 | 224 | if prop_valid < threshold: 225 | return None 226 | 227 | valid = pac.if_else(is_float, clean, None) # non-floats -> null 228 | valid = empty_to_null(valid) 229 | 230 | try: 231 | return pac.cast(valid, pa.float64()) 232 | except Exception as exc: 233 | LOG.error(exc) 234 | 235 | return None 236 | 237 | 238 | def maybe_truncate_floats(arr: Array, threshold: float = 1.0) -> Array | None: 239 | """Float to int conversion if sufficient values are kept unchanged.""" 240 | trunc = pac.trunc(arr) 241 | 242 | if proportion_equal(arr, trunc) < threshold: 243 | return None 244 | 245 | try: 246 | if pac.min(arr).as_py() >= 0: 247 | return pac.cast(trunc, pa.uint64()) 248 | 249 | return pac.cast(trunc, pa.int64()) 250 | except pa.ArrowInvalid as exc: 251 | LOG.error("Failed to convert floats to ints: " + str(exc)) 252 | return None 253 | 254 | 255 | def maybe_downcast_ints(arr: Array) -> Array | None: 256 | """Convert to smallest applicable int type.""" 257 | vmin, vmax = min_max(arr, skip_nulls=True) 258 | if (vmin is None) or (vmax is None): 259 | return None 260 | 261 | type = smallest_int_type(vmin, vmax) 262 | 263 | if type is not None: 264 | return pac.cast(arr, type) 265 | 266 | return None 267 | 268 | 269 | @dataclass 270 | @Registry.register 271 | class Downcast(Converter): 272 | """Attempts truncation of floats to ints and then downcasting of ints.""" 273 | 274 | def convert(self, array: Array) -> Conversion | None: 275 | if pat.is_floating(array.type): 276 | array = maybe_truncate_floats(array, self.threshold) 277 | if array is None: 278 | return None 279 | 280 | if pat.is_integer(array.type): 281 | result = maybe_downcast_ints(array) 282 | return Conversion(result) if result is not None else Conversion(array) 283 | 284 | return None 285 | 286 | 287 | @dataclass 288 | @Registry.register 289 | class Number(Converter): 290 | """Attempts to parse strings into floats or ints followed by downcasting.""" 291 | 292 | decimal: str | DecimalMode = DecimalMode.INFER 293 | allow_unsigned_int: bool = True 294 | max_int: int | None = None 295 | 296 | def convert(self, array: Array) -> Conversion | None: 297 | if pat.is_string(array.type): 298 | converted = maybe_parse_ints( 299 | array, 300 | threshold=self.threshold, 301 | allow_unsigned=self.allow_unsigned_int, 302 | ) 303 | 304 | if converted is None: 305 | converted = maybe_parse_floats( 306 | array, 307 | threshold=self.threshold, 308 | decimal=self.decimal, 309 | ) 310 | 311 | if converted is not None: 312 | downcast = Downcast().convert(converted) 313 | converted = downcast if downcast is not None else Conversion(converted) 314 | else: 315 | converted = Downcast().convert(array) 316 | 317 | if converted is None: 318 | return None 319 | 320 | if ( 321 | pat.is_integer(converted.result.type) 322 | and self.max_int is not None 323 | and (pac.max(converted.result).as_py() or 0) > self.max_int 324 | ): 325 | return None 326 | 327 | converted.meta = {"semantic": f"number[{dtype_name(converted.result)}]"} 328 | return converted 329 | -------------------------------------------------------------------------------- /lector/types/regex.py: -------------------------------------------------------------------------------- 1 | """Common regex patterns used in mutiple modules.""" 2 | 3 | RE_INT_SIGN = "^(?P[+-])?(?P[0-9]+)$" 4 | """Capture optional sign and numeric parts in integer strings.""" 5 | 6 | RE_IS_INT = r"^\+?\-?[0-9]+$" 7 | """Strings matching int representations we're able to parse.""" 8 | 9 | RE_IS_FLOAT = "^[-]?[0-9]*[.]?[0-9]*(?:[e][+-]?[0-9]+)?$" 10 | """Strings matching float representations convertable by Arrow. Allows ints too, 11 | but those should have been inferred before trying floats. 12 | """ 13 | 14 | RE_LIST_LIKE: str = r"^[\(\[\|][\s\S]*[\)\]\|]$" 15 | """Lists start and end with parenthesis-like characters.""" 16 | 17 | RE_LIST_CLEAN: str = r"^[\[\{\(\|<]|[\]\}\)\|>]$|\r?\n" 18 | """Remove all parenthesis-like characters from start and end as well as line breaks.""" 19 | 20 | 21 | RE_URL = ( 22 | r"^(http://www\.|https://www\.|http://|https://)?" # http:// or https:// 23 | r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|" # domain... 24 | r"localhost|" # localhost... 25 | r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip 26 | r"(?::\d+)?" # optional port 27 | r"(?:/?|[/?]\S+)$" 28 | ) 29 | 30 | RE_TRAILING_DECIMALS: str = r"\.(\d+)$" 31 | """Strictly trailing, i.e. nothing after the decimals.""" 32 | 33 | RE_FRATIONAL_SECONDS: str = r"(?P\.\d+)" 34 | """Allows for timezone after fractional seconds, capturing part to be replaced.""" 35 | 36 | RE_TZ_OFFSET: str = r"(?PZ|[+-]\d{2}:?\d{2}$)" 37 | """Detect 4 digit timezone offsets, with or without colon.""" 38 | -------------------------------------------------------------------------------- /lector/types/strings.py: -------------------------------------------------------------------------------- 1 | """Helpers to convert to types that logically remain strings (e.g. categoricals). 2 | 3 | TODO: 4 | 5 | - Find a fast way to recognize whitespaces with regex (see is_text) 6 | - Try faster early out for text recognition using sufficient_texts() 7 | - Make sure Sex creates an ordinal column with correct Female > Male order 8 | 9 | """ 10 | from __future__ import annotations 11 | 12 | from dataclasses import dataclass 13 | from enum import Enum 14 | from math import inf as INF 15 | 16 | import pyarrow.compute as pac 17 | import pyarrow.types as pat 18 | from pyarrow import Array 19 | 20 | from ..log import LOG 21 | from ..utils import Number, map_values, proportion_trueish, proportion_unique, sorted_value_counts 22 | from .abc import Conversion, Converter, Registry 23 | from .regex import RE_LIST_LIKE, RE_URL 24 | 25 | MAX_CARDINALITY: Number = 0.1 26 | """Maximum cardinalty for categoricals (arrow's default is 50 in ConvertOptions).""" 27 | 28 | TEXT_MIN_SPACES: Number = 2 29 | """Strings need to have this many spaces to be considered text.""" 30 | 31 | TEXT_MIN_LENGTH: Number = 15 32 | """Strings need to be this long to be considered text.""" 33 | 34 | TEXT_REJECT_LISTS: bool = True 35 | """Whether to count list-like strings as texts.""" 36 | 37 | TEXT_PROPORTION_THRESHOLD: float = 0.8 38 | """Infer text type if a proportion or values greater than this is text-like. """ 39 | 40 | 41 | def is_text( 42 | arr: Array, 43 | min_spaces: int = TEXT_MIN_SPACES, 44 | min_length: int = TEXT_MIN_LENGTH, 45 | reject_lists: bool = TEXT_REJECT_LISTS, 46 | ) -> bool: 47 | """Check for natural language-like texts using criteria like lengths, number of spaces.""" 48 | is_long = pac.greater_equal(pac.utf8_length(arr), min_length) 49 | # This regex seems to be very slow 50 | # has_spaces = pac.greater_equal(pac.count_substring_regex(arr, pattern=r"\s"), min_spaces) # noqa 51 | has_spaces = pac.greater_equal(pac.count_substring(arr, pattern=" "), min_spaces) 52 | textlike = pac.and_(is_long, has_spaces) 53 | 54 | if reject_lists: 55 | listlike = pac.match_substring_regex(arr, RE_LIST_LIKE) 56 | return pac.and_not(textlike, listlike) 57 | 58 | return textlike 59 | 60 | 61 | def proportion_text( 62 | arr: Array, 63 | min_spaces: int = TEXT_MIN_SPACES, 64 | min_length: int = TEXT_MIN_LENGTH, 65 | reject_lists: bool = TEXT_REJECT_LISTS, 66 | ) -> float: 67 | """Calculate proportion of natural language-like texts given criteria.""" 68 | is_txt = is_text(arr.drop_null(), min_spaces, min_length, reject_lists) 69 | return proportion_trueish(is_txt) 70 | 71 | 72 | def sufficient_texts( 73 | arr: Array, 74 | min_spaces: int = TEXT_MIN_SPACES, 75 | min_length: int = TEXT_MIN_LENGTH, 76 | reject_lists: bool = TEXT_REJECT_LISTS, 77 | threshold: float = 1.0, 78 | ) -> bool: 79 | """Check for natural language-like texts using criteria like lengths, number of spaces. 80 | 81 | This is different from above in that for each text condition, we can early out if the 82 | condition is not met, without evaluating the remaining conditions. I.e., should be faster. 83 | """ 84 | is_long = pac.greater_equal(pac.utf8_length(arr), min_length) 85 | if proportion_trueish(is_long) < threshold: 86 | return False 87 | 88 | # This regex seems to be very slow 89 | # has_spaces = pac.greater_equal(pac.count_substring_regex(arr, pattern=r"\s"), min_spaces) # noqa 90 | has_spaces = pac.greater_equal(pac.count_substring(arr, pattern=" "), min_spaces) 91 | if proportion_trueish(has_spaces) < threshold: 92 | return False 93 | 94 | if reject_lists: 95 | is_listlike = pac.match_substring_regex(arr, RE_LIST_LIKE) 96 | if proportion_trueish(is_listlike) > (1.0 - threshold): 97 | return False 98 | 99 | return True 100 | 101 | 102 | def proportion_url(arr: Array) -> float: 103 | """Use regex to find proportion of strings that are (web) URL-like.""" 104 | is_url = pac.match_substring_regex(arr.drop_null(), RE_URL, ignore_case=True) 105 | return proportion_trueish(is_url) 106 | 107 | 108 | def maybe_cast_category( 109 | arr: Array, 110 | max_cardinality: Number | None = MAX_CARDINALITY, 111 | ) -> Array | None: 112 | """Cast to categorical depending on cardinality and whether strings are text-like.""" 113 | 114 | if max_cardinality is None or max_cardinality == INF: 115 | return pac.dictionary_encode(arr) 116 | 117 | n_unique = pac.count_distinct(arr, mode="only_valid").as_py() 118 | 119 | if max_cardinality > 1: 120 | do_cast = n_unique <= max_cardinality 121 | elif max_cardinality > 0: 122 | n_valid = len(arr) - arr.null_count 123 | if n_valid == 0: 124 | do_cast = True 125 | else: 126 | do_cast = (n_unique / n_valid) <= max_cardinality 127 | else: 128 | do_cast = False 129 | 130 | if max_cardinality is None or do_cast: 131 | return pac.dictionary_encode(arr) 132 | 133 | return None 134 | 135 | 136 | @dataclass 137 | @Registry.register 138 | class Text(Converter): 139 | """Anything could be text, but we can enforce text-likeness and uniqueness.""" 140 | 141 | min_unique: float = 0.1 142 | 143 | def convert(self, array: Array) -> Conversion | None: 144 | if not pat.is_string(array.type): 145 | return None 146 | 147 | if proportion_unique(array) >= self.min_unique and proportion_text(array) >= self.threshold: 148 | # if sufficient_texts(array, self.threshold): 149 | return Conversion(array, meta={"semantic": "text"}) 150 | 151 | return None 152 | 153 | 154 | @dataclass 155 | @Registry.register 156 | class Url(Converter): 157 | """Anything could be text, but we can enforce text-likeness and uniqueness.""" 158 | 159 | def convert(self, array: Array) -> Conversion | None: 160 | if not pat.is_string(array.type): 161 | return None 162 | 163 | array = pac.utf8_trim_whitespace(array) 164 | if proportion_url(array) >= self.threshold: 165 | result = pac.dictionary_encode(array) 166 | return Conversion(result, meta={"semantic": "url"}) 167 | 168 | return None 169 | 170 | 171 | @dataclass 172 | @Registry.register 173 | class Category(Converter): 174 | """Anything could be text, but we can enforce text-likeness and uniqueness.""" 175 | 176 | max_cardinality: Number | None = MAX_CARDINALITY 177 | 178 | def convert(self, array: Array) -> Conversion | None: 179 | if not pat.is_string(array.type): 180 | return None 181 | 182 | result = maybe_cast_category(array, self.max_cardinality) 183 | return Conversion(result, meta={"semantic": "category"}) if result is not None else None 184 | 185 | 186 | class Sex(Enum): 187 | Female = 0 188 | Male = 1 189 | 190 | 191 | class SexMapper: 192 | """Infer values encoding a person's sex in a column and map to configurable labels.""" 193 | 194 | DEFAULT_VALUES = { 195 | Sex.Female: ["female", "f", "femenino", "mujer", "m"], 196 | Sex.Male: ["male", "m", "masculino", "hombre", "varón", "varon", "h", "v"], 197 | } 198 | 199 | def __init__(self, values: tuple[str, str], labels: dict[Sex, str] | None = None): 200 | self.labels = labels or {Sex.Female: "Female", Sex.Male: "Male"} 201 | self.infer_values(values) 202 | self.make_mapping() 203 | 204 | def infer_values(self, values: tuple[str, str]) -> dict: 205 | """Infer which values encode female/male categories.""" 206 | if len(values[0]) == 1 and len(values[1]) == 1 and "m" in values: 207 | f_label, m_label = self.labels[Sex.Female], self.labels[Sex.Male] 208 | 209 | if "f" in values: 210 | # male/female or masculino/femenino 211 | self.values = {Sex.Female: ["f", f_label], Sex.Male: ["m", m_label]} 212 | elif "v" in values: 213 | # mujer/varon 214 | self.values = {Sex.Female: ["m", f_label], Sex.Male: ["v", m_label]} 215 | elif "h" in values: 216 | # mujer/hombre 217 | self.values = {Sex.Female: ["m", f_label], Sex.Male: ["h", m_label]} 218 | 219 | self.values = self.DEFAULT_VALUES 220 | 221 | def make_mapping(self) -> dict[str, str]: 222 | """Create a mapping from inferred values to desired labels.""" 223 | ensure_list = lambda x: x if isinstance(x, list) else [x] 224 | self.map = {val: self.labels[sex] for sex in Sex for val in ensure_list(self.values[sex])} 225 | 226 | 227 | def maybe_sex(arr: Array) -> tuple[str, str] | None: 228 | """Check if the two most common values are sex-like and return them.""" 229 | top_n = 2 230 | lower = pac.utf8_lower(arr) 231 | top2 = sorted_value_counts(lower, top_n=top_n) 232 | values = top2.field("values").to_pylist() 233 | 234 | if len(values) == top_n: 235 | mapper = SexMapper(values) 236 | LOG.debug(f"Sex mapping: {mapper.map}") 237 | mapped = map_values(lower, mapper.map) 238 | return mapped.dictionary_encode() 239 | 240 | return arr 241 | -------------------------------------------------------------------------------- /lector/types/timestamps.py: -------------------------------------------------------------------------------- 1 | """Helpers to convert timestamp strings or time-like columns to timestamps. 2 | 3 | Arrow seems to be using this parser under the hood: 4 | https://pubs.opengroup.org/onlinepubs/009695399/functions/strptime.html 5 | 6 | in its compute.strptime function, which doesn't support timezone offsets via 7 | the %z or %Z directives. Though they do support timezones when importing CSVs 8 | or casting... 9 | 10 | For arrow internals relating to timestamps also see: 11 | 12 | - Timezone internals: 13 | https://arrow.apache.org/docs/cpp/api/datatype.html#_CPPv4N5arrow13TimestampTypeE 14 | - CSV parsing: 15 | https://arrow.apache.org/docs/cpp/csv.html#timestamp-inference-parsing 16 | - Timestamp umbrella issue: 17 | https://github.com/apache/arrow/issues/31324 18 | 19 | TODO: 20 | - Fractional seconds are handled manually, also see 21 | https://github.com/apache/arrow/issues/20146. They are first removed via regex, 22 | converted to a pyarrow duration type and later added to parsed timestamps. 23 | - Timezones are only supported in format "+0100", but not e.g. "+01:00" 24 | - What to do with mixed timezones: 25 | https://stackoverflow.com/questions/75656639/computing-date-features-using-pyarrow-on-mixed-timezone-data 26 | 27 | """ 28 | from __future__ import annotations 29 | 30 | from dataclasses import dataclass 31 | from functools import lru_cache 32 | from typing import ClassVar 33 | 34 | import pyarrow as pa 35 | import pyarrow.compute as pac 36 | import pyarrow.types as pat 37 | from pyarrow import Array, TimestampArray, TimestampScalar, TimestampType 38 | 39 | from ..log import LOG 40 | from ..utils import proportion_trueish 41 | from .abc import Conversion, Converter, Registry 42 | from .regex import RE_FRATIONAL_SECONDS, RE_TZ_OFFSET 43 | 44 | TIMESTAMP_FORMATS: list[str] = [ 45 | "%Y-%m-%dT%H:%M:%S", 46 | "%Y-%m-%dT%H:%M", 47 | "%Y-%m-%dT%I:%M:%S %p", 48 | "%Y-%m-%dT%I:%M %p", 49 | "%Y-%m-%d%n%H:%M:%S", 50 | "%Y-%m-%d%n%I:%M:%S %p", 51 | "%a %b %d %H:%M:%S %Y", 52 | "%a %b %d %I:%M:%S %p %Y", 53 | "%a %d %b %H:%M:%S %Y", 54 | "%a %d %b %I:%M:%S %p %Y", 55 | "%a, %b %d %H:%M:%S %Y", 56 | "%a, %b %d %I:%M:%S %p %Y", 57 | "%a, %d %b %H:%M:%S %Y", 58 | "%a, %d %b %I:%M:%S %p %Y", 59 | "%a %d %b %Y %H:%M:%S", 60 | "%a %d %b %Y %I:%M:%S %p", 61 | "%a, %d %b %Y %H:%M:%S", 62 | "%a, %d %b %Y %I:%M:%S %p", 63 | "%a %b %d %H:%M:%S %z %Y", 64 | ] 65 | 66 | DATE_FORMATS: list[str] = [ 67 | "%d-%m-%y", # %y first since it will fail with 4 digit years, 68 | "%d/%m/%y", # while %Y will not fail on 2 digit years(!) 69 | "%Y-%m-%d", 70 | "%d-%m-%Y", 71 | "%Y/%m/%d", 72 | "%d/%m/%Y", 73 | "%m/%d/%Y", 74 | "%a %d %b %Y", 75 | "%a, %d %b %Y", 76 | ] 77 | 78 | ISO_FORMAT: str = "ISO8601()" 79 | """String Arrow recognizes as meaning the ISO format.""" 80 | 81 | UNIT = "ns" 82 | """Note that pandas internal unit is fixed to nanoseconds, and with that resolution it can 83 | represent a much smaller period of dates only.""" 84 | 85 | 86 | def timestamp_formats(tz: bool = True) -> list[str]: 87 | formats = TIMESTAMP_FORMATS 88 | if tz: 89 | with_tz = lambda fmt: (fmt, fmt + " %z", fmt + " Z", fmt + " UTC") 90 | formats = [ext for fmt in formats for ext in with_tz(fmt)] 91 | formats.extend(DATE_FORMATS) 92 | return [ISO_FORMAT] + formats 93 | 94 | 95 | ALL_FORMATS: list[str] = timestamp_formats() 96 | """All formats tried by default if None is explicitly provided when converting.""" 97 | 98 | 99 | def proportion_fractional_seconds(arr: Array) -> float: 100 | """Proportion of non-null dates in arr having fractional seconds.""" 101 | valid = arr.drop_null() 102 | has_frac = pac.match_substring_regex(valid, RE_FRATIONAL_SECONDS) 103 | return proportion_trueish(has_frac) 104 | 105 | 106 | def fraction_as_duration(arr: Array) -> Array: 107 | """Convert an array (of strings) representing fractional seconds to duration type.""" 108 | 109 | if pat.is_string(arr.type): 110 | arr = pac.cast(arr, pa.float64()) 111 | 112 | if pat.is_floating(arr.type): 113 | # Assume values in [0,1]: convert to nanoseconds 114 | arr = pac.multiply(arr, 1e9) 115 | arr = pac.trunc(arr) 116 | arr = pac.cast(arr, pa.int64()) 117 | 118 | return pac.cast(arr, pa.duration("ns")) 119 | 120 | 121 | def extract_timezone(timestamps: pa.Array): 122 | """Extract timezone from a list of string timestamps. 123 | 124 | Currently, the only supported format is +/-HH[:]MM, e.g. +0100. 125 | 126 | Also, returns None if there are multiple different offsets, after 127 | some basic cleaning. E.g. Z and +0000 are considered the same. 128 | """ 129 | res = pac.extract_regex(timestamps, RE_TZ_OFFSET) 130 | res = res.drop_null() 131 | 132 | if not len(res): 133 | return None 134 | 135 | offsets = pac.struct_field(res, indices=0) 136 | offsets = pac.replace_substring(offsets, ":", "") 137 | offsets = pac.replace_substring(offsets, "Z", "+0000") 138 | offsets = offsets.unique() 139 | 140 | if len(offsets) > 1: 141 | return None 142 | 143 | offset = offsets[0].as_py() 144 | return f"{offset[:-2]}:{offset[-2:]}" 145 | 146 | 147 | @lru_cache(maxsize=128, typed=False) 148 | def find_format(ts: TimestampScalar) -> str | None: 149 | """Try to find the first format that can parse given date.""" 150 | if pac.is_null(ts).as_py(): 151 | return None 152 | 153 | for fmt in ALL_FORMATS: 154 | try: 155 | pac.strptime(ts, format=fmt, unit="s") 156 | return fmt 157 | except Exception: # noqa: S112 158 | continue 159 | 160 | return None 161 | 162 | 163 | def maybe_parse_known_timestamps( 164 | arr: Array, 165 | format: str, 166 | unit: str = UNIT, 167 | threshold: float = 1.0, 168 | ) -> Array | None: 169 | """Helper for parsing with known format and no fractional seconds.""" 170 | 171 | if threshold == 1.0: # noqa: PLR2004 172 | try: 173 | return pac.strptime(arr, format=format, unit=unit) 174 | except Exception: 175 | return None 176 | 177 | valid_before = len(arr) - arr.null_count 178 | result = pac.strptime(arr, format=format, unit=unit, error_is_null=True) 179 | valid_after = len(result) - result.null_count 180 | 181 | if (valid_after / valid_before) < threshold: 182 | return None 183 | 184 | return result 185 | 186 | 187 | def maybe_parse_timestamps( 188 | arr: Array, 189 | format: str | None = None, 190 | unit: str = UNIT, 191 | threshold: float = 1.0, 192 | return_format: bool = False, 193 | ) -> Array | None: 194 | """Parse lists of strings as dates with format inference.""" 195 | min_prop_frac_secs = 0.1 196 | 197 | if proportion_fractional_seconds(arr) > min_prop_frac_secs: 198 | frac = pac.extract_regex(arr, RE_FRATIONAL_SECONDS) 199 | frac = pac.struct_field(frac, indices=[0]) 200 | frac = fraction_as_duration(frac) 201 | arr = pac.replace_substring_regex(arr, RE_FRATIONAL_SECONDS, "") 202 | else: 203 | frac = None 204 | 205 | if format is None: 206 | formats = ALL_FORMATS 207 | valid = arr.drop_null() 208 | 209 | if len(valid) > 0: 210 | first_date = valid[0] 211 | first_format = find_format(first_date) 212 | if first_format is not None: 213 | LOG.info(f"Found date format '{first_format}'") 214 | formats = ALL_FORMATS.copy() 215 | formats.remove(first_format) 216 | formats.insert(0, first_format) 217 | 218 | else: 219 | formats = [format] 220 | 221 | for fmt in formats: 222 | result = maybe_parse_known_timestamps(arr, format=fmt, unit=unit, threshold=threshold) 223 | if result is not None: 224 | if frac is not None: 225 | result = pac.add(result, frac) 226 | return (result, fmt) if return_format else result 227 | 228 | return None 229 | 230 | 231 | @dataclass 232 | @Registry.register 233 | class Timestamp(Converter): 234 | """Convert string or time/date-like arrays to timestamp type. 235 | 236 | Note: Arrow will always _parse_ either into UTC or timezone-naive 237 | timestamps, but never into specific timezones other than UTC 238 | by default. Also, internally all timestamps are represented as UTC. 239 | The timezone metadata is then used by other functions to correctly 240 | extract for example the local day of the week, time etc. 241 | 242 | Non-UTC timestamps can only be created by specifying the TimestampType 243 | explicitly, or using the assume_timezone function. 244 | 245 | When converting to pandas, the timezone is handled correctly. 246 | 247 | When input strings have no explicit timezone information, uses `tz` 248 | parameter to interpret them as local to that tz. If tz=None, keeps 249 | them as timezone-naive timestamps. If input strings do have explicit 250 | timezone information, will be represented internally as UTC (as always), 251 | and simply set the tz metadata so that component extraction etc. will 252 | use correctly localized moments in time. 253 | 254 | TZ-naive timestamps ["2013-07-17 05:00", "2013-07-17 02:00"]: 255 | 256 | - assume_timezone(NY): interprets input timestamps as local to tz, 257 | converts and stores them as UTC, and keeps tz metadata for 258 | correct localization when printing/extracting components. I.e., 259 | will convert to [2013-07-17 09:00:00, 2013-07-17 06:00:00] UTC, 260 | but when needed, will localize on demand to 261 | [2013-07-17 05:00:00-04:00 2013-07-17 02:00:00-04:00]. 262 | 263 | - cast with timezone(NY): interprets input timestamps as local to UTC, 264 | and stores the tz as metadata for on-demand localization. I.e., 265 | timestamps will be [2013-07-17 05:00:00, 2013-07-17 02:00:00] UTC, 266 | and when needed will localize on demand to 267 | [2013-07-17 01:00:00-04:00 2013-07-16 22:00:00-04:00]. 268 | 269 | TZ-aware timestamps ["2013-07-17 05:00", "2013-07-17 02:00"] UTC: 270 | 271 | - cast with timezone(NY): since input timestamps internally are already 272 | always in UTC, keeps them as UTC ["2013-07-17 05:00", "2013-07-17 02:00"], 273 | but localizes to cast tz on demand, i.e. to 274 | [2013-07-17 01:00:00-04:00 2013-07-16 22:00:00-04:00]. 275 | """ 276 | 277 | format: str | None = None 278 | """When None, default formats are tried in order.""" 279 | unit: str = UNIT 280 | """Resolution the timestamps are stored with internally.""" 281 | tz: str | None = None 282 | """The desired timezone of the timestamps.""" 283 | convert_temporal: bool = True 284 | """Whether time/date-only arrays should be converted to timestamps.""" 285 | 286 | DEFAULT_TZ: ClassVar[str] = "UTC" 287 | 288 | @staticmethod 289 | def meta(dt: TimestampType) -> dict[str, str]: 290 | tz = f", {dt.tz}" if dt.tz is not None else "" 291 | return {"semantic": f"date[{dt.unit}{tz}]"} 292 | 293 | @staticmethod 294 | def to_timezone(array: TimestampArray, tz: str | None) -> TimestampArray: 295 | if tz is not None: 296 | if array.type.tz is None: 297 | # Interpret as local moments in given timezone to convert to UTC equivalent 298 | return pac.assume_timezone( 299 | array, timezone=tz, ambiguous="earliest", nonexistent="earliest" 300 | ) 301 | 302 | # Keep UTC internally, simply change what local time is assumed in temporal functions 303 | return array.cast(pa.timestamp(unit=array.type.unit, tz=tz)) 304 | 305 | if array.type.tz is not None: 306 | # Make local timezone-naive. Careful: the following will make the timestamps 307 | # naive, but with local time in UTC, not using the existing timezone metadata! 308 | # return array.cast(pa.timestamp(unit=array.type.unit, tz=None)) # noqa: ERA001 309 | raise NotImplementedError("Pyarrow's to_local() will not be implemented until v12.0!") 310 | 311 | # Keep as timezone-naive timestamps 312 | return array 313 | 314 | def convert_date_time(self, array: Array) -> Conversion | None: 315 | try: 316 | result = array.cast(pa.timestamp(unit=self.unit), safe=False) 317 | result = self.to_timezone(result, self.tz or self.DEFAULT_TZ) 318 | return Conversion(result, self.meta(result.type)) 319 | except pa.ArrowNotImplementedError: 320 | LOG.error(f"Pyarrow cannot convert {array.type} to timestamp!") 321 | return None 322 | 323 | def convert_timestamp(self, array: Array) -> Conversion | None: 324 | result = array 325 | if array.type.unit != self.unit: 326 | result = array.cast(pa.timestamp(unit=self.unit), safe=False) 327 | 328 | result = self.to_timezone(result, self.tz or self.DEFAULT_TZ) 329 | return Conversion(result, self.meta(result.type)) 330 | 331 | def convert_strings(self, array: Array) -> Conversion | None: 332 | try: 333 | # Pyarrow's strptime behaves different from its internal cast and inference. Only the 334 | # latter support timezone offset. So try cast first, and then strptime-based conversion. 335 | result = array.cast(pa.timestamp(unit=self.unit)) 336 | except pa.ArrowInvalid: 337 | try: 338 | result = array.cast(pa.timestamp(unit=self.unit, tz="UTC")) 339 | except pa.ArrowInvalid: 340 | result = None 341 | 342 | if result is not None: 343 | tz = self.tz or extract_timezone(array) 344 | result = self.to_timezone(result, tz or self.DEFAULT_TZ) 345 | return Conversion(result, self.meta(result.type) | {"format": "arrow"}) 346 | 347 | result = maybe_parse_timestamps( 348 | array, 349 | format=self.format, 350 | unit=self.unit, 351 | threshold=self.threshold, 352 | return_format=True, 353 | ) 354 | 355 | if result is not None: 356 | result, format = result 357 | result = self.to_timezone(result, self.tz or self.DEFAULT_TZ) 358 | return Conversion(result, self.meta(result.type) | {"format": format}) 359 | 360 | return None 361 | 362 | def convert(self, array: Array) -> Conversion | None: 363 | if (pat.is_time(array.type) or pat.is_date(array.type)) and self.convert_temporal: 364 | return self.convert_date_time(array) 365 | 366 | if pat.is_timestamp(array.type): 367 | return self.convert_timestamp(array) 368 | 369 | return self.convert_strings(array) if pat.is_string(array.type) else None 370 | -------------------------------------------------------------------------------- /lector/utils.py: -------------------------------------------------------------------------------- 1 | """Common helpers to work with pyarrow objects.""" 2 | 3 | from __future__ import annotations 4 | 5 | import json 6 | from collections import namedtuple 7 | from collections.abc import Callable, Iterator, Sequence 8 | from contextlib import contextmanager 9 | from functools import singledispatch 10 | from time import perf_counter 11 | from typing import Any, Union 12 | 13 | import pyarrow as pa 14 | from pyarrow import ( 15 | Array, 16 | ChunkedArray, 17 | DataType, 18 | Schema, 19 | Table, 20 | type_for_alias, # noqa: F401 21 | ) 22 | from pyarrow import compute as pac 23 | from pyarrow import types as pat 24 | from pyarrow.lib import ensure_type # noqa: F401 25 | 26 | try: 27 | import pandas as pd 28 | 29 | PANDAS_INSTALLED = True 30 | except Exception: 31 | PANDAS_INSTALLED = False 32 | 33 | 34 | Number = Union[int, float] 35 | 36 | Limit = namedtuple("Limit", "min,max") 37 | 38 | INT_LIMITS: dict[str, Limit] = { 39 | "int8": Limit(-128, 127), 40 | "int16": Limit(-32_768, 32_767), 41 | "int32": Limit(-2_147_483_648, 2_147_483_647), 42 | "int64": Limit(-9_223_372_036_854_775_808, 9_223_372_036_854_775_807), 43 | "uint8": Limit(0, 255), 44 | "uint16": Limit(0, 65_535), 45 | "uint32": Limit(0, 4_294_967_295), 46 | "uint64": Limit(0, 18_446_744_073_709_551_615), 47 | } 48 | """Minimum and maximum for each integer subtype.""" 49 | 50 | MISSING_STRINGS: set[str] = { 51 | "#N/A", 52 | "#N/A N/A", 53 | "#NA", 54 | "-1.#IND", 55 | "-1.#INF", 56 | "-1.#QNAN", 57 | "1.#IND", 58 | "1.#INF", 59 | "1.#INF000000", 60 | "1.#QNAN", 61 | "-NaN", 62 | "-nan", 63 | "", 64 | "N/A", 65 | "n/a", 66 | "NA", 67 | "NAN", 68 | "NaN", 69 | "nan", 70 | "NULL", 71 | "Null", 72 | "null", 73 | # Would expect this to happen automatically, but not the case 74 | # (at least when Arrow reads CSV with types="string") 75 | "", 76 | } 77 | """Extension of pandas and arrow default missing values.""" 78 | 79 | 80 | @contextmanager 81 | def reset_buffer(buffer): 82 | """Caches and resets buffer position.""" 83 | cursor = buffer.tell() 84 | yield 85 | buffer.seek(cursor) 86 | 87 | 88 | def smallest_int_type(vmin: Number, vmax: Number) -> str | None: 89 | """Find the smallest int type able to hold vmin and vmax.""" 90 | 91 | if vmin >= 0: 92 | types = ["uint8", "uint16", "uint32"] 93 | else: 94 | types = ["int8", "int16", "int32"] 95 | 96 | for type in types: 97 | limits = INT_LIMITS[type] 98 | if vmin >= limits.min and vmax <= limits.max: 99 | return type 100 | 101 | return None 102 | 103 | 104 | def dtype_name(arr: Array): 105 | """Return a pandas-compatible type name including extension types where possible.""" 106 | type = arr.type 107 | name = str(type) 108 | 109 | if pat.is_integer(type) and arr.null_count > 0: 110 | name = name.replace("i", "I").replace("u", "U") 111 | 112 | return name 113 | 114 | 115 | def min_max(arr: Array, skip_nulls: bool = True) -> tuple[Number, Number]: 116 | """Wrapper to get minimum and maximum in arrow array as python tuple.""" 117 | mm = pac.min_max(arr, skip_nulls=skip_nulls).as_py() 118 | return mm["min"], mm["max"] 119 | 120 | 121 | def proportion_valid(arr: Array) -> float: 122 | """Proportion of non-null values in array.""" 123 | size = len(arr) 124 | return (size - arr.null_count) / size 125 | 126 | 127 | def proportion_unique(arr: Array) -> float: 128 | """Proportion of non-null values that are unique in array.""" 129 | n_valid = len(arr) - arr.null_count 130 | 131 | if n_valid == 0: 132 | return 0 133 | 134 | n_unique = pac.count_distinct(arr, mode="only_valid").as_py() 135 | return n_unique / n_valid 136 | 137 | 138 | def proportion_trueish(arr: Array) -> float: 139 | if len(arr) == 0: 140 | # Still means we had no trueish values 141 | return 0 142 | 143 | n_trueish = pac.sum(arr).as_py() or 0 # may return None otherwise, which we consider falsish 144 | return n_trueish / len(arr) 145 | 146 | 147 | def proportion_equal(arr1: Array, arr2: Array, ignore_nulls=True) -> float: 148 | """Proportion of equal values, optionally ignoring nulls (which otherwise compare falsish.""" 149 | equal = pac.equal(arr1, arr2) 150 | if ignore_nulls: 151 | equal = equal.drop_null() 152 | 153 | return proportion_trueish(equal) 154 | 155 | 156 | def empty_to_null(arr: Array) -> Array: 157 | """Convert empty strings to null values.""" 158 | is_empty = pac.equal(arr, "") 159 | return pac.if_else(is_empty, None, arr) 160 | 161 | 162 | def sorted_value_counts(arr: Array, order: str = "descending", top_n: int | None = None) -> Array: 163 | """Arrow's built-in value count doesn't allow sorting.""" 164 | valcnt = arr.value_counts() 165 | counts = valcnt.field("counts") 166 | order = pac.array_sort_indices(counts, order="descending") 167 | if top_n is None: 168 | return valcnt.take(order) 169 | 170 | return valcnt.take(order[:top_n]) 171 | 172 | 173 | def map_values(arr: Array, map: dict, unknown: str = "keep") -> Array: 174 | """Slow value mapping in pure Python while Arrow doesn't have a native compute function. 175 | 176 | For now assumes type can be left unchanged. 177 | """ 178 | values = arr.to_pylist() 179 | 180 | if unknown == "keep": 181 | values = [map.get(val, val) for val in values] 182 | else: 183 | values = [map.get(val) for val in values] 184 | 185 | return pa.array(values, type=arr.type) 186 | 187 | 188 | def categories(array: Array | ChunkedArray) -> Array: 189 | """Returns an array containing categories in input array of dictionary type.""" 190 | 191 | if not pat.is_dictionary(array.type): 192 | raise TypeError("Must have an array with dictionary type!") 193 | 194 | if isinstance(array, ChunkedArray): 195 | array = array.unify_dictionaries() 196 | return array.chunk(0).dictionary 197 | 198 | return array.dictionary 199 | 200 | 201 | def is_stringy(type: DataType) -> bool: 202 | """Check if array is stringy (string or dictionary of strings).""" 203 | if pat.is_string(type): 204 | return True 205 | 206 | return pat.is_dictionary(type) and pat.is_string(type.value_type) 207 | 208 | 209 | def with_flatten(arr: Array, func: Callable): 210 | """Apply a compute function to all elements of flattened (and restored) lists.""" 211 | isna = pac.is_null(arr) 212 | flat = pac.list_flatten(arr) 213 | transformed = func(flat) 214 | nested = pa.ListArray.from_arrays(arr.offsets, transformed) 215 | return pac.if_else(isna, None, nested) 216 | 217 | 218 | def schema_diff(s1: Schema, s2: Schema) -> dict[str, tuple[DataType, DataType]]: 219 | """Check differences in schema's column types.""" 220 | diff = {} 221 | 222 | for field in s1: 223 | other = s2.field(field.name) 224 | if field.type != other.type: 225 | diff[field.name] = (field.type, other.type) 226 | 227 | return diff 228 | 229 | 230 | def encode_metadata(d: dict): 231 | """Json-byte-encode a dict, like Arrow expects its metadata.""" 232 | return {k.encode("utf-8"): json.dumps(v).encode("utf-8") for k, v in d.items()} 233 | 234 | 235 | def maybe_load_json(s: str) -> Any: 236 | """Try to load a string as json, returning the original string if it fails.""" 237 | try: 238 | return json.loads(s) 239 | except (json.JSONDecodeError, TypeError): 240 | return s 241 | 242 | 243 | def decode_metadata(d: dict): 244 | """Decode Arrow metadata to dict.""" 245 | return {k.decode("utf-8"): maybe_load_json(v.decode("utf-8")) for k, v in d.items()} 246 | 247 | 248 | class Timer: 249 | def __enter__(self): 250 | self.start = perf_counter() 251 | return self 252 | 253 | def __exit__(self, type, value, traceback): 254 | self.end = perf_counter() 255 | self.elapsed = self.end - self.start 256 | 257 | 258 | if PANDAS_INSTALLED: 259 | # Arrow currently doesn't have any way to map its integer types to pandas 260 | # extension dtypes depending on whether a columns has missing values or not 261 | 262 | @singledispatch 263 | def to_pandas(array: Array): 264 | """Proper conversion allowing pandas extension types.""" 265 | 266 | atype = array.type 267 | 268 | if pat.is_string(atype): 269 | return array.to_pandas().astype("string") 270 | 271 | if pat.is_boolean(atype): 272 | return array.to_pandas().astype("boolean") 273 | 274 | if pat.is_integer(atype) and array.null_count > 0: 275 | dtype_name = str(atype).replace("i", "I").replace("u", "U") 276 | return array.to_pandas(integer_object_nulls=True).astype(dtype=dtype_name) 277 | 278 | return array.to_pandas() 279 | 280 | @to_pandas.register 281 | def _(table: Table): 282 | columns = [to_pandas(array) for array in table] 283 | df = pd.concat(columns, axis=1) 284 | df.columns = table.column_names 285 | return df 286 | 287 | 288 | def uniquify(items: Sequence[str]) -> Iterator[str]: 289 | """Add suffixes to inputs strings if necessary to ensure is item is unique.""" 290 | seen = set() 291 | 292 | for item in items: 293 | newitem = item 294 | 295 | suffix = 0 296 | while newitem in seen: 297 | suffix += 1 298 | newitem = f"{item}_{suffix}" 299 | 300 | seen.add(newitem) 301 | yield newitem 302 | -------------------------------------------------------------------------------- /notebooks/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 22, 6 | "id": "58eb08f3", 7 | "metadata": { 8 | "ExecuteTime": { 9 | "end_time": "2022-12-12T16:38:57.788998Z", 10 | "start_time": "2022-12-12T16:38:57.782691Z" 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "import lector\n", 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 23, 22 | "id": "5230ccf5", 23 | "metadata": { 24 | "ExecuteTime": { 25 | "end_time": "2022-12-12T16:38:59.183049Z", 26 | "start_time": "2022-12-12T16:38:59.177918Z" 27 | } 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "csv = \"\"\"\n", 32 | "Some preamble content here\n", 33 | "This is still \"part of the metadata preamble\"\n", 34 | "id;genre;metric;count;content;website;tags;vecs;date\n", 35 | "1234982348728374;a;0.1;1;; http://www.graphext.com;\"[a,b,c]\";\"[1.3, 1.4, 1.67]\";11/10/2022\n", 36 | ";b;0.12;;\"Natural language text is different from categorical data.\"; https://www.twitter.com;[d];\"[0, 1.9423]\";01/10/2022\n", 37 | "9007199254740993;a;3.14;3;\"The Project · Gutenberg » EBook « of Die Fürstin.\";http://www.google.com;\"['e', 'f']\";[\"84.234, 12509.99\"];13/10/2021\n", 38 | "\"\"\".encode(\"ISO-8859-1\")\n", 39 | "\n", 40 | "with open(\"example.csv\", \"wb\") as fp:\n", 41 | " fp.write(csv)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 24, 47 | "id": "91850fdc", 48 | "metadata": { 49 | "ExecuteTime": { 50 | "end_time": "2022-12-12T16:39:01.422026Z", 51 | "start_time": "2022-12-12T16:39:01.366891Z" 52 | } 53 | }, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/html": [ 58 | "
\n",
 59 |        "
\n" 60 | ], 61 | "text/plain": [ 62 | "\n" 63 | ] 64 | }, 65 | "metadata": {}, 66 | "output_type": "display_data" 67 | }, 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | " id genre metric count \\\n", 73 | "0 1234982348728374 a 0.10 1 \n", 74 | "1 b 0.12 \n", 75 | "2 9007199254740993 a 3.14 3 \n", 76 | "\n", 77 | " content website \\\n", 78 | "0 http://www.graphext.com \n", 79 | "1 Natural language text is different from catego... https://www.twitter.com \n", 80 | "2 The Project · Gutenberg » EBook « of Die Fürstin. http://www.google.com \n", 81 | "\n", 82 | " tags vecs date \n", 83 | "0 [a, b, c] [1.3, 1.4, 1.67] 2022-10-11 \n", 84 | "1 [d] [0.0, 1.9423] 2022-10-01 \n", 85 | "2 [e, f] [84.234, 12509.99] 2021-10-13 \n", 86 | "\n", 87 | " id Int64\n", 88 | "genre category\n", 89 | "metric float64\n", 90 | "count UInt8\n", 91 | "content string\n", 92 | "website category\n", 93 | "tags object\n", 94 | "vecs object\n", 95 | "date datetime64[ns]\n", 96 | "dtype: object\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "df = lector.read_csv(\"example.csv\", to_pandas=True)\n", 102 | "print(df)\n", 103 | "print(\"\\n\", df.dtypes)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 26, 109 | "id": "b8c6261e", 110 | "metadata": { 111 | "ExecuteTime": { 112 | "end_time": "2022-12-12T16:39:16.280641Z", 113 | "start_time": "2022-12-12T16:39:16.262108Z" 114 | } 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | " id genre metric count \\\n", 122 | "0 1234982348728374 a 0.10 1 \n", 123 | "1 b 0.12 \n", 124 | "2 9007199254740992 a 3.14 3 \n", 125 | "\n", 126 | " content \\\n", 127 | "0 \n", 128 | "1 Natural language text is different from catego... \n", 129 | "2 The Project · Gutenberg » EBook « of Die Fürstin. \n", 130 | "\n", 131 | " website tags vecs date \n", 132 | "0 http://www.graphext.com [a,b,c] [1.3, 1.4, 1.67] 2022-11-10 \n", 133 | "1 https://www.twitter.com [d] [0, 1.9423] 2022-01-10 \n", 134 | "2 http://www.google.com ['e', 'f'] [\"84.234, 12509.99\"] 2021-10-13 \n", 135 | "\n", 136 | " id Int64\n", 137 | "genre category\n", 138 | "metric float64\n", 139 | "count UInt8\n", 140 | "content string\n", 141 | "website category\n", 142 | "tags object\n", 143 | "vecs object\n", 144 | "date datetime64[ns]\n", 145 | "dtype: object\n" 146 | ] 147 | }, 148 | { 149 | "name": "stderr", 150 | "output_type": "stream", 151 | "text": [ 152 | "/var/folders/b2/mplwbs6j15976g80v9bw7c4r0000gn/T/ipykernel_80150/366501397.py:14: UserWarning: Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.\n", 153 | " df = pd.read_csv(\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "import pandas as pd\n", 159 | "\n", 160 | "dtypes = {\n", 161 | " \"id\": \"Int64\",\n", 162 | " \"genre\": \"category\",\n", 163 | " \"metric\": \"float\",\n", 164 | " \"count\": \"UInt8\", \n", 165 | " \"content\": \"string\",\n", 166 | " \"website\": \"category\",\n", 167 | " \"tags\": \"object\",\n", 168 | " \"vecs\": \"object\"\n", 169 | "}\n", 170 | "\n", 171 | "df = pd.read_csv(\n", 172 | " \"example.csv\",\n", 173 | " encoding=\"ISO-8859-1\",\n", 174 | " skiprows=3,\n", 175 | " sep=\";\",\n", 176 | " dtype=dtypes,\n", 177 | " parse_dates=[\"date\"],\n", 178 | " infer_datetime_format=True\n", 179 | ")\n", 180 | "\n", 181 | "print(df)\n", 182 | "print(\"\\n\", df.dtypes)" 183 | ] 184 | } 185 | ], 186 | "metadata": { 187 | "kernelspec": { 188 | "display_name": "lector", 189 | "language": "python", 190 | "name": "lector" 191 | }, 192 | "language_info": { 193 | "codemirror_mode": { 194 | "name": "ipython", 195 | "version": 3 196 | }, 197 | "file_extension": ".py", 198 | "mimetype": "text/x-python", 199 | "name": "python", 200 | "nbconvert_exporter": "python", 201 | "pygments_lexer": "ipython3", 202 | "version": "3.10.8" 203 | } 204 | }, 205 | "nbformat": 4, 206 | "nbformat_minor": 5 207 | } 208 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.coverage.run] 6 | omit = [ 7 | "*test/*", 8 | ] 9 | 10 | [tool.ruff] 11 | target-version = "py39" 12 | line-length = 100 13 | 14 | src = ["libs/*"] 15 | 16 | select = [ 17 | "E", # Pycodestyle errors 18 | "W", # Pycodestyle warnings 19 | "F", # Pyflakes 20 | "UP", # Pyupgrade 21 | "S", # Bandit (security), 22 | "B", # Bugbear 23 | "C4", # Comprehensions 24 | "ISC", # Implicit string concat 25 | "ICN", # Import conventions 26 | "I001", # isort 27 | "Q", # Quotes 28 | "SIM", # Simplify 29 | "ERA", # eradicate (commented out code) 30 | "PIE", 31 | "PD", # Pandas 32 | "PLC", 33 | "PLE", 34 | "PLR", 35 | "PLW", # Pylint extensions 36 | "RET", # Return statements 37 | ] 38 | 39 | # Never enforce these 40 | ignore = [ 41 | "E731", # Do not assign a lambda expression, use a def 42 | "E741", # Ambiguous variable names, e.g. l/1, O/0. Just use appropriate font. 43 | "D", # All docstrings for now. 44 | "S101", # Don't warn for asserts in pytest 45 | "S105", # HardcodedPasswordString 46 | "S106", # HardcodedPasswordFuncArg 47 | "S107", # HardcodedPasswordDefault 48 | "PD901", # df as variable name 49 | "PD011", # Use .to_numpy() instead of .values 50 | "PD015", # Use .merge instead of pd.merge 51 | "PLR0913", # Too many arguments (in function signatures) 52 | "RET501", # explicit `return None` if only possible return value 53 | "RET504", # Unnecessary variable assignment before `return` statement 54 | "SIM108", # Ternary instead of if/else (doesn't take into account line length) 55 | ] 56 | 57 | # Never try to fix these 58 | unfixable = [ 59 | "ERA001", # Don't automatically delete commented out code 60 | ] 61 | 62 | exclude = [ 63 | ".bzr", 64 | ".direnv", 65 | ".eggs", 66 | ".git", 67 | ".hg", 68 | ".mypy_cache", 69 | ".nox", 70 | ".pants.d", 71 | ".ruff_cache", 72 | ".svn", 73 | ".tox", 74 | ".venv", 75 | "__pypackages__", 76 | "_build", 77 | "buck-out", 78 | "build", 79 | "dist", 80 | "node_modules", 81 | "venv", 82 | ] 83 | 84 | # Allow unused variables when underscore-prefixed. 85 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 86 | 87 | [tool.ruff.per-file-ignores] 88 | # "__init__.py" = ["E402"] 89 | # "test/" = ["S101"] # Don't warn for asserts in pytest, doesn't work for paths 90 | 91 | [tool.ruff.flake8-annotations] 92 | mypy-init-return = true 93 | 94 | [tool.ruff.flake8-quotes] 95 | docstring-quotes = "double" 96 | 97 | [tool.ruff.flake8-import-conventions] 98 | [tool.ruff.flake8-import-conventions.extend-aliases] 99 | "dask.dataframe" = "dd" 100 | "pyarrow" = "pa" 101 | 102 | [tool.ruff.mccabe] 103 | # Unlike Flake8, default to a complexity level of 10. 104 | max-complexity = 10 105 | 106 | [tool.ruff.isort] 107 | known-first-party = ["lector"] 108 | 109 | [tool.ruff.flake8-bugbear] 110 | extend-immutable-calls = ["typer.Option"] 111 | 112 | [tool.ruff.format] 113 | quote-style = "double" 114 | indent-style = "space" 115 | -------------------------------------------------------------------------------- /recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "lector" %} 2 | {% set version = "0.3.5" %} 3 | 4 | package: 5 | name: {{ name|lower }} 6 | version: {{ version }} 7 | 8 | source: 9 | path: .. 10 | 11 | build: 12 | number: 0 13 | skip: True # [win] 14 | script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vv " 15 | 16 | requirements: 17 | host: 18 | - python >=3.9,{{PY_VER}}* 19 | - pip 20 | - setuptools 21 | - wheel 22 | run: 23 | - python {{PY_VER}}* 24 | - cchardet 25 | - clevercsv <0.8.1 26 | - msgspec 27 | - pyarrow>=15.0.0 28 | - rich 29 | - tqdm 30 | - typer 31 | 32 | test: 33 | imports: 34 | - lector 35 | requires: 36 | - hypothesis 37 | - hypothesis-csv @ git+https://github.com/jeffbr13/hypothesis-csv.git@master 38 | - pytest 39 | - pytest-cov 40 | commands: 41 | - pip check 42 | - pytest -v 43 | 44 | about: 45 | home: https://github.com/graphext/lector 46 | summary: A fast and flexible CSV reader 47 | description: | 48 | Lector aims to be a fast reader for potentially messy CSV files with configurable column type inference. 49 | It combines automatic detection of file encodings, CSV dialects (separator, escaping etc.) and preambles (initial lines containing metadata or 50 | junk unrelated to the actual tabular data). Its goal is to just-read-the-effing-CSV file without manual configuration in most cases. Each of the 51 | detection components is configurable and can be swapped out easily with custom implementations. 52 | doc_url: https://lector.readthedocs.io/en/latest/ 53 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = lector 3 | version = attr: lector.__version__ 4 | description = A fast reader for messy CSV files and type inference. 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown; charset=UTF-8 7 | url = https://github.com/graphext/lector 8 | author = Graphext 9 | author_email = thomasb@hey.com 10 | keywords = 11 | csv 12 | json 13 | parser 14 | reader 15 | type inference 16 | license = Apache License 2.0 17 | license_file = LICENSE 18 | classifiers = 19 | Development Status :: 4 - Beta 20 | Intended Audience :: Developers 21 | Topic :: Scientific/Engineering 22 | License :: OSI Approved :: Apache Software License 23 | Programming Language :: Python :: 3 24 | project_urls = 25 | Source = https://github.com/graphext/lector 26 | 27 | [options] 28 | packages = find: 29 | platforms = any 30 | python_requires = >=3.8 31 | install_requires = 32 | cchardet 33 | clevercsv<0.8.1 34 | msgspec 35 | pyarrow>=15.0.0 36 | pyarrow-hotfix 37 | rich 38 | tqdm 39 | typer 40 | 41 | [aliases] 42 | test=pytest 43 | 44 | [options.entry_points] 45 | console_scripts = 46 | lector = lector.cli:CLI 47 | 48 | [options.extras_require] 49 | test = 50 | hypothesis 51 | pytest 52 | pytest-cov 53 | docs = 54 | pydata-sphinx-theme<0.10 55 | sphinx 56 | sphinx-autoapi 57 | sphinx-autobuild 58 | sphinx-autodoc-typehints 59 | dev = 60 | ipywidgets 61 | jupyter_contrib_nbextensions 62 | jupyter_nbextensions_configurator 63 | notebook 64 | pandas 65 | pre-commit 66 | ruff 67 | lector[test] 68 | lector[docs] 69 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | if __name__ == "__main__": 4 | setuptools.setup() 5 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graphext/lector/fb76591acd69aa01ee37a03f08fc8e68fd5f8aa8/test/__init__.py -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | """For future reference, if we want to make tests extensible to future implementations 2 | of some of the interfaces (ABCs) in this package, we can use parameterized fixtures, 3 | like this: 4 | 5 | - https://github.com/pytest-dev/pytest/issues/421 6 | - https://stackoverflow.com/q/26266481/3519145 7 | """ 8 | -------------------------------------------------------------------------------- /test/test_dialects.py: -------------------------------------------------------------------------------- 1 | """Test detection of dialects of otherwise valid CSV files.""" 2 | import io 3 | from csv import QUOTE_MINIMAL, get_dialect 4 | 5 | import pytest 6 | from hypothesis import given 7 | from hypothesis.strategies import data 8 | from hypothesis_csv.strategies import csv as csv_strat 9 | 10 | from lector.csv.dialects import Dialect, PySniffer 11 | 12 | from .utils import equal 13 | 14 | 15 | def fix_expected_dialect(dialect): 16 | dialect.line_terminator = "\r\n" # Hardcoded in sniffer (not detectable) 17 | dialect.quoting = QUOTE_MINIMAL # Hardcoded in sniffer (not detectable) 18 | return dialect 19 | 20 | 21 | @pytest.mark.parametrize("dialect_name", ["excel", "excel-tab", "unix"]) 22 | def test_pydialect_roundtrip(dialect_name: str): 23 | attrs = [ 24 | "delimiter", 25 | "doublequote", 26 | "escapechar", 27 | "lineterminator", 28 | "quotechar", 29 | "quoting", 30 | "skipinitialspace", 31 | "strict", 32 | ] 33 | d1 = get_dialect(dialect_name) 34 | d2 = Dialect.from_builtin(d1).to_builtin() 35 | assert all(getattr(d1, a, None) == getattr(d2, a, None) for a in attrs) 36 | 37 | 38 | @pytest.mark.parametrize("dialect_name", ["excel", "excel-tab", "unix"]) 39 | def test_dialect_roundtrip(dialect_name: str): 40 | b = get_dialect(dialect_name) 41 | d1 = Dialect.from_builtin(b) 42 | d2 = Dialect.from_builtin(d1.to_builtin()) 43 | assert equal(d1, d2) 44 | 45 | 46 | @given(data=data()) 47 | @pytest.mark.parametrize("dialect", ["excel", "excel-tab", "unix"]) 48 | def test_dialects(dialect, data): 49 | strategy = csv_strat(dialect=dialect, lines=3, header=2) 50 | csv = data.draw(strategy) 51 | expected = Dialect.from_builtin(get_dialect(dialect)) 52 | expected = fix_expected_dialect(expected) 53 | detected = PySniffer().detect(io.StringIO(csv)) 54 | assert equal(expected, detected) 55 | -------------------------------------------------------------------------------- /test/test_encodings.py: -------------------------------------------------------------------------------- 1 | """Test encoding detectors. 2 | 3 | Notes: 4 | 5 | - ISO-8859-1 (Latin-1) is mostly identical to Windows-1252 (CP1252): 6 | https://www.i18nqa.com/debug/table-iso8859-1-vs-windows-1252.html 7 | 8 | """ 9 | import io 10 | 11 | import pytest 12 | 13 | from lector.csv.encodings import Chardet 14 | 15 | CODEC_ERR = "�" 16 | 17 | CODEC_SAMPLES = [ 18 | ("刺靑 谷崎潤一郞著", "utf-8", "windows-1250", 2), 19 | ("顏是家訓 北齊 顏之推", "BIG5", "utf-8", 12), 20 | ("The Project · Gutenberg » EBook « of Die Fürstin.", "ISO-8859-1", "utf-8", 4), 21 | ("Той и сам не знае кога е роден, но като го запитат.", "windows-1251", "utf-8", 38), 22 | ("première is first", "utf-8", "ascii", 2), 23 | ("première is first", "utf-16", "utf-8", 3), 24 | ("première is first", "windows-1252", "utf-8", 1), 25 | ("première is first", "ISO-8859-1", "utf-16", 1), 26 | ("𐐀 am Deseret", "utf-8", "windows-1250", 2), 27 | ("𐐀 am Deseret", "utf-8", "windows-1252", 2), 28 | ("𐐀 am Deseret", "utf-16", "utf-8", 4), 29 | ] 30 | """When encoded with first then decoded with second codecs, n unknown chars are produced.""" 31 | 32 | CODECS = [ 33 | "utf-8", 34 | "utf-16", 35 | "utf-8-sig", 36 | "windows-1250", 37 | "windows-1251", 38 | "windows-1252", 39 | "iso-8859-1", 40 | "ascii", 41 | ] 42 | 43 | 44 | def codecs_equal(c1, c2): 45 | c1 = c1.lower() 46 | c2 = c2.lower() 47 | if c1 == c2: 48 | return True 49 | 50 | # Allow some variations where small differences between codes cannot be inferred 51 | # from small sample 52 | equivalent = ("iso-8859-1", "windows-1252") 53 | return c1 in equivalent and c2 in equivalent 54 | 55 | 56 | def codecs_compatible(c1, c2, encoded, n_err_max=0): 57 | """Check if c1 and c2 are equivalent, or otherwise that c2 can decode string without error.""" 58 | if codecs_equal(c1, c2): 59 | return True 60 | 61 | return encoded.decode(c2, errors="replace").count(CODEC_ERR) <= n_err_max 62 | 63 | 64 | @pytest.mark.parametrize("codec", CODECS) 65 | def test_simple_roundtrip(codec): 66 | """Test correct detection if string can be encoded given codec.""" 67 | s = "première is first" 68 | 69 | try: 70 | encoded = s.encode(codec) 71 | detected = Chardet().detect(io.BytesIO(encoded)) 72 | assert codecs_equal(codec, detected) 73 | except Exception: 74 | pass # noqa 75 | 76 | 77 | @pytest.mark.parametrize("example", CODEC_SAMPLES) 78 | def test_roundtrips(example): 79 | text, codec, _, _ = example 80 | encoded = text.encode(codec) 81 | detected = Chardet().detect(io.BytesIO(encoded)) 82 | decoded = encoded.decode(detected, errors="replace") 83 | 84 | assert codecs_equal(codec, detected) 85 | assert decoded.count(CODEC_ERR) == 0 86 | -------------------------------------------------------------------------------- /test/test_formats.py: -------------------------------------------------------------------------------- 1 | """Test integration/interactions between encoding, preambles, dialect.""" 2 | from __future__ import annotations 3 | 4 | import io 5 | from csv import get_dialect 6 | 7 | import pytest 8 | from hypothesis import given 9 | from hypothesis.strategies import data 10 | from hypothesis_csv.strategies import csv as csv_strat 11 | 12 | from lector.csv import Dialect, EmptyFileError, Reader 13 | 14 | from .test_dialects import fix_expected_dialect 15 | from .test_encodings import CODECS, codecs_compatible 16 | from .utils import equal 17 | 18 | PREAMBLES = [ 19 | # Brandwatch, the delimiter should be ignored in preamble, as long as a row of commas 20 | # separates the preamble from CSV table 21 | ("a{d}b{d}c\n,,", 2), 22 | ("some text\n,,", 2), 23 | ("a{d}b{d}c\nsome text\n,,", 3), 24 | # Fieldless 25 | # Start with header line having multiple (two) fields (delimited) 26 | ("abc{d}def", 0), 27 | ('"abc"{d}"def"', 0), 28 | ('"abc"{d}def', 0), 29 | ('abc{d} "123,456"', 0), 30 | ('"[cat1, cat2]"{d} "123,67"', 0), 31 | # Since the line isn't quoted as a whole, the delimiter creates two fields 32 | ('A line that has "something{d}in quotes" and then some', 0), 33 | ('A line that has ""something{d} in quotes"" and then some', 0), 34 | # Start with line having single field 35 | ("abc", 1), 36 | ('"abc, def"', 1), 37 | ('"The ""text,"" is double-quoted, and contains a comma"', 1), 38 | ('"A line that has ""something, in quotes"" and then some"', 1), 39 | ('"A line that has something, and then some more"', 1), 40 | ('"One line, spreading over\n multiple, lines"', 2), 41 | ("One line\nAnother line", 2), 42 | ('One line\nAnother line\n"And, a\nthird"', 4), 43 | ] 44 | 45 | 46 | def with_delimiter(s, delim): 47 | if "{d}" in s: 48 | return s.format(d=delim) 49 | return s 50 | 51 | 52 | class NonParser(Reader): 53 | """Simple implementation of abstract class that only infers format.""" 54 | 55 | def parse(self, *args, **kwds) -> None: 56 | return None 57 | 58 | 59 | def detect_format(csv: str | bytes): 60 | if isinstance(csv, str): 61 | buffer = io.StringIO(csv) 62 | else: 63 | buffer = io.BytesIO(csv) 64 | 65 | reader = NonParser(buffer, log=False) 66 | reader.read() 67 | return reader.format 68 | 69 | 70 | def test_empty_file(): 71 | with pytest.raises(EmptyFileError): 72 | detect_format("") 73 | 74 | 75 | @given(data=data()) 76 | @pytest.mark.parametrize("codec", CODECS) 77 | @pytest.mark.parametrize("preamble", PREAMBLES) 78 | @pytest.mark.parametrize("dialect", ["excel", "excel-tab", "unix"]) 79 | def test_formats(codec, preamble, dialect, data): 80 | """Test parsing with different encoding, dialect and preamble.""" 81 | 82 | pydialect = get_dialect(dialect) 83 | 84 | # Make premable compatible with dialect & generated csv 85 | preamble, skiprows = preamble 86 | preamble = with_delimiter(preamble, pydialect.delimiter) 87 | preamble = preamble.replace("\n", pydialect.lineterminator) 88 | if not pydialect.skipinitialspace: 89 | preamble = preamble.replace(f"{pydialect.delimiter} ", f"{pydialect.delimiter}") 90 | 91 | # Create the full, encoded CSV "file" 92 | strategy = csv_strat(dialect=pydialect, lines=3, header=2) 93 | csv = data.draw(strategy) 94 | csv = preamble + pydialect.lineterminator + csv 95 | 96 | try: 97 | encoded = csv.encode(codec) 98 | format = detect_format(encoded) 99 | 100 | # We have very short CSVs with preambles including the ñ, whose encoding 101 | # cannot always be detected correctly (would be easier with larger examples) 102 | assert codecs_compatible(codec, format.encoding, encoded, n_err_max=1) 103 | assert equal(skiprows, format.preamble, extra=csv) 104 | 105 | exp_dialect = fix_expected_dialect(Dialect.from_builtin(pydialect)) 106 | assert equal(exp_dialect, format.dialect, extra=csv) 107 | 108 | except UnicodeEncodeError: 109 | pass 110 | -------------------------------------------------------------------------------- /test/test_preambles.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import pytest 4 | from hypothesis import given 5 | from hypothesis.strategies import data 6 | from hypothesis_csv.strategies import csv as csv_strat 7 | 8 | from lector.csv.preambles import Brandwatch, Fieldless, GoogleAds, Preambles 9 | 10 | from .utils import equal 11 | 12 | BRANDWATCH_PREAMBLES = [ 13 | ("a,b,c", 0), 14 | ("a,b,c\n,,", 2), 15 | ("some text\n,,", 2), 16 | ("a,b,c\nsome text\n,,", 3), 17 | ] 18 | """Brandwatch uses a line of delimiters only to mark end of preamble.""" 19 | 20 | FIELDLESS_PREAMBLES = [ 21 | # Start with line having multiple fields 22 | ("abc,def", 0), 23 | ('"abc","def"', 0), 24 | ('"abc",def', 0), 25 | ('abc, "123,456"', 0), 26 | ('"[cat1, cat2]", "123,67"', 0), 27 | ('A line that has "something,in quotes" and then some', 0), 28 | ('A line that has ""something, in quotes"" and then some', 0), 29 | # Start with line having single field 30 | ("abc", 1), 31 | ('"abc, def"', 1), 32 | ('"The ""text,"" is double-quoted, and contains a comma"', 1), 33 | ('"A line that has ""something, in quotes"" and then some"', 1), 34 | ('"A line that has something, and then some more"', 1), 35 | ('"One line, spreading over\n multiple, lines"', 2), 36 | ("One line\nAnother line", 2), 37 | ('One line\nAnother line\n"And, a\nthird"', 4), 38 | ] 39 | """Test cases for Fieldless detector, ints indicating how many lines to skip for preamble.""" 40 | 41 | 42 | GOOGLEADS_PREAMBLES = [ 43 | ("Informe de something something\nSome other info\nCampaña, Column", 2), 44 | ("Informe de something something\nCampaña, Column", 1), 45 | ("Something something\nCampaña,Column", 0), 46 | ("Informe de something something\nCampana,Column", 0), 47 | ] 48 | 49 | 50 | @given(data=data()) 51 | @pytest.mark.parametrize("preamble", BRANDWATCH_PREAMBLES) 52 | def test_brandwatch(preamble, data): 53 | preamble, skip_n_exp = preamble 54 | csv = preamble + "\n" + data.draw(csv_strat(header=3)) 55 | skip_n_det = Brandwatch().detect(io.StringIO(csv)) 56 | assert equal(skip_n_exp, skip_n_det) 57 | 58 | 59 | @given(data=data()) 60 | @pytest.mark.parametrize("preamble", FIELDLESS_PREAMBLES) 61 | def test_fieldless(preamble, data): 62 | preamble, skip_n_exp = preamble 63 | csv = preamble + "\n" + data.draw(csv_strat(header=2)) 64 | skip_n_det = Fieldless().detect(io.StringIO(csv)) 65 | assert equal(skip_n_exp, skip_n_det) 66 | 67 | 68 | @given(data=data()) 69 | @pytest.mark.parametrize("preamble", GOOGLEADS_PREAMBLES) 70 | def test_googleads(preamble, data): 71 | preamble, skip_n_exp = preamble 72 | csv = preamble + "\n" + data.draw(csv_strat(header=2)) 73 | skip_n_det = GoogleAds().detect(io.StringIO(csv)) 74 | assert equal(skip_n_exp, skip_n_det) 75 | 76 | 77 | @given(data=data()) 78 | @pytest.mark.parametrize("preamble", BRANDWATCH_PREAMBLES + FIELDLESS_PREAMBLES) 79 | def test_preambles(preamble, data): 80 | preamble, skip_n_exp = preamble 81 | csv = preamble + "\n" + data.draw(csv_strat(header=3)) 82 | skip_n_det = Preambles.detect(io.StringIO(csv)) 83 | assert equal(skip_n_exp, skip_n_det) 84 | -------------------------------------------------------------------------------- /test/test_reader.py: -------------------------------------------------------------------------------- 1 | """Test CSV readers.""" 2 | import csv 3 | import io 4 | import sys 5 | from csv import get_dialect 6 | 7 | import pytest 8 | from hypothesis import given 9 | from hypothesis.strategies import data 10 | from hypothesis_csv.strategies import csv as csv_strat 11 | 12 | import lector 13 | from lector.csv import ArrowReader, Dialect, EmptyFileError 14 | 15 | from .test_dialects import fix_expected_dialect 16 | from .test_encodings import CODECS, codecs_compatible 17 | from .test_formats import PREAMBLES, with_delimiter 18 | from .utils import equal 19 | 20 | SHAPES = [ 21 | # (0, 1), # hypothesis_csv cannot generate a 0 row file # noqa 22 | # (1, 0), # This produces an empty file ("\r\n\r\n") # noqa 23 | (1, 1), 24 | (1, 3), 25 | (3, 1), 26 | (10, 5), 27 | ] 28 | """Different combinations of n_rows and n_cols.""" 29 | 30 | EMPTY = ["", "\n", "\n\n", "\r\n"] 31 | 32 | NULL_BYTES_CSV = b""" 33 | col1,col_\0_2,col3 34 | 1,abc,x 35 | 2,de\0fg,y 36 | """ 37 | 38 | UNNAMED = b""" 39 | col1,,col3,,col5 40 | 1,2,3,4,5 41 | 6,7,8,9,10 42 | """ 43 | 44 | 45 | @pytest.mark.parametrize("csv", EMPTY) 46 | def test_empty(csv: str): 47 | fp = io.BytesIO(csv.encode("utf-8")) 48 | with pytest.raises(EmptyFileError): 49 | ArrowReader(fp, log=False).read() 50 | 51 | 52 | def test_unnamed(): 53 | """Automatic names for unnnamed columns""" 54 | fp = io.BytesIO(UNNAMED) 55 | tbl = lector.read_csv(fp) 56 | assert tbl.column_names == ["col1", "Unnamed_0", "col3", "Unnamed_1", "col5"] 57 | 58 | 59 | def test_null_bytes(): 60 | """For now, null bytes don't throw error, but are also not removed automatically!""" 61 | 62 | with pytest.raises(csv.Error): 63 | # python's csv reader throws error on null byte 64 | s = io.StringIO(NULL_BYTES_CSV.decode("utf-8")) 65 | print("Null-byte CSV:", list(csv.reader(s))) 66 | 67 | tbl = lector.read_csv(io.BytesIO(NULL_BYTES_CSV)) 68 | assert tbl.column_names == ["col1", "col_\x00_2", "col3"] 69 | assert tbl.column("col_\x00_2").to_pylist() == ["abc", "de\x00fg"] 70 | 71 | 72 | @given(data=data()) 73 | @pytest.mark.parametrize("codec", CODECS) 74 | @pytest.mark.parametrize("preamble", PREAMBLES) 75 | @pytest.mark.parametrize("dialect", ["excel", "excel-tab", "unix"]) 76 | def test_all(codec, preamble, dialect, data): 77 | """Test parsing with different encoding, dialect and preamble.""" 78 | 79 | pydialect = get_dialect(dialect) 80 | 81 | # Make premable compatible with dialect & generated csv 82 | preamble, skiprows = preamble 83 | preamble = with_delimiter(preamble, pydialect.delimiter) 84 | preamble = preamble.replace("\n", pydialect.lineterminator) 85 | if not pydialect.skipinitialspace: 86 | preamble = preamble.replace(f"{pydialect.delimiter} ", f"{pydialect.delimiter}") 87 | 88 | # Create the full, encoded CSV "file" 89 | n_lines = 3 90 | n_columns = 2 91 | strategy = csv_strat(dialect=pydialect, lines=n_lines, header=n_columns) 92 | csv = data.draw(strategy) 93 | csv = preamble + pydialect.lineterminator + csv 94 | 95 | try: 96 | encoded = csv.encode(codec) 97 | reader = ArrowReader(io.BytesIO(encoded), log=False) 98 | tbl = reader.read() 99 | 100 | # If a preamble test case specifies 0 as the number of skiprows, it means the preamble text 101 | # consists of a single line that should be interpreted as the header. In this case, the 102 | # header (first line) of the csv created automatically will effectively become another row 103 | # in the table. 104 | exp_num_rows = n_lines if skiprows > 0 else n_lines + 1 105 | assert equal(exp_num_rows, tbl.num_rows, extra=csv) 106 | assert equal(2, tbl.num_columns, extra=csv) 107 | 108 | # We have very short CSVs with preambles including the ñ, whose encoding 109 | # cannot always be detected correctly (would be easier with larger examples) 110 | assert codecs_compatible(codec, reader.format.encoding, encoded, n_err_max=1) 111 | assert equal(skiprows, reader.format.preamble, extra=csv) 112 | 113 | exp_dialect = fix_expected_dialect(Dialect.from_builtin(pydialect)) 114 | assert equal(exp_dialect, reader.format.dialect, extra=csv) 115 | 116 | except UnicodeEncodeError: 117 | print(f"FAILED ON CSV:\n{csv}") 118 | sys.exit() 119 | raise 120 | -------------------------------------------------------------------------------- /test/test_types.py: -------------------------------------------------------------------------------- 1 | import io 2 | from collections import namedtuple 3 | 4 | import pyarrow as pa 5 | import pyarrow.types as pat 6 | 7 | import lector 8 | from lector import ArrowReader, Autocast 9 | 10 | from .utils import equal 11 | 12 | TC = namedtuple("TC", "min,max,extra,null") 13 | """Test Case definition.""" 14 | 15 | TYPE_COLUMNS = { 16 | "num_int8": TC("-128", "127", "0", "NA"), 17 | "num_int16": TC("-32768", "32767", "0", "NA"), 18 | "num_int32": TC("-2147483648", "2147483647", "0", "NA"), 19 | "num_int64": TC("-9223372036854775808", "9223372036854775807", "0", "NA"), 20 | "num_uint8": TC("0", "255", "1", "NA"), 21 | "num_uint16": TC("0", "65535", "1", "NA"), 22 | "num_uint32": TC("0", "4294967295", "1", "NA"), 23 | # "num_uint64": TS("0", "18446744073709551615", "", "NA"), # noqa 24 | "list_uint8": TC('"[0,1,2]"', '"[7,8,9]"', '"[]"', "NA"), 25 | "list_uint8_2": TC('"[0,1,2]"', '"[7,8,9]"', '"[4]"', "NA"), 26 | "list_str": TC('"[a,b,c]"', '"[x,y,z]"', '"[]"', "NA"), 27 | "list_str_2": TC('"[a,b,c]"', '"[x,y,z]"', '"[test]"', "NA"), 28 | "date_iso": TC( 29 | "2022-06-17T10:31:40.000746", 30 | "2022-06-17T10:31:40.000746", 31 | "2022-06-17T10:31:40.000000", 32 | "NA", 33 | ), 34 | "date_custom": TC( 35 | "2022-06-17", 36 | "2022-06-17", 37 | "2022-05-03", 38 | "NA", 39 | ), 40 | "text": TC( 41 | '"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor."', 42 | '"No one rejects, dislikes, or avoids pleasure itself, because it is pleasure."', 43 | '"Nor again is there anyone who loves or pursues or desires to obtain pain of itself."', 44 | "NA", 45 | ), 46 | "cat": TC("cat1", "cat2", "", "NA"), 47 | } 48 | 49 | TYPE_CSV = ",".join(TYPE_COLUMNS) 50 | for row in map(list, zip(*TYPE_COLUMNS.values())): 51 | TYPE_CSV += "\n" + ",".join(row) 52 | 53 | ARROW_TYPES = { 54 | "num_int8": pa.int64(), 55 | "num_int16": pa.int64(), 56 | "num_int32": pa.int64(), 57 | "num_int64": pa.int64(), 58 | "num_uint8": pa.int64(), 59 | "num_uint16": pa.int64(), 60 | "num_uint32": pa.int64(), 61 | "list_uint8": pa.string(), 62 | "list_uint8_2": pa.string(), 63 | "list_str": pa.string(), 64 | "list_str_2": pa.string(), 65 | "date_iso": pa.timestamp(unit="ns"), 66 | "date_custom": pa.date32(), 67 | "text": pa.string(), 68 | "cat": pa.string(), 69 | } 70 | 71 | LECTOR_TYPES = { 72 | "num_int8": pa.int8(), 73 | "num_int16": pa.int16(), 74 | "num_int32": pa.int32(), 75 | "num_int64": pa.int64(), 76 | "num_uint8": pa.uint8(), 77 | "num_uint16": pa.uint16(), 78 | "num_uint32": pa.uint32(), 79 | "list_uint8": pa.list_(pa.uint8()), 80 | "list_uint8_2": pa.list_(pa.uint8()), 81 | "list_str": pa.list_(pa.string()), 82 | "list_str_2": pa.list_(pa.string()), 83 | "date_iso": pa.timestamp(unit="ns", tz="UTC"), 84 | "date_custom": pa.timestamp(unit="ns", tz="UTC"), 85 | "text": pa.string(), 86 | "cat": pa.dictionary(index_type=pa.int32(), value_type=pa.string()), 87 | } 88 | 89 | DELIM_CSV = b""" 90 | dot_delim, comma_delim, mixed_delim_dot, mixed_delim_comma, mixed_delim 91 | "1,234.0","1.234,0","1,234.0","1.234,0","1.234,0" 92 | "1,234,456",1.234.456,"1,234,456",1.234.456,1.234.456 93 | NA,NA,NA,NA,NA 94 | "1,234,456.987","1.234.456,987","1,234,456.987","1.234.456,987","1.234.456,987" 95 | 0.1,"0,1","0.1","0,1",0.1 96 | .1,",1",",1",.1,.1 97 | 98765.123,"98765,123","98765,123",98765.123,98765.123 98 | """ 99 | 100 | # ruff: noqa: E501 101 | LIST_CSV = """ 102 | lnum1,lnum2,lnum_NA_3,lnum4,lcat5,lfloat6,lfloat7,lfloat8,lfloat_DEL_9 103 | "[0,1,2]","[0,1,2]","['123', '456', NA, '789']","[123, 456, 789]","[a,b,c]","[123.45, 678.90]","[""123.45"", ""678.90""]","['123.45', '678.90']","['123,45', '678,90']" 104 | "[7,8,9]","[7,8,9]","['123', '456', NA, '789']","[123, 456, 789]","(d,e)","[123, 678]","[""123"", ""678""]","['123', '678']","['123', '678']" 105 | "[]","[4]","[123, 456, NA, 789]","[123, 456, 789]","|f|","[123.45, 678.90]","[""123.45"", ""678.90""]","['123.45', '678.90', '0.0']","['123,45', '678,90', '0,0']" 106 | "NA","NA",NA,NA,NA,NA,NA,NA,NA 107 | """ 108 | 109 | 110 | def test_decimals(): 111 | """Based on inferred decimal delimiter, thousands delimiter gets removed. 112 | 113 | If delimiter is ambiguous, result will be dict. 114 | """ 115 | tbl = lector.read_csv(io.BytesIO(DELIM_CSV)) 116 | 117 | for i in range(4): 118 | assert pat.is_floating(tbl.column(i).type) 119 | 120 | assert pat.is_dictionary(tbl.column(4).type) 121 | 122 | 123 | def test_list(): 124 | """List parsing. NAs are not allowed in float lists. Also, decimal delimiter must be the period character!""" 125 | fp = io.BytesIO(LIST_CSV.encode("utf-8")) 126 | tbl = lector.read_csv(fp) 127 | 128 | exp_types = { 129 | "lnum1": pa.list_(pa.uint8()), 130 | "lnum2": pa.list_(pa.uint8()), 131 | "lnum_NA_3": pa.list_(pa.string()), # NA not supported in numeric lists 132 | "lnum4": pa.list_(pa.uint16()), 133 | "lcat5": pa.list_(pa.string()), 134 | "lfloat6": pa.list_(pa.float64()), 135 | "lfloat7": pa.list_(pa.float64()), 136 | "lfloat8": pa.list_(pa.float64()), 137 | "lfloat_DEL_9": pa.list_( 138 | pa.uint16() 139 | ), # comma as decimal delimiter not supported (interpreted as csv delimter) 140 | } 141 | 142 | for col in tbl.column_names: 143 | assert equal(tbl.column(col).type, exp_types[col], col) 144 | 145 | 146 | def test_inference(): 147 | """Compare Arrow and Lector inferred types against reference. 148 | 149 | TODO: 150 | - Deal correctly with ns/ms in timestamps. 151 | - Default dictionary encoding uses int32 always. Downcast? 152 | """ 153 | file = io.BytesIO(TYPE_CSV.encode("utf-8")) 154 | tbl = ArrowReader(file, log=False).read() 155 | schema = tbl.schema 156 | 157 | for name, type in ARROW_TYPES.items(): 158 | assert equal(type, schema.field(name).type, extra=name) 159 | 160 | tbl = Autocast(log=False).cast(tbl) 161 | schema = tbl.schema 162 | 163 | for name, type in LECTOR_TYPES.items(): 164 | assert equal(type, schema.field(name).type, extra=name) 165 | -------------------------------------------------------------------------------- /test/utils.py: -------------------------------------------------------------------------------- 1 | from lector.log import CONSOLE 2 | 3 | 4 | def equal(obj1, obj2, extra=None): 5 | """Helper to print useful info if result is unexpected.""" 6 | eq = obj1 == obj2 7 | 8 | if not eq: 9 | CONSOLE.print(obj1) 10 | CONSOLE.print(obj2) 11 | 12 | if extra is not None: 13 | CONSOLE.print(extra) 14 | 15 | return False 16 | 17 | return True 18 | --------------------------------------------------------------------------------