├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── .vscode
└── settings.json
├── CITATION.cff
├── LICENSE
├── README.md
├── docs
├── Makefile
├── make.bat
└── source
│ ├── _static
│ ├── css
│ │ └── custom.css
│ ├── graphext.png
│ └── gx_logo_sq_blue.jpg
│ ├── _templates
│ └── python
│ │ └── module.rst
│ ├── conf.py
│ ├── index.rst
│ ├── quick.rst
│ ├── reader.rst
│ └── types.rst
├── environment.yml
├── lector
├── __init__.py
├── cli.py
├── csv
│ ├── __init__.py
│ ├── abc.py
│ ├── arrow.py
│ ├── dialects.py
│ ├── encodings.py
│ ├── pandas.py
│ └── preambles.py
├── log.py
├── types
│ ├── __init__.py
│ ├── abc.py
│ ├── bools.py
│ ├── cast.py
│ ├── lists.py
│ ├── numbers.py
│ ├── regex.py
│ ├── strings.py
│ └── timestamps.py
└── utils.py
├── notebooks
├── example.ipynb
└── usage.ipynb
├── pyproject.toml
├── recipe
└── meta.yaml
├── setup.cfg
├── setup.py
└── test
├── __init__.py
├── conftest.py
├── test_dialects.py
├── test_encodings.py
├── test_formats.py
├── test_preambles.py
├── test_reader.py
├── test_types.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | notebooks/*.csv
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 | cover/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | .pybuilder/
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | # For a library or package, you might want to ignore these files since the code is
89 | # intended to run in multiple environments; otherwise, check them in:
90 | # .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # poetry
100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | # This is especially recommended for binary packages to ensure reproducibility, and is more
102 | # commonly ignored for libraries.
103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 |
106 | # pdm
107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | # in version control.
111 | # https://pdm.fming.dev/#use-with-ide
112 | .pdm.toml
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
164 | .ruff_cache/
165 | .DS_Store
166 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | default_language_version:
2 | python: python3.9
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v4.5.0
6 | hooks:
7 | - id: check-yaml
8 | exclude: recipe/meta.yaml
9 | args:
10 | - --allow-multiple-documents
11 | - id: end-of-file-fixer
12 | - id: trailing-whitespace
13 |
14 | - repo: https://github.com/astral-sh/ruff-pre-commit
15 | rev: 'v0.5.4'
16 | hooks:
17 | - id: ruff
18 | args: [--fix, --exit-non-zero-on-fix]
19 | # Respect `exclude` and `extend-exclude` settings.
20 | # args: ["--force-exclude"]
21 | - id: ruff-format
22 | args: [--check]
23 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: "ubuntu-20.04"
5 | tools:
6 | python: "3.10"
7 |
8 | python:
9 | install:
10 | - method: pip
11 | path: .
12 | extra_requirements:
13 | - docs
14 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | // Default (langugage-independent) settings
3 | "editor.tabSize": 4,
4 | "editor.trimAutoWhitespace": true,
5 | "editor.insertSpaces": true,
6 | "editor.autoIndent": "full",
7 | "editor.formatOnSave": false,
8 | "editor.formatOnPaste": false,
9 | // "workbench.colorCustomizations": {
10 | // "editorRuler.foreground": "#4093ff"
11 | // },
12 | "files.insertFinalNewline": true,
13 | "files.trimFinalNewlines": true,
14 | "files.autoSave": "afterDelay",
15 | "files.autoSaveDelay": 500,
16 | // Python
17 | "[python]": {
18 | "editor.formatOnSave": true,
19 | "editor.formatOnPaste": true,
20 | "editor.tabSize": 4,
21 | "editor.insertSpaces": true,
22 | "editor.codeActionsOnSave": {
23 | "source.organizeImports": "explicit",
24 | "source.fixAll": "explicit"
25 | }
26 | },
27 | }
28 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: "Buhrmann"
5 | given-names: "Thomas"
6 | title: "Lector"
7 | version: 0.3.3
8 | date-released: 2023-12-07
9 | url: "https://github.com/graphext/lector"
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2022 Graphext Labs S.L.
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://mybinder.org/v2/gh/graphext/lector/HEAD?labpath=notebooks%2Fusage.ipynb)
2 |
3 | # Lector
4 |
5 | [Lector](https://github.com/graphext/lector) aims to be a fast reader for potentially messy CSV files with configurable column type inference. It combines automatic detection of file encodings, CSV dialects (separator, escaping etc.) and preambles (initial lines containing metadata or junk unrelated to the actual tabular data). Its goal is to just-read-the-effing-CSV file without manual configuration in most cases. Each of the detection components is configurable and can be swapped out easily with custom implementations.
6 |
7 | Also, since both [pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) and Apache [Arrow](https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html) will destructively cast columns to the wrong type in some cases (e.g. large ID-like integer strings to floats), it provides an alternative and customisable inference and casting mechanism.
8 |
9 | Under the hood it uses pyarrow's [CSV parser](https://arrow.apache.org/docs/python/generated/pyarrow.csv.read_csv.html) for reading, and its [compute functions](https://arrow.apache.org/docs/python/api/compute.html) for optional type inference.
10 |
11 | Lector is used at [Graphext](https://www.graphext.com) behind the scenes whenever a user uploads a new dataset, and so implicitly has been validated across 1000s of different CSV files from all kinds of sources. Note, however, that this is Graphext's first foray into open-sourcing our code and still _work-in-progress_. So at least initially we won't provide any guarantees as to support of this library.
12 |
13 | For quick usage examples see the [Usage](#usage) section below or the [notebook](notebooks/usage.ipynb) in this repo.
14 |
15 | For detailed documentation visit https://lector.readthedocs.io/.
16 |
17 | ## Installing
18 |
19 | While this library is not available yet on pypi, you can easily install it from Github with
20 |
21 | ```
22 | pip install git+https://github.com/graphext/lector
23 | ```
24 |
25 | ## Usage
26 |
27 | Let's assume we receive a CSV file containing some initial metadata, using the semicolon as separator, having some missing fields, and being encoded in Latin-1 (you'd be surprised how common such files are in the real world).
28 |
29 |
30 | Create example CSV file
31 |
32 | ``` python
33 | csv = """
34 | Some preamble content here
35 | This is still "part of the metadata preamble"
36 | id;genre;metric;count;content;website;tags;vecs;date
37 | 1234982348728374;a;0.1;1;; http://www.graphext.com;"[a,b,c]";"[1.3, 1.4, 1.67]";11/10/2022
38 | ;b;0.12;;"Natural language text is different from categorical data."; https://www.twitter.com;[d];"[0, 1.9423]";01/10/2022
39 | 9007199254740993;a;3.14;3;"The Project · Gutenberg » EBook « of Die Fürstin.";http://www.google.com;"['e', 'f']";["84.234, 12509.99"];13/10/2021
40 | """.encode("ISO-8859-1")
41 |
42 | with open("example.csv", "wb") as fp:
43 | fp.write(csv)
44 | ```
45 |
46 |
47 |
48 | To read this with lector into a pandas DataFrame, simply use
49 |
50 | ``` python
51 | df = lector.read_csv("example.csv", to_pandas=True)
52 | ```
53 |
54 | Printing the DataFrame and its column types produces the following output:
55 |
56 | ```
57 | id genre metric count \
58 | 0 1234982348728374 a 0.10 1
59 | 1 b 0.12
60 | 2 9007199254740993 a 3.14 3
61 |
62 | content website \
63 | 0 http://www.graphext.com
64 | 1 Natural language text is different from catego... https://www.twitter.com
65 | 2 The Project · Gutenberg » EBook « of Die Fürstin. http://www.google.com
66 |
67 | tags vecs date
68 | 0 [a, b, c] [1.3, 1.4, 1.67] 2022-10-11
69 | 1 [d] [0.0, 1.9423] 2022-10-01
70 | 2 [e, f] [84.234, 12509.99] 2021-10-13
71 |
72 | id Int64
73 | genre category
74 | metric float64
75 | count UInt8
76 | content string
77 | website category
78 | tags object
79 | vecs object
80 | date datetime64[ns]
81 | dtype: object
82 | ```
83 |
84 | This is pretty sweet, because
85 |
86 | - we didn't have to tell lector _how_ to read this file (text encoding, lines to skip, separator etc.)
87 | - we didn't have to tell lector the _data types_ of the columns, but it inferred the correct and most efficient ones automatically, e.g.:
88 | - a nullable `Int64` extension type was necessary to correctly represent values in the `id` column
89 | - the `category` column was automatically converted to the efficient `dictionary` (categorical) type
90 | - the `count` column uses the _smallest_ integer type necessary
91 | - the `text` column, containing natural language text, has _not_ been converted to a categortical type, but kept as string values (it is unlikely to benefit from dictionary-encoding)
92 | - the `date` column was converted to datetime's correctly, even though the original
93 | strings are not in an ISO format
94 | - the `tags` and `vecs` columns have been imported with `object` dtype (since pandas
95 | doesn't officially support iterables as elements in a column), but its values are in fact numpy array of the correct dtype!
96 |
97 | Neither pandas nor arrow will do this. In fact, they cannot even import this data correctly, _without_ attempting to do any smart type inference. Compare e.g. with pandas attempt to read the same CSV file:
98 |
99 |
100 | Pandas and Arrow fail
101 | Firstly, to get something close to the above, you'll have to spend a good amount of time manually inspecting the CSV file and come up with the following verbose pandas call:
102 |
103 | ``` python
104 | dtypes = {
105 | "id": "Int64",
106 | "genre": "category",
107 | "metric": "float",
108 | "count": "UInt8",
109 | "content": "string",
110 | "website": "category",
111 | "tags": "object",
112 | "vecs": "object"
113 | }
114 |
115 | df = pd.read_csv(
116 | fp,
117 | encoding="ISO-8859-1",
118 | skiprows=3,
119 | sep=";",
120 | dtype=dtypes,
121 | parse_dates=["date"],
122 | infer_datetime_format=True
123 | )
124 |
125 | ```
126 |
127 | While this _parses_ the CSV file alright, the result is, urm, lacking. Let's see:
128 |
129 | ```
130 | id genre metric count \
131 | 0 1234982348728374 a 0.10 1
132 | 1 b 0.12
133 | 2 9007199254740992 a 3.14 3
134 |
135 | content \
136 | 0
137 | 1 Natural language text is different from catego...
138 | 2 The Project · Gutenberg » EBook « of Die Fürstin.
139 |
140 | website tags vecs date
141 | 0 http://www.graphext.com [a,b,c] [1.3, 1.4, 1.67] 2022-11-10
142 | 1 https://www.twitter.com [d] [0, 1.9423] 2022-01-10
143 | 2 http://www.google.com ['e', 'f'] ["84.234, 12509.99"] 2021-10-13
144 |
145 | id Int64
146 | genre category
147 | metric float64
148 | count UInt8
149 | content string
150 | website category
151 | tags object
152 | vecs object
153 | date datetime64[ns]
154 | dtype: object
155 | ```
156 |
157 | A couple of observations:
158 |
159 | - Pandas _will_ cast numeric columns with missing data to the float type always, before any of our custom types are applied. This is a big problem, as we can see in the `id` column, since not all integers can be represented exactly by a 64 bit floating type (the correct value in our file is `9007199254740993` 👀). It is also a sneaky problem, because this happens silently, and so you may not realize you've got wrong IDs, and may produce totally wrong analyses if you use them down the line for joins etc. The only way to import CSV files like this with pandas correctly is to inspect the actual data in a text editor, guess the best data type, import the data without any type inference, and then individually cast to the correct types. There is no way to configure pandas to import the data correctly.
160 | - Pandas has messed up the dates. While at least warning us about it, pandas doesn't try to infer a consistent date format across all rows. While the CSV file contains all dates in a single consistent format (`%d/%m/%Y`), pandas has used mixed formats and so imported some dates wrongly.
161 | - The `category` and `text` columns have been imported with the `object` dtype, which is not particularly useful, but not necessarily a problem either.
162 | - Since pandas doesn't support iterable dtypes, the tags and vecs columns haven't been parsed into any useful structures
163 |
164 | Note that Arrow doesn't fare much better. It doesn't parse and infer its own `list` data type, it doesn't know how to parse dates in any format other than ISO 8601, and commits the same integer-as-float conversion error.
165 |
166 |
167 |
168 | ## Development
169 |
170 | To install a local copy for development, including all dependencies for test, documentation and code quality, use the following commands:
171 |
172 | ``` bash
173 | clone git+https://github.com/graphext/lector
174 | cd lector
175 | pip install -v -e ".[dev]"
176 | pre-commit install
177 | ```
178 |
179 | The [pre-commit](https://pre-commit.com/) command will make sure that whenever you try to commit changes to this repo code quality and formatting tools will be executed. This ensures e.g. a common coding style, such that any changes to be commited are functional changes only, not changes due to different personal coding style preferences. This in turn makes it either to collaborate via pull requests etc.
180 |
181 | To test installation you may execute the [pytest](https://docs.pytest.org/) suite to make sure everything's setup correctly, e.g.:
182 |
183 | ``` bash
184 | pytest -v .
185 | ```
186 |
187 | ## Documentation
188 |
189 | The documentation is created using Sphinx and is available here: https://lector.readthedocs.io/.
190 |
191 | You can build and view the static html locally like any other Sphinx project:
192 |
193 | ``` bash
194 | (cd docs && make clean html)
195 | (cd docs/build/html && python -m http.server)
196 | ```
197 |
198 |
199 | ## To Do
200 |
201 | - _Parallelize type inference_? While type inference is already pretty fast, it can potentially be sped up by processing columns in parallel.
202 | - _Testing_. The current pytest setup is terrible. I've given `hypothesis_csv` a try here,
203 | but I'm probably making bad use of it. Tests are convoluted and probably not even good a catching corner cases.
204 |
205 | ## License
206 |
207 | This project is licensed under the terms of the Apache License 2.0.
208 |
209 | ## Links
210 |
211 | - Documentation: https://lector.readthedocs.io/
212 | - Source: https://github.com/graphext/lector
213 | - Graphext: https://www.graphext.com
214 | - Graphext on Twitter: https://twitter.com/graphext
215 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | rm -rf source/_api
21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/_static/css/custom.css:
--------------------------------------------------------------------------------
1 | /* Visually separate classes a bit more. */
2 | .py.class > dt {
3 | border-top: solid 3px;
4 | margin-top: 50px;
5 | padding-left: 5px
6 | }
7 |
8 | .py.method > dt {
9 | border-left: solid 3px;
10 | display: inline;
11 | padding: 5px 10px 5px 10px;
12 | font-size: 0.9rem;
13 | }
14 |
15 | .py.function > dt {
16 | border-left: solid 3px;
17 | /* display: inline; */
18 | padding: 5px 10px 5px 10px;
19 | font-size: 0.9rem;
20 | }
21 |
22 | .py.function > dt:before {
23 | content: "func";
24 | font-style: italic;
25 | }
26 |
27 | .py.attribute > dt, .py.data > dt{
28 | font-size: 0.9rem;
29 | }
30 |
31 | html[data-theme="dark"], html[data-theme="light"] {
32 | --pst-color-inline-code: var(--pst-color-primary);
33 | }
34 |
35 | html[data-theme="light"] .py.class > dt,
36 | html[data-theme="light"] .py.method > dt,
37 | html[data-theme="light"] .py.function > dt {
38 | border-color: lightgrey;
39 | background: #f0f0f0;
40 | }
41 |
42 | html[data-theme="light"] .autosummary.table .row-odd {
43 | background: #f0f0f0;
44 | }
45 |
46 | html[data-theme="dark"] .py.class > dt,
47 | html[data-theme="dark"] .py.method > dt,
48 | html[data-theme="dark"] .py.function > dt {
49 | /* border-color: #82265b;
50 | background: #372b32; */
51 | border-color: var(--pst-color-primary);
52 | background: #263033;
53 | }
54 |
55 | html[data-theme="dark"] .autosummary.table .row-odd {
56 | background: #263033;
57 | }
58 |
--------------------------------------------------------------------------------
/docs/source/_static/graphext.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphext/lector/fb76591acd69aa01ee37a03f08fc8e68fd5f8aa8/docs/source/_static/graphext.png
--------------------------------------------------------------------------------
/docs/source/_static/gx_logo_sq_blue.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphext/lector/fb76591acd69aa01ee37a03f08fc8e68fd5f8aa8/docs/source/_static/gx_logo_sq_blue.jpg
--------------------------------------------------------------------------------
/docs/source/_templates/python/module.rst:
--------------------------------------------------------------------------------
1 | {% if not obj.display %}
2 | :orphan:
3 |
4 | {% endif %}
5 | :py:mod:`{{ obj.name }}`
6 | =========={{ "=" * obj.name|length }}
7 |
8 | .. py:module:: {{ obj.name }}
9 |
10 | {% if obj.docstring %}
11 | .. autoapi-nested-parse::
12 |
13 | {{ obj.docstring|indent(3) }}
14 |
15 | {% endif %}
16 |
17 | {% block subpackages %}
18 | {% set visible_subpackages = obj.subpackages|selectattr("display")|list %}
19 | {% if visible_subpackages %}
20 | Subpackages
21 | -----------
22 | .. toctree::
23 | :titlesonly:
24 | :maxdepth: 3
25 |
26 | {% for subpackage in visible_subpackages %}
27 | {{ subpackage.short_name }}/index.rst
28 | {% endfor %}
29 |
30 |
31 | {% endif %}
32 | {% endblock %}
33 | {% block submodules %}
34 | {% set visible_submodules = obj.submodules|selectattr("display")|list %}
35 | {% if visible_submodules %}
36 | Submodules
37 | ----------
38 | .. toctree::
39 | :titlesonly:
40 | :maxdepth: 1
41 |
42 | {% for submodule in visible_submodules %}
43 | {{ submodule.short_name }}/index.rst
44 | {% endfor %}
45 |
46 |
47 | {% endif %}
48 | {% endblock %}
49 | {% block content %}
50 | {% if obj.all is not none %}
51 | {% set visible_children = obj.children|selectattr("short_name", "in", obj.all)|list %}
52 | {% elif obj.type is equalto("package") %}
53 | {% set visible_children = obj.children|selectattr("display")|list %}
54 | {% else %}
55 | {% set visible_children = obj.children|selectattr("display")|rejectattr("imported")|list %}
56 | {% endif %}
57 | {% if visible_children %}
58 |
59 | {% set visible_classes = visible_children|selectattr("type", "equalto", "class")|list %}
60 | {% set visible_functions = visible_children|selectattr("type", "equalto", "function")|list %}
61 | {% set visible_attributes = visible_children|selectattr("type", "equalto", "data")|list %}
62 | {% if "show-module-summary" in autoapi_options and (visible_classes or visible_functions) %}
63 | {% block classes scoped %}
64 | {% if visible_classes %}
65 | Classes
66 | ~~~~~~~
67 |
68 | .. autoapisummary::
69 |
70 | {% for klass in visible_classes %}
71 | {{ klass.id }}
72 | {% endfor %}
73 |
74 |
75 | {% endif %}
76 | {% endblock %}
77 |
78 | {% block functions scoped %}
79 | {% if visible_functions %}
80 | Functions
81 | ~~~~~~~~~
82 |
83 | .. autoapisummary::
84 |
85 | {% for function in visible_functions %}
86 | {{ function.id }}
87 | {% endfor %}
88 |
89 |
90 | {% endif %}
91 | {% endblock %}
92 |
93 | {% block attributes scoped %}
94 | {% if visible_attributes %}
95 | Attributes
96 | ~~~~~~~~~~
97 |
98 | .. autoapisummary::
99 |
100 | {% for attribute in visible_attributes %}
101 | {{ attribute.id }}
102 | {% endfor %}
103 |
104 |
105 | {% endif %}
106 | {% endblock %}
107 | {% endif %}
108 | {% for obj_item in visible_children %}
109 | {{ obj_item.render()|indent(0) }}
110 | {% endfor %}
111 | {% endif %}
112 | {% endblock %}
113 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.insert(0, os.path.abspath("../.."))
5 |
6 | from lector import __version__ # noqa
7 |
8 | project = "lector"
9 | copyright = "2022, Thomas Buhrmann"
10 | author = "Thomas Buhrmann"
11 | version = __version__
12 | release = version
13 |
14 | extensions = [
15 | # "sphinx.ext.autodoc",
16 | "sphinx.ext.autosectionlabel",
17 | # "sphinx.ext.autosummary",
18 | "sphinx.ext.napoleon",
19 | "sphinx.ext.viewcode",
20 | # "sphinx_autodoc_typehints",
21 | "sphinx.ext.autodoc.typehints",
22 | "sphinx.ext.todo",
23 | "autoapi.extension",
24 | ]
25 |
26 | autodoc_typehints = "description"
27 |
28 | autoapi_dirs = ["../../lector"]
29 | autoapi_member_order = "groupwise"
30 | autoapi_add_toctree_entry = False
31 | autoapi_template_dir = "_templates"
32 |
33 | autosectionlabel_prefix_document = True
34 |
35 |
36 | templates_path = ["_templates"]
37 | exclude_patterns = []
38 |
39 | html_theme = "pydata_sphinx_theme"
40 | html_static_path = ["_static"]
41 | html_css_files = ["css/custom.css"]
42 |
43 | html_theme_options = {
44 | "icon_links": [
45 | {
46 | "name": "Github",
47 | "url": "https://github.com/graphext/lector",
48 | "icon": "fab fa-github-square",
49 | "type": "fontawesome",
50 | },
51 | {
52 | "name": "Graphext",
53 | "url": "https://www.graphext.com",
54 | "icon": "_static/gx_logo_sq_blue.jpg",
55 | "type": "local",
56 | },
57 | ]
58 | }
59 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | Lector
2 | ======
3 |
4 | `Lector `_ aims to be a fast reader for potentially
5 | messy CSV files with configurable column type inference. It combines automatic detection
6 | of :ref:`reader:File encodings` , :ref:`CSV dialects ` (separator,
7 | escaping etc.) and :ref:`reader:preambles` (initial lines
8 | containing metadata or junk unrelated to the actual tabular data). Its goal is to
9 | just-read-the-effing-CSV file without manual configuration in most cases. Each of the
10 | detection components is configurable and can be swapped out easily with custom implementations.
11 |
12 | Also, since both `pandas `_
13 | and Apache's `arrow `_
14 | will destructively cast columns to the wrong type in some cases (e.g. large ID-like integer
15 | strings to floats), it provides an alternative and customisable column type :doc:`inference and
16 | casting ` mechanism.
17 |
18 | Under the hood it uses pyarrow's CSV parser for reading, and its compute functions for optional
19 | type inference.
20 |
21 | Lector is used at `Graphext `_ behind the scenes whenever a user
22 | uploads a new dataset, and so implicitly has been validated across 1000s of different CSV
23 | files from all kinds of sources.
24 |
25 | Note, however, that this is Graphext's first foray into open-sourcing our code and still
26 | *work-in-progress*. So at least initially we won't provide any guarantees as to support
27 | of this library.
28 |
29 | For a quick illustration of how to use ``lector``, see the the :doc:`quickstart guide `.
30 |
31 |
32 | .. toctree::
33 | :hidden:
34 |
35 | Home
36 | Quickstart
37 | CSV Reader
38 | Types
39 | API
40 |
--------------------------------------------------------------------------------
/docs/source/quick.rst:
--------------------------------------------------------------------------------
1 | Quickstart
2 | ==========
3 |
4 |
5 | Installation
6 | ------------
7 |
8 | While this library is not available yet on pypi, you can easily install it from Github with
9 |
10 | .. code-block:: bash
11 |
12 | pip install git+https://github.com/graphext/lector
13 |
14 | The project depends on ``cchardet`` for encoding detection, ``clevercsv`` for advanced
15 | dialect detection, ``pyarrow`` for CSV parsing and type inference/conversion, as well as
16 | ``rich`` and ``typer`` for pretty output and the command-line interface.
17 |
18 | Quickstart
19 | ----------
20 |
21 | The below examples illustrate lector's default behaviour when reading CSV files. For
22 | customization options, check the :doc:`reader` and :doc:`types` sections as well as the
23 | :doc:`API reference `.
24 |
25 | Let's assume we receive the following CSV file, containing some initial metadata, using
26 | the semicolon as separator, having some missing fields, and being encoded in Latin-1:
27 |
28 | .. code-block:: python
29 |
30 | csv = """
31 | Some metadata
32 | Some more metadata
33 | id;category;metric;count;text
34 | 1234982348728374;a;0.1;1;
35 | ;b;0.12;;"Natural language text is different from categorical data."
36 | 18446744073709551615;a;3.14;3;"The Project · Gutenberg » EBook « of Die Fürstin."
37 | """.encode("ISO-8859-1")
38 |
39 | with open("example.csv", "wb") as fp:
40 | fp.write(csv)
41 |
42 | The recommended way to use lector for reading this CSV (without type-inference) would be
43 |
44 | .. code-block:: python
45 |
46 | import lector
47 |
48 | tbl = lector.read_csv("example.csv", types="string", log=True)
49 |
50 | which produces something like the following output::
51 |
52 | 'Fieldless' matches CSV buffer: detected 3 rows to skip.
53 |
54 | ─────────── CSV Format ────────────
55 | {
56 | 'encoding': 'ISO-8859-1',
57 | 'preamble': 3,
58 | 'dialect': Dialect(
59 | delimiter=';',
60 | quote_char='"',
61 | escape_char=None,
62 | double_quote=True,
63 | skip_initial_space=False,
64 | line_terminator='\r\n',
65 | quoting=0
66 | )
67 | }
68 | ───────────────────────────────────
69 |
70 | The log provides some feedback about proporties of the CSV that lector has detected
71 | automatically, namely:
72 |
73 | - It has found a *preamble* pattern named ``Fieldless`` that matches the beginning of the
74 | CSV file and indicates that the first 3 rows should be skipped (lector has an extensible
75 | list of such patterns which are tried in order until a match is found)
76 | - It has detected the *encoding* correctly as ``ISO-8859-1`` (this cannot be guaranteed in all
77 | cases, but the CSV will be read always with a fallback encoding, usually ``utf-8``, and
78 | characters that cannot be decoded will be represented by �)
79 | - It has correctly detected the CSV *dialect* (the delimiter used etc.)
80 | - The encoding, preamble and dialect together are stored in a ``Format`` object, which holds
81 | all the necessary parameters to parse the CSV file correctly with pandas or arrow
82 |
83 | Using the detected CSV format, the data is parsed (using pyarrow's ``csv.read_csv()`` under
84 | the hood). Note we have indicated to arrow to parse all columns using the ``string`` type,
85 | effectively turning *off* its internal type inference. We can use lector's type inference by
86 | not specifying the ``types`` argument or selecting it explicitly:
87 |
88 | .. code-block:: python
89 |
90 | tbl = lector.read_csv("example.csv")
91 | tbl = lector.read_csv("example.csv", types=lector.Inference.Auto) # equivalent
92 | print(tbl.schema)
93 |
94 | We see this results in the most appropriate type for each column:
95 |
96 | .. code-block::
97 |
98 | pyarrow.Table
99 | id: uint64
100 | category: dictionary
101 | metric: double
102 | count: uint8
103 | text: string
104 |
105 | Notice that:
106 |
107 | - An unsigned int (``uint64``) was necessary to correctly represent all values in the id
108 | column. Had values been even larger than the maximum of the ``uint64`` type, the values
109 | would have been converted to a categorical type (strings), rather than floats
110 | - The category column was automatically converted to the memory-efficient ``dictionary``
111 | (categorical) type
112 | - The count column uses the smallest integer type necessary (``uint8``, unsigned since all
113 | values are positive)
114 | - The text column, containing natural language text, has *not* been converted to a categorical
115 | type, but kept as simple ``string`` values (it is unlikely to benefit from dictionary-encoding)
116 |
117 | We could have relied on arrow's built-in type inference instead, like so:
118 |
119 | .. code-block:: python
120 |
121 | tbl = lector.read_csv("example.csv", types=lector.Inference.Native)
122 |
123 | but this would result in less memory-efficient and even erroneous data types (see the
124 | pandas and pure arrow comparisons below).
125 |
126 | Finally, if you need the CSV table in pandas, lector provides a little helper for correct
127 | conversion (again, pure arrow's ``to_pandas(...)`` isn't smart or flexible enough to use pandas
128 | extension dtypes for correct conversion). Use it as an argument to ``read_csv(...)`` or explicitly:
129 |
130 | .. code-block:: python
131 |
132 | from lector.utils import to_pandas
133 |
134 | df = lector.read_csv("example.csv", to_pandas=True)
135 |
136 | # equivalent:
137 | tbl = lector.read_csv("example.csv")
138 | df = to_pandas(tbl)
139 | print(df)
140 | print(df.types)
141 |
142 | Which outputs::
143 |
144 | id category metric count \
145 | 0 1234982348728374 a 0.10 1
146 | 1 b 0.12
147 | 2 18446744073709551615 a 3.14 3
148 |
149 | text
150 | 0
151 | 1 Natural language text is different from catego...
152 | 2 The Project · Gutenberg » EBook « of Die Fürstin.
153 |
154 | id UInt64
155 | category category
156 | metric float64
157 | count UInt8
158 | text string
159 | dtype: object
160 |
161 | Note how nullable pandas extension dtypes are used to preserve correct integer values, where pure arrow would have used the unsafe float type instead.
162 |
163 | Compared with pandas
164 | --------------------
165 |
166 | Trying to read CSV files like the above using ``pandas.read_csv(...)`` and default arguments
167 | only will fail (at least in pandas < 2.0). To find the correct arguments, you'll have to open the CSV in a text editor
168 | and manually identify the separator and the initial lines to skip, and then try different
169 | encodings until you find one that seems to decode all characters correctly. But even if you
170 | then manage to read the CSV, the result may not be what you expected:
171 |
172 | .. code-block:: python
173 |
174 | csv = """
175 | Some metadata
176 | Some more metadata
177 | id;category;metric;count;text
178 | 1234982348728374;a;0.1;1;"This is a text."
179 | ;b;0.12;;"Natural language text is different from categorical data."
180 | 9007199254740993;a;3.14;3;"The Project · Gutenberg » EBook « of Die Fürstin."
181 | """.encode("ISO-8859-1")
182 |
183 | df = pd.read_csv(
184 | io.BytesIO(csv),
185 | encoding="ISO-8859-1",
186 | skiprows=3,
187 | sep=";",
188 | index_col=False
189 | )
190 | print(df)
191 | print(df.dtypes)
192 |
193 | results in::
194 |
195 | id category metric count \
196 | 0 1.234982e+15 a 0.10 1.0
197 | 1 NaN b 0.12 NaN
198 | 2 9.007199e+15 a 3.14 3.0
199 |
200 | text
201 | 0 This is a text.
202 | 1 Natural language text is different from catego...
203 | 2 The Project · Gutenberg » EBook « of Die Fürstin.
204 |
205 |
206 | id float64
207 | category object
208 | metric float64
209 | count float64
210 | text object
211 |
212 |
213 | The ``category`` and ``text`` columns have been imported with the ``object`` dtype,
214 | which is not particularly useful, but not necessarily a problem either.
215 |
216 | Note, however, that numeric-like columns with missing data have been cast to the ``float``
217 | type. This may seem merely a nuisance in the case of the ``count`` column, which could easily
218 | be cast to a (nullable) integer type. It is, however, a big problem for the ``id`` column,
219 | since not all integers can be represented exactly by a 64 bit floating type:
220 |
221 | .. code-block::
222 |
223 | >>> print(df.id.iloc[2])
224 | 9007199254740992.0
225 |
226 | which is not the value ``"9007199254740993"`` contained in our CSV file! We cannot cast
227 | this column to the correct type anymore either (e.g. ``int64`` or ``string``), because
228 | the original value is lost. It is also a sneaky problem, because you may not realize
229 | you've got wrong IDs, and may produce totally wrong analyses if you use them down the
230 | line for joins etc. The only way to import CSV files like this correctly is to inspect
231 | essentially all columns and all rows manually in a text editor, choose the best data type
232 | manually, and then provide these types via pandas ``dtype`` argument. This may be feasible
233 | if you work with CSVs only sporadically, but quickly becomes cumbersome otherwise.
234 |
235 |
236 | Compared with arrow
237 | -------------------
238 |
239 | The arrow CSV reader unfotunately faces exactly the same limitations as pandas:
240 |
241 | .. code-block:: python
242 |
243 | import pyarrow as pa
244 | import pyarrow.csv
245 |
246 |
247 | csv = """
248 | Some metadata
249 | Some more metadata
250 | id;category;metric;count;text
251 | 1234982348728374;a;0.1;1;
252 | ;b;0.12;;"Natural language text is different from categorical data."
253 | 18446744073709551615;a;3.14;3;"The Project · Gutenberg » EBook « of Die Fürstin."
254 | """.encode("ISO-8859-1")
255 |
256 | tbl = pa.csv.read_csv(
257 | io.BytesIO(csv),
258 | read_options=pa.csv.ReadOptions(encoding="ISO-8859-1", skip_rows=3),
259 | parse_options=pa.csv.ParseOptions(delimiter=";"),
260 | convert_options=pa.csv.ConvertOptions(strings_can_be_null=True)
261 | )
262 |
263 | print(tbl)
264 | int(tbl.column("id")[2].as_py())
265 |
266 | It needs the same level of human inspection to identify the correct arguments to read the CSV,
267 | and destructively casts IDs to floats (but at least uses a more efficient string type where
268 | applicable, in contrast to pandas object dtype)::
269 |
270 | pyarrow.Table
271 | id: double
272 | category: string
273 | metric: double
274 | count: int64
275 | text: string
276 | ----
277 | id: [[1.234982348728374e+15,null,1.8446744073709552e+19]]
278 | category: [["a","b","a"]]
279 | metric: [[0.1,0.12,3.14]]
280 | count: [[1,null,3]]
281 | text: [[null,"Natural language text is different from categorical data.","The Project · Gutenberg » EBook « of Die Fürstin."]]
282 |
283 | 18446744073709551616
284 |
285 | Again, the only way to ensure correctness of the parsed CSV is to not use arrow's built-in
286 | type inference, but provide the desired type for each column manually.
287 |
--------------------------------------------------------------------------------
/docs/source/reader.rst:
--------------------------------------------------------------------------------
1 | CSV Reader
2 | ==========
3 |
4 | The :doc:`CSV Reader ` has the simple task of detecting 3
5 | properties of a CSV file:
6 |
7 | 1. The text encoding (utf-8, latin-1 etc.)
8 | 2. A potential preamble (initial lines to skip)
9 | 3. The CSV dialect (delimiter etc.)
10 |
11 | Lector provides an abstract base class and default implementations for each of
12 | the three detectors (see below).
13 |
14 | A reader itself then simply receives instances of these detectors (or the results
15 | of the detection), and configures the parameters of a CSV parser accordingly. The
16 | main CSV parser in lector is pyarrow's `csv.read_csv() `_,
17 | as used in the :doc:`ArrowReader `. As an example
18 | for using alternative parsers we also include a :doc:`PandasReader `.
19 | Both implement the abstract :doc:`Reader ` class.
20 |
21 | File encodings
22 | --------------
23 |
24 | An encoding detector in lector is any class having a ``detect()`` method that
25 | accepts a binary (bytes) buffer, and returns a string indicating the name of
26 | a `Python codec `_, as the
27 | :class:`abstract base class ` requires:
28 |
29 | .. code-block:: python
30 |
31 | @dataclass
32 | class EncodingDetector(ABC):
33 | """Base class specifying interface for all encoding detetors."""
34 |
35 | @abstractmethod
36 | def detect(self, buffer: BinaryIO) -> str:
37 | """Implement me.""
38 |
39 |
40 | The :class:`default implementation ` uses the
41 | `cchardet `_ library internally and has the following
42 | interface:
43 |
44 | .. code-block:: python
45 |
46 | @dataclass
47 | class Chardet(EncodingDetector):
48 | """An encoding detector using cchardet if the default utf-8 generates too many errors."""
49 |
50 | n_bytes: int = int(1e7) # 10 MB
51 | """Use this many bytes to detect encoding."""
52 | error_threshold: float = 0.0
53 | """A greater proportion of decoding errors than this will be considered a failed encoding."""
54 | confidence_threshold: float = 0.6
55 | """Minimum level of confidence to accept an encoding automatically detected by cchardet."""
56 |
57 | It reads a maximum of ``n_bytes`` bytes from the received buffer, and then in the following
58 | order:
59 |
60 | - Tries to identify an initial byte-order mark (`BOM `_)
61 | indicating the file's codec
62 | - Checks whether assuming ``utf-8`` produces less than ``error_threshold`` decoding errors
63 | (and returns this codec if true)
64 | - Uses ``cchardet`` to detect the encoding. If cchardet's confidence is greater than the
65 | ``confidence_threshold``, returns the detected encoding. Otherwise it falls back on the
66 | ``windows-1250`` codec as the windows/latin-like codec that most acts as a superset of
67 | special characters amongst related codecs.
68 |
69 |
70 | Preambles
71 | ---------
72 |
73 | By "preamble" lector understands initial lines in CSV files to be skipped, e.g. metadata
74 | that should not be interpreted part of the tabular data itself.
75 |
76 | It is impossible to always detect arbitrary preambles from the CSV data itself. There are,
77 | however, common patterns amongst preambles written to CSV by certain sources. E.g.
78 | some exporters may separate the metadata from actual data by a line of delimiters only.
79 | Others may write metadata only that does not itself contain the delimiter used otherwise
80 | to separate fields in the tabular part.
81 |
82 | Since it is essentially an open-ended exercise to detect arbitrary preambles, lector was
83 | designed to allow easy extension of the patterns to be detected. One simply implements
84 | a new subclass of :class:`PreambleDetector `, and
85 | uses a decorator to register it with the :class:`preamble registry `.
86 | Like so:
87 |
88 | .. code-block:: python
89 |
90 | @Preambles.register
91 | @dataclass
92 | class MyPreamble(PreambleDetector):
93 |
94 | def detect(self, buffer: TextIO) -> int:
95 | ...
96 |
97 | In this case the detector will receive an already decoded *text* buffer, and should
98 | return an integer indicating the number of lines to skip.
99 |
100 | :class:`lector.csv.preambles.Brandwatch`, and :class:`lector.csv.preambles.Fieldless`
101 | are two detectors provided out of the box. The former checks for initial lines followed
102 | by a single line of commas only. The second checks for N initial lines containing a single
103 | field only, followed by at least one line containing multiple fields. It then returns N as
104 | the number of rows to skip.
105 |
106 | :meth:`lector.csv.preambles.Preambles.detect` is responsible for trying all
107 | implemented detectors in the order they have been registered and returns the first match
108 | (returning N > 0 lines to skip). This may provide too contraining in the long run and
109 | may change in the future so that the order is more easily configurable.
110 |
111 | Dialects
112 | --------
113 |
114 | The CSV format is not in fact a strict standard, and there are a number of differences
115 | in how CSVs files can be generated. E.g. while the delimiter is usually the comma, it may
116 | also be a semi-colon, the tab or any other arbitrary character. To handle the delimiter
117 | appearing *within* fields, one may choose to quote such fields, or use a special escape
118 | character etc.
119 |
120 | A `CSV dialect `_
121 | is a set of parameters describing how to parse a CSV file, i.e. identifying the delimiter,
122 | quote character and so on. In Python's `csv` module, it was decided unfortunately that
123 | to use such dialects one has to pass around subclasses of it, rather than instances. Since
124 | this is somewhat awkward, lector implements it's own :class:`lector.csv.dialects.Dialect`.
125 |
126 | Instances of dialects are used as return values by dialect detectors in lector, the abstract
127 | base class of which is simply
128 |
129 | .. code-block:: python
130 |
131 | @dataclass
132 | class DialectDetector(ABC):
133 | """Base class for all dialect detectors."""
134 |
135 | @abstractmethod
136 | def detect(self, buffer: TextIO) -> Dialect:
137 | ...
138 |
139 | Lector provides two default implementations. :class:`lector.csv.dialects.PySniffer` uses the
140 | Python standard library's `CSV Sniffer `_
141 | internally and fixes up the result specifically for more robust *parsing* of CSVs.
142 |
143 | Alternatively, if `clevercsv `_ has
144 | been installed as an optional dependency, lector wraps it inside the
145 | :class:`lector.csv.dialects.CleverCsv` detector class. It can be used to trade-off speed
146 | against more robust dialect inference.
147 |
148 | Readers
149 | -------
150 |
151 | Finally, a CSV Reader in lector simply receices an encoding (or encoding detector),
152 | a preamble (or preamble detector) and a dialect (or, wait, a dialect detector). The
153 | abstract base class for readers, :class:`lector.csv.abc.Reader`, is essentially
154 |
155 | .. code-block:: python
156 |
157 | class Reader(ABC):
158 | """Base class for CSV readers."""
159 |
160 | def __init__(
161 | self,
162 | fp: FileLike,
163 | encoding: str | EncodingDetector | None = None,
164 | dialect: dict | DialectDetector | None = None,
165 | preamble: int | PreambleRegistry | None = None,
166 | log: bool = True,
167 | ) -> None:
168 | self.fp = fp
169 | self.encoding = encoding or encodings.Chardet()
170 | self.dialect = dialect or dialects.PySniffer()
171 | self.preamble = preamble or Preambles
172 | self.log = log
173 |
174 | def read(self, *args, **kwds) -> Any:
175 | try:
176 | self.analyze()
177 | result = self.parse(*args, **kwds)
178 | self.buffer.close()
179 | return result
180 | except Exception:
181 | raise
182 |
183 | @abstractmethod
184 | def parse(self, *args, **kwds) -> Any:
185 | """Parse the file pointer or text buffer. Args are forwarded to read()."""
186 | ...
187 |
188 | The base class uses the provided detectors to infer (if necessary) all the information
189 | required to call a CSV parser. It wraps all inferred information in a
190 | :class:`lector.csv.abc.Format` object, which Reader subclasses can then translate
191 | to a specific parser's own parameters. E.g., the only thing the :class:`lector.csv.arrow.ArrowReader`
192 | does, is translate a CSV Format, to arrow's own ``csv.ReadOptions``, ``csv.ParseOptions``
193 | and ``csv.ConvertOptions`` objects.
194 |
195 | If no parameters (other than a file pointer) are passed, a reader uses the default
196 | implementations of all detectors, which means that if no customization is needed,
197 | reading almost any CSV becomes simply:
198 |
199 | .. code-block:: python
200 |
201 | from lector import ArrowReader
202 |
203 | tbl = ArrowReader("/path/to/file.csv").read()
204 |
--------------------------------------------------------------------------------
/docs/source/types.rst:
--------------------------------------------------------------------------------
1 | Types
2 | =====
3 |
4 | Introduction
5 | ------------
6 |
7 | Lector implements its own column type inference. It can be used by parsing a CSV file
8 | with ``string`` types only (preserving the original fields without modification),
9 | and then auto-casting all columns to the most appropriate and efficient data type:
10 |
11 | .. code-block:: python
12 |
13 | import io
14 | import lector
15 | from lector import ArrowReader, Autocast
16 | from lector.log import schema_view
17 |
18 | csv = """id,genre,metric,count,content,website,tags
19 | 1234982348728374,a,0.1,1,, http://www.graphext.com,"[a,b,c]"
20 | ,b,0.12,,"Natural language text is different from categorical data.", https://www.twitter.com,[d]
21 | 18446744073709551615,a,3.14,3,"The Project · Gutenberg » EBook « of Die Fürstin.",http://www.google.com,"['e', 'f']"
22 | """.encode()
23 |
24 | # High-level API uses automatic type-inference and casting by default
25 | tbl = lector.read_csv(io.BytesIO(csv))
26 |
27 | # Equivalent low-level API
28 | tbl = ArrowReader(io.BytesIO(csv)).read(types="string")
29 | tbl = Autocast().cast(tbl)
30 |
31 | schema_view(tbl.schema)
32 |
33 | Printing the table schema this way will produce the following output:
34 |
35 | .. code-block::
36 |
37 | Schema
38 | ─────────────────────────────────────────────────────────
39 | Column Type Meta
40 | ─────────────────────────────────────────────────────────
41 | id uint64 {'semantic': 'number[UInt64]'}
42 | genre dict {'semantic': 'category'}
43 | metric double {'semantic': 'number[double]'}
44 | count uint8 {'semantic': 'number[UInt8]'}
45 | content string {'semantic': 'text'}
46 | website dict {'semantic': 'url'}
47 | tags list {'semantic': 'list[category]'}
48 | ─────────────────────────────────────────────────────────
49 |
50 | The schema view contains for each column in the table the arrow type it has been
51 | cast to, as well as some metadata about the *semantic* type lector has inferred.
52 | By semantic type we mean the kind of content the column contains, which may be
53 | different from (more specific than) the "physical" (arrow) type used to store it.
54 |
55 | For example, the "website" column has been cast to arrow using a dictionary type with
56 | string values (``dict``). However, lector has in fact inferred that the column
57 | contains URLs, and simply selected the dictionary type as the best storage type for URLs.
58 | Equally, the "content" column has been inferred to contain natural language text, and in
59 | this case arrow's ``string`` type is used for storage. Also note that lector handles
60 | types that have no equivalent in `pandas`. The "tags" column contains lists of strings, for
61 | example, which lector has automatically parsed and cast to arrow's ``list``
62 | type.
63 |
64 | For numeric columns lector has automatically identified the most efficient (least
65 | memory-hungry) data types. The semantic metadata here is used to indicate pandas'
66 | corresponding (potentially nullable extension) ``dtype``.
67 |
68 | Using lector's ``to_pandas()`` function we can convert the arrow table to a pandas DataFrame
69 | ensuring that all data is converted correctly, even when there is no corresponding
70 | pandas type:
71 |
72 | .. code-block:: python
73 |
74 | from lector.utils import to_pandas
75 |
76 | df = to_pandas(tbl)
77 | print(df)
78 | print(df.dtypes)
79 |
80 | produces
81 |
82 | .. code-block::
83 |
84 | id genre metric count \
85 | 0 1234982348728374 a 0.10 1
86 | 1 b 0.12
87 | 2 18446744073709551615 a 3.14 3
88 |
89 | content website \
90 | 0 http://www.graphext.com
91 | 1 Natural language text is different from catego... https://www.twitter.com
92 | 2 The Project · Gutenberg » EBook « of Die Fürstin. http://www.google.com
93 |
94 | tags
95 | 0 [a, b, c]
96 | 1 [d]
97 | 2 [e, f]
98 |
99 |
100 | id UInt64
101 | genre category
102 | metric float64
103 | count UInt8
104 | content string
105 | website category
106 | tags object
107 | dtype: object
108 |
109 | Note that arrow's ``tbl.to_pandas()`` would have converted integer columns with
110 | missing data to the float type, which is not save and may introduce erroneous data
111 | (because of insufficient float precision when representing large integers). Lector
112 | uses extension dtypes where necessary. Also note how all tag lists have been parsed
113 | correctly, despite having various different representations in the CSV data (use of
114 | quotes etc.). In pandas, the lists are representated by a column of numpy arrays.
115 |
116 | Array Converters
117 | ----------------
118 |
119 | ``Converters`` in Lector are responsible for inferring the semantic type of a column
120 | (i.e. an arrow Array or ChunkedArray), identifying the corresponding storage type,
121 | and potentially generating some useful metadata.
122 |
123 | Lector implements one subclass of :class:`lector.types.abc.Converter` for each semantic
124 | type. At the moment there are:
125 |
126 | - :class:`lector.types.numbers.Number` (``number``)
127 | - :class:`lector.types.lists.List` (``list[number]`` or ``list[category]``)
128 | - :class:`lector.types.strings.Text` (``text``)
129 | - :class:`lector.types.strings.Category` (``category``)
130 | - :class:`lector.types.strings.Url` (``url``)
131 | - :class:`lector.types.timestamps.Timestamp` (``date``)
132 |
133 | Their interface is simple. Minimally they have to accept a ``threshold`` as
134 | parameter, and must implement a ``convert()`` method:
135 |
136 | .. code-block:: python
137 |
138 | @dataclass
139 | class Converter(ABC):
140 |
141 | threshold: float = 1.0
142 |
143 | @abstractmethod
144 | def convert(self, arr: Array) -> Conversion | None:
145 | ...
146 |
147 | @dataclass
148 | class Conversion:
149 |
150 | result: Array
151 | meta: dict = field(default_factory=dict)
152 |
153 | A specific converter (e.g. ``Number``) returns ``None`` if the passed
154 | array data is not compatible with the type (e.g. the values are not numeric).
155 | Otherwise it will return a ``Conversion`` object containing the correctly
156 | cast array and potential metadata.
157 |
158 | The converters can be used in two ways for casting a table of raw data: using
159 | :class:`lector.types.cast.Autocast` to infer the best type automatically, or
160 | using :class:`lector.types.cast.Cast` to specify the (semantic) type for each
161 | column explicitly (see below for more information).
162 |
163 | In both cases, exactly *how* types are used for inference and casting can be
164 | configured by
165 |
166 | - limiting or extending the list of allowed semantic types/converters
167 | - configuring each converter via parameters
168 |
169 | The single common parameter for all converters is the ``threshold``. This
170 | is used to identify the proportion of values in an array that have to be
171 | valid according to the given type for it to return a cast result. I.e.
172 | a converter (e.g. ``Number``) should return ``None`` if the proportion
173 | of valid (e.g. numeric-like) values is less than ``threshold``.
174 |
175 | Automatic Table Cast
176 | --------------------
177 |
178 | For each column, the :class:`lector.types.cast.Autocast` simply tries each semantic
179 | type (i.e. ``Converter``) in an ordered list. The first ``Converter`` returning a
180 | ``Conversion`` (rather than ``None``) is used to cast the column. If no list of
181 | converters is specified explicitly (as in the first example above), a default
182 | order of all implemented converters with default parameters is used
183 | (:data:`lector.types.cast.DEFAULT_CONVERTERS`).
184 |
185 | As mentioned above, the ``Autocast`` can be configured by passing an ordered list
186 | of allowed converters, and by configuring the parameters of individual converters.
187 | If that is not enough, and a more complicated cast strategy is required, one can
188 | also implement a new subclass of :class:`lector.types.cast.CastStratregy`. The base
189 | class takes care of iteration over columns and some other minor boilerplate so
190 | that subclasses only have to implement the conversion of a single Array.
191 |
192 | Explicit Table Cast
193 | -------------------
194 |
195 | :class:`lector.types.cast.Cast` is even simpler. Given a mapping of column names to
196 | converters, it applies a specific converter to a specific column. If the conversion
197 | is successful that column is cast, otherwise left as is. For example, given the CSV
198 | data above:
199 |
200 | .. code-block:: python
201 |
202 | from lector import Cast
203 | from lector.types import Category, Number
204 |
205 | strategy = Cast({
206 | "id": Number(),
207 | "genre": Category(max_cardinality=None),
208 | # ...
209 | })
210 |
211 | # High-level API
212 | tbl = lector.read_csv(io.BytesIO(csv), strategy=strategy)
213 |
214 | # Low-level API
215 | tbl = ArrowReader(io.BytesIO(csv)).read(types="string")
216 | tbl = strategy.cast(tbl)
217 | schema_view(tbl.schema)
218 |
219 | produces
220 |
221 | .. code-block::
222 |
223 | Schema
224 | ─────────────────────────────────────────────────────────
225 | Column Type Meta
226 | ─────────────────────────────────────────────────────────
227 | id uint64 {'semantic': 'number[UInt64]'}
228 | genre dict {'semantic': 'category'}
229 | metric string
230 | count string
231 | content string
232 | website string
233 | tags string
234 | ─────────────────────────────────────────────────────────
235 |
236 | I.e., only the two specified columns have been converted using the configured
237 | types.
238 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: lectorpy310
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - cchardet
6 | - gdown
7 | - humanize
8 | - rich
9 | - typer
10 | - pandas
11 | - pyarrow>=8.0.0
12 | - python=3.10
13 | - pip
14 | - pip:
15 | - clevercsv<0.8.1
16 | - lector @ git+https://github.com/graphext/lector@main
17 |
--------------------------------------------------------------------------------
/lector/__init__.py:
--------------------------------------------------------------------------------
1 | """A package for fast parsing of messy CSV files and smart-ish type inference."""
2 |
3 | from __future__ import annotations
4 |
5 | from enum import Enum
6 |
7 | from . import utils
8 | from .csv import ArrowReader, Dialect, EmptyFileError, Format, Preambles
9 | from .csv.abc import FileLike, PreambleRegistry
10 | from .csv.dialects import DialectDetector
11 | from .csv.encodings import EncodingDetector
12 | from .log import CONSOLE, LOG, schema_view, table_view
13 | from .types import Autocast, Cast, Converter, Registry
14 | from .types.cast import CastStrategy
15 |
16 |
17 | class Inference(str, Enum):
18 | Native = "Native"
19 | Auto = "Auto"
20 | Disable = "Disable"
21 |
22 |
23 | def read_csv(
24 | fp: FileLike,
25 | encoding: str | EncodingDetector | None = None,
26 | dialect: dict | DialectDetector | None = None,
27 | preamble: int | PreambleRegistry | None = None,
28 | types: str | dict | Inference = Inference.Auto,
29 | strategy: CastStrategy | None = None,
30 | to_pandas: bool = False,
31 | log: bool = False,
32 | ):
33 | """Thin wrapper around class-based reader interface."""
34 |
35 | reader = ArrowReader(fp, encoding=encoding, dialect=dialect, preamble=preamble, log=log)
36 |
37 | dtypes = types
38 | if isinstance(types, Inference):
39 | dtypes = None if types == Inference.Native else "string"
40 |
41 | tbl = reader.read(types=dtypes)
42 |
43 | if types == Inference.Auto:
44 | strategy = strategy or Autocast(log=log)
45 | tbl = strategy.cast(tbl)
46 |
47 | if to_pandas:
48 | if utils.PANDAS_INSTALLED:
49 | return utils.to_pandas(tbl)
50 |
51 | raise Exception("It seems pandas isn't installed in this environment!")
52 |
53 | return tbl
54 |
55 |
56 | __all__ = [
57 | "Autocast",
58 | "ArrowReader",
59 | "Cast",
60 | "CONSOLE",
61 | "Converter",
62 | "EmptyFileError",
63 | "Dialect",
64 | "Format",
65 | "LOG",
66 | "Preambles",
67 | "Registry",
68 | "schema_view",
69 | "table_view",
70 | ]
71 |
72 | __version__ = "0.3.5"
73 |
--------------------------------------------------------------------------------
/lector/cli.py:
--------------------------------------------------------------------------------
1 | """Command-line interface."""
2 | from pathlib import Path
3 | from typing import Optional
4 |
5 | import typer
6 |
7 | from . import Inference, read_csv
8 | from .log import LOG, pformat, schema_view, table_view
9 | from .utils import Timer
10 |
11 | CLI = typer.Typer()
12 |
13 |
14 | @CLI.command()
15 | def read(
16 | fp: Path = typer.Argument( # noqa: B008
17 | ..., exists=True, file_okay=True, dir_okay=False, resolve_path=True
18 | ),
19 | types: Optional[Inference] = typer.Option(Inference.Auto),
20 | log: Optional[bool] = typer.Option(False),
21 | ):
22 | """Read a CSV file into an Arrow table."""
23 | with Timer() as t:
24 | tbl = read_csv(fp, types=types, log=log)
25 |
26 | LOG.info(pformat(table_view(tbl, title="Final table")))
27 | LOG.info(pformat(schema_view(tbl.schema, title="Schema")))
28 | LOG.info(f"Import took {t.elapsed:.2f} seconds.")
29 |
--------------------------------------------------------------------------------
/lector/csv/__init__.py:
--------------------------------------------------------------------------------
1 | """Subpackage for smart parsing of CSV files.
2 |
3 | Helps deteting encoding, preambles (initial junk to skip), CSV dialects etc.
4 | """
5 | from .abc import EmptyFileError, Format, Reader
6 | from .arrow import ArrowReader
7 | from .dialects import Dialect, PySniffer
8 | from .encodings import Chardet
9 | from .preambles import Preambles
10 |
11 | __all__ = [
12 | "ArrowReader",
13 | "Chardet",
14 | "Dialect",
15 | "EmptyFileError",
16 | "Format",
17 | "Preambles",
18 | "PySniffer",
19 | "Reader",
20 | ]
21 |
--------------------------------------------------------------------------------
/lector/csv/abc.py:
--------------------------------------------------------------------------------
1 | """Fast and robust parser for potentially messy CSV data."""
2 | from __future__ import annotations
3 |
4 | import io
5 | from abc import ABC, abstractmethod
6 | from contextlib import suppress
7 | from csv import DictReader
8 | from dataclasses import dataclass, field
9 | from pathlib import Path
10 | from typing import IO, Any, TextIO, Union
11 |
12 | from rich.table import Table as RichTable
13 |
14 | from ..log import LOG, dict_view, pformat
15 | from ..utils import reset_buffer
16 | from . import dialects, encodings
17 | from .dialects import Dialect, DialectDetector
18 | from .encodings import EncodingDetector
19 | from .preambles import Preambles
20 |
21 | FileLike = Union[str, Path, IO]
22 |
23 | PreambleRegistry = type(Preambles)
24 |
25 |
26 | class EmptyFileError(Exception):
27 | """Raised when a binary file read() returns 0 bytes."""
28 |
29 |
30 | def is_empty(buffer: IO) -> bool:
31 | """Check if a binary or text buffer is empty (from current position onwards)."""
32 | pos = buffer.tell()
33 | empty = len(buffer.read(1)) == 0
34 | buffer.seek(pos)
35 | return empty
36 |
37 |
38 | class CleanTextBuffer(io.TextIOWrapper):
39 | """Remove null bytes on the fly."""
40 |
41 | def read(self, *args):
42 | return super().read(*args).replace("\x00", "")
43 |
44 | def readline(self, *args):
45 | return super().readline(*args).replace("\x00", "")
46 |
47 |
48 | @dataclass
49 | class Format:
50 | """Holds all parameters needed to successfully read a CSV file."""
51 |
52 | encoding: str | None = "utf-8"
53 | preamble: int | None = 0
54 | dialect: Dialect | None = field(default_factory=lambda: Dialect())
55 | columns: list[str] | None = None
56 |
57 | def __rich__(self) -> RichTable:
58 | return dict_view(
59 | {k: v for k, v in self.__dict__.items() if k != "columns"},
60 | title="CSV Format",
61 | width=120,
62 | )
63 |
64 |
65 | class Reader(ABC):
66 | """Base class for CSV readers."""
67 |
68 | def __init__(
69 | self,
70 | fp: FileLike,
71 | encoding: str | EncodingDetector | None = None,
72 | dialect: dict | Dialect | DialectDetector | None = None,
73 | preamble: int | PreambleRegistry | None = None,
74 | log: bool = True,
75 | ) -> None:
76 | self.fp = fp
77 | self.encoding = encoding or encodings.Chardet()
78 | self.dialect = dialect or dialects.CleverCSV()
79 | self.preamble = preamble if preamble is not None else Preambles
80 | self.log = log
81 |
82 | def decode(self, fp: FileLike) -> TextIO:
83 | """Make sure we have a text buffer."""
84 | buffer = fp
85 |
86 | if isinstance(buffer, (str, Path)):
87 | if isinstance(self.encoding, str):
88 | buffer = open(buffer, encoding=self.encoding, errors="replace") # noqa: SIM115
89 | else:
90 | buffer = open(buffer, "rb") # noqa: SIM115
91 |
92 | if is_empty(buffer):
93 | raise EmptyFileError(f"The passed object ({buffer}) contained 0 bytes of data.")
94 |
95 | if isinstance(buffer, io.BufferedIOBase):
96 | if isinstance(self.encoding, EncodingDetector):
97 | with reset_buffer(buffer):
98 | self.encoding = self.encoding.detect(buffer)
99 |
100 | buffer = CleanTextBuffer(buffer, encoding=self.encoding, errors="replace")
101 | else:
102 | self.encoding = buffer.encoding or "UTF-8"
103 |
104 | return buffer
105 |
106 | def detect_preamble(self, buffer: TextIO) -> int:
107 | """Detect the number of junk lines at the start of the file."""
108 | if self.preamble is None:
109 | return 0
110 | if isinstance(self.preamble, (int, float)):
111 | return self.preamble
112 | if issubclass(self.preamble, Preambles):
113 | return Preambles.detect(buffer, log=self.log) or 0
114 |
115 | return 0
116 |
117 | def detect_dialect(self, buffer: TextIO) -> dict:
118 | """Detect separator, quote character etc."""
119 | if isinstance(self.dialect, DialectDetector):
120 | return self.dialect.detect(buffer)
121 |
122 | if isinstance(self.dialect, dict):
123 | return Dialect(**self.dialect)
124 |
125 | return self.dialect
126 |
127 | @classmethod
128 | def detect_columns(cls, buffer: TextIO, dialect: Dialect) -> list[str]:
129 | """Extract column names from buffer pointing at header row."""
130 | reader = DictReader(buffer, dialect=dialect.to_builtin())
131 | with suppress(StopIteration):
132 | _ = next(reader)
133 |
134 | return reader.fieldnames
135 |
136 | def analyze(self):
137 | """Infer all parameters required for reading a csv file."""
138 | self.buffer = self.decode(self.fp)
139 | cursor = self.buffer.tell()
140 |
141 | with reset_buffer(self.buffer):
142 | self.preamble = self.detect_preamble(self.buffer)
143 |
144 | for _ in range(self.preamble):
145 | self.buffer.readline()
146 |
147 | with reset_buffer(self.buffer):
148 | self.dialect = self.detect_dialect(self.buffer)
149 |
150 | with reset_buffer(self.buffer):
151 | self.columns = self.detect_columns(self.buffer, self.dialect)
152 |
153 | self.format = Format(
154 | encoding=self.encoding,
155 | preamble=self.preamble,
156 | dialect=self.dialect,
157 | columns=self.columns,
158 | )
159 |
160 | if self.log:
161 | LOG.info(pformat(self.format))
162 |
163 | self.buffer.seek(cursor)
164 |
165 | @abstractmethod
166 | def parse(self, *args, **kwds) -> Any:
167 | """Parse the file pointer or text buffer. Args are forwarded to read()."""
168 |
169 | def read(self, *args, **kwds) -> Any:
170 | try:
171 | self.analyze()
172 | result = self.parse(*args, **kwds)
173 | self.buffer.close()
174 | return result
175 | except Exception:
176 | raise
177 |
178 | __call__ = read
179 |
--------------------------------------------------------------------------------
/lector/csv/arrow.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import codecs
4 | from codecs import StreamRecoder
5 | from collections.abc import Iterable
6 | from io import SEEK_CUR, BufferedIOBase, TextIOBase
7 | from pathlib import Path
8 | from typing import Union
9 |
10 | import pyarrow as pa
11 | import pyarrow.csv as pacsv
12 | from pyarrow import DataType
13 | from pyarrow.csv import InvalidRow
14 |
15 | from ..log import LOG
16 | from ..utils import MISSING_STRINGS, ensure_type, uniquify
17 | from .abc import EmptyFileError, FileLike, Format, Reader
18 |
19 | TypeDict = dict[str, Union[str, DataType]]
20 |
21 | MAX_MSG_LEN = 200 # characters
22 | SKIPPED_MSG_N_MAX = 20
23 |
24 |
25 | def clean_column_names(names: list[str]) -> list[str]:
26 | """Handle empty and duplicate column names."""
27 |
28 | # Arrow doesn't (yet?) have support for CSV dialect "skipinitialspace" option
29 | names = [name.strip() for name in names]
30 | unnamed = [i for i, x in enumerate(names) if not x]
31 | for i, col_idx in enumerate(unnamed):
32 | names[col_idx] = f"Unnamed_{i}"
33 |
34 | return uniquify(names)
35 |
36 |
37 | def transcode(
38 | fp: FileLike,
39 | codec_in: str = "utf-8",
40 | codec_out: str = "utf-8",
41 | errors="replace",
42 | ) -> StreamRecoder:
43 | """Safely transcode any readable byte stream from decoder to encoder codecs.
44 |
45 | Arrow only accepts byte streams and optional encoding, but has no option to
46 | automatically handle codec errors. It also doesn't seem to like the interface
47 | of a Python recoder when the encoding is "utf-16" (rather than more specific
48 | "utf-16-le" or "utf-16-be").
49 | """
50 | if isinstance(fp, (str, Path)):
51 | fp = open(fp, "rb") # noqa: SIM115
52 | elif isinstance(fp, TextIOBase):
53 | # Not a no-operation! If we read 3 characteres from a text buffer, the underlying binary
54 | # buffer might actually read more, since it reads in batches. Which means its internal
55 | # cursor might be in advance of the current position in the text buffer read so far.
56 | fp.seek(0, SEEK_CUR)
57 | fp = fp.buffer
58 |
59 | if not isinstance(fp, BufferedIOBase):
60 | raise ValueError(f"Have unsupported input: {type(fp)}")
61 |
62 | return codecs.EncodedFile(fp, data_encoding=codec_out, file_encoding=codec_in, errors=errors)
63 |
64 |
65 | class ArrowReader(Reader):
66 | """Use base class detection methods to configure a pyarrow.csv.read_csv() call."""
67 |
68 | def skip_invalid_row(self, row: InvalidRow) -> str:
69 | self.n_skipped += 1
70 |
71 | if self.n_skipped < SKIPPED_MSG_N_MAX:
72 | if row.text and len(row.text) > MAX_MSG_LEN:
73 | row = row._replace(text=row.text[:MAX_MSG_LEN])
74 | LOG.warning(f"Skipping row:\n{row}")
75 |
76 | elif self.n_skipped == SKIPPED_MSG_N_MAX:
77 | LOG.warning("Won't show more skipped row messages.")
78 |
79 | return "skip"
80 |
81 | def configure(self, format: Format) -> dict:
82 | return {
83 | "read_options": {
84 | "encoding": format.encoding,
85 | "skip_rows": format.preamble,
86 | "block_size": 2 << 20, # 2 MiB, twice arrow's default of 1 MiB (1 << 20)
87 | },
88 | "parse_options": {
89 | "delimiter": format.dialect.delimiter,
90 | "quote_char": format.dialect.quote_char,
91 | "double_quote": format.dialect.double_quote,
92 | "escape_char": format.dialect.escape_char,
93 | "newlines_in_values": True,
94 | "invalid_row_handler": self.skip_invalid_row,
95 | },
96 | "convert_options": {
97 | "check_utf8": False,
98 | "strings_can_be_null": True,
99 | "quoted_strings_can_be_null": True,
100 | },
101 | }
102 |
103 | def parse( # noqa: PLR0912
104 | self,
105 | types: str | TypeDict | None = None,
106 | timestamp_formats: str | list[str] | None = None,
107 | null_values: str | Iterable[str] | None = None,
108 | ) -> pa.Table:
109 | """Invoke Arrow's parser with inferred CSV format."""
110 | self.n_skipped = 0
111 |
112 | config = self.configure(self.format)
113 |
114 | ro = config["read_options"]
115 | po = config["parse_options"]
116 | co = config["convert_options"]
117 |
118 | if types is not None:
119 | if isinstance(types, (str, DataType)):
120 | types = {col: ensure_type(types) for col in self.columns}
121 | elif isinstance(types, dict):
122 | types = {col: ensure_type(type) for col, type in types.items()}
123 |
124 | co["column_types"] = types
125 |
126 | if timestamp_formats is not None:
127 | if not isinstance(timestamp_formats, list):
128 | timestamp_formats = [timestamp_formats]
129 |
130 | co["timestamp_parsers"] = timestamp_formats
131 |
132 | if null_values is not None:
133 | if isinstance(null_values, str):
134 | null_values = [null_values]
135 | else:
136 | null_values = list(null_values)
137 |
138 | co["null_values"] = null_values
139 | else:
140 | co["null_values"] = MISSING_STRINGS
141 |
142 | try:
143 | fp = transcode(self.fp, codec_in=self.encoding, codec_out="utf-8")
144 | ro["encoding"] = "utf-8"
145 |
146 | tbl = pacsv.read_csv(
147 | fp,
148 | read_options=pa.csv.ReadOptions(**ro),
149 | parse_options=pa.csv.ParseOptions(**po),
150 | convert_options=pa.csv.ConvertOptions(**co),
151 | )
152 |
153 | column_names = list(clean_column_names(tbl.column_names))
154 | tbl = tbl.rename_columns(column_names)
155 | return tbl
156 | except pa.ArrowInvalid as exc:
157 | if "Empty CSV file or block" in (msg := str(exc)):
158 | raise EmptyFileError(msg) from None
159 |
160 | raise
161 |
--------------------------------------------------------------------------------
/lector/csv/dialects.py:
--------------------------------------------------------------------------------
1 | """Detectors of CSV dialects (separator, quoting etc.).
2 |
3 | Note that python.csv is not even internally consistent. E.g. although the dialect used to produce a
4 | CSV may specify ``\\n`` as the line terminator, the python sniffer is hard-coded to return
5 | ``\\r\\n`` (it doesn't actually support detecting it). It's own reader (and others hopefully) deal
6 | internally with different line breaks, but it means one cannot compare a dialect used to generate a
7 | CSV and a dialect created by sniffing the same (quoting is equally hard-coded to ``QUOTE_MINIMAL``).
8 |
9 | Python quoting levels:
10 |
11 | - ``QUOTE_ALL``: 1
12 | - ``QUOTE_MINIMAL``: 0
13 | - ``QUOTE_NONE``: 3
14 | - ``QUOTE_NONNUMERIC``: 2
15 |
16 | """
17 | from __future__ import annotations
18 |
19 | from abc import ABC, abstractmethod
20 | from collections.abc import Iterable
21 | from contextlib import suppress
22 | from csv import QUOTE_MINIMAL, QUOTE_NONE, Sniffer, get_dialect
23 | from csv import Dialect as PyDialect
24 | from dataclasses import dataclass
25 | from itertools import islice
26 | from typing import TextIO
27 |
28 | from ..log import LOG
29 |
30 | try:
31 | import clevercsv as ccsv
32 |
33 | CLEVER_CSV = True
34 | except Exception:
35 | CLEVER_CSV = False
36 |
37 | PyDialectT = type(PyDialect)
38 |
39 | N_ROWS_DFAULT: int = 100
40 | """How many rows to use for dialect detection."""
41 |
42 | DELIMITER_OPTIONS: tuple[str] = (",", ";", "\t", "|")
43 | """Allowed delimiters for dialect detection."""
44 |
45 |
46 | @dataclass
47 | class Dialect:
48 | """A more convenient class for dialects than Python's built-in.
49 |
50 | The built-in Dialect is a class with class attributes only, and so instead of instances
51 | of that class, Python wants you to send references to subclasses around, which is, uhm,
52 | awkward to say the least (see below _to_builtin() for an example).
53 | """
54 |
55 | delimiter: str = ","
56 | quote_char: str = '"'
57 | escape_char: str | None = None
58 | double_quote: bool = True
59 | skip_initial_space: bool = False
60 | line_terminator: str = "\r\n" # Default in Python and correct according to official spec
61 | quoting: int = QUOTE_MINIMAL
62 |
63 | @classmethod
64 | def from_builtin(cls, dialect: str | PyDialectT) -> Dialect:
65 | """Make instance from built-in dialect class configured for reliable reading(!)."""
66 | if isinstance(dialect, str):
67 | dialect = get_dialect(dialect)
68 |
69 | # A dialect without delimiter doesn't make sense, though CleverCSV may return one,
70 | # e.g. when a CSV file contains a single column only
71 | delimiter = dialect.delimiter or ","
72 |
73 | # To read reliably we need one of escape_char or double quote defined
74 | double_quote = dialect.doublequote or (dialect.escapechar is None)
75 |
76 | # Although most parsers ignore this, Python's csv module complains when its missing
77 | line_terminator = dialect.lineterminator or "\r\n"
78 |
79 | # Minimal quoting won't hurt and is sensible if we already know how quoting is used
80 | quoting = dialect.quoting
81 | if quoting == QUOTE_NONE and (dialect.quotechar is not None or dialect.doublequote):
82 | quoting = QUOTE_MINIMAL
83 |
84 | return Dialect(
85 | delimiter=delimiter,
86 | quote_char=dialect.quotechar,
87 | escape_char=dialect.escapechar,
88 | double_quote=double_quote,
89 | skip_initial_space=dialect.skipinitialspace,
90 | line_terminator=line_terminator,
91 | quoting=quoting,
92 | )
93 |
94 | def to_builtin(self) -> PyDialectT:
95 | """Make a subclass of built-in Dialect from this instance."""
96 |
97 | class _Dialect(PyDialect):
98 | _name = "generated"
99 | lineterminator = self.line_terminator
100 | quoting = self.quoting
101 | escapechar = self.escape_char or None
102 | doublequote = self.double_quote
103 | delimiter = self.delimiter
104 | quotechar = self.quote_char
105 | skipinitialspace = self.skip_initial_space
106 | strict = False
107 |
108 | return _Dialect
109 |
110 |
111 | @dataclass
112 | class DialectDetector(ABC):
113 | """Base class for all dialect detectors."""
114 |
115 | @abstractmethod
116 | def detect(self, buffer: TextIO) -> Dialect:
117 | ...
118 |
119 |
120 | @dataclass
121 | class PySniffer(DialectDetector):
122 | """Use Python's built-in csv sniffer."""
123 |
124 | delimiters: Iterable[str] = DELIMITER_OPTIONS
125 | n_rows: int = N_ROWS_DFAULT
126 | log: bool = False
127 |
128 | def detect(self, buffer: TextIO) -> Dialect:
129 | """Detect a dialect we can read(!) a CSV with using the python sniffer.
130 |
131 | Note that the sniffer is not reliable for detecting quoting, quotechar etc., but reasonable
132 | defaults are almost guaranteed to work with most parsers. E.g. the lineterminator is not
133 | even configurable in pyarrow's csv reader, nor in pandas (python engine).
134 | """
135 |
136 | pos = buffer.tell()
137 | sniffer = Sniffer()
138 | sniffer.preferred = []
139 |
140 | for n_rows in (self.n_rows, 1):
141 | with suppress(Exception):
142 | buffer.seek(pos)
143 | sample = "\n".join(islice(buffer, n_rows))
144 | dialect = sniffer.sniff(sample, delimiters=self.delimiters)
145 |
146 | # To read(!) a CSV reliably, we must have either doublequote=True or an escapechar,
147 | # yet Python's sniffer may return doublequote=False and no escapechar if nothing
148 | # was escaped in any way in the given CSV.
149 | dialect.doublequote = dialect.escapechar is None
150 |
151 | # The lineterminator is always returned as "\r\n", but that's ok since parsers
152 | # tend to ignore it anyways
153 | # dialect.lineterminator = ... # noqa
154 |
155 | # May detect that sample has no quotes, but if correct, parsing with minimal quote
156 | # option will still work, and if detection was erroneous, assuming minimal quoting
157 | # is more robust. It's also the default in pandas (=0) and arrow ignores it.
158 | if dialect.quoting == QUOTE_NONE:
159 | dialect.quoting = QUOTE_MINIMAL
160 |
161 | return Dialect.from_builtin(dialect)
162 |
163 | if self.log:
164 | LOG.info("Falling back to default dialect...")
165 |
166 | return Dialect()
167 |
168 |
169 | if CLEVER_CSV:
170 | # CleverCSV may return non-sensical characters as escapechar.
171 | # Monkey-patch to at least limit to ASCII chars.
172 | is_potential_escapechar_orig = ccsv.escape.is_potential_escapechar
173 |
174 | def is_potential_escapechar(char, encoding, block_char=None):
175 | if not char.isascii():
176 | return False
177 |
178 | return is_potential_escapechar_orig(char, encoding, block_char)
179 |
180 | ccsv.escape.is_potential_escapechar = is_potential_escapechar
181 | ccsv.potential_dialects.is_potential_escapechar = is_potential_escapechar
182 | ccsv.normal_form.is_potential_escapechar = is_potential_escapechar
183 |
184 | @dataclass
185 | class CleverCSV(DialectDetector):
186 | """A more advanced dialect detector using CleverCsv."""
187 |
188 | num_chars: int = int(1e6)
189 | skip: bool = True
190 | method: str = "auto"
191 | verbose: bool = False
192 |
193 | def detect(self, buffer: TextIO) -> Dialect:
194 | text = buffer.read(self.num_chars)
195 | dialect = ccsv.Detector().detect(
196 | text,
197 | delimiters=DELIMITER_OPTIONS,
198 | verbose=self.verbose,
199 | method=self.method,
200 | skip=self.skip,
201 | )
202 | return Dialect.from_builtin(dialect.to_csv_dialect())
203 |
--------------------------------------------------------------------------------
/lector/csv/encodings.py:
--------------------------------------------------------------------------------
1 | """Helpers to detecting character encodings in binary buffers."""
2 | from __future__ import annotations
3 |
4 | import codecs
5 | from abc import ABC, abstractmethod
6 | from dataclasses import dataclass
7 | from typing import BinaryIO, Literal
8 |
9 | import cchardet as cdet
10 |
11 | BOMS: dict[str, tuple[Literal, ...]] = {
12 | "utf-8-sig": (codecs.BOM_UTF8,),
13 | "utf-16": (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE),
14 | "utf-32": (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE),
15 | }
16 | """Map BOM (Byte-order mark) to encoding."""
17 |
18 |
19 | MAX_INT32: int = 2_147_483_647
20 | """Cannot read more than this number of bytes at once to detect encoding."""
21 |
22 | CODEC_ERR_CHAR = "�"
23 | """Character representing non-codable bytes."""
24 |
25 |
26 | def detect_bom(bs: bytes):
27 | """Detect encoding by looking for a BOM at the start of the file."""
28 | for enc, boms in BOMS.items():
29 | if any(bs.startswith(bom) for bom in boms):
30 | return enc
31 |
32 | return None
33 |
34 |
35 | def decoding_errors(bs: bytes, encoding: str, prop: bool = True) -> float:
36 | """The proportion of characters that couldn't be decoded correctly."""
37 | string = bytes.decode(bs, encoding, errors="replace")
38 | err = string.count(CODEC_ERR_CHAR) / (len(string) if prop else 1.0)
39 | return err
40 |
41 |
42 | @dataclass
43 | class EncodingDetector(ABC):
44 | """Base class specifying interface for all encoding detetors."""
45 |
46 | @abstractmethod
47 | def detect(self, buffer: BinaryIO) -> str:
48 | """Implement me."""
49 |
50 |
51 | @dataclass
52 | class Chardet(EncodingDetector):
53 | """An encoding detector using cchardet if the default utf-8 generates too many errors."""
54 |
55 | n_bytes: int = int(1e7) # 10 MB
56 | """Use this many bytes to detect encoding."""
57 | error_threshold: float = 0.001
58 | """A greater proportion of decoding errors than this will be considered a failed encoding."""
59 | confidence_threshold: float = 0.6
60 | """Minimum level of confidence to accept an encoding automatically detected by cchardet."""
61 |
62 | def detect(self, buffer: BinaryIO) -> str:
63 | """Somewhat 'opinionated' encoding detection.
64 |
65 | Assumes utf-8 as most common encoding, falling back on cchardet detection, and
66 | if all else fails on windows-1250 if encoding is latin-like.
67 | """
68 | head: bytes = buffer.read(min(self.n_bytes, MAX_INT32))
69 |
70 | bom_encoding = detect_bom(head)
71 | if bom_encoding:
72 | return bom_encoding
73 |
74 | if decoding_errors(head, "utf-8", prop=True) <= self.error_threshold:
75 | return "utf-8"
76 |
77 | detected = cdet.detect(head)
78 | encoding, confidence = detected["encoding"], detected["confidence"]
79 |
80 | if encoding:
81 | if confidence > self.confidence_threshold:
82 | return encoding
83 |
84 | if any(label in encoding.lower() for label in ("windows", "iso-8859")):
85 | # Iso-like, will use windows-1250 as super set for special chars
86 | return "windows-1250"
87 |
88 | return "windows-1250"
89 |
--------------------------------------------------------------------------------
/lector/csv/pandas.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from .abc import Reader
4 |
5 |
6 | class PandasReader(Reader):
7 | """Use base class detection methods to configure a pandas.read_csv() call."""
8 |
9 | def parse(self, *args, **kwds):
10 | """Invoke Pandas' parser with inferred CSV format."""
11 | cfg = {
12 | "encoding": self.format.encoding,
13 | "skiprows": self.format.preamble,
14 | "on_bad_lines": "warn",
15 | "engine": "python",
16 | }
17 | cfg.update(self.format.dialect)
18 |
19 | # Or "\n"? Pandas doesn't allow "\r\n"...
20 | cfg["lineterminator"] = None
21 |
22 | kwds = {**cfg, **kwds}
23 | result = pd.read_csv(self.buffer, *args, **kwds)
24 | return result
25 |
--------------------------------------------------------------------------------
/lector/csv/preambles.py:
--------------------------------------------------------------------------------
1 | """Detectors of preambles in CSV files.
2 |
3 | This is generally a chicken-and-egg-type situation. Do detect generic preambles robustly and
4 | efficiently, it would really help to know the CSV dialect, or at least the delimiter. But to detect
5 | the dialect/delimiter correctly, we need to ignore/(skip) the preamble. Detectors may therefore
6 | rely on (somtimes) overly simplistic heuristics implicitly assuming a certain dialect.
7 | """
8 | from __future__ import annotations
9 |
10 | import csv
11 | from abc import ABC, abstractmethod
12 | from collections.abc import Iterable
13 | from dataclasses import dataclass, field
14 | from itertools import islice
15 | from typing import TextIO
16 |
17 | from ..log import LOG
18 |
19 |
20 | @dataclass
21 | class PreambleDetector(ABC):
22 | """Base class for detecting preambles (initial junk) in a CSV buffer."""
23 |
24 | n_rows: int = 100
25 |
26 | @abstractmethod
27 | def detect(self, buffer: TextIO) -> int:
28 | """Detect preamble and return number of lines to skip."""
29 |
30 |
31 | class Preambles:
32 | """Registry to manage preamble detectors."""
33 |
34 | DETECTORS = {}
35 |
36 | @classmethod
37 | def register(cls, registered: type) -> type:
38 | cls.DETECTORS[registered.__name__] = registered
39 | return registered
40 |
41 | @classmethod
42 | def detect(
43 | cls,
44 | buffer: TextIO,
45 | detectors: Iterable[PreambleDetector] | None = None,
46 | log: bool = False,
47 | ) -> int:
48 | """Get result of first preamble detector matching the csv buffer.
49 |
50 | Matching here means detecting more than 0 rows of preamble text, and result
51 | is the number of rows to skip.
52 |
53 | If no detectors are provided (as ordered sequence), all registered
54 | detector classes are tried in registered order and using default parameters.
55 | """
56 | cursor = buffer.tell()
57 |
58 | if detectors is None:
59 | detectors = (det() for det in cls.DETECTORS.values())
60 |
61 | for detector in detectors:
62 | skiprows = detector.detect(buffer)
63 | if skiprows:
64 | if log:
65 | name = detector.__class__.__name__
66 | msg = f"'{name}' matches CSV buffer: detected {skiprows} rows to skip."
67 | LOG.info(msg)
68 | return skiprows
69 |
70 | buffer.seek(cursor)
71 |
72 | return 0
73 |
74 |
75 | @Preambles.register
76 | @dataclass
77 | class Brandwatch(PreambleDetector):
78 | """Detect CSV files exported from Brandwatch.
79 |
80 | Brandwatch uses the comma as separator and includes a row of commas only
81 | to separate preamble texts from the CSV table as such.
82 | """
83 |
84 | def detect(self, buffer: TextIO) -> int:
85 | rows = [row.strip() for row in islice(buffer, self.n_rows)]
86 |
87 | for i, row in enumerate(rows):
88 | if len(row) > 0 and all(x == "," for x in row):
89 | return i + 1
90 |
91 | return 0
92 |
93 |
94 | @Preambles.register
95 | @dataclass
96 | class Fieldless(PreambleDetector):
97 | """Detects initial rows that don't contain any delimited fields.
98 |
99 | Tries parsing buffer using Python's built-in csv functionality, assuming as delimiter the most
100 | frequent character amongst those configured via ``delimiters``. Given this delimiter, the parser
101 | detects N initial lines containing a single field only, followed by at least one line containing
102 | multiple fields. N is then the number of rows to skip.
103 | """
104 |
105 | delimiters: str | list[str] = field(default_factory=lambda: [",", ";", "\t"])
106 |
107 | def detect_with_delimiter(self, buffer: TextIO, delimiter: str) -> int:
108 | """Count how many consecutive initial fieldless rows we have given specific delimiter."""
109 |
110 | reader = csv.reader(
111 | islice(buffer, self.n_rows),
112 | delimiter=delimiter,
113 | quotechar='"',
114 | quoting=csv.QUOTE_MINIMAL,
115 | doublequote=True,
116 | skipinitialspace=True,
117 | )
118 |
119 | for row in reader:
120 | if len(row) > 1:
121 | return reader.line_num - 1
122 |
123 | return 0
124 |
125 | def detect(self, buffer: TextIO) -> int:
126 | """Count consecutive initial fieldless rows given the most frequent delimiter."""
127 |
128 | cursor = buffer.tell()
129 | delimiters = [self.delimiters] if isinstance(self.delimiters, str) else self.delimiters
130 |
131 | text = "".join(islice(buffer, self.n_rows))
132 | counts = {delim: text.count(delim) for delim in delimiters}
133 | delimiter = max(counts.items(), key=lambda item: item[1])[0]
134 |
135 | buffer.seek(cursor)
136 | return self.detect_with_delimiter(buffer, delimiter)
137 |
138 |
139 | @Preambles.register
140 | @dataclass
141 | class GoogleAds(Fieldless):
142 | """In GoogleAds CSVs the garbage lines don't contain the separator (comma or tab).
143 |
144 | The only complications are that 1) GoogleAds has two CSV export formats: 'Excel' using tabs
145 | as separators and normal 'CSV' the comma; 2) A single column CSV wouldn't have the
146 | separator either.
147 |
148 | GoogleAds also seems to include two "totals" rows at the end, which we exclude here.
149 | """
150 |
151 | def detect(self, buffer: TextIO) -> int:
152 | cursor = buffer.tell()
153 | skip = super().detect(buffer)
154 |
155 | if skip:
156 | buffer.seek(cursor)
157 | rows = [row.strip() for row in islice(buffer, self.n_rows)]
158 |
159 | is_report = any("informe de" in row.lower() for row in rows[0:skip])
160 | has_campaign_col = any("Campaña" in col for col in rows[skip].split(","))
161 |
162 | if is_report and has_campaign_col:
163 | self.skipfooter = 2
164 | else:
165 | skip = 0
166 | self.skipfooter = 0
167 |
168 | return skip
169 |
--------------------------------------------------------------------------------
/lector/log.py:
--------------------------------------------------------------------------------
1 | """Helpers to pretty print/log objects using Rich."""
2 | from __future__ import annotations
3 |
4 | import logging
5 | import sys
6 | from collections.abc import Iterable, Sequence
7 | from functools import partial
8 | from typing import TypeVar
9 |
10 | import pyarrow as pa
11 | import pyarrow.types as pat
12 | from pyarrow import DataType, Schema
13 | from pyarrow import Table as PaTable
14 | from rich import box, get_console
15 | from rich.padding import Padding
16 | from rich.panel import Panel
17 | from rich.pretty import Pretty
18 | from rich.progress import Progress, TimeElapsedColumn
19 | from rich.table import Column, Table
20 | from rich.text import Text
21 |
22 | from .utils import decode_metadata
23 |
24 | CONSOLE = get_console()
25 |
26 | BOX = box.HORIZONTALS
27 |
28 | Item = TypeVar("Item")
29 |
30 |
31 | class ColoredFormatter(logging.Formatter):
32 | """A custom formatter controlling message color."""
33 |
34 | RESET = "\x1b[0m"
35 |
36 | FORMAT = "{asctime} {levelname} | {name} | {module}.{funcName}:{lineno} \n{message}"
37 |
38 | COLORS = {
39 | logging.DEBUG: "\x1b[38;20m", # grey
40 | logging.INFO: "\x1b[38;20m", # grey
41 | logging.WARNING: "\x1b[33;1m", # bold yellow
42 | logging.ERROR: "\x1b[31;1m", # bold red
43 | logging.CRITICAL: "\x1b[31;1m", # bold red
44 | }
45 |
46 | def __init__(self, datefmt=None, validate=True):
47 | super().__init__(self.FORMAT, style="{", datefmt=datefmt, validate=validate)
48 |
49 | def format(self, record):
50 | msg = super().format(record)
51 | col = self.COLORS.get(record.levelno)
52 | return msg.replace("", col).replace("", self.RESET)
53 |
54 |
55 | def setup_logging(level=logging.DEBUG, color=True):
56 | """Ensure logging handler is only added once."""
57 | date_fmt = "%H:%M:%S"
58 | if color:
59 | fmt = ColoredFormatter(datefmt=date_fmt)
60 | else:
61 | fmt = logging.Formatter(
62 | "{asctime} {levelname} | {name} | {module}.{funcName}:{lineno} \n{message}",
63 | datefmt=date_fmt,
64 | style="{",
65 | )
66 |
67 | logger = logging.getLogger("lector")
68 | logger.setLevel(level)
69 |
70 | _sh = logging.StreamHandler(sys.stdout)
71 | _sh.setFormatter(fmt)
72 | logger.addHandler(_sh)
73 |
74 | return logger
75 |
76 |
77 | LOG = setup_logging(level=logging.INFO, color=True)
78 |
79 |
80 | def pformat(obj, console=None, markup=True, end="", strip=False, **kwargs):
81 | """Pretty format any object, if possible with Rich."""
82 | console = console or CONSOLE
83 |
84 | with console.capture() as capture:
85 | console.print(obj, markup=markup, end=end)
86 |
87 | result = capture.get()
88 |
89 | if strip:
90 | result = result.strip()
91 |
92 | return result
93 |
94 |
95 | iformat = partial(pformat, strip=True)
96 | """Shortcut for inline formatting, avoiding strings terminating in line breaks."""
97 |
98 |
99 | def track(
100 | items: Iterable[Item] | Sequence[Item],
101 | columns: Iterable[Column] | None = None,
102 | total: float | None = None,
103 | desc: str = "Processing",
104 | update_period: float = 0.1,
105 | **kwds,
106 | ) -> Iterable[Item]:
107 | """Rich track with elapsed time by default."""
108 | if columns is None:
109 | columns = (*Progress.get_default_columns(), TimeElapsedColumn())
110 |
111 | with Progress(*columns, **kwds) as progress:
112 | yield from progress.track(
113 | items,
114 | total=total,
115 | description=desc,
116 | update_period=update_period,
117 | )
118 |
119 |
120 | def type_view(type: DataType) -> str:
121 | """More compact strinf represenation of arrow data types."""
122 | if pat.is_list(type):
123 | return f"list<{type.value_type}>"
124 | if pat.is_dictionary(type):
125 | if type.ordered:
126 | return f"dict<{type.value_type}, ordered>"
127 | return f"dict<{type.value_type}>"
128 | return str(type)
129 |
130 |
131 | def dict_view(
132 | d: dict, title: str = "", expand: bool = False, width=None, padding=1, **kwds
133 | ) -> Panel:
134 | dv = Pretty(d, **kwds)
135 | p = Panel(dv, expand=expand, title=title, width=width, box=BOX)
136 | return Padding(p, padding)
137 |
138 |
139 | def schema_view(schema: Schema, title: str | None = "Schema", padding: int = 1) -> Table:
140 | """Make a rich view for arrow schema."""
141 |
142 | meta = {field.name: decode_metadata(field.metadata or {}) for field in schema}
143 | have_meta = any(meta.values())
144 |
145 | rt = Table(title=title, title_justify="left", box=BOX)
146 | rt.add_column("Column", justify="left", style="indian_red1", no_wrap=True)
147 | rt.add_column("Type", style="yellow3")
148 | if have_meta:
149 | rt.add_column("Meta")
150 |
151 | for field in schema:
152 | if have_meta:
153 | field_meta = meta.get(field.name)
154 | field_meta = Pretty(field_meta) if field_meta else None
155 | rt.add_row(field.name, type_view(field.type), field_meta)
156 | else:
157 | rt.add_row(field.name, type_view(field.type))
158 |
159 | return Padding(rt, padding)
160 |
161 |
162 | def schema_comparison(
163 | s1: Schema,
164 | s2: Schema,
165 | title: str | None = None,
166 | padding: int = 1,
167 | left: str = "Before",
168 | right: str = "After",
169 | ):
170 | meta = {field.name: decode_metadata(field.metadata or {}) for field in s2}
171 | have_meta = any(meta.values())
172 |
173 | t = Table(title=title, title_justify="left", box=BOX)
174 | t.add_column("Column", justify="left", style="indian_red1", no_wrap=True)
175 | t.add_column(left, style="orange1")
176 | t.add_column(right, style="yellow3")
177 | if have_meta:
178 | t.add_column("Meta")
179 |
180 | for field in s2:
181 | if have_meta:
182 | field_meta = meta.get(field.name)
183 | field_meta = Pretty(field_meta) if field_meta else ""
184 |
185 | other = s1.field(field.name)
186 | if field.type != other.type:
187 | orig_type = type_view(other.type)
188 | else:
189 | orig_type = ""
190 |
191 | t.add_row(field.name, orig_type, type_view(field.type), field_meta)
192 |
193 | return Padding(t, padding)
194 |
195 |
196 | def schema_diff_view(diff: dict, title: str | None = None, padding: int = 1) -> Table:
197 | """Make a rich view for an arrow schema diff."""
198 |
199 | t = Table(title=title, title_justify="left", box=BOX)
200 | t.add_column("Column", justify="left", style="indian_red1", no_wrap=True)
201 | t.add_column("Before", style="orange1")
202 | t.add_column("After", style="yellow3")
203 |
204 | for col, (before, after) in diff.items():
205 | t.add_row(col, type_view(before), type_view(after))
206 |
207 | return Padding(t, padding)
208 |
209 |
210 | def table_view(
211 | tbl: PaTable,
212 | title: str | None = None,
213 | n_rows_max: int = 10,
214 | n_columns_max: int = 6,
215 | max_column_width: int = 20,
216 | padding: int = 1,
217 | ) -> Table:
218 | """Pyarrow table to rich table."""
219 |
220 | sample = tbl
221 |
222 | if sample.num_rows > n_rows_max:
223 | sample = sample.slice(0, n_rows_max)
224 |
225 | if sample.num_columns > n_columns_max:
226 | sample = sample.select(range(n_columns_max))
227 | rest = pa.array(["..."] * len(sample))
228 | sample = sample.append_column(field_="...", column=rest)
229 |
230 | style = "bold indian_red1"
231 | caption = Text.from_markup(
232 | f"[{style}]{tbl.num_rows:,}[/] rows ✕ [{style}]{tbl.num_columns}[/] columns"
233 | )
234 |
235 | table = Table(
236 | title=title,
237 | caption=caption,
238 | title_justify="left",
239 | caption_justify="left",
240 | box=BOX,
241 | )
242 |
243 | for field in sample.schema:
244 | name = field.name
245 | table.add_column(
246 | name,
247 | max_width=max_column_width,
248 | overflow="crop",
249 | no_wrap=True,
250 | )
251 |
252 | rows = sample.to_pylist()
253 | ellipses = len(rows) < tbl.num_rows
254 |
255 | def value_repr(x):
256 | if x is None:
257 | return None
258 | if x == "...":
259 | return x
260 | return Pretty(x, max_length=max_column_width, max_string=max_column_width)
261 |
262 | for i, row in enumerate(rows):
263 | row = [value_repr(x) for x in row.values()]
264 | end_section = False if ellipses else i == len(rows) - 1
265 | table.add_row(*row, end_section=end_section)
266 |
267 | if ellipses:
268 | table.add_row(*["..."] * len(rows[0]), end_section=True)
269 |
270 | def type_repr(table, column):
271 | if column == "...":
272 | return ""
273 |
274 | style = "italic yellow3"
275 | type_ = table.schema.field(column).type
276 | return Text.from_markup(f"[{style}]{type_view(type_)}[/]")
277 |
278 | def null_repr(table, column):
279 | if column == "...":
280 | return ""
281 |
282 | style = "italic"
283 | n_nulls = table.column(column).null_count
284 | if n_nulls:
285 | return Text.from_markup(f"[{style} bold]nulls {n_nulls}[/]")
286 |
287 | return Text.from_markup(f"[{style}]nulls 0[/]")
288 |
289 | types = [type_repr(sample, column) for column in sample.column_names]
290 | nulls = [null_repr(sample, column) for column in sample.column_names]
291 | table.add_row(*nulls)
292 | table.add_row(*types)
293 |
294 | return Padding(table, padding)
295 |
--------------------------------------------------------------------------------
/lector/types/__init__.py:
--------------------------------------------------------------------------------
1 | """Subpackage for inferring column types in CSV files.
2 |
3 | This is instead or on top of Arrow's built-in inference, which currently doesn't detect
4 | list columns, timestamps in non-ISO formats, or semantic types such as URLs, natural language
5 | text etc.
6 | """
7 | from .abc import Converter, Registry
8 | from .bools import Boolean
9 | from .cast import Autocast, Cast
10 | from .lists import List
11 | from .numbers import Number
12 | from .strings import Category, Text, Url
13 | from .timestamps import Timestamp
14 |
15 | """Note, we need to import the types here, otherwise they won't be registered."""
16 |
17 | __all__ = [
18 | "Autocast",
19 | "Boolean",
20 | "Cast",
21 | "Converter",
22 | "Registry",
23 | "Category",
24 | "List",
25 | "Number",
26 | "Text",
27 | "Timestamp",
28 | "Url",
29 | ]
30 |
--------------------------------------------------------------------------------
/lector/types/abc.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from abc import ABC, abstractmethod
4 | from dataclasses import dataclass, field
5 |
6 | import rich.repr
7 | from pyarrow import Array
8 |
9 |
10 | @dataclass
11 | class Conversion:
12 | """Return type of a conversion/cast."""
13 |
14 | result: Array
15 | meta: dict = field(default_factory=dict)
16 |
17 |
18 | @dataclass
19 | @rich.repr.auto
20 | class Converter(ABC):
21 | """Simple base class for dependency injection of new custom data types.
22 |
23 | If a proportion of values smaller than `threshold` can be successfully
24 | converted, the converter should return None.
25 | """
26 |
27 | threshold: float = 1.0
28 |
29 | @abstractmethod
30 | def convert(self, arr: Array) -> Conversion | None:
31 | """To be implemented in subclasses."""
32 |
33 |
34 | @dataclass
35 | class ConverterRegistry:
36 | """Registry to manage converters."""
37 |
38 | convs: dict[str, Converter] = field(default_factory=dict)
39 |
40 | def register(self, registered: type) -> type:
41 | self.convs[registered.__name__.lower()] = registered
42 | return registered
43 |
44 | def __getitem__(self, item: str) -> Converter:
45 | return self.convs[item.lower()]
46 |
47 |
48 | Registry = ConverterRegistry()
49 | """'Singleton' conversion registry."""
50 |
--------------------------------------------------------------------------------
/lector/types/bools.py:
--------------------------------------------------------------------------------
1 | """Classes for converting arrays to the boolean type."""
2 | from __future__ import annotations
3 |
4 | from contextlib import suppress
5 | from dataclasses import dataclass
6 |
7 | import pyarrow as pa
8 | from pyarrow import Array
9 |
10 | from ..utils import is_stringy
11 | from .abc import Conversion, Converter, Registry
12 |
13 |
14 | @dataclass
15 | @Registry.register
16 | class Boolean(Converter):
17 | """Converts stringy booleans ("true" / "False"), and ints (0/1) to the boolean type."""
18 |
19 | def convert(self, array: Array) -> Conversion | None:
20 | if not is_stringy(array.type) or array.null_count == len(array):
21 | return None
22 |
23 | meta = {"semantic": "boolean"}
24 |
25 | with suppress(pa.ArrowInvalid):
26 | converted = array.cast(pa.bool_())
27 |
28 | n = len(array)
29 | valid_before = n - array.null_count
30 | valid_after = n - converted.null_count
31 | proportion_valid = valid_after / valid_before
32 |
33 | if proportion_valid >= self.threshold:
34 | return Conversion(converted, meta=meta)
35 |
36 | return None
37 |
--------------------------------------------------------------------------------
/lector/types/cast.py:
--------------------------------------------------------------------------------
1 | """Helpers to easily cast columns to their most appropriate/efficient type."""
2 | from __future__ import annotations
3 |
4 | from abc import ABC, abstractmethod
5 | from collections.abc import Iterable
6 | from dataclasses import dataclass, field
7 | from typing import Union
8 |
9 | import pyarrow as pa
10 | from pyarrow import Array, ChunkedArray, Table
11 | from tqdm.auto import tqdm
12 |
13 | from ..log import LOG, iformat, pformat, schema_diff_view
14 | from ..utils import encode_metadata, schema_diff
15 | from .abc import Conversion, Converter, Registry
16 | from .numbers import DecimalMode
17 | from .strings import Category
18 |
19 | Config = dict[str, dict]
20 | """An (ordered) dict of converter class names and corresponding parameters."""
21 |
22 | Converters = Union[Config, Iterable[Converter], None]
23 | """Accepted argument type where converters are expected."""
24 |
25 | DEFAULT_CONVERTERS: Config = {
26 | "number": {"threshold": 0.95, "allow_unsigned_int": True, "decimal": DecimalMode.INFER},
27 | "boolean": {"threshold": 1.0},
28 | "list": {"threshold": 0.95, "threshold_urls": 0.8},
29 | "timestamp": {"threshold": 0.95},
30 | "text": {"threshold": 0.8, "min_unique": 0.1},
31 | "url": {"threshold": 0.8},
32 | "category": {"threshold": 0.0, "max_cardinality": None},
33 | }
34 |
35 |
36 | def ensure_converters(converters: Converters = None) -> list[Converter]:
37 | """Turn a type conversion config into a list of converter instances."""
38 | if converters is None:
39 | converters = DEFAULT_CONVERTERS.copy()
40 |
41 | if isinstance(converters, dict):
42 | return [Registry[name](**params) for name, params in converters.items()]
43 |
44 | if isinstance(converters, list) and converters and isinstance(converters[0], Converter):
45 | return converters
46 |
47 | raise ValueError(f"Object cannot be made into type converters: {converters}")
48 |
49 |
50 | @dataclass
51 | class CastStrategy(ABC):
52 | """Base class for autocasting implementations."""
53 |
54 | converters: Converters | None = None
55 | columns: list[str] | None = None
56 | log: bool = False
57 |
58 | def __post_init__(self):
59 | self.converters = ensure_converters(self.converters)
60 |
61 | @abstractmethod
62 | def cast_array(self, array: Array, name: str | None = None) -> Conversion:
63 | """Only need to override this."""
64 |
65 | def cast_table(self, table: Table) -> Table:
66 | """Takes care of updating fields, including metadata etc."""
67 | schema = table.schema
68 | columns = self.columns or table.column_names
69 |
70 | for name in tqdm(columns, desc="Autocasting", disable=not self.log):
71 | array = table.column(name)
72 | conv = self.cast_array(array, name=name)
73 |
74 | if conv is not None:
75 | result = conv.result
76 | meta = conv.meta or {}
77 | meta = encode_metadata(meta) if meta else None
78 | field = pa.field(name, type=result.type, metadata=meta)
79 | table = table.set_column(table.column_names.index(name), field, result)
80 |
81 | if self.log:
82 | diff = schema_diff(schema, table.schema)
83 | if diff:
84 | LOG.info(pformat(schema_diff_view(diff, title="Changed types")))
85 |
86 | return table
87 |
88 | def cast(self, data: Array | ChunkedArray | Table) -> Conversion | Table:
89 | """Shouldn't be necessary, but @singledispatchmethod doesn't work with inheritance."""
90 | if isinstance(data, (Array, ChunkedArray)):
91 | return self.cast_array(data)
92 |
93 | if isinstance(data, Table):
94 | return self.cast_table(data)
95 |
96 | raise ValueError(f"Can only cast arrays or tables, got {type(data)}!")
97 |
98 |
99 | @dataclass
100 | class Autocast(CastStrategy):
101 | """Simple cast trying each registered type in order.
102 |
103 | As a little performance optimization (having a huge effect on execution time),
104 | types are first tested on a sample for fast rejection of non-matching types.
105 | """
106 |
107 | n_samples: int = 100
108 | fallback: Converter | None = field(
109 | default_factory=lambda: Category(threshold=0.0, max_cardinality=None)
110 | )
111 |
112 | def cast_array(self, array: Array | ChunkedArray, name: str | None = None) -> Conversion:
113 | name = name or ""
114 |
115 | if array.null_count == len(array):
116 | if self.fallback:
117 | LOG.info(f"Column '{name}' is all null, trying fallback {iformat(self.fallback)}")
118 | return self.fallback.convert(array)
119 |
120 | LOG.debug(f"Column '{name}' is all null, skipping.")
121 | return None
122 |
123 | for converter in self.converters:
124 | sample = array.drop_null().slice(length=self.n_samples)
125 | if (
126 | len(sample) > 0
127 | and converter.convert(sample)
128 | and (result := converter.convert(array))
129 | ):
130 | if self.log:
131 | LOG.debug(f'Converted column "{name}" with converter\n{iformat(converter)}')
132 | return result
133 |
134 | if self.fallback and pa.types.is_string(array.type) or pa.types.is_null(array.type):
135 | LOG.debug(
136 | f"Got no matching converter for string column '{name}'. "
137 | f"Will try fallback {iformat(self.fallback)}."
138 | )
139 | return self.fallback.convert(array)
140 |
141 | return None
142 |
143 |
144 | @dataclass
145 | class Cast:
146 | """Tries a specific cast for each column."""
147 |
148 | converters: dict[str, Converter]
149 | log: bool = False
150 |
151 | def cast(self, table: Table) -> Table:
152 | schema = table.schema
153 |
154 | for _, (name, converter) in tqdm(
155 | enumerate(self.converters.items()),
156 | total=len(self.converters),
157 | desc="Explicit casting",
158 | disable=not self.log,
159 | ):
160 | array = table.column(name)
161 | try:
162 | conv = converter.convert(array)
163 | except Exception:
164 | LOG.error(f"Couldn't convert column {name} with converter {iformat(converter)}!")
165 | raise
166 |
167 | if conv is not None:
168 | result = conv.result
169 | meta = conv.meta or {}
170 | meta = encode_metadata(meta) if meta else None
171 | field = pa.field(name, type=result.type, metadata=meta)
172 | idx = table.schema.get_field_index(name)
173 | table = table.set_column(idx, field, result)
174 | else:
175 | LOG.error(
176 | f"Conversion of columns '{name}' with converter '{iformat(converter)}' failed!"
177 | )
178 | LOG.error(f"Original column ({array.type}):\n{array}")
179 |
180 | if self.log:
181 | diff = schema_diff(schema, table.schema)
182 | if diff:
183 | LOG.info(pformat(schema_diff_view(diff, title="Changed types")))
184 |
185 | return table
186 |
--------------------------------------------------------------------------------
/lector/types/lists.py:
--------------------------------------------------------------------------------
1 | """List parsing and casting.
2 |
3 | Currently NOT supported in CSV strings:
4 |
5 | - floats with comma as the decimal delimiter (must be the period character)
6 | - floats with thousands separator
7 | """
8 | from __future__ import annotations
9 |
10 | from collections.abc import Iterable
11 | from contextlib import suppress
12 | from csv import reader as csvreader
13 | from dataclasses import dataclass
14 |
15 | import msgspec
16 | import pyarrow as pa
17 | import pyarrow.compute as pac
18 | import pyarrow.types as pat
19 | from pyarrow import Array, DataType
20 |
21 | from ..log import LOG
22 | from ..utils import ensure_type, min_max, proportion_trueish, smallest_int_type
23 | from .abc import Conversion, Converter, Registry
24 | from .regex import RE_LIST_CLEAN, RE_LIST_LIKE
25 | from .strings import proportion_url
26 |
27 | LIST_TYPES: tuple[str] = (pa.int64(), pa.float64(), pa.timestamp(unit="ms"))
28 |
29 | JSON_DECODE = msgspec.json.Decoder(type=list).decode
30 |
31 | SAFE_CSV_PARSING = False
32 |
33 |
34 | def parse_csvs(strings: Iterable[str], safe=SAFE_CSV_PARSING, **kwds) -> Iterable[list]:
35 | """Parse a list of strings as CSV, to separate it into individual fields.
36 |
37 | The non-safe option uses python's built-in reader. But it either raises on invalid
38 | rows, or silently returns fewer parsed rows than original rows, depending on the
39 | "strict" parameter. The safe option will always return the expected number of rows,
40 | with values being None where a string couldn't be parsed.
41 | """
42 | if safe:
43 | for s in strings:
44 | try:
45 | yield next(csvreader([s], **kwds))
46 | except Exception:
47 | yield None
48 | else:
49 | yield from csvreader(strings, **kwds)
50 |
51 |
52 | def parse_lists_csv(arr: Array, **kwds) -> Array:
53 | """Parse strings as lines of CSV, to separate it into individual fields.
54 |
55 | Respects the separator being escaped when enclosed in (double) quotes etc.
56 | """
57 | content = pac.replace_substring_regex(arr, pattern=RE_LIST_CLEAN, replacement="")
58 | strings = (s.as_py() if s.is_valid else "" for s in content)
59 | lists = parse_csvs(strings, **kwds)
60 | lists = ([elem.strip("' ") for elem in l] if l is not None else l for l in lists)
61 | result = pa.array(lists)
62 | result = pac.if_else(arr.is_null(), pa.NA, result) # Restore original nulls
63 | return result
64 |
65 |
66 | def parse_json(s: str):
67 | """Parse a single string as json."""
68 | l = JSON_DECODE(s)
69 |
70 | if l and any(isinstance(x, (list, dict)) for x in l):
71 | l = [str(x) for x in l]
72 |
73 | return l
74 |
75 |
76 | def parse_lists_json(arr: Array) -> Array:
77 | """Parse strings as lists using the significantly faster msgspec."""
78 | parsed = (parse_json(s.as_py()) if s.is_valid else None for s in arr)
79 | return pa.array(parsed)
80 |
81 |
82 | def proportion_listlike(arr: Array) -> float:
83 | """Calculate proportion of non-null strings that could be lists."""
84 | valid = arr.drop_null()
85 | is_list = pac.match_substring_regex(valid, RE_LIST_LIKE)
86 | return proportion_trueish(is_list)
87 |
88 |
89 | def maybe_cast_lists(
90 | arr: Array,
91 | types: list[DataType | str] = LIST_TYPES,
92 | downcast: bool = True,
93 | ) -> Array | None:
94 | """Cast lists (of strings) to first valid type, if any."""
95 |
96 | for type in types:
97 | type = ensure_type(type)
98 |
99 | if arr.type == type:
100 | return arr
101 |
102 | with suppress(Exception):
103 | result = pac.cast(arr, pa.list_(type))
104 |
105 | if type == "int64" and downcast:
106 | vmin, vmax = min_max(pac.list_flatten(result))
107 | itype = smallest_int_type(vmin, vmax)
108 |
109 | if itype is not None:
110 | try:
111 | itype = ensure_type(itype)
112 | result = pac.cast(result, pa.list_(itype))
113 | except Exception as exc:
114 | LOG.error(exc)
115 | LOG.error("Will not downcast lists of int64.")
116 |
117 | return result
118 |
119 | return None
120 |
121 |
122 | def maybe_parse_lists(
123 | arr: Array,
124 | type: str | DataType | None = None,
125 | threshold: float = 1.0,
126 | quote_char: str = '"',
127 | delimiter: str = ",",
128 | ) -> Array | None:
129 | """Parse strings into list, optionally with (inferrable) element type."""
130 | if proportion_listlike(arr.drop_null()) < threshold:
131 | return None
132 |
133 | try:
134 | result = parse_lists_json(arr)
135 | LOG.debug("[List] Was able to fast-parse as json")
136 | except Exception:
137 | try:
138 | result = parse_lists_csv(
139 | arr, skipinitialspace=True, quotechar=quote_char, delimiter=delimiter
140 | )
141 | except Exception as exc:
142 | LOG.error(f"Cannot parse lists as CSV: {exc}")
143 | return None
144 |
145 | if type is not None:
146 | return result.cast(pa.list_(ensure_type(type)))
147 |
148 | return maybe_cast_lists(result, types=LIST_TYPES) or result
149 |
150 |
151 | @dataclass
152 | @Registry.register
153 | class List(Converter):
154 | type: str | DataType | None = None
155 | infer_urls: bool = True
156 | threshold_urls: float = 1.0
157 | quote_char: str = '"'
158 | delimiter: str = ","
159 |
160 | def convert(self, array: Array) -> Conversion | None:
161 | result = None
162 |
163 | if pat.is_string(array.type):
164 | result = maybe_parse_lists(
165 | array,
166 | type=self.type,
167 | threshold=self.threshold,
168 | quote_char=self.quote_char,
169 | delimiter=self.delimiter,
170 | )
171 | elif pat.is_list(array.type):
172 | result = array
173 |
174 | if result is None:
175 | return None
176 |
177 | vtype = result.type.value_type
178 |
179 | if pat.is_string(vtype):
180 | if self.infer_urls and proportion_url(pac.list_flatten(result)) >= self.threshold_urls:
181 | semantic = "list[url]"
182 | else:
183 | semantic = "list[category]"
184 | elif pat.is_timestamp(vtype):
185 | semantic = "list[date]"
186 | elif pat.is_integer(vtype):
187 | semantic = f"list[number[{vtype}]]"
188 | else:
189 | if str(vtype) == "float":
190 | # pyarrow's "float" means float32, but pandas would interpret it as float64
191 | vtype = "float32"
192 |
193 | semantic = f"list[number[{vtype}]]"
194 |
195 | return Conversion(result, meta={"semantic": semantic})
196 |
--------------------------------------------------------------------------------
/lector/types/numbers.py:
--------------------------------------------------------------------------------
1 | """Helpers for parsing and downcasting numeric data.
2 |
3 | Note: Arrow uses Google's RE2 to implement regex functionality:
4 | https://github.com/google/re2/wiki/Syntax
5 |
6 | """
7 | from __future__ import annotations
8 |
9 | from collections import Counter
10 | from dataclasses import dataclass
11 | from enum import Enum
12 |
13 | import pyarrow as pa
14 | import pyarrow.compute as pac
15 | import pyarrow.types as pat
16 | from pyarrow import Array
17 |
18 | from ..log import LOG
19 | from ..utils import (
20 | dtype_name,
21 | empty_to_null,
22 | min_max,
23 | proportion_equal,
24 | smallest_int_type,
25 | )
26 | from .abc import Conversion, Converter, Registry
27 | from .regex import RE_IS_FLOAT, RE_IS_INT
28 |
29 | DECIMAL_SUPPORT_MIN = 0.2 # 20%
30 | DECIMAL_CONFIDENCE_MIN = 1.5 # 150%
31 |
32 |
33 | class DecimalMode(str, Enum):
34 | INFER = "INFER"
35 | COMPARE = "COMPARE"
36 |
37 |
38 | def clean_float_pattern(thousands: str = ",") -> str:
39 | """Removes characters in number strings that Arrow cannot parse."""
40 | if thousands == ",":
41 | # Match a "+" at the beginning and commas anywhere
42 | return r"^\+|,"
43 |
44 | # Match a "+" at the beginning and a period anywhere
45 | return r"^\+|\."
46 |
47 |
48 | def decimal_delimiter( # noqa: PLR0911, PLR0912
49 | s: str,
50 | n_chars_max: int = 20,
51 | ) -> str | None:
52 | """Infer decimal delimiter from string representation s of an input number.
53 |
54 | Returns None if not unambiguously inferrable.
55 | """
56 | n_commas = n_dots = n_delims = 0
57 | first_comma_idx = first_dot_idx = None
58 | n = len(s)
59 |
60 | for i, c in enumerate(s):
61 | if i > n_chars_max and n_delims == 0: # noqa: PLR2004
62 | return None # Early out for long strings that are unlikely to represent numbers
63 |
64 | if c in ".,":
65 | if i == 0 or (i == 1 and s[0] == "0"):
66 | return c # ".123" or "0.123": can only be decimal
67 |
68 | if i >= 4 and n_delims == 0: # noqa: PLR2004
69 | return c # First delim at 5th position: cannot be thousands (1234.00)
70 |
71 | if i + 3 >= n:
72 | return c # Less than 3 characters after delim: cannot be thousands (1.12)
73 |
74 | n_delims += 1
75 |
76 | if c == ".":
77 | n_dots += 1
78 | if first_dot_idx is None:
79 | first_dot_idx = i
80 | else:
81 | n_commas += 1
82 | if first_comma_idx is None:
83 | first_comma_idx = i
84 |
85 | if n_dots == 1 and n_commas == 0:
86 | return "."
87 | if n_dots > 0 and n_commas > 0:
88 | return "." if first_comma_idx < first_dot_idx else ","
89 | if n_commas == 1 and n_dots == 0:
90 | return ","
91 | if n_commas > 1:
92 | return "."
93 | if n_dots > 1:
94 | return ","
95 |
96 | return None
97 |
98 |
99 | def infer_decimal_delimiter(arr: Array) -> str | None:
100 | """Get most frequent decimal delimiter in array.
101 |
102 | If most frequent delimiter doesn't occur in sufficient proportion (support),
103 | or not significantly more often than other delimiters (confidence), returns
104 | None.
105 | """
106 | n = len(arr)
107 | counts = Counter(decimal_delimiter(s.as_py()) for s in arr)
108 | counts.update({".": 0, ",": 0})
109 | ranked = [d for d in counts.most_common(3) if d[0]]
110 |
111 | if all(delim[1] == 0 for delim in ranked):
112 | return None
113 |
114 | if ranked[1][1] > 0:
115 | # If ambiguous
116 |
117 | if (ranked[0][1] / n) < DECIMAL_SUPPORT_MIN:
118 | # Most frequent delimiter should occur in at least 30% of rows
119 | return None
120 |
121 | if (ranked[0][1] / ranked[1][1]) < DECIMAL_CONFIDENCE_MIN:
122 | # Most frequent delimiter should occur at least 50% more often than next delimiter
123 | return None
124 |
125 | return ranked[0][0]
126 |
127 |
128 | def clean_float_strings(arr: Array, decimal: str) -> tuple[Array, Array, float]:
129 | """Prepare an array of strings so that Arrow can cast the result to floats.
130 |
131 | Arrow allows exponential syntax and omission of 0s before and after the decimal point,
132 | i.e. the following are all valid string representations of floating point numbers:
133 | "-1e10", "1e10", "1e-2", "1.2e3", "-1.2e3", "1." ".12", "-1.", "-.1".
134 |
135 | Arrow doesn't allow prefix of a positive sign indicator, nor thousands separator, i.e.
136 | the following are not(!) valid:
137 | "+1e10", "+1.", "+.1", "123,456.0"
138 |
139 | We hence remove occurrences of both the thousands character and the positive sign
140 | before extracting the floating point part of strings using regex.
141 |
142 | Also see following for more regex parsing options:
143 | https://stackoverflow.com/questions/12643009/regular-expression-for-floating-point-numbers
144 |
145 | Note, we don't parse as float if there isn't a single value with decimals. If this is
146 | the case they should be integers really, and if they haven't been parsed as ints before,
147 | that's because the values didn't fit into Arrow's largesy integer type, in which case it
148 | isn't safe to parse as float, which Arrow would otherwise do unsafely(!) and silently.
149 | """
150 | thousands = "," if decimal == "." else "."
151 | pattern = clean_float_pattern(thousands)
152 | clean = pac.replace_substring_regex(arr, pattern=pattern, replacement="")
153 | if decimal == ",":
154 | clean = pac.replace_substring(clean, pattern=",", replacement=".", max_replacements=1)
155 |
156 | # Arrow doesn't recognize upper case exponential ("1.03481E-11")
157 | clean = pac.utf8_lower(clean)
158 | is_float = pac.match_substring_regex(clean, pattern=RE_IS_FLOAT)
159 |
160 | if is_float.null_count == len(is_float):
161 | prop_valid = 0.0
162 | else:
163 | prop_valid = pac.sum(is_float).as_py() / (len(arr) - arr.null_count)
164 |
165 | return clean, is_float, prop_valid
166 |
167 |
168 | def maybe_parse_ints(
169 | arr: Array,
170 | threshold: float = 1.0,
171 | allow_unsigned: bool = False,
172 | ) -> Array | None:
173 | """Use regex to extract castable ints.
174 |
175 | Arrow's internal casting from string to int doesn't allow for an
176 | initial positive sign character, so we have to handle that separately.
177 | """
178 | is_int = pac.match_substring_regex(arr, pattern=RE_IS_INT)
179 | if is_int.null_count == len(is_int):
180 | return None
181 |
182 | valid_prop = pac.sum(is_int).as_py() / (len(arr) - arr.null_count)
183 | if valid_prop < threshold:
184 | return None
185 |
186 | clean = pac.if_else(is_int, arr, None)
187 | clean = pac.replace_substring_regex(clean, r"^\+", "")
188 |
189 | try:
190 | return pac.cast(clean, pa.int64())
191 | except Exception:
192 | if allow_unsigned:
193 | try:
194 | return pac.cast(clean, pa.uint64())
195 | except Exception as exc:
196 | LOG.error(exc)
197 |
198 | return None
199 |
200 |
201 | def maybe_parse_floats(
202 | arr: Array,
203 | threshold: float = 0.5,
204 | decimal: str | DecimalMode = DecimalMode.INFER,
205 | ) -> Array | None:
206 | """Parse valid string representations of floating point numbers."""
207 | if decimal == DecimalMode.INFER:
208 | decimal = infer_decimal_delimiter(arr.drop_null())
209 | if decimal is None:
210 | return None
211 |
212 | if isinstance(decimal, str) and decimal in ".,":
213 | clean, is_float, prop_valid = clean_float_strings(arr, decimal=decimal)
214 | elif decimal == DecimalMode.COMPARE:
215 | result_dot = clean_float_strings(arr, decimal=".")
216 | result_comma = clean_float_strings(arr, decimal=",")
217 | if result_dot[2] >= result_comma[2]:
218 | clean, is_float, prop_valid = result_dot
219 | else:
220 | clean, is_float, prop_valid = result_comma
221 | else:
222 | raise ValueError(f"Must have decimal char or one of ['infer', 'compare']! Got '{decimal}'.")
223 |
224 | if prop_valid < threshold:
225 | return None
226 |
227 | valid = pac.if_else(is_float, clean, None) # non-floats -> null
228 | valid = empty_to_null(valid)
229 |
230 | try:
231 | return pac.cast(valid, pa.float64())
232 | except Exception as exc:
233 | LOG.error(exc)
234 |
235 | return None
236 |
237 |
238 | def maybe_truncate_floats(arr: Array, threshold: float = 1.0) -> Array | None:
239 | """Float to int conversion if sufficient values are kept unchanged."""
240 | trunc = pac.trunc(arr)
241 |
242 | if proportion_equal(arr, trunc) < threshold:
243 | return None
244 |
245 | try:
246 | if pac.min(arr).as_py() >= 0:
247 | return pac.cast(trunc, pa.uint64())
248 |
249 | return pac.cast(trunc, pa.int64())
250 | except pa.ArrowInvalid as exc:
251 | LOG.error("Failed to convert floats to ints: " + str(exc))
252 | return None
253 |
254 |
255 | def maybe_downcast_ints(arr: Array) -> Array | None:
256 | """Convert to smallest applicable int type."""
257 | vmin, vmax = min_max(arr, skip_nulls=True)
258 | if (vmin is None) or (vmax is None):
259 | return None
260 |
261 | type = smallest_int_type(vmin, vmax)
262 |
263 | if type is not None:
264 | return pac.cast(arr, type)
265 |
266 | return None
267 |
268 |
269 | @dataclass
270 | @Registry.register
271 | class Downcast(Converter):
272 | """Attempts truncation of floats to ints and then downcasting of ints."""
273 |
274 | def convert(self, array: Array) -> Conversion | None:
275 | if pat.is_floating(array.type):
276 | array = maybe_truncate_floats(array, self.threshold)
277 | if array is None:
278 | return None
279 |
280 | if pat.is_integer(array.type):
281 | result = maybe_downcast_ints(array)
282 | return Conversion(result) if result is not None else Conversion(array)
283 |
284 | return None
285 |
286 |
287 | @dataclass
288 | @Registry.register
289 | class Number(Converter):
290 | """Attempts to parse strings into floats or ints followed by downcasting."""
291 |
292 | decimal: str | DecimalMode = DecimalMode.INFER
293 | allow_unsigned_int: bool = True
294 | max_int: int | None = None
295 |
296 | def convert(self, array: Array) -> Conversion | None:
297 | if pat.is_string(array.type):
298 | converted = maybe_parse_ints(
299 | array,
300 | threshold=self.threshold,
301 | allow_unsigned=self.allow_unsigned_int,
302 | )
303 |
304 | if converted is None:
305 | converted = maybe_parse_floats(
306 | array,
307 | threshold=self.threshold,
308 | decimal=self.decimal,
309 | )
310 |
311 | if converted is not None:
312 | downcast = Downcast().convert(converted)
313 | converted = downcast if downcast is not None else Conversion(converted)
314 | else:
315 | converted = Downcast().convert(array)
316 |
317 | if converted is None:
318 | return None
319 |
320 | if (
321 | pat.is_integer(converted.result.type)
322 | and self.max_int is not None
323 | and (pac.max(converted.result).as_py() or 0) > self.max_int
324 | ):
325 | return None
326 |
327 | converted.meta = {"semantic": f"number[{dtype_name(converted.result)}]"}
328 | return converted
329 |
--------------------------------------------------------------------------------
/lector/types/regex.py:
--------------------------------------------------------------------------------
1 | """Common regex patterns used in mutiple modules."""
2 |
3 | RE_INT_SIGN = "^(?P[+-])?(?P[0-9]+)$"
4 | """Capture optional sign and numeric parts in integer strings."""
5 |
6 | RE_IS_INT = r"^\+?\-?[0-9]+$"
7 | """Strings matching int representations we're able to parse."""
8 |
9 | RE_IS_FLOAT = "^[-]?[0-9]*[.]?[0-9]*(?:[e][+-]?[0-9]+)?$"
10 | """Strings matching float representations convertable by Arrow. Allows ints too,
11 | but those should have been inferred before trying floats.
12 | """
13 |
14 | RE_LIST_LIKE: str = r"^[\(\[\|][\s\S]*[\)\]\|]$"
15 | """Lists start and end with parenthesis-like characters."""
16 |
17 | RE_LIST_CLEAN: str = r"^[\[\{\(\|<]|[\]\}\)\|>]$|\r?\n"
18 | """Remove all parenthesis-like characters from start and end as well as line breaks."""
19 |
20 |
21 | RE_URL = (
22 | r"^(http://www\.|https://www\.|http://|https://)?" # http:// or https://
23 | r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|" # domain...
24 | r"localhost|" # localhost...
25 | r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
26 | r"(?::\d+)?" # optional port
27 | r"(?:/?|[/?]\S+)$"
28 | )
29 |
30 | RE_TRAILING_DECIMALS: str = r"\.(\d+)$"
31 | """Strictly trailing, i.e. nothing after the decimals."""
32 |
33 | RE_FRATIONAL_SECONDS: str = r"(?P\.\d+)"
34 | """Allows for timezone after fractional seconds, capturing part to be replaced."""
35 |
36 | RE_TZ_OFFSET: str = r"(?PZ|[+-]\d{2}:?\d{2}$)"
37 | """Detect 4 digit timezone offsets, with or without colon."""
38 |
--------------------------------------------------------------------------------
/lector/types/strings.py:
--------------------------------------------------------------------------------
1 | """Helpers to convert to types that logically remain strings (e.g. categoricals).
2 |
3 | TODO:
4 |
5 | - Find a fast way to recognize whitespaces with regex (see is_text)
6 | - Try faster early out for text recognition using sufficient_texts()
7 | - Make sure Sex creates an ordinal column with correct Female > Male order
8 |
9 | """
10 | from __future__ import annotations
11 |
12 | from dataclasses import dataclass
13 | from enum import Enum
14 | from math import inf as INF
15 |
16 | import pyarrow.compute as pac
17 | import pyarrow.types as pat
18 | from pyarrow import Array
19 |
20 | from ..log import LOG
21 | from ..utils import Number, map_values, proportion_trueish, proportion_unique, sorted_value_counts
22 | from .abc import Conversion, Converter, Registry
23 | from .regex import RE_LIST_LIKE, RE_URL
24 |
25 | MAX_CARDINALITY: Number = 0.1
26 | """Maximum cardinalty for categoricals (arrow's default is 50 in ConvertOptions)."""
27 |
28 | TEXT_MIN_SPACES: Number = 2
29 | """Strings need to have this many spaces to be considered text."""
30 |
31 | TEXT_MIN_LENGTH: Number = 15
32 | """Strings need to be this long to be considered text."""
33 |
34 | TEXT_REJECT_LISTS: bool = True
35 | """Whether to count list-like strings as texts."""
36 |
37 | TEXT_PROPORTION_THRESHOLD: float = 0.8
38 | """Infer text type if a proportion or values greater than this is text-like. """
39 |
40 |
41 | def is_text(
42 | arr: Array,
43 | min_spaces: int = TEXT_MIN_SPACES,
44 | min_length: int = TEXT_MIN_LENGTH,
45 | reject_lists: bool = TEXT_REJECT_LISTS,
46 | ) -> bool:
47 | """Check for natural language-like texts using criteria like lengths, number of spaces."""
48 | is_long = pac.greater_equal(pac.utf8_length(arr), min_length)
49 | # This regex seems to be very slow
50 | # has_spaces = pac.greater_equal(pac.count_substring_regex(arr, pattern=r"\s"), min_spaces) # noqa
51 | has_spaces = pac.greater_equal(pac.count_substring(arr, pattern=" "), min_spaces)
52 | textlike = pac.and_(is_long, has_spaces)
53 |
54 | if reject_lists:
55 | listlike = pac.match_substring_regex(arr, RE_LIST_LIKE)
56 | return pac.and_not(textlike, listlike)
57 |
58 | return textlike
59 |
60 |
61 | def proportion_text(
62 | arr: Array,
63 | min_spaces: int = TEXT_MIN_SPACES,
64 | min_length: int = TEXT_MIN_LENGTH,
65 | reject_lists: bool = TEXT_REJECT_LISTS,
66 | ) -> float:
67 | """Calculate proportion of natural language-like texts given criteria."""
68 | is_txt = is_text(arr.drop_null(), min_spaces, min_length, reject_lists)
69 | return proportion_trueish(is_txt)
70 |
71 |
72 | def sufficient_texts(
73 | arr: Array,
74 | min_spaces: int = TEXT_MIN_SPACES,
75 | min_length: int = TEXT_MIN_LENGTH,
76 | reject_lists: bool = TEXT_REJECT_LISTS,
77 | threshold: float = 1.0,
78 | ) -> bool:
79 | """Check for natural language-like texts using criteria like lengths, number of spaces.
80 |
81 | This is different from above in that for each text condition, we can early out if the
82 | condition is not met, without evaluating the remaining conditions. I.e., should be faster.
83 | """
84 | is_long = pac.greater_equal(pac.utf8_length(arr), min_length)
85 | if proportion_trueish(is_long) < threshold:
86 | return False
87 |
88 | # This regex seems to be very slow
89 | # has_spaces = pac.greater_equal(pac.count_substring_regex(arr, pattern=r"\s"), min_spaces) # noqa
90 | has_spaces = pac.greater_equal(pac.count_substring(arr, pattern=" "), min_spaces)
91 | if proportion_trueish(has_spaces) < threshold:
92 | return False
93 |
94 | if reject_lists:
95 | is_listlike = pac.match_substring_regex(arr, RE_LIST_LIKE)
96 | if proportion_trueish(is_listlike) > (1.0 - threshold):
97 | return False
98 |
99 | return True
100 |
101 |
102 | def proportion_url(arr: Array) -> float:
103 | """Use regex to find proportion of strings that are (web) URL-like."""
104 | is_url = pac.match_substring_regex(arr.drop_null(), RE_URL, ignore_case=True)
105 | return proportion_trueish(is_url)
106 |
107 |
108 | def maybe_cast_category(
109 | arr: Array,
110 | max_cardinality: Number | None = MAX_CARDINALITY,
111 | ) -> Array | None:
112 | """Cast to categorical depending on cardinality and whether strings are text-like."""
113 |
114 | if max_cardinality is None or max_cardinality == INF:
115 | return pac.dictionary_encode(arr)
116 |
117 | n_unique = pac.count_distinct(arr, mode="only_valid").as_py()
118 |
119 | if max_cardinality > 1:
120 | do_cast = n_unique <= max_cardinality
121 | elif max_cardinality > 0:
122 | n_valid = len(arr) - arr.null_count
123 | if n_valid == 0:
124 | do_cast = True
125 | else:
126 | do_cast = (n_unique / n_valid) <= max_cardinality
127 | else:
128 | do_cast = False
129 |
130 | if max_cardinality is None or do_cast:
131 | return pac.dictionary_encode(arr)
132 |
133 | return None
134 |
135 |
136 | @dataclass
137 | @Registry.register
138 | class Text(Converter):
139 | """Anything could be text, but we can enforce text-likeness and uniqueness."""
140 |
141 | min_unique: float = 0.1
142 |
143 | def convert(self, array: Array) -> Conversion | None:
144 | if not pat.is_string(array.type):
145 | return None
146 |
147 | if proportion_unique(array) >= self.min_unique and proportion_text(array) >= self.threshold:
148 | # if sufficient_texts(array, self.threshold):
149 | return Conversion(array, meta={"semantic": "text"})
150 |
151 | return None
152 |
153 |
154 | @dataclass
155 | @Registry.register
156 | class Url(Converter):
157 | """Anything could be text, but we can enforce text-likeness and uniqueness."""
158 |
159 | def convert(self, array: Array) -> Conversion | None:
160 | if not pat.is_string(array.type):
161 | return None
162 |
163 | array = pac.utf8_trim_whitespace(array)
164 | if proportion_url(array) >= self.threshold:
165 | result = pac.dictionary_encode(array)
166 | return Conversion(result, meta={"semantic": "url"})
167 |
168 | return None
169 |
170 |
171 | @dataclass
172 | @Registry.register
173 | class Category(Converter):
174 | """Anything could be text, but we can enforce text-likeness and uniqueness."""
175 |
176 | max_cardinality: Number | None = MAX_CARDINALITY
177 |
178 | def convert(self, array: Array) -> Conversion | None:
179 | if not pat.is_string(array.type):
180 | return None
181 |
182 | result = maybe_cast_category(array, self.max_cardinality)
183 | return Conversion(result, meta={"semantic": "category"}) if result is not None else None
184 |
185 |
186 | class Sex(Enum):
187 | Female = 0
188 | Male = 1
189 |
190 |
191 | class SexMapper:
192 | """Infer values encoding a person's sex in a column and map to configurable labels."""
193 |
194 | DEFAULT_VALUES = {
195 | Sex.Female: ["female", "f", "femenino", "mujer", "m"],
196 | Sex.Male: ["male", "m", "masculino", "hombre", "varón", "varon", "h", "v"],
197 | }
198 |
199 | def __init__(self, values: tuple[str, str], labels: dict[Sex, str] | None = None):
200 | self.labels = labels or {Sex.Female: "Female", Sex.Male: "Male"}
201 | self.infer_values(values)
202 | self.make_mapping()
203 |
204 | def infer_values(self, values: tuple[str, str]) -> dict:
205 | """Infer which values encode female/male categories."""
206 | if len(values[0]) == 1 and len(values[1]) == 1 and "m" in values:
207 | f_label, m_label = self.labels[Sex.Female], self.labels[Sex.Male]
208 |
209 | if "f" in values:
210 | # male/female or masculino/femenino
211 | self.values = {Sex.Female: ["f", f_label], Sex.Male: ["m", m_label]}
212 | elif "v" in values:
213 | # mujer/varon
214 | self.values = {Sex.Female: ["m", f_label], Sex.Male: ["v", m_label]}
215 | elif "h" in values:
216 | # mujer/hombre
217 | self.values = {Sex.Female: ["m", f_label], Sex.Male: ["h", m_label]}
218 |
219 | self.values = self.DEFAULT_VALUES
220 |
221 | def make_mapping(self) -> dict[str, str]:
222 | """Create a mapping from inferred values to desired labels."""
223 | ensure_list = lambda x: x if isinstance(x, list) else [x]
224 | self.map = {val: self.labels[sex] for sex in Sex for val in ensure_list(self.values[sex])}
225 |
226 |
227 | def maybe_sex(arr: Array) -> tuple[str, str] | None:
228 | """Check if the two most common values are sex-like and return them."""
229 | top_n = 2
230 | lower = pac.utf8_lower(arr)
231 | top2 = sorted_value_counts(lower, top_n=top_n)
232 | values = top2.field("values").to_pylist()
233 |
234 | if len(values) == top_n:
235 | mapper = SexMapper(values)
236 | LOG.debug(f"Sex mapping: {mapper.map}")
237 | mapped = map_values(lower, mapper.map)
238 | return mapped.dictionary_encode()
239 |
240 | return arr
241 |
--------------------------------------------------------------------------------
/lector/types/timestamps.py:
--------------------------------------------------------------------------------
1 | """Helpers to convert timestamp strings or time-like columns to timestamps.
2 |
3 | Arrow seems to be using this parser under the hood:
4 | https://pubs.opengroup.org/onlinepubs/009695399/functions/strptime.html
5 |
6 | in its compute.strptime function, which doesn't support timezone offsets via
7 | the %z or %Z directives. Though they do support timezones when importing CSVs
8 | or casting...
9 |
10 | For arrow internals relating to timestamps also see:
11 |
12 | - Timezone internals:
13 | https://arrow.apache.org/docs/cpp/api/datatype.html#_CPPv4N5arrow13TimestampTypeE
14 | - CSV parsing:
15 | https://arrow.apache.org/docs/cpp/csv.html#timestamp-inference-parsing
16 | - Timestamp umbrella issue:
17 | https://github.com/apache/arrow/issues/31324
18 |
19 | TODO:
20 | - Fractional seconds are handled manually, also see
21 | https://github.com/apache/arrow/issues/20146. They are first removed via regex,
22 | converted to a pyarrow duration type and later added to parsed timestamps.
23 | - Timezones are only supported in format "+0100", but not e.g. "+01:00"
24 | - What to do with mixed timezones:
25 | https://stackoverflow.com/questions/75656639/computing-date-features-using-pyarrow-on-mixed-timezone-data
26 |
27 | """
28 | from __future__ import annotations
29 |
30 | from dataclasses import dataclass
31 | from functools import lru_cache
32 | from typing import ClassVar
33 |
34 | import pyarrow as pa
35 | import pyarrow.compute as pac
36 | import pyarrow.types as pat
37 | from pyarrow import Array, TimestampArray, TimestampScalar, TimestampType
38 |
39 | from ..log import LOG
40 | from ..utils import proportion_trueish
41 | from .abc import Conversion, Converter, Registry
42 | from .regex import RE_FRATIONAL_SECONDS, RE_TZ_OFFSET
43 |
44 | TIMESTAMP_FORMATS: list[str] = [
45 | "%Y-%m-%dT%H:%M:%S",
46 | "%Y-%m-%dT%H:%M",
47 | "%Y-%m-%dT%I:%M:%S %p",
48 | "%Y-%m-%dT%I:%M %p",
49 | "%Y-%m-%d%n%H:%M:%S",
50 | "%Y-%m-%d%n%I:%M:%S %p",
51 | "%a %b %d %H:%M:%S %Y",
52 | "%a %b %d %I:%M:%S %p %Y",
53 | "%a %d %b %H:%M:%S %Y",
54 | "%a %d %b %I:%M:%S %p %Y",
55 | "%a, %b %d %H:%M:%S %Y",
56 | "%a, %b %d %I:%M:%S %p %Y",
57 | "%a, %d %b %H:%M:%S %Y",
58 | "%a, %d %b %I:%M:%S %p %Y",
59 | "%a %d %b %Y %H:%M:%S",
60 | "%a %d %b %Y %I:%M:%S %p",
61 | "%a, %d %b %Y %H:%M:%S",
62 | "%a, %d %b %Y %I:%M:%S %p",
63 | "%a %b %d %H:%M:%S %z %Y",
64 | ]
65 |
66 | DATE_FORMATS: list[str] = [
67 | "%d-%m-%y", # %y first since it will fail with 4 digit years,
68 | "%d/%m/%y", # while %Y will not fail on 2 digit years(!)
69 | "%Y-%m-%d",
70 | "%d-%m-%Y",
71 | "%Y/%m/%d",
72 | "%d/%m/%Y",
73 | "%m/%d/%Y",
74 | "%a %d %b %Y",
75 | "%a, %d %b %Y",
76 | ]
77 |
78 | ISO_FORMAT: str = "ISO8601()"
79 | """String Arrow recognizes as meaning the ISO format."""
80 |
81 | UNIT = "ns"
82 | """Note that pandas internal unit is fixed to nanoseconds, and with that resolution it can
83 | represent a much smaller period of dates only."""
84 |
85 |
86 | def timestamp_formats(tz: bool = True) -> list[str]:
87 | formats = TIMESTAMP_FORMATS
88 | if tz:
89 | with_tz = lambda fmt: (fmt, fmt + " %z", fmt + " Z", fmt + " UTC")
90 | formats = [ext for fmt in formats for ext in with_tz(fmt)]
91 | formats.extend(DATE_FORMATS)
92 | return [ISO_FORMAT] + formats
93 |
94 |
95 | ALL_FORMATS: list[str] = timestamp_formats()
96 | """All formats tried by default if None is explicitly provided when converting."""
97 |
98 |
99 | def proportion_fractional_seconds(arr: Array) -> float:
100 | """Proportion of non-null dates in arr having fractional seconds."""
101 | valid = arr.drop_null()
102 | has_frac = pac.match_substring_regex(valid, RE_FRATIONAL_SECONDS)
103 | return proportion_trueish(has_frac)
104 |
105 |
106 | def fraction_as_duration(arr: Array) -> Array:
107 | """Convert an array (of strings) representing fractional seconds to duration type."""
108 |
109 | if pat.is_string(arr.type):
110 | arr = pac.cast(arr, pa.float64())
111 |
112 | if pat.is_floating(arr.type):
113 | # Assume values in [0,1]: convert to nanoseconds
114 | arr = pac.multiply(arr, 1e9)
115 | arr = pac.trunc(arr)
116 | arr = pac.cast(arr, pa.int64())
117 |
118 | return pac.cast(arr, pa.duration("ns"))
119 |
120 |
121 | def extract_timezone(timestamps: pa.Array):
122 | """Extract timezone from a list of string timestamps.
123 |
124 | Currently, the only supported format is +/-HH[:]MM, e.g. +0100.
125 |
126 | Also, returns None if there are multiple different offsets, after
127 | some basic cleaning. E.g. Z and +0000 are considered the same.
128 | """
129 | res = pac.extract_regex(timestamps, RE_TZ_OFFSET)
130 | res = res.drop_null()
131 |
132 | if not len(res):
133 | return None
134 |
135 | offsets = pac.struct_field(res, indices=0)
136 | offsets = pac.replace_substring(offsets, ":", "")
137 | offsets = pac.replace_substring(offsets, "Z", "+0000")
138 | offsets = offsets.unique()
139 |
140 | if len(offsets) > 1:
141 | return None
142 |
143 | offset = offsets[0].as_py()
144 | return f"{offset[:-2]}:{offset[-2:]}"
145 |
146 |
147 | @lru_cache(maxsize=128, typed=False)
148 | def find_format(ts: TimestampScalar) -> str | None:
149 | """Try to find the first format that can parse given date."""
150 | if pac.is_null(ts).as_py():
151 | return None
152 |
153 | for fmt in ALL_FORMATS:
154 | try:
155 | pac.strptime(ts, format=fmt, unit="s")
156 | return fmt
157 | except Exception: # noqa: S112
158 | continue
159 |
160 | return None
161 |
162 |
163 | def maybe_parse_known_timestamps(
164 | arr: Array,
165 | format: str,
166 | unit: str = UNIT,
167 | threshold: float = 1.0,
168 | ) -> Array | None:
169 | """Helper for parsing with known format and no fractional seconds."""
170 |
171 | if threshold == 1.0: # noqa: PLR2004
172 | try:
173 | return pac.strptime(arr, format=format, unit=unit)
174 | except Exception:
175 | return None
176 |
177 | valid_before = len(arr) - arr.null_count
178 | result = pac.strptime(arr, format=format, unit=unit, error_is_null=True)
179 | valid_after = len(result) - result.null_count
180 |
181 | if (valid_after / valid_before) < threshold:
182 | return None
183 |
184 | return result
185 |
186 |
187 | def maybe_parse_timestamps(
188 | arr: Array,
189 | format: str | None = None,
190 | unit: str = UNIT,
191 | threshold: float = 1.0,
192 | return_format: bool = False,
193 | ) -> Array | None:
194 | """Parse lists of strings as dates with format inference."""
195 | min_prop_frac_secs = 0.1
196 |
197 | if proportion_fractional_seconds(arr) > min_prop_frac_secs:
198 | frac = pac.extract_regex(arr, RE_FRATIONAL_SECONDS)
199 | frac = pac.struct_field(frac, indices=[0])
200 | frac = fraction_as_duration(frac)
201 | arr = pac.replace_substring_regex(arr, RE_FRATIONAL_SECONDS, "")
202 | else:
203 | frac = None
204 |
205 | if format is None:
206 | formats = ALL_FORMATS
207 | valid = arr.drop_null()
208 |
209 | if len(valid) > 0:
210 | first_date = valid[0]
211 | first_format = find_format(first_date)
212 | if first_format is not None:
213 | LOG.info(f"Found date format '{first_format}'")
214 | formats = ALL_FORMATS.copy()
215 | formats.remove(first_format)
216 | formats.insert(0, first_format)
217 |
218 | else:
219 | formats = [format]
220 |
221 | for fmt in formats:
222 | result = maybe_parse_known_timestamps(arr, format=fmt, unit=unit, threshold=threshold)
223 | if result is not None:
224 | if frac is not None:
225 | result = pac.add(result, frac)
226 | return (result, fmt) if return_format else result
227 |
228 | return None
229 |
230 |
231 | @dataclass
232 | @Registry.register
233 | class Timestamp(Converter):
234 | """Convert string or time/date-like arrays to timestamp type.
235 |
236 | Note: Arrow will always _parse_ either into UTC or timezone-naive
237 | timestamps, but never into specific timezones other than UTC
238 | by default. Also, internally all timestamps are represented as UTC.
239 | The timezone metadata is then used by other functions to correctly
240 | extract for example the local day of the week, time etc.
241 |
242 | Non-UTC timestamps can only be created by specifying the TimestampType
243 | explicitly, or using the assume_timezone function.
244 |
245 | When converting to pandas, the timezone is handled correctly.
246 |
247 | When input strings have no explicit timezone information, uses `tz`
248 | parameter to interpret them as local to that tz. If tz=None, keeps
249 | them as timezone-naive timestamps. If input strings do have explicit
250 | timezone information, will be represented internally as UTC (as always),
251 | and simply set the tz metadata so that component extraction etc. will
252 | use correctly localized moments in time.
253 |
254 | TZ-naive timestamps ["2013-07-17 05:00", "2013-07-17 02:00"]:
255 |
256 | - assume_timezone(NY): interprets input timestamps as local to tz,
257 | converts and stores them as UTC, and keeps tz metadata for
258 | correct localization when printing/extracting components. I.e.,
259 | will convert to [2013-07-17 09:00:00, 2013-07-17 06:00:00] UTC,
260 | but when needed, will localize on demand to
261 | [2013-07-17 05:00:00-04:00 2013-07-17 02:00:00-04:00].
262 |
263 | - cast with timezone(NY): interprets input timestamps as local to UTC,
264 | and stores the tz as metadata for on-demand localization. I.e.,
265 | timestamps will be [2013-07-17 05:00:00, 2013-07-17 02:00:00] UTC,
266 | and when needed will localize on demand to
267 | [2013-07-17 01:00:00-04:00 2013-07-16 22:00:00-04:00].
268 |
269 | TZ-aware timestamps ["2013-07-17 05:00", "2013-07-17 02:00"] UTC:
270 |
271 | - cast with timezone(NY): since input timestamps internally are already
272 | always in UTC, keeps them as UTC ["2013-07-17 05:00", "2013-07-17 02:00"],
273 | but localizes to cast tz on demand, i.e. to
274 | [2013-07-17 01:00:00-04:00 2013-07-16 22:00:00-04:00].
275 | """
276 |
277 | format: str | None = None
278 | """When None, default formats are tried in order."""
279 | unit: str = UNIT
280 | """Resolution the timestamps are stored with internally."""
281 | tz: str | None = None
282 | """The desired timezone of the timestamps."""
283 | convert_temporal: bool = True
284 | """Whether time/date-only arrays should be converted to timestamps."""
285 |
286 | DEFAULT_TZ: ClassVar[str] = "UTC"
287 |
288 | @staticmethod
289 | def meta(dt: TimestampType) -> dict[str, str]:
290 | tz = f", {dt.tz}" if dt.tz is not None else ""
291 | return {"semantic": f"date[{dt.unit}{tz}]"}
292 |
293 | @staticmethod
294 | def to_timezone(array: TimestampArray, tz: str | None) -> TimestampArray:
295 | if tz is not None:
296 | if array.type.tz is None:
297 | # Interpret as local moments in given timezone to convert to UTC equivalent
298 | return pac.assume_timezone(
299 | array, timezone=tz, ambiguous="earliest", nonexistent="earliest"
300 | )
301 |
302 | # Keep UTC internally, simply change what local time is assumed in temporal functions
303 | return array.cast(pa.timestamp(unit=array.type.unit, tz=tz))
304 |
305 | if array.type.tz is not None:
306 | # Make local timezone-naive. Careful: the following will make the timestamps
307 | # naive, but with local time in UTC, not using the existing timezone metadata!
308 | # return array.cast(pa.timestamp(unit=array.type.unit, tz=None)) # noqa: ERA001
309 | raise NotImplementedError("Pyarrow's to_local() will not be implemented until v12.0!")
310 |
311 | # Keep as timezone-naive timestamps
312 | return array
313 |
314 | def convert_date_time(self, array: Array) -> Conversion | None:
315 | try:
316 | result = array.cast(pa.timestamp(unit=self.unit), safe=False)
317 | result = self.to_timezone(result, self.tz or self.DEFAULT_TZ)
318 | return Conversion(result, self.meta(result.type))
319 | except pa.ArrowNotImplementedError:
320 | LOG.error(f"Pyarrow cannot convert {array.type} to timestamp!")
321 | return None
322 |
323 | def convert_timestamp(self, array: Array) -> Conversion | None:
324 | result = array
325 | if array.type.unit != self.unit:
326 | result = array.cast(pa.timestamp(unit=self.unit), safe=False)
327 |
328 | result = self.to_timezone(result, self.tz or self.DEFAULT_TZ)
329 | return Conversion(result, self.meta(result.type))
330 |
331 | def convert_strings(self, array: Array) -> Conversion | None:
332 | try:
333 | # Pyarrow's strptime behaves different from its internal cast and inference. Only the
334 | # latter support timezone offset. So try cast first, and then strptime-based conversion.
335 | result = array.cast(pa.timestamp(unit=self.unit))
336 | except pa.ArrowInvalid:
337 | try:
338 | result = array.cast(pa.timestamp(unit=self.unit, tz="UTC"))
339 | except pa.ArrowInvalid:
340 | result = None
341 |
342 | if result is not None:
343 | tz = self.tz or extract_timezone(array)
344 | result = self.to_timezone(result, tz or self.DEFAULT_TZ)
345 | return Conversion(result, self.meta(result.type) | {"format": "arrow"})
346 |
347 | result = maybe_parse_timestamps(
348 | array,
349 | format=self.format,
350 | unit=self.unit,
351 | threshold=self.threshold,
352 | return_format=True,
353 | )
354 |
355 | if result is not None:
356 | result, format = result
357 | result = self.to_timezone(result, self.tz or self.DEFAULT_TZ)
358 | return Conversion(result, self.meta(result.type) | {"format": format})
359 |
360 | return None
361 |
362 | def convert(self, array: Array) -> Conversion | None:
363 | if (pat.is_time(array.type) or pat.is_date(array.type)) and self.convert_temporal:
364 | return self.convert_date_time(array)
365 |
366 | if pat.is_timestamp(array.type):
367 | return self.convert_timestamp(array)
368 |
369 | return self.convert_strings(array) if pat.is_string(array.type) else None
370 |
--------------------------------------------------------------------------------
/lector/utils.py:
--------------------------------------------------------------------------------
1 | """Common helpers to work with pyarrow objects."""
2 |
3 | from __future__ import annotations
4 |
5 | import json
6 | from collections import namedtuple
7 | from collections.abc import Callable, Iterator, Sequence
8 | from contextlib import contextmanager
9 | from functools import singledispatch
10 | from time import perf_counter
11 | from typing import Any, Union
12 |
13 | import pyarrow as pa
14 | from pyarrow import (
15 | Array,
16 | ChunkedArray,
17 | DataType,
18 | Schema,
19 | Table,
20 | type_for_alias, # noqa: F401
21 | )
22 | from pyarrow import compute as pac
23 | from pyarrow import types as pat
24 | from pyarrow.lib import ensure_type # noqa: F401
25 |
26 | try:
27 | import pandas as pd
28 |
29 | PANDAS_INSTALLED = True
30 | except Exception:
31 | PANDAS_INSTALLED = False
32 |
33 |
34 | Number = Union[int, float]
35 |
36 | Limit = namedtuple("Limit", "min,max")
37 |
38 | INT_LIMITS: dict[str, Limit] = {
39 | "int8": Limit(-128, 127),
40 | "int16": Limit(-32_768, 32_767),
41 | "int32": Limit(-2_147_483_648, 2_147_483_647),
42 | "int64": Limit(-9_223_372_036_854_775_808, 9_223_372_036_854_775_807),
43 | "uint8": Limit(0, 255),
44 | "uint16": Limit(0, 65_535),
45 | "uint32": Limit(0, 4_294_967_295),
46 | "uint64": Limit(0, 18_446_744_073_709_551_615),
47 | }
48 | """Minimum and maximum for each integer subtype."""
49 |
50 | MISSING_STRINGS: set[str] = {
51 | "#N/A",
52 | "#N/A N/A",
53 | "#NA",
54 | "-1.#IND",
55 | "-1.#INF",
56 | "-1.#QNAN",
57 | "1.#IND",
58 | "1.#INF",
59 | "1.#INF000000",
60 | "1.#QNAN",
61 | "-NaN",
62 | "-nan",
63 | "",
64 | "N/A",
65 | "n/a",
66 | "NA",
67 | "NAN",
68 | "NaN",
69 | "nan",
70 | "NULL",
71 | "Null",
72 | "null",
73 | # Would expect this to happen automatically, but not the case
74 | # (at least when Arrow reads CSV with types="string")
75 | "",
76 | }
77 | """Extension of pandas and arrow default missing values."""
78 |
79 |
80 | @contextmanager
81 | def reset_buffer(buffer):
82 | """Caches and resets buffer position."""
83 | cursor = buffer.tell()
84 | yield
85 | buffer.seek(cursor)
86 |
87 |
88 | def smallest_int_type(vmin: Number, vmax: Number) -> str | None:
89 | """Find the smallest int type able to hold vmin and vmax."""
90 |
91 | if vmin >= 0:
92 | types = ["uint8", "uint16", "uint32"]
93 | else:
94 | types = ["int8", "int16", "int32"]
95 |
96 | for type in types:
97 | limits = INT_LIMITS[type]
98 | if vmin >= limits.min and vmax <= limits.max:
99 | return type
100 |
101 | return None
102 |
103 |
104 | def dtype_name(arr: Array):
105 | """Return a pandas-compatible type name including extension types where possible."""
106 | type = arr.type
107 | name = str(type)
108 |
109 | if pat.is_integer(type) and arr.null_count > 0:
110 | name = name.replace("i", "I").replace("u", "U")
111 |
112 | return name
113 |
114 |
115 | def min_max(arr: Array, skip_nulls: bool = True) -> tuple[Number, Number]:
116 | """Wrapper to get minimum and maximum in arrow array as python tuple."""
117 | mm = pac.min_max(arr, skip_nulls=skip_nulls).as_py()
118 | return mm["min"], mm["max"]
119 |
120 |
121 | def proportion_valid(arr: Array) -> float:
122 | """Proportion of non-null values in array."""
123 | size = len(arr)
124 | return (size - arr.null_count) / size
125 |
126 |
127 | def proportion_unique(arr: Array) -> float:
128 | """Proportion of non-null values that are unique in array."""
129 | n_valid = len(arr) - arr.null_count
130 |
131 | if n_valid == 0:
132 | return 0
133 |
134 | n_unique = pac.count_distinct(arr, mode="only_valid").as_py()
135 | return n_unique / n_valid
136 |
137 |
138 | def proportion_trueish(arr: Array) -> float:
139 | if len(arr) == 0:
140 | # Still means we had no trueish values
141 | return 0
142 |
143 | n_trueish = pac.sum(arr).as_py() or 0 # may return None otherwise, which we consider falsish
144 | return n_trueish / len(arr)
145 |
146 |
147 | def proportion_equal(arr1: Array, arr2: Array, ignore_nulls=True) -> float:
148 | """Proportion of equal values, optionally ignoring nulls (which otherwise compare falsish."""
149 | equal = pac.equal(arr1, arr2)
150 | if ignore_nulls:
151 | equal = equal.drop_null()
152 |
153 | return proportion_trueish(equal)
154 |
155 |
156 | def empty_to_null(arr: Array) -> Array:
157 | """Convert empty strings to null values."""
158 | is_empty = pac.equal(arr, "")
159 | return pac.if_else(is_empty, None, arr)
160 |
161 |
162 | def sorted_value_counts(arr: Array, order: str = "descending", top_n: int | None = None) -> Array:
163 | """Arrow's built-in value count doesn't allow sorting."""
164 | valcnt = arr.value_counts()
165 | counts = valcnt.field("counts")
166 | order = pac.array_sort_indices(counts, order="descending")
167 | if top_n is None:
168 | return valcnt.take(order)
169 |
170 | return valcnt.take(order[:top_n])
171 |
172 |
173 | def map_values(arr: Array, map: dict, unknown: str = "keep") -> Array:
174 | """Slow value mapping in pure Python while Arrow doesn't have a native compute function.
175 |
176 | For now assumes type can be left unchanged.
177 | """
178 | values = arr.to_pylist()
179 |
180 | if unknown == "keep":
181 | values = [map.get(val, val) for val in values]
182 | else:
183 | values = [map.get(val) for val in values]
184 |
185 | return pa.array(values, type=arr.type)
186 |
187 |
188 | def categories(array: Array | ChunkedArray) -> Array:
189 | """Returns an array containing categories in input array of dictionary type."""
190 |
191 | if not pat.is_dictionary(array.type):
192 | raise TypeError("Must have an array with dictionary type!")
193 |
194 | if isinstance(array, ChunkedArray):
195 | array = array.unify_dictionaries()
196 | return array.chunk(0).dictionary
197 |
198 | return array.dictionary
199 |
200 |
201 | def is_stringy(type: DataType) -> bool:
202 | """Check if array is stringy (string or dictionary of strings)."""
203 | if pat.is_string(type):
204 | return True
205 |
206 | return pat.is_dictionary(type) and pat.is_string(type.value_type)
207 |
208 |
209 | def with_flatten(arr: Array, func: Callable):
210 | """Apply a compute function to all elements of flattened (and restored) lists."""
211 | isna = pac.is_null(arr)
212 | flat = pac.list_flatten(arr)
213 | transformed = func(flat)
214 | nested = pa.ListArray.from_arrays(arr.offsets, transformed)
215 | return pac.if_else(isna, None, nested)
216 |
217 |
218 | def schema_diff(s1: Schema, s2: Schema) -> dict[str, tuple[DataType, DataType]]:
219 | """Check differences in schema's column types."""
220 | diff = {}
221 |
222 | for field in s1:
223 | other = s2.field(field.name)
224 | if field.type != other.type:
225 | diff[field.name] = (field.type, other.type)
226 |
227 | return diff
228 |
229 |
230 | def encode_metadata(d: dict):
231 | """Json-byte-encode a dict, like Arrow expects its metadata."""
232 | return {k.encode("utf-8"): json.dumps(v).encode("utf-8") for k, v in d.items()}
233 |
234 |
235 | def maybe_load_json(s: str) -> Any:
236 | """Try to load a string as json, returning the original string if it fails."""
237 | try:
238 | return json.loads(s)
239 | except (json.JSONDecodeError, TypeError):
240 | return s
241 |
242 |
243 | def decode_metadata(d: dict):
244 | """Decode Arrow metadata to dict."""
245 | return {k.decode("utf-8"): maybe_load_json(v.decode("utf-8")) for k, v in d.items()}
246 |
247 |
248 | class Timer:
249 | def __enter__(self):
250 | self.start = perf_counter()
251 | return self
252 |
253 | def __exit__(self, type, value, traceback):
254 | self.end = perf_counter()
255 | self.elapsed = self.end - self.start
256 |
257 |
258 | if PANDAS_INSTALLED:
259 | # Arrow currently doesn't have any way to map its integer types to pandas
260 | # extension dtypes depending on whether a columns has missing values or not
261 |
262 | @singledispatch
263 | def to_pandas(array: Array):
264 | """Proper conversion allowing pandas extension types."""
265 |
266 | atype = array.type
267 |
268 | if pat.is_string(atype):
269 | return array.to_pandas().astype("string")
270 |
271 | if pat.is_boolean(atype):
272 | return array.to_pandas().astype("boolean")
273 |
274 | if pat.is_integer(atype) and array.null_count > 0:
275 | dtype_name = str(atype).replace("i", "I").replace("u", "U")
276 | return array.to_pandas(integer_object_nulls=True).astype(dtype=dtype_name)
277 |
278 | return array.to_pandas()
279 |
280 | @to_pandas.register
281 | def _(table: Table):
282 | columns = [to_pandas(array) for array in table]
283 | df = pd.concat(columns, axis=1)
284 | df.columns = table.column_names
285 | return df
286 |
287 |
288 | def uniquify(items: Sequence[str]) -> Iterator[str]:
289 | """Add suffixes to inputs strings if necessary to ensure is item is unique."""
290 | seen = set()
291 |
292 | for item in items:
293 | newitem = item
294 |
295 | suffix = 0
296 | while newitem in seen:
297 | suffix += 1
298 | newitem = f"{item}_{suffix}"
299 |
300 | seen.add(newitem)
301 | yield newitem
302 |
--------------------------------------------------------------------------------
/notebooks/example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 22,
6 | "id": "58eb08f3",
7 | "metadata": {
8 | "ExecuteTime": {
9 | "end_time": "2022-12-12T16:38:57.788998Z",
10 | "start_time": "2022-12-12T16:38:57.782691Z"
11 | }
12 | },
13 | "outputs": [],
14 | "source": [
15 | "import lector\n",
16 | "import pandas as pd"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 23,
22 | "id": "5230ccf5",
23 | "metadata": {
24 | "ExecuteTime": {
25 | "end_time": "2022-12-12T16:38:59.183049Z",
26 | "start_time": "2022-12-12T16:38:59.177918Z"
27 | }
28 | },
29 | "outputs": [],
30 | "source": [
31 | "csv = \"\"\"\n",
32 | "Some preamble content here\n",
33 | "This is still \"part of the metadata preamble\"\n",
34 | "id;genre;metric;count;content;website;tags;vecs;date\n",
35 | "1234982348728374;a;0.1;1;; http://www.graphext.com;\"[a,b,c]\";\"[1.3, 1.4, 1.67]\";11/10/2022\n",
36 | ";b;0.12;;\"Natural language text is different from categorical data.\"; https://www.twitter.com;[d];\"[0, 1.9423]\";01/10/2022\n",
37 | "9007199254740993;a;3.14;3;\"The Project · Gutenberg » EBook « of Die Fürstin.\";http://www.google.com;\"['e', 'f']\";[\"84.234, 12509.99\"];13/10/2021\n",
38 | "\"\"\".encode(\"ISO-8859-1\")\n",
39 | "\n",
40 | "with open(\"example.csv\", \"wb\") as fp:\n",
41 | " fp.write(csv)"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 24,
47 | "id": "91850fdc",
48 | "metadata": {
49 | "ExecuteTime": {
50 | "end_time": "2022-12-12T16:39:01.422026Z",
51 | "start_time": "2022-12-12T16:39:01.366891Z"
52 | }
53 | },
54 | "outputs": [
55 | {
56 | "data": {
57 | "text/html": [
58 | "\n",
59 | "
\n"
60 | ],
61 | "text/plain": [
62 | "\n"
63 | ]
64 | },
65 | "metadata": {},
66 | "output_type": "display_data"
67 | },
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | " id genre metric count \\\n",
73 | "0 1234982348728374 a 0.10 1 \n",
74 | "1 b 0.12 \n",
75 | "2 9007199254740993 a 3.14 3 \n",
76 | "\n",
77 | " content website \\\n",
78 | "0 http://www.graphext.com \n",
79 | "1 Natural language text is different from catego... https://www.twitter.com \n",
80 | "2 The Project · Gutenberg » EBook « of Die Fürstin. http://www.google.com \n",
81 | "\n",
82 | " tags vecs date \n",
83 | "0 [a, b, c] [1.3, 1.4, 1.67] 2022-10-11 \n",
84 | "1 [d] [0.0, 1.9423] 2022-10-01 \n",
85 | "2 [e, f] [84.234, 12509.99] 2021-10-13 \n",
86 | "\n",
87 | " id Int64\n",
88 | "genre category\n",
89 | "metric float64\n",
90 | "count UInt8\n",
91 | "content string\n",
92 | "website category\n",
93 | "tags object\n",
94 | "vecs object\n",
95 | "date datetime64[ns]\n",
96 | "dtype: object\n"
97 | ]
98 | }
99 | ],
100 | "source": [
101 | "df = lector.read_csv(\"example.csv\", to_pandas=True)\n",
102 | "print(df)\n",
103 | "print(\"\\n\", df.dtypes)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 26,
109 | "id": "b8c6261e",
110 | "metadata": {
111 | "ExecuteTime": {
112 | "end_time": "2022-12-12T16:39:16.280641Z",
113 | "start_time": "2022-12-12T16:39:16.262108Z"
114 | }
115 | },
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | " id genre metric count \\\n",
122 | "0 1234982348728374 a 0.10 1 \n",
123 | "1 b 0.12 \n",
124 | "2 9007199254740992 a 3.14 3 \n",
125 | "\n",
126 | " content \\\n",
127 | "0 \n",
128 | "1 Natural language text is different from catego... \n",
129 | "2 The Project · Gutenberg » EBook « of Die Fürstin. \n",
130 | "\n",
131 | " website tags vecs date \n",
132 | "0 http://www.graphext.com [a,b,c] [1.3, 1.4, 1.67] 2022-11-10 \n",
133 | "1 https://www.twitter.com [d] [0, 1.9423] 2022-01-10 \n",
134 | "2 http://www.google.com ['e', 'f'] [\"84.234, 12509.99\"] 2021-10-13 \n",
135 | "\n",
136 | " id Int64\n",
137 | "genre category\n",
138 | "metric float64\n",
139 | "count UInt8\n",
140 | "content string\n",
141 | "website category\n",
142 | "tags object\n",
143 | "vecs object\n",
144 | "date datetime64[ns]\n",
145 | "dtype: object\n"
146 | ]
147 | },
148 | {
149 | "name": "stderr",
150 | "output_type": "stream",
151 | "text": [
152 | "/var/folders/b2/mplwbs6j15976g80v9bw7c4r0000gn/T/ipykernel_80150/366501397.py:14: UserWarning: Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.\n",
153 | " df = pd.read_csv(\n"
154 | ]
155 | }
156 | ],
157 | "source": [
158 | "import pandas as pd\n",
159 | "\n",
160 | "dtypes = {\n",
161 | " \"id\": \"Int64\",\n",
162 | " \"genre\": \"category\",\n",
163 | " \"metric\": \"float\",\n",
164 | " \"count\": \"UInt8\", \n",
165 | " \"content\": \"string\",\n",
166 | " \"website\": \"category\",\n",
167 | " \"tags\": \"object\",\n",
168 | " \"vecs\": \"object\"\n",
169 | "}\n",
170 | "\n",
171 | "df = pd.read_csv(\n",
172 | " \"example.csv\",\n",
173 | " encoding=\"ISO-8859-1\",\n",
174 | " skiprows=3,\n",
175 | " sep=\";\",\n",
176 | " dtype=dtypes,\n",
177 | " parse_dates=[\"date\"],\n",
178 | " infer_datetime_format=True\n",
179 | ")\n",
180 | "\n",
181 | "print(df)\n",
182 | "print(\"\\n\", df.dtypes)"
183 | ]
184 | }
185 | ],
186 | "metadata": {
187 | "kernelspec": {
188 | "display_name": "lector",
189 | "language": "python",
190 | "name": "lector"
191 | },
192 | "language_info": {
193 | "codemirror_mode": {
194 | "name": "ipython",
195 | "version": 3
196 | },
197 | "file_extension": ".py",
198 | "mimetype": "text/x-python",
199 | "name": "python",
200 | "nbconvert_exporter": "python",
201 | "pygments_lexer": "ipython3",
202 | "version": "3.10.8"
203 | }
204 | },
205 | "nbformat": 4,
206 | "nbformat_minor": 5
207 | }
208 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.coverage.run]
6 | omit = [
7 | "*test/*",
8 | ]
9 |
10 | [tool.ruff]
11 | target-version = "py39"
12 | line-length = 100
13 |
14 | src = ["libs/*"]
15 |
16 | select = [
17 | "E", # Pycodestyle errors
18 | "W", # Pycodestyle warnings
19 | "F", # Pyflakes
20 | "UP", # Pyupgrade
21 | "S", # Bandit (security),
22 | "B", # Bugbear
23 | "C4", # Comprehensions
24 | "ISC", # Implicit string concat
25 | "ICN", # Import conventions
26 | "I001", # isort
27 | "Q", # Quotes
28 | "SIM", # Simplify
29 | "ERA", # eradicate (commented out code)
30 | "PIE",
31 | "PD", # Pandas
32 | "PLC",
33 | "PLE",
34 | "PLR",
35 | "PLW", # Pylint extensions
36 | "RET", # Return statements
37 | ]
38 |
39 | # Never enforce these
40 | ignore = [
41 | "E731", # Do not assign a lambda expression, use a def
42 | "E741", # Ambiguous variable names, e.g. l/1, O/0. Just use appropriate font.
43 | "D", # All docstrings for now.
44 | "S101", # Don't warn for asserts in pytest
45 | "S105", # HardcodedPasswordString
46 | "S106", # HardcodedPasswordFuncArg
47 | "S107", # HardcodedPasswordDefault
48 | "PD901", # df as variable name
49 | "PD011", # Use .to_numpy() instead of .values
50 | "PD015", # Use .merge instead of pd.merge
51 | "PLR0913", # Too many arguments (in function signatures)
52 | "RET501", # explicit `return None` if only possible return value
53 | "RET504", # Unnecessary variable assignment before `return` statement
54 | "SIM108", # Ternary instead of if/else (doesn't take into account line length)
55 | ]
56 |
57 | # Never try to fix these
58 | unfixable = [
59 | "ERA001", # Don't automatically delete commented out code
60 | ]
61 |
62 | exclude = [
63 | ".bzr",
64 | ".direnv",
65 | ".eggs",
66 | ".git",
67 | ".hg",
68 | ".mypy_cache",
69 | ".nox",
70 | ".pants.d",
71 | ".ruff_cache",
72 | ".svn",
73 | ".tox",
74 | ".venv",
75 | "__pypackages__",
76 | "_build",
77 | "buck-out",
78 | "build",
79 | "dist",
80 | "node_modules",
81 | "venv",
82 | ]
83 |
84 | # Allow unused variables when underscore-prefixed.
85 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
86 |
87 | [tool.ruff.per-file-ignores]
88 | # "__init__.py" = ["E402"]
89 | # "test/" = ["S101"] # Don't warn for asserts in pytest, doesn't work for paths
90 |
91 | [tool.ruff.flake8-annotations]
92 | mypy-init-return = true
93 |
94 | [tool.ruff.flake8-quotes]
95 | docstring-quotes = "double"
96 |
97 | [tool.ruff.flake8-import-conventions]
98 | [tool.ruff.flake8-import-conventions.extend-aliases]
99 | "dask.dataframe" = "dd"
100 | "pyarrow" = "pa"
101 |
102 | [tool.ruff.mccabe]
103 | # Unlike Flake8, default to a complexity level of 10.
104 | max-complexity = 10
105 |
106 | [tool.ruff.isort]
107 | known-first-party = ["lector"]
108 |
109 | [tool.ruff.flake8-bugbear]
110 | extend-immutable-calls = ["typer.Option"]
111 |
112 | [tool.ruff.format]
113 | quote-style = "double"
114 | indent-style = "space"
115 |
--------------------------------------------------------------------------------
/recipe/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set name = "lector" %}
2 | {% set version = "0.3.5" %}
3 |
4 | package:
5 | name: {{ name|lower }}
6 | version: {{ version }}
7 |
8 | source:
9 | path: ..
10 |
11 | build:
12 | number: 0
13 | skip: True # [win]
14 | script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vv "
15 |
16 | requirements:
17 | host:
18 | - python >=3.9,{{PY_VER}}*
19 | - pip
20 | - setuptools
21 | - wheel
22 | run:
23 | - python {{PY_VER}}*
24 | - cchardet
25 | - clevercsv <0.8.1
26 | - msgspec
27 | - pyarrow>=15.0.0
28 | - rich
29 | - tqdm
30 | - typer
31 |
32 | test:
33 | imports:
34 | - lector
35 | requires:
36 | - hypothesis
37 | - hypothesis-csv @ git+https://github.com/jeffbr13/hypothesis-csv.git@master
38 | - pytest
39 | - pytest-cov
40 | commands:
41 | - pip check
42 | - pytest -v
43 |
44 | about:
45 | home: https://github.com/graphext/lector
46 | summary: A fast and flexible CSV reader
47 | description: |
48 | Lector aims to be a fast reader for potentially messy CSV files with configurable column type inference.
49 | It combines automatic detection of file encodings, CSV dialects (separator, escaping etc.) and preambles (initial lines containing metadata or
50 | junk unrelated to the actual tabular data). Its goal is to just-read-the-effing-CSV file without manual configuration in most cases. Each of the
51 | detection components is configurable and can be swapped out easily with custom implementations.
52 | doc_url: https://lector.readthedocs.io/en/latest/
53 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = lector
3 | version = attr: lector.__version__
4 | description = A fast reader for messy CSV files and type inference.
5 | long_description = file: README.md
6 | long_description_content_type = text/markdown; charset=UTF-8
7 | url = https://github.com/graphext/lector
8 | author = Graphext
9 | author_email = thomasb@hey.com
10 | keywords =
11 | csv
12 | json
13 | parser
14 | reader
15 | type inference
16 | license = Apache License 2.0
17 | license_file = LICENSE
18 | classifiers =
19 | Development Status :: 4 - Beta
20 | Intended Audience :: Developers
21 | Topic :: Scientific/Engineering
22 | License :: OSI Approved :: Apache Software License
23 | Programming Language :: Python :: 3
24 | project_urls =
25 | Source = https://github.com/graphext/lector
26 |
27 | [options]
28 | packages = find:
29 | platforms = any
30 | python_requires = >=3.8
31 | install_requires =
32 | cchardet
33 | clevercsv<0.8.1
34 | msgspec
35 | pyarrow>=15.0.0
36 | pyarrow-hotfix
37 | rich
38 | tqdm
39 | typer
40 |
41 | [aliases]
42 | test=pytest
43 |
44 | [options.entry_points]
45 | console_scripts =
46 | lector = lector.cli:CLI
47 |
48 | [options.extras_require]
49 | test =
50 | hypothesis
51 | pytest
52 | pytest-cov
53 | docs =
54 | pydata-sphinx-theme<0.10
55 | sphinx
56 | sphinx-autoapi
57 | sphinx-autobuild
58 | sphinx-autodoc-typehints
59 | dev =
60 | ipywidgets
61 | jupyter_contrib_nbextensions
62 | jupyter_nbextensions_configurator
63 | notebook
64 | pandas
65 | pre-commit
66 | ruff
67 | lector[test]
68 | lector[docs]
69 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | if __name__ == "__main__":
4 | setuptools.setup()
5 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graphext/lector/fb76591acd69aa01ee37a03f08fc8e68fd5f8aa8/test/__init__.py
--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
1 | """For future reference, if we want to make tests extensible to future implementations
2 | of some of the interfaces (ABCs) in this package, we can use parameterized fixtures,
3 | like this:
4 |
5 | - https://github.com/pytest-dev/pytest/issues/421
6 | - https://stackoverflow.com/q/26266481/3519145
7 | """
8 |
--------------------------------------------------------------------------------
/test/test_dialects.py:
--------------------------------------------------------------------------------
1 | """Test detection of dialects of otherwise valid CSV files."""
2 | import io
3 | from csv import QUOTE_MINIMAL, get_dialect
4 |
5 | import pytest
6 | from hypothesis import given
7 | from hypothesis.strategies import data
8 | from hypothesis_csv.strategies import csv as csv_strat
9 |
10 | from lector.csv.dialects import Dialect, PySniffer
11 |
12 | from .utils import equal
13 |
14 |
15 | def fix_expected_dialect(dialect):
16 | dialect.line_terminator = "\r\n" # Hardcoded in sniffer (not detectable)
17 | dialect.quoting = QUOTE_MINIMAL # Hardcoded in sniffer (not detectable)
18 | return dialect
19 |
20 |
21 | @pytest.mark.parametrize("dialect_name", ["excel", "excel-tab", "unix"])
22 | def test_pydialect_roundtrip(dialect_name: str):
23 | attrs = [
24 | "delimiter",
25 | "doublequote",
26 | "escapechar",
27 | "lineterminator",
28 | "quotechar",
29 | "quoting",
30 | "skipinitialspace",
31 | "strict",
32 | ]
33 | d1 = get_dialect(dialect_name)
34 | d2 = Dialect.from_builtin(d1).to_builtin()
35 | assert all(getattr(d1, a, None) == getattr(d2, a, None) for a in attrs)
36 |
37 |
38 | @pytest.mark.parametrize("dialect_name", ["excel", "excel-tab", "unix"])
39 | def test_dialect_roundtrip(dialect_name: str):
40 | b = get_dialect(dialect_name)
41 | d1 = Dialect.from_builtin(b)
42 | d2 = Dialect.from_builtin(d1.to_builtin())
43 | assert equal(d1, d2)
44 |
45 |
46 | @given(data=data())
47 | @pytest.mark.parametrize("dialect", ["excel", "excel-tab", "unix"])
48 | def test_dialects(dialect, data):
49 | strategy = csv_strat(dialect=dialect, lines=3, header=2)
50 | csv = data.draw(strategy)
51 | expected = Dialect.from_builtin(get_dialect(dialect))
52 | expected = fix_expected_dialect(expected)
53 | detected = PySniffer().detect(io.StringIO(csv))
54 | assert equal(expected, detected)
55 |
--------------------------------------------------------------------------------
/test/test_encodings.py:
--------------------------------------------------------------------------------
1 | """Test encoding detectors.
2 |
3 | Notes:
4 |
5 | - ISO-8859-1 (Latin-1) is mostly identical to Windows-1252 (CP1252):
6 | https://www.i18nqa.com/debug/table-iso8859-1-vs-windows-1252.html
7 |
8 | """
9 | import io
10 |
11 | import pytest
12 |
13 | from lector.csv.encodings import Chardet
14 |
15 | CODEC_ERR = "�"
16 |
17 | CODEC_SAMPLES = [
18 | ("刺靑 谷崎潤一郞著", "utf-8", "windows-1250", 2),
19 | ("顏是家訓 北齊 顏之推", "BIG5", "utf-8", 12),
20 | ("The Project · Gutenberg » EBook « of Die Fürstin.", "ISO-8859-1", "utf-8", 4),
21 | ("Той и сам не знае кога е роден, но като го запитат.", "windows-1251", "utf-8", 38),
22 | ("première is first", "utf-8", "ascii", 2),
23 | ("première is first", "utf-16", "utf-8", 3),
24 | ("première is first", "windows-1252", "utf-8", 1),
25 | ("première is first", "ISO-8859-1", "utf-16", 1),
26 | ("𐐀 am Deseret", "utf-8", "windows-1250", 2),
27 | ("𐐀 am Deseret", "utf-8", "windows-1252", 2),
28 | ("𐐀 am Deseret", "utf-16", "utf-8", 4),
29 | ]
30 | """When encoded with first then decoded with second codecs, n unknown chars are produced."""
31 |
32 | CODECS = [
33 | "utf-8",
34 | "utf-16",
35 | "utf-8-sig",
36 | "windows-1250",
37 | "windows-1251",
38 | "windows-1252",
39 | "iso-8859-1",
40 | "ascii",
41 | ]
42 |
43 |
44 | def codecs_equal(c1, c2):
45 | c1 = c1.lower()
46 | c2 = c2.lower()
47 | if c1 == c2:
48 | return True
49 |
50 | # Allow some variations where small differences between codes cannot be inferred
51 | # from small sample
52 | equivalent = ("iso-8859-1", "windows-1252")
53 | return c1 in equivalent and c2 in equivalent
54 |
55 |
56 | def codecs_compatible(c1, c2, encoded, n_err_max=0):
57 | """Check if c1 and c2 are equivalent, or otherwise that c2 can decode string without error."""
58 | if codecs_equal(c1, c2):
59 | return True
60 |
61 | return encoded.decode(c2, errors="replace").count(CODEC_ERR) <= n_err_max
62 |
63 |
64 | @pytest.mark.parametrize("codec", CODECS)
65 | def test_simple_roundtrip(codec):
66 | """Test correct detection if string can be encoded given codec."""
67 | s = "première is first"
68 |
69 | try:
70 | encoded = s.encode(codec)
71 | detected = Chardet().detect(io.BytesIO(encoded))
72 | assert codecs_equal(codec, detected)
73 | except Exception:
74 | pass # noqa
75 |
76 |
77 | @pytest.mark.parametrize("example", CODEC_SAMPLES)
78 | def test_roundtrips(example):
79 | text, codec, _, _ = example
80 | encoded = text.encode(codec)
81 | detected = Chardet().detect(io.BytesIO(encoded))
82 | decoded = encoded.decode(detected, errors="replace")
83 |
84 | assert codecs_equal(codec, detected)
85 | assert decoded.count(CODEC_ERR) == 0
86 |
--------------------------------------------------------------------------------
/test/test_formats.py:
--------------------------------------------------------------------------------
1 | """Test integration/interactions between encoding, preambles, dialect."""
2 | from __future__ import annotations
3 |
4 | import io
5 | from csv import get_dialect
6 |
7 | import pytest
8 | from hypothesis import given
9 | from hypothesis.strategies import data
10 | from hypothesis_csv.strategies import csv as csv_strat
11 |
12 | from lector.csv import Dialect, EmptyFileError, Reader
13 |
14 | from .test_dialects import fix_expected_dialect
15 | from .test_encodings import CODECS, codecs_compatible
16 | from .utils import equal
17 |
18 | PREAMBLES = [
19 | # Brandwatch, the delimiter should be ignored in preamble, as long as a row of commas
20 | # separates the preamble from CSV table
21 | ("a{d}b{d}c\n,,", 2),
22 | ("some text\n,,", 2),
23 | ("a{d}b{d}c\nsome text\n,,", 3),
24 | # Fieldless
25 | # Start with header line having multiple (two) fields (delimited)
26 | ("abc{d}def", 0),
27 | ('"abc"{d}"def"', 0),
28 | ('"abc"{d}def', 0),
29 | ('abc{d} "123,456"', 0),
30 | ('"[cat1, cat2]"{d} "123,67"', 0),
31 | # Since the line isn't quoted as a whole, the delimiter creates two fields
32 | ('A line that has "something{d}in quotes" and then some', 0),
33 | ('A line that has ""something{d} in quotes"" and then some', 0),
34 | # Start with line having single field
35 | ("abc", 1),
36 | ('"abc, def"', 1),
37 | ('"The ""text,"" is double-quoted, and contains a comma"', 1),
38 | ('"A line that has ""something, in quotes"" and then some"', 1),
39 | ('"A line that has something, and then some more"', 1),
40 | ('"One line, spreading over\n multiple, lines"', 2),
41 | ("One line\nAnother line", 2),
42 | ('One line\nAnother line\n"And, a\nthird"', 4),
43 | ]
44 |
45 |
46 | def with_delimiter(s, delim):
47 | if "{d}" in s:
48 | return s.format(d=delim)
49 | return s
50 |
51 |
52 | class NonParser(Reader):
53 | """Simple implementation of abstract class that only infers format."""
54 |
55 | def parse(self, *args, **kwds) -> None:
56 | return None
57 |
58 |
59 | def detect_format(csv: str | bytes):
60 | if isinstance(csv, str):
61 | buffer = io.StringIO(csv)
62 | else:
63 | buffer = io.BytesIO(csv)
64 |
65 | reader = NonParser(buffer, log=False)
66 | reader.read()
67 | return reader.format
68 |
69 |
70 | def test_empty_file():
71 | with pytest.raises(EmptyFileError):
72 | detect_format("")
73 |
74 |
75 | @given(data=data())
76 | @pytest.mark.parametrize("codec", CODECS)
77 | @pytest.mark.parametrize("preamble", PREAMBLES)
78 | @pytest.mark.parametrize("dialect", ["excel", "excel-tab", "unix"])
79 | def test_formats(codec, preamble, dialect, data):
80 | """Test parsing with different encoding, dialect and preamble."""
81 |
82 | pydialect = get_dialect(dialect)
83 |
84 | # Make premable compatible with dialect & generated csv
85 | preamble, skiprows = preamble
86 | preamble = with_delimiter(preamble, pydialect.delimiter)
87 | preamble = preamble.replace("\n", pydialect.lineterminator)
88 | if not pydialect.skipinitialspace:
89 | preamble = preamble.replace(f"{pydialect.delimiter} ", f"{pydialect.delimiter}")
90 |
91 | # Create the full, encoded CSV "file"
92 | strategy = csv_strat(dialect=pydialect, lines=3, header=2)
93 | csv = data.draw(strategy)
94 | csv = preamble + pydialect.lineterminator + csv
95 |
96 | try:
97 | encoded = csv.encode(codec)
98 | format = detect_format(encoded)
99 |
100 | # We have very short CSVs with preambles including the ñ, whose encoding
101 | # cannot always be detected correctly (would be easier with larger examples)
102 | assert codecs_compatible(codec, format.encoding, encoded, n_err_max=1)
103 | assert equal(skiprows, format.preamble, extra=csv)
104 |
105 | exp_dialect = fix_expected_dialect(Dialect.from_builtin(pydialect))
106 | assert equal(exp_dialect, format.dialect, extra=csv)
107 |
108 | except UnicodeEncodeError:
109 | pass
110 |
--------------------------------------------------------------------------------
/test/test_preambles.py:
--------------------------------------------------------------------------------
1 | import io
2 |
3 | import pytest
4 | from hypothesis import given
5 | from hypothesis.strategies import data
6 | from hypothesis_csv.strategies import csv as csv_strat
7 |
8 | from lector.csv.preambles import Brandwatch, Fieldless, GoogleAds, Preambles
9 |
10 | from .utils import equal
11 |
12 | BRANDWATCH_PREAMBLES = [
13 | ("a,b,c", 0),
14 | ("a,b,c\n,,", 2),
15 | ("some text\n,,", 2),
16 | ("a,b,c\nsome text\n,,", 3),
17 | ]
18 | """Brandwatch uses a line of delimiters only to mark end of preamble."""
19 |
20 | FIELDLESS_PREAMBLES = [
21 | # Start with line having multiple fields
22 | ("abc,def", 0),
23 | ('"abc","def"', 0),
24 | ('"abc",def', 0),
25 | ('abc, "123,456"', 0),
26 | ('"[cat1, cat2]", "123,67"', 0),
27 | ('A line that has "something,in quotes" and then some', 0),
28 | ('A line that has ""something, in quotes"" and then some', 0),
29 | # Start with line having single field
30 | ("abc", 1),
31 | ('"abc, def"', 1),
32 | ('"The ""text,"" is double-quoted, and contains a comma"', 1),
33 | ('"A line that has ""something, in quotes"" and then some"', 1),
34 | ('"A line that has something, and then some more"', 1),
35 | ('"One line, spreading over\n multiple, lines"', 2),
36 | ("One line\nAnother line", 2),
37 | ('One line\nAnother line\n"And, a\nthird"', 4),
38 | ]
39 | """Test cases for Fieldless detector, ints indicating how many lines to skip for preamble."""
40 |
41 |
42 | GOOGLEADS_PREAMBLES = [
43 | ("Informe de something something\nSome other info\nCampaña, Column", 2),
44 | ("Informe de something something\nCampaña, Column", 1),
45 | ("Something something\nCampaña,Column", 0),
46 | ("Informe de something something\nCampana,Column", 0),
47 | ]
48 |
49 |
50 | @given(data=data())
51 | @pytest.mark.parametrize("preamble", BRANDWATCH_PREAMBLES)
52 | def test_brandwatch(preamble, data):
53 | preamble, skip_n_exp = preamble
54 | csv = preamble + "\n" + data.draw(csv_strat(header=3))
55 | skip_n_det = Brandwatch().detect(io.StringIO(csv))
56 | assert equal(skip_n_exp, skip_n_det)
57 |
58 |
59 | @given(data=data())
60 | @pytest.mark.parametrize("preamble", FIELDLESS_PREAMBLES)
61 | def test_fieldless(preamble, data):
62 | preamble, skip_n_exp = preamble
63 | csv = preamble + "\n" + data.draw(csv_strat(header=2))
64 | skip_n_det = Fieldless().detect(io.StringIO(csv))
65 | assert equal(skip_n_exp, skip_n_det)
66 |
67 |
68 | @given(data=data())
69 | @pytest.mark.parametrize("preamble", GOOGLEADS_PREAMBLES)
70 | def test_googleads(preamble, data):
71 | preamble, skip_n_exp = preamble
72 | csv = preamble + "\n" + data.draw(csv_strat(header=2))
73 | skip_n_det = GoogleAds().detect(io.StringIO(csv))
74 | assert equal(skip_n_exp, skip_n_det)
75 |
76 |
77 | @given(data=data())
78 | @pytest.mark.parametrize("preamble", BRANDWATCH_PREAMBLES + FIELDLESS_PREAMBLES)
79 | def test_preambles(preamble, data):
80 | preamble, skip_n_exp = preamble
81 | csv = preamble + "\n" + data.draw(csv_strat(header=3))
82 | skip_n_det = Preambles.detect(io.StringIO(csv))
83 | assert equal(skip_n_exp, skip_n_det)
84 |
--------------------------------------------------------------------------------
/test/test_reader.py:
--------------------------------------------------------------------------------
1 | """Test CSV readers."""
2 | import csv
3 | import io
4 | import sys
5 | from csv import get_dialect
6 |
7 | import pytest
8 | from hypothesis import given
9 | from hypothesis.strategies import data
10 | from hypothesis_csv.strategies import csv as csv_strat
11 |
12 | import lector
13 | from lector.csv import ArrowReader, Dialect, EmptyFileError
14 |
15 | from .test_dialects import fix_expected_dialect
16 | from .test_encodings import CODECS, codecs_compatible
17 | from .test_formats import PREAMBLES, with_delimiter
18 | from .utils import equal
19 |
20 | SHAPES = [
21 | # (0, 1), # hypothesis_csv cannot generate a 0 row file # noqa
22 | # (1, 0), # This produces an empty file ("\r\n\r\n") # noqa
23 | (1, 1),
24 | (1, 3),
25 | (3, 1),
26 | (10, 5),
27 | ]
28 | """Different combinations of n_rows and n_cols."""
29 |
30 | EMPTY = ["", "\n", "\n\n", "\r\n"]
31 |
32 | NULL_BYTES_CSV = b"""
33 | col1,col_\0_2,col3
34 | 1,abc,x
35 | 2,de\0fg,y
36 | """
37 |
38 | UNNAMED = b"""
39 | col1,,col3,,col5
40 | 1,2,3,4,5
41 | 6,7,8,9,10
42 | """
43 |
44 |
45 | @pytest.mark.parametrize("csv", EMPTY)
46 | def test_empty(csv: str):
47 | fp = io.BytesIO(csv.encode("utf-8"))
48 | with pytest.raises(EmptyFileError):
49 | ArrowReader(fp, log=False).read()
50 |
51 |
52 | def test_unnamed():
53 | """Automatic names for unnnamed columns"""
54 | fp = io.BytesIO(UNNAMED)
55 | tbl = lector.read_csv(fp)
56 | assert tbl.column_names == ["col1", "Unnamed_0", "col3", "Unnamed_1", "col5"]
57 |
58 |
59 | def test_null_bytes():
60 | """For now, null bytes don't throw error, but are also not removed automatically!"""
61 |
62 | with pytest.raises(csv.Error):
63 | # python's csv reader throws error on null byte
64 | s = io.StringIO(NULL_BYTES_CSV.decode("utf-8"))
65 | print("Null-byte CSV:", list(csv.reader(s)))
66 |
67 | tbl = lector.read_csv(io.BytesIO(NULL_BYTES_CSV))
68 | assert tbl.column_names == ["col1", "col_\x00_2", "col3"]
69 | assert tbl.column("col_\x00_2").to_pylist() == ["abc", "de\x00fg"]
70 |
71 |
72 | @given(data=data())
73 | @pytest.mark.parametrize("codec", CODECS)
74 | @pytest.mark.parametrize("preamble", PREAMBLES)
75 | @pytest.mark.parametrize("dialect", ["excel", "excel-tab", "unix"])
76 | def test_all(codec, preamble, dialect, data):
77 | """Test parsing with different encoding, dialect and preamble."""
78 |
79 | pydialect = get_dialect(dialect)
80 |
81 | # Make premable compatible with dialect & generated csv
82 | preamble, skiprows = preamble
83 | preamble = with_delimiter(preamble, pydialect.delimiter)
84 | preamble = preamble.replace("\n", pydialect.lineterminator)
85 | if not pydialect.skipinitialspace:
86 | preamble = preamble.replace(f"{pydialect.delimiter} ", f"{pydialect.delimiter}")
87 |
88 | # Create the full, encoded CSV "file"
89 | n_lines = 3
90 | n_columns = 2
91 | strategy = csv_strat(dialect=pydialect, lines=n_lines, header=n_columns)
92 | csv = data.draw(strategy)
93 | csv = preamble + pydialect.lineterminator + csv
94 |
95 | try:
96 | encoded = csv.encode(codec)
97 | reader = ArrowReader(io.BytesIO(encoded), log=False)
98 | tbl = reader.read()
99 |
100 | # If a preamble test case specifies 0 as the number of skiprows, it means the preamble text
101 | # consists of a single line that should be interpreted as the header. In this case, the
102 | # header (first line) of the csv created automatically will effectively become another row
103 | # in the table.
104 | exp_num_rows = n_lines if skiprows > 0 else n_lines + 1
105 | assert equal(exp_num_rows, tbl.num_rows, extra=csv)
106 | assert equal(2, tbl.num_columns, extra=csv)
107 |
108 | # We have very short CSVs with preambles including the ñ, whose encoding
109 | # cannot always be detected correctly (would be easier with larger examples)
110 | assert codecs_compatible(codec, reader.format.encoding, encoded, n_err_max=1)
111 | assert equal(skiprows, reader.format.preamble, extra=csv)
112 |
113 | exp_dialect = fix_expected_dialect(Dialect.from_builtin(pydialect))
114 | assert equal(exp_dialect, reader.format.dialect, extra=csv)
115 |
116 | except UnicodeEncodeError:
117 | print(f"FAILED ON CSV:\n{csv}")
118 | sys.exit()
119 | raise
120 |
--------------------------------------------------------------------------------
/test/test_types.py:
--------------------------------------------------------------------------------
1 | import io
2 | from collections import namedtuple
3 |
4 | import pyarrow as pa
5 | import pyarrow.types as pat
6 |
7 | import lector
8 | from lector import ArrowReader, Autocast
9 |
10 | from .utils import equal
11 |
12 | TC = namedtuple("TC", "min,max,extra,null")
13 | """Test Case definition."""
14 |
15 | TYPE_COLUMNS = {
16 | "num_int8": TC("-128", "127", "0", "NA"),
17 | "num_int16": TC("-32768", "32767", "0", "NA"),
18 | "num_int32": TC("-2147483648", "2147483647", "0", "NA"),
19 | "num_int64": TC("-9223372036854775808", "9223372036854775807", "0", "NA"),
20 | "num_uint8": TC("0", "255", "1", "NA"),
21 | "num_uint16": TC("0", "65535", "1", "NA"),
22 | "num_uint32": TC("0", "4294967295", "1", "NA"),
23 | # "num_uint64": TS("0", "18446744073709551615", "", "NA"), # noqa
24 | "list_uint8": TC('"[0,1,2]"', '"[7,8,9]"', '"[]"', "NA"),
25 | "list_uint8_2": TC('"[0,1,2]"', '"[7,8,9]"', '"[4]"', "NA"),
26 | "list_str": TC('"[a,b,c]"', '"[x,y,z]"', '"[]"', "NA"),
27 | "list_str_2": TC('"[a,b,c]"', '"[x,y,z]"', '"[test]"', "NA"),
28 | "date_iso": TC(
29 | "2022-06-17T10:31:40.000746",
30 | "2022-06-17T10:31:40.000746",
31 | "2022-06-17T10:31:40.000000",
32 | "NA",
33 | ),
34 | "date_custom": TC(
35 | "2022-06-17",
36 | "2022-06-17",
37 | "2022-05-03",
38 | "NA",
39 | ),
40 | "text": TC(
41 | '"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor."',
42 | '"No one rejects, dislikes, or avoids pleasure itself, because it is pleasure."',
43 | '"Nor again is there anyone who loves or pursues or desires to obtain pain of itself."',
44 | "NA",
45 | ),
46 | "cat": TC("cat1", "cat2", "", "NA"),
47 | }
48 |
49 | TYPE_CSV = ",".join(TYPE_COLUMNS)
50 | for row in map(list, zip(*TYPE_COLUMNS.values())):
51 | TYPE_CSV += "\n" + ",".join(row)
52 |
53 | ARROW_TYPES = {
54 | "num_int8": pa.int64(),
55 | "num_int16": pa.int64(),
56 | "num_int32": pa.int64(),
57 | "num_int64": pa.int64(),
58 | "num_uint8": pa.int64(),
59 | "num_uint16": pa.int64(),
60 | "num_uint32": pa.int64(),
61 | "list_uint8": pa.string(),
62 | "list_uint8_2": pa.string(),
63 | "list_str": pa.string(),
64 | "list_str_2": pa.string(),
65 | "date_iso": pa.timestamp(unit="ns"),
66 | "date_custom": pa.date32(),
67 | "text": pa.string(),
68 | "cat": pa.string(),
69 | }
70 |
71 | LECTOR_TYPES = {
72 | "num_int8": pa.int8(),
73 | "num_int16": pa.int16(),
74 | "num_int32": pa.int32(),
75 | "num_int64": pa.int64(),
76 | "num_uint8": pa.uint8(),
77 | "num_uint16": pa.uint16(),
78 | "num_uint32": pa.uint32(),
79 | "list_uint8": pa.list_(pa.uint8()),
80 | "list_uint8_2": pa.list_(pa.uint8()),
81 | "list_str": pa.list_(pa.string()),
82 | "list_str_2": pa.list_(pa.string()),
83 | "date_iso": pa.timestamp(unit="ns", tz="UTC"),
84 | "date_custom": pa.timestamp(unit="ns", tz="UTC"),
85 | "text": pa.string(),
86 | "cat": pa.dictionary(index_type=pa.int32(), value_type=pa.string()),
87 | }
88 |
89 | DELIM_CSV = b"""
90 | dot_delim, comma_delim, mixed_delim_dot, mixed_delim_comma, mixed_delim
91 | "1,234.0","1.234,0","1,234.0","1.234,0","1.234,0"
92 | "1,234,456",1.234.456,"1,234,456",1.234.456,1.234.456
93 | NA,NA,NA,NA,NA
94 | "1,234,456.987","1.234.456,987","1,234,456.987","1.234.456,987","1.234.456,987"
95 | 0.1,"0,1","0.1","0,1",0.1
96 | .1,",1",",1",.1,.1
97 | 98765.123,"98765,123","98765,123",98765.123,98765.123
98 | """
99 |
100 | # ruff: noqa: E501
101 | LIST_CSV = """
102 | lnum1,lnum2,lnum_NA_3,lnum4,lcat5,lfloat6,lfloat7,lfloat8,lfloat_DEL_9
103 | "[0,1,2]","[0,1,2]","['123', '456', NA, '789']","[123, 456, 789]","[a,b,c]","[123.45, 678.90]","[""123.45"", ""678.90""]","['123.45', '678.90']","['123,45', '678,90']"
104 | "[7,8,9]","[7,8,9]","['123', '456', NA, '789']","[123, 456, 789]","(d,e)","[123, 678]","[""123"", ""678""]","['123', '678']","['123', '678']"
105 | "[]","[4]","[123, 456, NA, 789]","[123, 456, 789]","|f|","[123.45, 678.90]","[""123.45"", ""678.90""]","['123.45', '678.90', '0.0']","['123,45', '678,90', '0,0']"
106 | "NA","NA",NA,NA,NA,NA,NA,NA,NA
107 | """
108 |
109 |
110 | def test_decimals():
111 | """Based on inferred decimal delimiter, thousands delimiter gets removed.
112 |
113 | If delimiter is ambiguous, result will be dict.
114 | """
115 | tbl = lector.read_csv(io.BytesIO(DELIM_CSV))
116 |
117 | for i in range(4):
118 | assert pat.is_floating(tbl.column(i).type)
119 |
120 | assert pat.is_dictionary(tbl.column(4).type)
121 |
122 |
123 | def test_list():
124 | """List parsing. NAs are not allowed in float lists. Also, decimal delimiter must be the period character!"""
125 | fp = io.BytesIO(LIST_CSV.encode("utf-8"))
126 | tbl = lector.read_csv(fp)
127 |
128 | exp_types = {
129 | "lnum1": pa.list_(pa.uint8()),
130 | "lnum2": pa.list_(pa.uint8()),
131 | "lnum_NA_3": pa.list_(pa.string()), # NA not supported in numeric lists
132 | "lnum4": pa.list_(pa.uint16()),
133 | "lcat5": pa.list_(pa.string()),
134 | "lfloat6": pa.list_(pa.float64()),
135 | "lfloat7": pa.list_(pa.float64()),
136 | "lfloat8": pa.list_(pa.float64()),
137 | "lfloat_DEL_9": pa.list_(
138 | pa.uint16()
139 | ), # comma as decimal delimiter not supported (interpreted as csv delimter)
140 | }
141 |
142 | for col in tbl.column_names:
143 | assert equal(tbl.column(col).type, exp_types[col], col)
144 |
145 |
146 | def test_inference():
147 | """Compare Arrow and Lector inferred types against reference.
148 |
149 | TODO:
150 | - Deal correctly with ns/ms in timestamps.
151 | - Default dictionary encoding uses int32 always. Downcast?
152 | """
153 | file = io.BytesIO(TYPE_CSV.encode("utf-8"))
154 | tbl = ArrowReader(file, log=False).read()
155 | schema = tbl.schema
156 |
157 | for name, type in ARROW_TYPES.items():
158 | assert equal(type, schema.field(name).type, extra=name)
159 |
160 | tbl = Autocast(log=False).cast(tbl)
161 | schema = tbl.schema
162 |
163 | for name, type in LECTOR_TYPES.items():
164 | assert equal(type, schema.field(name).type, extra=name)
165 |
--------------------------------------------------------------------------------
/test/utils.py:
--------------------------------------------------------------------------------
1 | from lector.log import CONSOLE
2 |
3 |
4 | def equal(obj1, obj2, extra=None):
5 | """Helper to print useful info if result is unexpected."""
6 | eq = obj1 == obj2
7 |
8 | if not eq:
9 | CONSOLE.print(obj1)
10 | CONSOLE.print(obj2)
11 |
12 | if extra is not None:
13 | CONSOLE.print(extra)
14 |
15 | return False
16 |
17 | return True
18 |
--------------------------------------------------------------------------------