├── clevercsv ├── py.typed ├── __version__.py ├── console │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ ├── _docs.py │ │ ├── _utils.py │ │ ├── code.py │ │ ├── view.py │ │ ├── explore.py │ │ └── detect.py │ └── application.py ├── cabstraction.pyi ├── exceptions.py ├── __main__.py ├── utils.py ├── __init__.py ├── _types.py ├── cparser.pyi ├── encoding.py ├── cparser_util.pyi ├── escape.py ├── read.py ├── write.py ├── _optional.py ├── detect_pattern.py ├── cparser_util.py ├── dict_read_write.py ├── dialect.py └── consistency.py ├── stubs ├── pythonfuzz │ ├── __init__.pyi │ └── main.pyi ├── tabview │ ├── __init__.pyi │ └── tabview.pyi ├── regex │ ├── _regex.pyi │ ├── __init__.pyi │ └── regex.pyi ├── termcolor │ └── __init__.pyi ├── pandas │ └── __init__.pyi └── wilderness │ └── __init__.pyi ├── docs ├── _readme.rst ├── _changelog.rst ├── source │ ├── modules.rst │ ├── clevercsv.console.rst │ ├── clevercsv.console.commands.rst │ └── clevercsv.rst ├── index.rst ├── Makefile ├── make.bat └── conf.py ├── tests ├── test_integration │ ├── error.log │ ├── error_partial.log │ ├── README.md │ ├── failed.log │ ├── failed_partial.log │ └── test_dialect_detection.py ├── test_unit │ ├── data │ │ └── abstraction_testcases.json.gz │ ├── test_fuzzing.py │ ├── test_c_file_naming.py │ ├── test_consistency.py │ ├── test_potential_dialects.py │ ├── test_abstraction.py │ ├── test_encoding.py │ ├── test_write.py │ ├── test_normal_forms.py │ ├── test_detect.py │ └── test_detect_pattern.py ├── README.md └── test_fuzz │ └── fuzz_sniffer.py ├── .github ├── dependabot.yml └── workflows │ ├── build.yml │ └── deploy.yml ├── .readthedocs.yml ├── .gitignore ├── notes └── date_regex │ ├── README.md │ ├── dateregexmin.txt │ ├── dateregex_formats.txt │ ├── dateregex.txt │ ├── datefmt.py │ └── dateregex_annotated.txt ├── MANIFEST.in ├── .pre-commit-config.yaml ├── pyproject.toml ├── example ├── README.md └── airedale.csv ├── LICENSE ├── man ├── clevercsv-help.1 ├── clevercsv-view.1 ├── clevercsv-code.1 ├── clevercsv-explore.1 ├── clevercsv.1 ├── clevercsv-standardize.1 └── clevercsv-detect.1 ├── CODE_OF_CONDUCT.md ├── Makefile └── setup.py /clevercsv/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stubs/pythonfuzz/__init__.pyi: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/_readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ./README.rst 2 | -------------------------------------------------------------------------------- /docs/_changelog.rst: -------------------------------------------------------------------------------- 1 | .. include:: ./CHANGELOG.rst 2 | -------------------------------------------------------------------------------- /stubs/tabview/__init__.pyi: -------------------------------------------------------------------------------- 1 | from .tabview import view as view 2 | -------------------------------------------------------------------------------- /tests/test_integration/error.log: -------------------------------------------------------------------------------- 1 | 12f6fa751d2b2a491a54bc9e0e39d05f 2 | -------------------------------------------------------------------------------- /tests/test_integration/error_partial.log: -------------------------------------------------------------------------------- 1 | 12f6fa751d2b2a491a54bc9e0e39d05f 2 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | CleverCSV API Documentation 2 | =========================== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | clevercsv 8 | -------------------------------------------------------------------------------- /tests/test_unit/data/abstraction_testcases.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/CleverCSV/HEAD/tests/test_unit/data/abstraction_testcases.json.gz -------------------------------------------------------------------------------- /clevercsv/__version__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Tuple 4 | 5 | VERSION: Tuple[int, int, int] = (0, 8, 4) 6 | 7 | __version__: str = ".".join(map(str, VERSION)) 8 | -------------------------------------------------------------------------------- /clevercsv/console/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .application import build_application 4 | 5 | 6 | def main() -> int: 7 | app = build_application() 8 | return app.run() 9 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | ignore: 8 | - dependency-name: "actions/*" 9 | -------------------------------------------------------------------------------- /stubs/pythonfuzz/main.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from typing import Callable 3 | 4 | class PythonFuzz: 5 | def __init__(self, func: Callable[[bytes], Any]) -> None: ... 6 | def __call__(self, *args: Any, **kwargs: Any) -> None: ... 7 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | We have two types of tests for CleverCSV: unit tests and integration tests. 4 | 5 | * Unit tests evaluate the functionality of the package in the usual way 6 | * Integration tests specifically evaluate the dialect detection accuracy on a 7 | large set of test files. 8 | -------------------------------------------------------------------------------- /clevercsv/cabstraction.pyi: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Optional 4 | 5 | def base_abstraction( 6 | data: str, 7 | delimiter: Optional[str], 8 | quotechar: Optional[str], 9 | escapechar: Optional[str], 10 | ) -> str: ... 11 | def c_merge_with_quotechar(data: str) -> str: ... 12 | -------------------------------------------------------------------------------- /clevercsv/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Exceptions for CleverCSV 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | from .cparser import Error as ParserError 11 | 12 | 13 | class Error(ParserError): 14 | pass 15 | 16 | 17 | class NoDetectionResult(Exception): 18 | pass 19 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file for CleverCSV 2 | # 3 | version: 2 4 | 5 | build: 6 | os: ubuntu-22.04 7 | tools: 8 | python: "latest" 9 | 10 | sphinx: 11 | configuration: docs/conf.py 12 | 13 | python: 14 | install: 15 | - method: pip 16 | path: . 17 | extra_requirements: 18 | - docs 19 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. include:: ./_readme.rst 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | _readme 8 | 9 | .. toctree:: 10 | :caption: Further Documentation 11 | :maxdepth: 2 12 | 13 | _changelog 14 | source/modules 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | */__pycache__/ 2 | *.pyc 3 | build/ 4 | cover/* 5 | tests/test_integration/data 6 | tests/test_integration/__pycache__/ 7 | tests/test_unit/__pycache__/ 8 | .coverage 9 | clevercsv.egg-info/ 10 | clevercsv/.coverage 11 | clevercsv/*.so 12 | dist/* 13 | docs/_build 14 | docs/_static 15 | cgrep 16 | vgrep 17 | auxiliary/ 18 | notes/ 19 | _logo 20 | .logo.png 21 | docs/source/AUTOGENERATED 22 | comparison/ 23 | -------------------------------------------------------------------------------- /clevercsv/console/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .code import CodeCommand 4 | from .detect import DetectCommand 5 | from .explore import ExploreCommand 6 | from .standardize import StandardizeCommand 7 | from .view import ViewCommand 8 | 9 | __all__ = [ 10 | "CodeCommand", 11 | "DetectCommand", 12 | "ExploreCommand", 13 | "StandardizeCommand", 14 | "ViewCommand", 15 | ] 16 | -------------------------------------------------------------------------------- /notes/date_regex/README.md: -------------------------------------------------------------------------------- 1 | # Date regex 2 | 3 | These are some files used to develop the date regular expression. It is based 4 | on individual regular expressions for each of the date formats generated by 5 | ``datefmt.py``, but merged into a single expression by hand. 6 | 7 | Note that this regex checks whether a string is a valid *date format*, not 8 | whether it's a valid *date* (i.e. 2019-02-31 is considered valid). This is for 9 | both speed and simplicity. 10 | -------------------------------------------------------------------------------- /stubs/regex/_regex.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | CODE_SIZE: int 4 | MAGIC: int 5 | copyright: str 6 | 7 | def compile(*args, **kwargs) -> Any: ... 8 | def fold_case(*args, **kwargs) -> Any: ... 9 | def get_all_cases(*args, **kwargs) -> Any: ... 10 | def get_code_size(*args, **kwargs) -> Any: ... 11 | def get_expand_on_folding(*args, **kwargs) -> Any: ... 12 | def get_properties(*args, **kwargs) -> Any: ... 13 | def has_property_value(*args, **kwargs) -> Any: ... 14 | -------------------------------------------------------------------------------- /stubs/termcolor/__init__.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | __ALL__: Any 4 | VERSION: Any 5 | ATTRIBUTES: Any 6 | HIGHLIGHTS: Any 7 | COLORS: Any 8 | RESET: str 9 | 10 | def colored( 11 | text, 12 | color: Any | None = ..., 13 | on_color: Any | None = ..., 14 | attrs: Any | None = ..., 15 | ): ... 16 | def cprint( 17 | text, 18 | color: Any | None = ..., 19 | on_color: Any | None = ..., 20 | attrs: Any | None = ..., 21 | **kwargs 22 | ) -> None: ... 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include setup.py 2 | include README.md 3 | include CHANGELOG.md 4 | include LICENSE 5 | include requirements.txt 6 | recursive-include clevercsv *.py 7 | recursive-include src *.c 8 | recursive-include bin * 9 | recursive-include tests/test_unit *.py 10 | recursive-include man *.1 11 | prune tests/test_integration 12 | exclude Makefile 13 | exclude .gitignore 14 | exclude .travis.yml 15 | exclude .readthedocs.yml 16 | exclude make_release.py 17 | exclude cgrep 18 | exclude vgrep 19 | prune notes 20 | prune auxiliary 21 | -------------------------------------------------------------------------------- /clevercsv/__main__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Caller for the command line application. 5 | 6 | """ 7 | 8 | import sys 9 | 10 | from ._optional import import_optional_dependency 11 | 12 | 13 | def main() -> None: 14 | # Check that necessary dependencies are available 15 | import_optional_dependency("wilderness") 16 | 17 | # if so, load the actual main function and call it. 18 | from .console import main as realmain 19 | 20 | sys.exit(realmain()) 21 | 22 | 23 | if __name__ == "__main__": 24 | main() 25 | -------------------------------------------------------------------------------- /tests/test_fuzz/fuzz_sniffer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Script to run PythonFuzz to detect unhandled exceptions in the Sniffer 5 | 6 | This file is part of CleverCSV. 7 | 8 | """ 9 | 10 | from pythonfuzz.main import PythonFuzz 11 | 12 | import clevercsv 13 | 14 | 15 | @PythonFuzz 16 | def fuzz(buf): 17 | try: 18 | string = buf.decode("utf-8") 19 | _ = clevercsv.Sniffer().sniff(string) 20 | except UnicodeDecodeError: 21 | pass 22 | except clevercsv.exceptions.Error: 23 | pass 24 | 25 | 26 | if __name__ == "__main__": 27 | fuzz() 28 | -------------------------------------------------------------------------------- /tests/test_unit/test_fuzzing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Unit tests based on fuzzing 5 | 6 | """ 7 | 8 | import unittest 9 | 10 | import clevercsv 11 | 12 | 13 | class FuzzingTestCase(unittest.TestCase): 14 | def test_sniffer_fuzzing(self) -> None: 15 | strings = ['"""', "```", "\"'", "'@'", "'\"", "'''", "O##P~` "] 16 | for string in strings: 17 | with self.subTest(string=string): 18 | try: 19 | _ = clevercsv.Sniffer().sniff(string) 20 | except clevercsv.exceptions.Error: 21 | pass 22 | -------------------------------------------------------------------------------- /docs/source/clevercsv.console.rst: -------------------------------------------------------------------------------- 1 | clevercsv.console package 2 | ========================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | clevercsv.console.commands 11 | 12 | Submodules 13 | ---------- 14 | 15 | clevercsv.console.application module 16 | ------------------------------------ 17 | 18 | .. automodule:: clevercsv.console.application 19 | :members: 20 | :show-inheritance: 21 | :undoc-members: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: clevercsv.console 27 | :members: 28 | :show-inheritance: 29 | :undoc-members: 30 | -------------------------------------------------------------------------------- /notes/date_regex/dateregexmin.txt: -------------------------------------------------------------------------------- 1 | ((0[1-9]|1[0-2])((0[1-9]|[12]\d|3[01])([12]\d{3}|\d{2})|(?P[-\/. ])(0?[1-9]|[12]\d|3[01])(?P=sep1)([12]\d{3}|\d{2}))|(0[1-9]|[12]\d|3[01])((0[1-9]|1[0-2])([12]\d{3}|\d{2})|(?P[-\/. ])(0?[1-9]|1[0-2])(?P=sep2)([12]\d{3}|\d{2}))|([12]\d{3}|\d{2})((?P[-\/. ])(0?[1-9]|1[0-2])(?P=sep3)(0?[1-9]|[12]\d|3[01])|年(0?[1-9]|1[0-2])月(0?[1-9]|[12]\d|3[01])日|년(0?[1-9]|1[0-2])월(0?[1-9]|[12]\d|3[01])일|(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01]))|(([1-9]|1[0-2])(?P[-\/. ])(0?[1-9]|[12]\d|3[01])(?P=sep4)([12]\d{3}|\d{2})|([1-9]|[12]\d|3[01])(?P[-\/. ])(0?[1-9]|1[0-2])(?P=sep5)([12]\d{3}|\d{2}))) 2 | -------------------------------------------------------------------------------- /notes/date_regex/dateregex_formats.txt: -------------------------------------------------------------------------------- 1 | DDMMYY 2 | DDMMYYYY 3 | DDxMMxYY 4 | DDxMMxYYYY 5 | DDxMxYY 6 | DDxMxYYYY 7 | DxMMxYY 8 | DxMMxYYYY 9 | DxMxYY 10 | DxMxYYYY 11 | MMDDYY 12 | MMDDYYYY 13 | MMxDDxYY 14 | MMxDDxYYYY 15 | MMxDxYY 16 | MMxDxYYYY 17 | MxDDxYY 18 | MxDDxYYYY 19 | MxDxYY 20 | MxDxYYYY 21 | YYMMDD 22 | YYYYMMDD 23 | YYYYxMMxD 24 | YYYYxMMxDD 25 | YYYYxMxD 26 | YYYYxMxDD 27 | YYYY年MM月DD日 28 | YYYY年MM月D日 29 | YYYY年M月DD日 30 | YYYY年M月D日 31 | YYYY년MM월DD일 32 | YYYY년MM월D일 33 | YYYY년M월DD일 34 | YYYY년M월D일 35 | YYxMMxD 36 | YYxMMxDD 37 | YYxMxD 38 | YYxMxDD 39 | YY年MM月DD日 40 | YY年MM月D日 41 | YY年M月DD日 42 | YY年M月D日 43 | YY년MM월DD일 44 | YY년MM월D일 45 | YY년M월DD일 46 | YY년M월D일 47 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # NOTE: Keep versions of tools in sync with Github Actions build.yml 2 | repos: 3 | - repo: https://github.com/psf/black 4 | rev: 23.3.0 5 | hooks: 6 | - id: black 7 | language_version: python3 8 | 9 | - repo: https://github.com/pycqa/isort 10 | rev: 5.12.0 11 | hooks: 12 | - id: isort 13 | name: isort (python) 14 | - id: isort 15 | name: isort (cython) 16 | types: [cython] 17 | - id: isort 18 | name: isort (pyi) 19 | types: [pyi] 20 | 21 | - repo: https://github.com/charliermarsh/ruff-pre-commit 22 | rev: "v0.0.261" 23 | hooks: 24 | - id: ruff 25 | name: ruff (python) 26 | args: [--fix, --exit-non-zero-on-fix] 27 | -------------------------------------------------------------------------------- /tests/test_integration/README.md: -------------------------------------------------------------------------------- 1 | # Integration Tests 2 | 3 | This directory is for the integration tests that evaluate the accuracy of 4 | dialect detection in CleverCSV. We have a ``data`` folder that contains 5 | annotated dialects for CSV files scraped from GitHub from repositories with 6 | the MIT license (allowing their redistribution and use). The 7 | ``test_dialect_detection.py`` script runs CleverCSV on each file for which 8 | ground truth is available, and writes the file hash to either ``success.log``, 9 | ``failed.log``, or ``error.log``. By keeping these files in Git we can keep 10 | track of CleverCSVs performance. 11 | 12 | Note that runtime should be interpreted very carefully and only when 13 | experimental conditions are constant, and then generally only as averages. 14 | -------------------------------------------------------------------------------- /tests/test_unit/test_c_file_naming.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import clevercsv 4 | 5 | 6 | class CNamingTestCase(unittest.TestCase): 7 | def test_name_cabstraction_module(self) -> None: 8 | self.assertEqual( 9 | clevercsv.cabstraction.__name__, "clevercsv.cabstraction" 10 | ) 11 | 12 | def test_name_cparser_module(self) -> None: 13 | self.assertEqual(clevercsv.cparser.__name__, "clevercsv.cparser") 14 | 15 | def test_name_cparser_error(self) -> None: 16 | self.assertEqual( 17 | clevercsv.cparser.Error.__module__, "clevercsv.cparser" 18 | ) 19 | 20 | def test_name_cparser_parser(self) -> None: 21 | self.assertEqual( 22 | clevercsv.cparser.Parser.__module__, "clevercsv.cparser" 23 | ) 24 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length=79 3 | 4 | [tool.isort] 5 | profile="black" 6 | sections=["FUTURE", "STDLIB", "TYPING", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"] 7 | known_typing=["typing"] 8 | force_single_line=true 9 | lines_between_types=1 10 | 11 | [tool.ruff] 12 | # Exclude stubs directory for now 13 | exclude = ["stubs"] 14 | 15 | [tool.mypy] 16 | python_version = 3.10 17 | warn_unused_configs = true 18 | warn_redundant_casts = true 19 | warn_unused_ignores = true 20 | strict_equality = true 21 | strict_concatenate = true 22 | check_untyped_defs = true 23 | disallow_subclassing_any = true 24 | disallow_untyped_decorators = true 25 | disallow_any_generics = true 26 | disallow_untyped_calls = true 27 | disallow_incomplete_defs = true 28 | disallow_untyped_defs = false 29 | 30 | [[tool.mypy.overrides]] 31 | packages = ["stubs", "clevercsv"] 32 | disallow_incomplete_defs = true 33 | -------------------------------------------------------------------------------- /tests/test_unit/test_consistency.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Unit test for consistency score 5 | 6 | Author: G.J.J. van den Burg 7 | 8 | """ 9 | 10 | import unittest 11 | 12 | from clevercsv.consistency import ConsistencyDetector 13 | from clevercsv.consistency import ConsistencyScore 14 | from clevercsv.dialect import SimpleDialect 15 | 16 | 17 | class ConsistencyTestCase(unittest.TestCase): 18 | def test_get_best_set_1(self) -> None: 19 | scores = { 20 | SimpleDialect(",", None, None): ConsistencyScore(P=1, T=1, Q=1), 21 | SimpleDialect(";", None, None): ConsistencyScore( 22 | P=1, T=None, Q=None 23 | ), 24 | SimpleDialect("|", None, None): ConsistencyScore(P=2, T=1, Q=2), 25 | } 26 | H = ConsistencyDetector.get_best_dialects(scores) 27 | self.assertEqual(H, [SimpleDialect("|", None, None)]) 28 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /stubs/regex/__init__.pyi: -------------------------------------------------------------------------------- 1 | from .regex import * 2 | 3 | # Names in __all__ with no definition: 4 | # A 5 | # ASCII 6 | # B 7 | # BESTMATCH 8 | # D 9 | # DEBUG 10 | # DEFAULT_VERSION 11 | # DOTALL 12 | # E 13 | # ENHANCEMATCH 14 | # F 15 | # FULLCASE 16 | # I 17 | # IGNORECASE 18 | # L 19 | # LOCALE 20 | # M 21 | # MULTILINE 22 | # Match 23 | # P 24 | # POSIX 25 | # Pattern 26 | # R 27 | # REVERSE 28 | # Regex 29 | # S 30 | # Scanner 31 | # T 32 | # TEMPLATE 33 | # U 34 | # UNICODE 35 | # V0 36 | # V1 37 | # VERBOSE 38 | # VERSION0 39 | # VERSION1 40 | # W 41 | # WORD 42 | # X 43 | # __doc__ 44 | # __version__ 45 | # cache_all 46 | # compile 47 | # error 48 | # escape 49 | # findall 50 | # finditer 51 | # fullmatch 52 | # match 53 | # purge 54 | # search 55 | # split 56 | # splititer 57 | # sub 58 | # subf 59 | # subfn 60 | # subn 61 | # template 62 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # Example files 2 | 3 | This directory contains some example files for the demo. 4 | 5 | These are the sources: 6 | 7 | - ``imdb.csv`` comes from https://www.kaggle.com/orgesleka/imdbmovies and is 8 | CC0 licensed. We only use the first 100 rows in this example. 9 | 10 | - ``airedale.csv`` comes from 11 | https://data.gov.uk/dataset/9c0b1334-dcaf-4a25-ad24-31425933afd9/spend-over-25-000-in-airedale-nhs-foundation-trust 12 | and is the file "2011 July Return". This data is made available under the 13 | [Open Government 14 | License](http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). 15 | 16 | - ``milk.csv`` comes from 17 | https://data.gov.uk/dataset/b4674861-f2a0-4dcd-bd5e-687d01380259/utilisation-of-milk-by-dairies-in-england-and-wales 18 | and is the file " UK Availability, Disposals and Production of Milk and Milk 19 | Products January 1987 to June 2015 ". This data is made available under the 20 | [Open Government 21 | License](http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). 22 | 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Alan Turing Institute 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /clevercsv/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Various utilities 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | import hashlib 11 | 12 | from typing import Iterable 13 | from typing import Iterator 14 | from typing import Tuple 15 | from typing import TypeVar 16 | 17 | from clevercsv._types import AnyPath 18 | 19 | T = TypeVar("T") 20 | 21 | 22 | def pairwise(iterable: Iterable[T]) -> Iterator[Tuple[T, T]]: 23 | "s - > (s0, s1), (s1, s2), (s2, s3), ..." 24 | a = iter(iterable) 25 | b = iter(iterable) 26 | next(b, None) 27 | return zip(a, b) 28 | 29 | 30 | def sha1sum(filename: AnyPath) -> str: 31 | """Compute the SHA1 checksum of a given file 32 | 33 | Parameters 34 | ---------- 35 | filename : str 36 | Path to a file 37 | 38 | Returns 39 | ------- 40 | checksum : str 41 | The SHA1 checksum of the file contents. 42 | """ 43 | blocksize = 1 << 16 44 | hasher = hashlib.sha1() 45 | with open(filename, "rb") as fp: 46 | buf = fp.read(blocksize) 47 | while len(buf) > 0: 48 | hasher.update(buf) 49 | buf = fp.read(blocksize) 50 | return hasher.hexdigest() 51 | -------------------------------------------------------------------------------- /tests/test_integration/failed.log: -------------------------------------------------------------------------------- 1 | 0069e7bfc8ca0884a84752226a8fb78d 2 | 026061a7526455946a3f983899d2f0c6 3 | 0349cc6c33ecda401bff13f23905ed72 4 | 04095bd80e50f90df503ff7d09ae8672 5 | 044196aa9f527ccd8aeff80cfa757dd1 6 | 0475a9956f5bcbb16b58f4a7ccaea973 7 | 060b4e623f38d193b16aaee0527a4f20 8 | 07675dd515504407b82f897ec886434f 9 | 080ca82bd5b56e99bd2a3db334a1a1aa 10 | 081dec2b2d490d2745b5ff99dcd98640 11 | 09af9825ae42e1cb4fb4f3609d5ec1e1 12 | 09cab00748d36387afd638c61ec077df 13 | 0a06a4a6b4151a5288bdb5f25f2754d5 14 | 0a677459b727fc7a7cc583054d7b0f42 15 | 0aded0a7e6428183e30b4dfe0edf476b 16 | 0b869b132594763bba8bb85b8f54688b 17 | 0e7a7f43c445ef171e5132372ff63601 18 | 104761c04f7278b2f5afce85c96db719 19 | 120b852c984ad304b3393c7beeea6491 20 | 1390ca6ccd8500cbbfbc5c7f64979004 21 | 13a6c86a18f053c593feda3d98755010 22 | 17c8007d6eb9baf19d075cb33759e313 23 | 17ccdf2fd0edef2d3bf5fca779cb2161 24 | 17e16b55d1d9ee2e13068db7cc69dbf9 25 | 1a53f0e394e74914659007cc5f153b9f 26 | 1a63a9d56584ec38adf5458adc4764f4 27 | 1a92ace99cdc356b862211df8c3ddc85 28 | 1ca7753332e716f667217edcd90efa83 29 | 1d0bf45700dccca4fa294b7c14fb578e 30 | 30e0f5bcbf1b29b01b27dea8353d1a62 31 | 62ea927849e53c95f0a6bff63ef26f82 32 | 74adf57365bf722eec497c8a2f306ca9 33 | -------------------------------------------------------------------------------- /notes/date_regex/dateregex.txt: -------------------------------------------------------------------------------- 1 | ( 2 | (0[1-9]|1[0-2]) 3 | ( 4 | (0[1-9]|[12]\d|3[01]) 5 | ( 6 | [12]\d{3} 7 | | 8 | \d{2} 9 | ) 10 | | 11 | (?P[-\/. ]) 12 | (0?[1-9]|[12]\d|3[01]) 13 | (?P=sep1) 14 | ( 15 | [12]\d{3} 16 | | 17 | \d{2} 18 | ) 19 | ) 20 | | 21 | (0[1-9]|[12]\d|3[01]) 22 | ( 23 | (0[1-9]|1[0-2]) 24 | ( 25 | [12]\d{3} 26 | | 27 | \d{2} 28 | ) 29 | | 30 | (?P[-\/. ]) 31 | (0?[1-9]|1[0-2]) 32 | (?P=sep2) 33 | ( 34 | [12]\d{3} 35 | | 36 | \d{2} 37 | ) 38 | ) 39 | | 40 | ( 41 | [12]\d{3} 42 | | 43 | \d{2} 44 | ) 45 | ( 46 | (?P[-\/. ]) 47 | (0?[1-9]|1[0-2]) 48 | (?P=sep3) 49 | (0?[1-9]|[12]\d|3[01]) 50 | | 51 | 年 52 | (0?[1-9]|1[0-2]) 53 | 月 54 | (0?[1-9]|[12]\d|3[01]) 55 | 日 56 | | 57 | 년 58 | (0?[1-9]|1[0-2]) 59 | 월 60 | (0?[1-9]|[12]\d|3[01]) 61 | 일 62 | | 63 | (0[1-9]|1[0-2]) 64 | (0[1-9]|[12]\d|3[01]) 65 | ) 66 | | 67 | ( 68 | ([1-9]|1[0-2]) 69 | (?P[-\/. ]) 70 | (0?[1-9]|[12]\d|3[01]) 71 | (?P=sep4) 72 | ( 73 | [12]\d{3} 74 | | 75 | \d{2} 76 | ) 77 | | 78 | ([1-9]|[12]\d|3[01]) 79 | (?P[-\/. ]) 80 | (0?[1-9]|1[0-2]) 81 | (?P=sep5) 82 | ( 83 | [12]\d{3} 84 | | 85 | \d{2} 86 | ) 87 | ) 88 | ) 89 | -------------------------------------------------------------------------------- /docs/source/clevercsv.console.commands.rst: -------------------------------------------------------------------------------- 1 | clevercsv.console.commands package 2 | ================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | clevercsv.console.commands.code module 8 | -------------------------------------- 9 | 10 | .. automodule:: clevercsv.console.commands.code 11 | :members: 12 | :show-inheritance: 13 | :undoc-members: 14 | 15 | clevercsv.console.commands.detect module 16 | ---------------------------------------- 17 | 18 | .. automodule:: clevercsv.console.commands.detect 19 | :members: 20 | :show-inheritance: 21 | :undoc-members: 22 | 23 | clevercsv.console.commands.explore module 24 | ----------------------------------------- 25 | 26 | .. automodule:: clevercsv.console.commands.explore 27 | :members: 28 | :show-inheritance: 29 | :undoc-members: 30 | 31 | clevercsv.console.commands.standardize module 32 | --------------------------------------------- 33 | 34 | .. automodule:: clevercsv.console.commands.standardize 35 | :members: 36 | :show-inheritance: 37 | :undoc-members: 38 | 39 | clevercsv.console.commands.view module 40 | -------------------------------------- 41 | 42 | .. automodule:: clevercsv.console.commands.view 43 | :members: 44 | :show-inheritance: 45 | :undoc-members: 46 | 47 | Module contents 48 | --------------- 49 | 50 | .. automodule:: clevercsv.console.commands 51 | :members: 52 | :show-inheritance: 53 | :undoc-members: 54 | -------------------------------------------------------------------------------- /clevercsv/console/commands/_docs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | FLAG_DESCRIPTIONS = { 4 | "encoding": ( 5 | "The file encoding of the given CSV file is automatically " 6 | "detected using chardet. While chardet is incredibly " 7 | "accurate, it is not perfect. In the rare cases that it makes " 8 | "a mistake in detecting the file encoding, you can override " 9 | "the encoding by providing it through this flag. Moreover, " 10 | "when you have a number of CSV files with a known file " 11 | "encoding, you can use this option to speed up the code " 12 | "generation process." 13 | ), 14 | "num-chars": ( 15 | "On large CSV files, dialect detection can sometimes be a bit " 16 | "slow due to the large number of possible dialects to " 17 | "consider. To alleviate this, you can limit the number of " 18 | "characters to use for detection.\n\n" 19 | "One aspect to keep in mind is that CleverCSV may need to " 20 | "read a specific number of characters to be able to correctly " 21 | "infer the dialect. For example, in the ``imdb.csv`` file " 22 | "in the GitHub repository, the correct dialect can only " 23 | "be found after at least 66 lines of the file are read. " 24 | "Therefore, if there is availability to run CleverCSV on " 25 | "the entire file, that is generally recommended." 26 | ), 27 | } 28 | -------------------------------------------------------------------------------- /clevercsv/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from csv import QUOTE_ALL 4 | from csv import QUOTE_MINIMAL 5 | from csv import QUOTE_NONE 6 | from csv import QUOTE_NONNUMERIC 7 | 8 | from .__version__ import __version__ 9 | from .cparser_util import field_size_limit 10 | from .detect import Detector 11 | from .detect import Detector as Sniffer 12 | from .dialect import excel 13 | from .dialect import excel_tab 14 | from .dialect import unix_dialect 15 | from .dict_read_write import DictReader 16 | from .dict_read_write import DictWriter 17 | from .exceptions import Error 18 | from .read import reader 19 | from .wrappers import detect_dialect 20 | from .wrappers import read_dataframe 21 | from .wrappers import read_dicts 22 | from .wrappers import read_table 23 | from .wrappers import stream_dicts 24 | from .wrappers import stream_table 25 | from .wrappers import write_table 26 | from .write import writer 27 | 28 | __all__ = [ 29 | "QUOTE_ALL", 30 | "QUOTE_MINIMAL", 31 | "QUOTE_NONE", 32 | "QUOTE_NONNUMERIC", 33 | "__version__", 34 | "field_size_limit", 35 | "Detector", 36 | "Sniffer", 37 | "excel", 38 | "excel_tab", 39 | "unix_dialect", 40 | "DictReader", 41 | "DictWriter", 42 | "Error", 43 | "reader", 44 | "detect_dialect", 45 | "read_dataframe", 46 | "read_dicts", 47 | "read_table", 48 | "stream_dicts", 49 | "stream_table", 50 | "write_table", 51 | "writer", 52 | ] 53 | -------------------------------------------------------------------------------- /notes/date_regex/datefmt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Generate strings for all the date formats. 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | 12 | def main(): 13 | year2 = "YY" 14 | year4 = "YYYY" 15 | month_leading = "MM" 16 | month_sparse = "M" 17 | day_leading = "D" 18 | day_leading = "DD" 19 | day_sparse = "D" 20 | sep = "x" 21 | pats = [] 22 | for year in [year2, year4]: 23 | for month in [month_leading, month_sparse]: 24 | for day in [day_leading, day_sparse]: 25 | fmt = dict(year=year, month=month, day=day, sep=sep) 26 | pats.append("{year}{sep}{month}{sep}{day}".format(**fmt)) 27 | pats.append("{day}{sep}{month}{sep}{year}".format(**fmt)) 28 | pats.append("{month}{sep}{day}{sep}{year}".format(**fmt)) 29 | pats.append("{year}年{month}月{day}日".format(**fmt)) 30 | pats.append("{year}년{month}월{day}일".format(**fmt)) 31 | 32 | for year in [year2, year4]: 33 | fmt = dict(year=year, month=month_leading, day=day_leading, sep="") 34 | pats.append("{year}{sep}{month}{sep}{day}".format(**fmt)) 35 | pats.append("{day}{sep}{month}{sep}{year}".format(**fmt)) 36 | pats.append("{month}{sep}{day}{sep}{year}".format(**fmt)) 37 | 38 | for pat in sorted(pats): 39 | print(pat) 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /clevercsv/_types.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import annotations 4 | 5 | import csv 6 | import os 7 | import sys 8 | 9 | from typing import TYPE_CHECKING 10 | from typing import Any 11 | from typing import Mapping 12 | from typing import TypeVar 13 | from typing import Union 14 | 15 | import _csv 16 | 17 | from clevercsv.dialect import SimpleDialect 18 | 19 | AnyPath = Union[str, bytes, "os.PathLike[str]", "os.PathLike[bytes]"] 20 | StrPath = Union[str, "os.PathLike[str]"] 21 | _OpenFile = Union[AnyPath, int] 22 | _DictRow = Mapping[str, Any] 23 | _DialectLike = Union[ 24 | str, 25 | csv.Dialect, 26 | _csv.Dialect, 27 | type[_csv.Dialect], 28 | SimpleDialect, 29 | ] 30 | _T = TypeVar("_T") 31 | 32 | if sys.version_info >= (3, 8): 33 | from typing import Dict as _DictReadMapping 34 | else: 35 | from collections import OrderedDict as _DictReadMapping 36 | 37 | 38 | if TYPE_CHECKING: 39 | from _typeshed import FileDescriptorOrPath # NOQA 40 | from _typeshed import SupportsIter # NOQA 41 | from _typeshed import SupportsWrite # NOQA 42 | 43 | __all__ = [ 44 | "SupportsWrite", 45 | "SupportsIter", 46 | "FileDescriptorOrPath", 47 | "AnyPath", 48 | "_OpenFile", 49 | "_DictRow", 50 | "_DialectLike", 51 | "_DictReadMapping", 52 | ] 53 | else: 54 | __all__ = [ 55 | "AnyPath", 56 | "_OpenFile", 57 | "_DictRow", 58 | "_DialectLike", 59 | "_DictReadMapping", 60 | ] 61 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | schedule: 11 | - cron: 53 18 */10 * * 12 | 13 | jobs: 14 | code-quality: 15 | name: Code quality checks for CleverCSV 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v3 20 | 21 | # NOTE: Keep versions in sync with .pre-commit-config.yaml 22 | 23 | - name: Run black 24 | uses: psf/black@stable 25 | with: 26 | version: "23.3.0" 27 | 28 | - name: Run isort 29 | uses: jamescurtin/isort-action@master 30 | with: 31 | isortVersion: "5.12.0" 32 | 33 | - name: Run ruff 34 | uses: chartboost/ruff-action@v1 35 | with: 36 | version: "v0.0.261" 37 | 38 | python-test: 39 | needs: [code-quality] 40 | name: Tests 41 | runs-on: ${{ matrix.os }} 42 | strategy: 43 | matrix: 44 | os: [ 'ubuntu-latest', 'macos-latest', 'windows-latest' ] 45 | py: [ '3.9', '3.14' ] # minimal and latest 46 | steps: 47 | - name: Install Python ${{ matrix.py }} 48 | uses: actions/setup-python@v5 49 | with: 50 | python-version: ${{ matrix.py }} 51 | 52 | - name: Checkout 53 | uses: actions/checkout@v3 54 | 55 | - name: Install CleverCSV 56 | run: pip install -e .[full,tests] 57 | 58 | - name: Run unit tests 59 | run: python -m unittest discover -v -f -s ./tests/test_unit 60 | -------------------------------------------------------------------------------- /tests/test_integration/failed_partial.log: -------------------------------------------------------------------------------- 1 | 0069e7bfc8ca0884a84752226a8fb78d 2 | 0123de82b4e6f80b2da76a76611efd1a 3 | 026061a7526455946a3f983899d2f0c6 4 | 04095bd80e50f90df503ff7d09ae8672 5 | 044196aa9f527ccd8aeff80cfa757dd1 6 | 0475a9956f5bcbb16b58f4a7ccaea973 7 | 04c88c064919bd89510d1dd22cb8642e 8 | 060b4e623f38d193b16aaee0527a4f20 9 | 068ff5ebfca3329887fa6289880004d1 10 | 0747d24252451f9110b252294a1cfd75 11 | 07675dd515504407b82f897ec886434f 12 | 080ca82bd5b56e99bd2a3db334a1a1aa 13 | 081dec2b2d490d2745b5ff99dcd98640 14 | 090a15a2f7f10009ad43406b4f60fe04 15 | 0985985f5dd21760ec916d66e73546fd 16 | 09af9825ae42e1cb4fb4f3609d5ec1e1 17 | 09cab00748d36387afd638c61ec077df 18 | 0a06a4a6b4151a5288bdb5f25f2754d5 19 | 0a677459b727fc7a7cc583054d7b0f42 20 | 0aded0a7e6428183e30b4dfe0edf476b 21 | 0af0ffb351e75c7756d273fca2a2f82b 22 | 0b869b132594763bba8bb85b8f54688b 23 | 0cf0e1bc97595760217806686a32ff39 24 | 0d00e37b443633368e8f136cffed5730 25 | 0da1208093afc2b90f3f4df49e8d996c 26 | 0e51f0fe2a2ddd25ff7a0d1bac00d17b 27 | 0e7a7f43c445ef171e5132372ff63601 28 | 102a1c511703cee9bb3decd49764ab38 29 | 107d3dbbfa4a37773b24e7d095ddfce2 30 | 119cecf07dd8af6a3c2229fb35b3103f 31 | 1390ca6ccd8500cbbfbc5c7f64979004 32 | 13989b94a814dfe6b7b784a3a8c5c581 33 | 13a6c86a18f053c593feda3d98755010 34 | 13fa5d67c7315502e255e3f53672775a 35 | 30e0f5bcbf1b29b01b27dea8353d1a62 36 | 32a92fc9acf632202de46d25b1c0fc3b 37 | 62aa70d13ad2c26a114d030cab55fae6 38 | 62ea927849e53c95f0a6bff63ef26f82 39 | 70e2d41fccd442a2d1b56bc6ba2ee310 40 | 74adf57365bf722eec497c8a2f306ca9 41 | 94dd7789e4e0a26f3a292f9721d248ff 42 | dbade3e6bf171424d97fcc1d487e2b67 43 | -------------------------------------------------------------------------------- /tests/test_unit/test_potential_dialects.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Unit tests for the potential dialect selection. 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | import unittest 11 | 12 | from clevercsv.potential_dialects import filter_urls 13 | from clevercsv.potential_dialects import get_delimiters 14 | from clevercsv.potential_dialects import get_quotechars 15 | from clevercsv.potential_dialects import masked_by_quotechar 16 | 17 | 18 | class PotentialDialectTestCase(unittest.TestCase): 19 | def test_masked_by_quotechar(self) -> None: 20 | self.assertTrue(masked_by_quotechar('A"B&C"A', '"', "", "&")) 21 | self.assertFalse(masked_by_quotechar('A"B&C"A&A', '"', "", "&")) 22 | self.assertFalse(masked_by_quotechar('A|"B&C"A', '"', "|", "&")) 23 | self.assertFalse(masked_by_quotechar('A"B"C', '"', "", "")) 24 | 25 | def test_filter_urls(self) -> None: 26 | data = "A,B\nwww.google.com,10\nhttps://gertjanvandenburg.com,25\n" 27 | exp = "A,B\nU,10\nU,25\n" 28 | self.assertEqual(exp, filter_urls(data)) 29 | 30 | def test_get_quotechars(self) -> None: 31 | data = "A,B,'A',B\"D\"E" 32 | exp = set(['"', "'", ""]) 33 | out = get_quotechars(data) 34 | self.assertEqual(out, exp) 35 | 36 | def test_get_delimiters(self) -> None: 37 | data = "A,B|CD,E;F\tD123£123€10.,0" 38 | exp = set([",", "|", ";", "\t", "€", "£", ""]) 39 | out = get_delimiters(data, "UTF-8") 40 | self.assertEqual(out, exp) 41 | 42 | 43 | if __name__ == "__main__": 44 | unittest.main() 45 | -------------------------------------------------------------------------------- /man/clevercsv-help.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: clevercsv-help 3 | .\" Author: G.J.J. van den Burg 4 | .\" Generator: Wilderness 5 | .\" Date: 2025-10-30 6 | .\" Manual: clevercsv Manual 7 | .\" Source: clevercsv 0.8.4 8 | .\" Language: English 9 | .\" 10 | .TH "CLEVERCSV-HELP" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual" 11 | .\" ----------------------------------------------------------------- 12 | .\" * Define some portability stuff 13 | .\" ----------------------------------------------------------------- 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .\" http://bugs.debian.org/507673 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .ie \n(.g .ds Aq \(aq 19 | .el .ds Aq ' 20 | .\" ----------------------------------------------------------------- 21 | .\" * set default formatting * 22 | .\" ----------------------------------------------------------------- 23 | .\" disable hyphenation 24 | .nh 25 | .\" disable justification 26 | .ad l 27 | .\" ----------------------------------------------------------------- 28 | .\" * MAIN CONTENT STARTS HERE * 29 | .\" ----------------------------------------------------------------- 30 | .SH "NAME" 31 | clevercsv-help \- Display help information 32 | .SH "SYNOPSIS" 33 | .sp 34 | .nf 35 | \fIclevercsv help [command] 36 | .fi 37 | .sp 38 | .SH "DESCRIPTION" 39 | .sp 40 | Display help information 41 | .SH "OPTIONS" 42 | .sp 43 | .sp 44 | .sp 45 | \-h, \-\-help 46 | .RS 4 47 | show this help message and exit 48 | .RE 49 | .PP 50 | .sp -------------------------------------------------------------------------------- /clevercsv/cparser.pyi: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Final 6 | from typing import Generic 7 | from typing import Iterable 8 | from typing import List 9 | from typing import Literal 10 | from typing import Optional 11 | from typing import Tuple 12 | from typing import TypeVar 13 | from typing import overload 14 | 15 | _T = TypeVar("_T") 16 | 17 | class Parser(Generic[_T]): 18 | _return_quoted: Final[bool] 19 | 20 | @overload 21 | def __init__( 22 | self: Parser[List[Tuple[str, bool]]], 23 | delimiter: Optional[str] = "", 24 | quotechar: Optional[str] = "", 25 | escapechar: Optional[str] = "", 26 | field_limit: Optional[int] = 128 * 1024, 27 | strict: Optional[bool] = False, 28 | return_quoted: Literal[True] = ..., 29 | ) -> None: ... 30 | @overload 31 | def __init__( 32 | self: Parser[List[str]], 33 | delimiter: Optional[str] = "", 34 | quotechar: Optional[str] = "", 35 | escapechar: Optional[str] = "", 36 | field_limit: Optional[int] = 128 * 1024, 37 | strict: Optional[bool] = False, 38 | return_quoted: Literal[False] = ..., 39 | ) -> None: ... 40 | @overload 41 | def __init__( 42 | self, 43 | data: Iterable[str], 44 | delimiter: Optional[str] = "", 45 | quotechar: Optional[str] = "", 46 | escapechar: Optional[str] = "", 47 | field_limit: Optional[int] = 128 * 1024, 48 | strict: Optional[bool] = False, 49 | return_quoted: bool = ..., 50 | ) -> None: ... 51 | def __iter__(self) -> "Parser": ... 52 | def __next__(self) -> _T: ... 53 | 54 | class Error(Exception): ... 55 | -------------------------------------------------------------------------------- /clevercsv/encoding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Functionality to detect file encodings 4 | 5 | Author: G.J.J. van den Burg 6 | License: See the LICENSE file 7 | 8 | This file is part of CleverCSV. 9 | 10 | """ 11 | 12 | from typing import Optional 13 | 14 | import chardet 15 | 16 | from ._optional import import_optional_dependency 17 | from ._types import _OpenFile 18 | 19 | 20 | def get_encoding( 21 | filename: _OpenFile, try_cchardet: bool = True 22 | ) -> Optional[str]: 23 | """Get the encoding of the file 24 | 25 | This function uses the chardet package for detecting the encoding of a 26 | file. 27 | 28 | Parameters 29 | ---------- 30 | filename: str 31 | Path to a file 32 | 33 | try_cchardet: bool 34 | Whether to run detection using cChardet if it is available. This can be 35 | faster, but may give different results than using chardet. 36 | 37 | Returns 38 | ------- 39 | encoding: str 40 | Encoding of the file. 41 | """ 42 | if try_cchardet: 43 | cchardet = import_optional_dependency( 44 | "cchardet", raise_on_missing=False 45 | ) 46 | else: 47 | cchardet = None 48 | 49 | if cchardet is None: 50 | detector = chardet.UniversalDetector() 51 | else: 52 | detector = cchardet.UniversalDetector() 53 | 54 | final_chunk = False 55 | blk_size = 65536 56 | with open(filename, "rb") as fid: 57 | while (not final_chunk) and (not detector.done): 58 | chunk = fid.read(blk_size) 59 | if len(chunk) < blk_size: 60 | final_chunk = True 61 | detector.feed(chunk) 62 | detector.close() 63 | encoding = detector.result.get("encoding", None) 64 | return encoding 65 | -------------------------------------------------------------------------------- /clevercsv/console/commands/_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Any 4 | from typing import List 5 | from typing import Optional 6 | 7 | from clevercsv import __version__ 8 | from clevercsv.dialect import SimpleDialect 9 | 10 | 11 | def parse_int(val: Any, name: str) -> Optional[int]: 12 | """Parse a number to an integer if possible""" 13 | if val is None: 14 | return val 15 | try: 16 | return int(val) 17 | except ValueError: 18 | raise ValueError( 19 | f"Please provide a number for {name}, instead of {val}" 20 | ) 21 | 22 | 23 | def generate_code( 24 | filename: str, 25 | dialect: SimpleDialect, 26 | encoding: Optional[str], 27 | use_pandas: bool = False, 28 | ) -> List[str]: 29 | assert dialect.quotechar is not None 30 | d = '"\\t"' if dialect.delimiter == "\t" else f'"{dialect.delimiter}"' 31 | q = '"%s"' % (dialect.quotechar.replace('"', '\\"')) 32 | e = repr(f"{dialect.escapechar}").replace("'", '"') 33 | base = [ 34 | "", 35 | f"# Code generated with CleverCSV version {__version__}", 36 | "", 37 | "import clevercsv", 38 | ] 39 | if use_pandas: 40 | return [ 41 | *base, 42 | "", 43 | f'df = clevercsv.read_dataframe("{filename}", delimiter={d}, ' 44 | f"quotechar={q}, escapechar={e})", 45 | "", 46 | ] 47 | 48 | enc = "None" if encoding is None else f'"{encoding}"' 49 | lines = [ 50 | *base, 51 | "", 52 | f'with open("{filename}", "r", newline="", encoding={enc}) as fp:', 53 | " reader = clevercsv.reader(fp, " 54 | + f"delimiter={d}, quotechar={q}, escapechar={e})", 55 | " rows = list(reader)", 56 | "", 57 | ] 58 | return lines 59 | -------------------------------------------------------------------------------- /notes/date_regex/dateregex_annotated.txt: -------------------------------------------------------------------------------- 1 | ( # MMDDYY(YY) 2 | (0[1-9]|1[0-2]) # MM 3 | ( 4 | (0[1-9]|[12]\d|3[01]) # DD 5 | ( 6 | [12]\d{3} # YYYY 7 | | 8 | \d{2} # YY 9 | ) 10 | | # MMxD(D)xYY(YY) 11 | (?P[-\/. ]) 12 | (0?[1-9]|[12]\d|3[01]) # DD|D 13 | (?P=sep1) 14 | ( 15 | [12]\d{3} # YYYY 16 | | 17 | \d{2} # YY 18 | ) 19 | ) 20 | | # DDMMYY(YY) 21 | (0[1-9]|[12]\d|3[01]) # DD 22 | ( 23 | (0[1-9]|1[0-2]) # MM 24 | ( 25 | [12]\d{3} # YYYY 26 | | 27 | \d{2} # YY 28 | ) 29 | | # DDxM(M)xYY(YY) 30 | (?P[-\/. ]) 31 | (0?[1-9]|1[0-2]) # M|MM 32 | (?P=sep2) 33 | ( 34 | [12]\d{3} # YYYY 35 | | 36 | \d{2} # YY 37 | ) 38 | ) 39 | | 40 | ( 41 | [12]\d{3} # YYYY 42 | | 43 | \d{2} # YY 44 | ) 45 | ( # YY(YY)xM(M)xD(D) 46 | (?P[-\/. ]) 47 | (0?[1-9]|1[0-2]) # MM|M 48 | (?P=sep3) 49 | (0?[1-9]|[12]\d|3[01]) # DD|D 50 | | 51 | 年 # YY(YY)年M(M)月D(D)日 52 | (0?[1-9]|1[0-2]) # MM|M 53 | 月 54 | (0?[1-9]|[12]\d|3[01]) # DD|D 55 | 日 56 | | 57 | 년 # YY(YY)년M(M)월D(D)일 58 | (0?[1-9]|1[0-2]) # MM|M 59 | 월 60 | (0?[1-9]|[12]\d|3[01]) # DD|D 61 | 일 62 | | # YY(YY)MMDD 63 | (0[1-9]|1[0-2]) # MM 64 | (0[1-9]|[12]\d|3[01]) # DD 65 | ) 66 | | 67 | ( # MxD(D)xYY(YY) 68 | ([1-9]|1[0-2]) # M 69 | (?P[-\/. ]) 70 | (0?[1-9]|[12]\d|3[01]) # DD|D 71 | (?P=sep4) 72 | ( 73 | [12]\d{3} # YYYY 74 | | 75 | \d{2} # YY 76 | ) 77 | | # DxM(M)xYY(YY) 78 | ([1-9]|[12]\d|3[01]) # D 79 | (?P[-\/. ]) 80 | (0?[1-9]|1[0-2]) # MM|M 81 | (?P=sep5) 82 | ( 83 | [12]\d{3} # YYYY 84 | | 85 | \d{2} # YY 86 | ) 87 | ) 88 | ) 89 | -------------------------------------------------------------------------------- /clevercsv/cparser_util.pyi: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Any 4 | from typing import Iterable 5 | from typing import Iterator 6 | from typing import List 7 | from typing import Literal 8 | from typing import Optional 9 | from typing import Tuple 10 | from typing import Union 11 | from typing import overload 12 | 13 | from .dialect import SimpleDialect 14 | 15 | def field_size_limit(*args: Any, **kwargs: Any) -> int: ... 16 | @overload 17 | def _parse_data( 18 | data: Iterable[str], 19 | delimiter: str, 20 | quotechar: str, 21 | escapechar: str, 22 | strict: bool, 23 | return_quoted: Literal[False] = ..., 24 | ) -> Iterator[List[str]]: ... 25 | @overload 26 | def _parse_data( 27 | data: Iterable[str], 28 | delimiter: str, 29 | quotechar: str, 30 | escapechar: str, 31 | strict: bool, 32 | return_quoted: Literal[True], 33 | ) -> Iterator[List[Tuple[str, bool]]]: ... 34 | @overload 35 | def _parse_data( 36 | data: Iterable[str], 37 | delimiter: str, 38 | quotechar: str, 39 | escapechar: str, 40 | strict: bool, 41 | return_quoted: bool = ..., 42 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]: ... 43 | def parse_data( 44 | data: Iterable[str], 45 | dialect: Optional[SimpleDialect] = None, 46 | delimiter: Optional[str] = None, 47 | quotechar: Optional[str] = None, 48 | escapechar: Optional[str] = None, 49 | strict: Optional[bool] = None, 50 | return_quoted: bool = False, 51 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]: ... 52 | @overload 53 | def parse_string( 54 | data: str, 55 | dialect: SimpleDialect, 56 | return_quoted: Literal[False] = ..., 57 | ) -> Iterator[List[str]]: ... 58 | @overload 59 | def parse_string( 60 | data: str, 61 | dialect: SimpleDialect, 62 | return_quoted: Literal[True], 63 | ) -> Iterator[List[Tuple[str, bool]]]: ... 64 | @overload 65 | def parse_string( 66 | data: str, 67 | dialect: SimpleDialect, 68 | return_quoted: bool = ..., 69 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]: ... 70 | -------------------------------------------------------------------------------- /clevercsv/escape.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Common functions for dealing with escape characters. 5 | 6 | Author: Gertjan van den Burg 7 | Date: 2018-11-06 8 | """ 9 | 10 | import codecs 11 | import sys 12 | import unicodedata 13 | 14 | from typing import Iterable 15 | from typing import Optional 16 | from typing import Set 17 | 18 | #: Set of default characters to *never* consider as escape character 19 | DEFAULT_BLOCK_CHARS: Set[str] = set( 20 | [ 21 | "!", 22 | "?", 23 | '"', 24 | "'", 25 | ".", 26 | ",", 27 | ";", 28 | ":", 29 | "%", 30 | "*", 31 | "&", 32 | "#", 33 | ] 34 | ) 35 | 36 | #: Set of characters in the Unicode "Po" category 37 | UNICODE_PO_CHARS: Set[str] = set( 38 | [ 39 | c 40 | for c in map(chr, range(sys.maxunicode + 1)) 41 | if unicodedata.category(c) == "Po" 42 | ] 43 | ) 44 | 45 | 46 | def is_potential_escapechar( 47 | char: str, encoding: str, block_char: Optional[Iterable[str]] = None 48 | ) -> bool: 49 | """Check if a character is a potential escape character. 50 | 51 | A character is considered a potential escape character if it is in the 52 | "Punctuation, Other" Unicode category and not in the list of blocked 53 | characters. 54 | 55 | Parameters 56 | ---------- 57 | char: str 58 | The character to check 59 | 60 | encoding : str 61 | The encoding of the character 62 | 63 | block_char : Optional[Iterable[str]] 64 | Characters that are in the Punctuation Other category but that should 65 | not be considered as escape character. If None, the default set is 66 | used, which is defined in :py:data:`DEFAULT_BLOCK_CHARS`. 67 | 68 | Returns 69 | ------- 70 | is_escape : bool 71 | Whether the character is considered a potential escape or not. 72 | 73 | """ 74 | if encoding.lower() in set(["utf-8", "ascii"]): 75 | uchar = char 76 | else: 77 | uchar = codecs.decode(bytes(char, encoding), encoding=encoding) 78 | 79 | block_chars = ( 80 | DEFAULT_BLOCK_CHARS if block_char is None else set(block_char) 81 | ) 82 | if uchar in UNICODE_PO_CHARS and uchar not in block_chars: 83 | return True 84 | return False 85 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath(".")) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "CleverCSV" 22 | copyright = "2019, The Alan Turing Institute" 23 | author = "G.J.J. van den Burg" 24 | 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | master_doc = "index" 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | "sphinx.ext.autodoc", 35 | "sphinx.ext.coverage", 36 | "sphinx.ext.napoleon", 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ["_templates"] 41 | 42 | # List of patterns, relative to source directory, that match files and 43 | # directories to ignore when looking for source files. 44 | # This pattern also affects html_static_path and html_extra_path. 45 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 46 | 47 | 48 | # -- Options for HTML output ------------------------------------------------- 49 | 50 | # The theme to use for HTML and HTML Help pages. See the documentation for 51 | # a list of builtin themes. 52 | # 53 | html_theme = "furo" 54 | html_logo = "https://raw.githubusercontent.com/alan-turing-institute/CleverCSV/eea72549195e37bd4347d87fd82bc98be2f1383d/.logo.png" 55 | 56 | html_theme_options = { 57 | "sidebar_hide_name": True, 58 | "light_css_variables": { 59 | "color-brand-primary": "#336790", # "blue" 60 | "color-brand-content": "#336790", 61 | }, 62 | "dark_css_variables": { 63 | "color-brand-primary": "#c03232ff", # "red" 64 | "color-brand-content": "#c03232ff", 65 | }, 66 | } 67 | 68 | # Add any paths that contain custom static files (such as style sheets) here, 69 | # relative to this directory. They are copied after the builtin static files, 70 | # so a file named "default.css" will overwrite the builtin "default.css". 71 | html_static_path = ["_static"] 72 | -------------------------------------------------------------------------------- /clevercsv/read.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Drop-in replacement for the Python csv reader class. This is a wrapper for the 5 | Parser class, defined in :mod:`cparser`. 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import csv 12 | 13 | from typing import Any 14 | from typing import Iterable 15 | from typing import Iterator 16 | from typing import List 17 | from typing import Optional 18 | 19 | from . import field_size_limit 20 | from ._types import _DialectLike 21 | from .cparser import Error as ParserError 22 | from .cparser import Parser 23 | from .dialect import SimpleDialect 24 | from .exceptions import Error 25 | 26 | 27 | class reader: 28 | def __init__( 29 | self, 30 | csvfile: Iterable[str], 31 | dialect: _DialectLike = "excel", 32 | **fmtparams: Any, 33 | ): 34 | self.csvfile = csvfile 35 | self.original_dialect = dialect 36 | self._dialect = self._make_simple_dialect(dialect, **fmtparams) 37 | self.line_num: int = 0 38 | self.parser_gen: Optional[Parser] = None 39 | 40 | @property 41 | def dialect(self) -> csv.Dialect: 42 | return self._dialect.to_csv_dialect() 43 | 44 | def _make_simple_dialect( 45 | self, dialect: _DialectLike, **fmtparams: Any 46 | ) -> SimpleDialect: 47 | if isinstance(dialect, str): 48 | sd = SimpleDialect.from_csv_dialect(csv.get_dialect(dialect)) 49 | elif isinstance(dialect, csv.Dialect): 50 | sd = SimpleDialect.from_csv_dialect(dialect) 51 | elif isinstance(dialect, SimpleDialect): 52 | sd = dialect 53 | else: 54 | raise ValueError("Unknown dialect type: %r" % dialect) 55 | for key, value in fmtparams.items(): 56 | if key in ["delimiter", "quotechar", "escapechar", "strict"]: 57 | setattr(sd, key, value) 58 | sd.validate() 59 | return sd 60 | 61 | def __iter__(self) -> Iterator[List[str]]: 62 | self.parser_gen = Parser( 63 | self.csvfile, 64 | delimiter=self._dialect.delimiter, 65 | quotechar=self._dialect.quotechar, 66 | escapechar=self._dialect.escapechar, 67 | field_limit=field_size_limit(), 68 | strict=self._dialect.strict, 69 | ) 70 | return self 71 | 72 | def __next__(self) -> List[str]: 73 | if self.parser_gen is None: 74 | self.__iter__() 75 | assert self.parser_gen is not None 76 | try: 77 | row = next(self.parser_gen) 78 | except ParserError as e: 79 | raise Error(str(e)) 80 | self.line_num += 1 81 | return row 82 | -------------------------------------------------------------------------------- /tests/test_unit/test_abstraction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Unit tests for making abstractions 5 | 6 | """ 7 | 8 | import gzip 9 | import json 10 | import unittest 11 | 12 | from pathlib import Path 13 | 14 | from typing import Any 15 | from typing import Dict 16 | from typing import List 17 | 18 | from clevercsv.cabstraction import c_merge_with_quotechar 19 | from clevercsv.detect_pattern import base_abstraction 20 | from clevercsv.detect_pattern import fill_empties 21 | from clevercsv.detect_pattern import strip_trailing 22 | 23 | 24 | class AbstractionTestCase(unittest.TestCase): 25 | def setUp(self) -> None: 26 | here = Path(__file__) 27 | this_dir = here.parent 28 | data_dir = this_dir / "data" 29 | testcases_file = data_dir / "abstraction_testcases.json.gz" 30 | if not testcases_file.exists(): 31 | self._cases = [] 32 | else: 33 | self._cases = self._load_cases(testcases_file) 34 | 35 | @staticmethod 36 | def _load_cases(filename: Path) -> List[Dict[str, Any]]: 37 | cases = [] 38 | with gzip.open(filename, "rt", newline="", encoding="utf-8") as fp: 39 | for line in fp: 40 | cases.append(json.loads(line)) 41 | return cases 42 | 43 | def test_abstraction_multi(self) -> None: 44 | if not self._cases: 45 | self.skipTest("no abstraction test cases found") 46 | 47 | for case in self._cases: 48 | content = case["content"] 49 | dialect = case["dialect"] 50 | 51 | exp_base = case["base_abstraction"] 52 | exp_merge = case["after_merge_with_quotechar"] 53 | exp_empties = case["after_fill_empties"] 54 | exp_trailing = case["after_strip_trailing"] 55 | with self.subTest(name=case["name"], kind="base"): 56 | base = base_abstraction( 57 | content, 58 | dialect["delimiter"], 59 | dialect["quotechar"], 60 | dialect["escapechar"], 61 | ) 62 | self.assertEqual(base, exp_base) 63 | 64 | with self.subTest(name=case["name"], kind="merge"): 65 | merge = c_merge_with_quotechar(base) 66 | self.assertEqual(merge, exp_merge) 67 | 68 | with self.subTest(name=case["name"], kind="empties"): 69 | empties = fill_empties(merge) 70 | self.assertEqual(empties, exp_empties) 71 | 72 | with self.subTest(name=case["name"], kind="trailing"): 73 | trailing = strip_trailing(empties) 74 | self.assertEqual(trailing, exp_trailing) 75 | 76 | 77 | if __name__ == "__main__": 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /clevercsv/write.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Drop-in replacement for the Python csv writer class. 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | from __future__ import annotations 12 | 13 | import csv 14 | 15 | from typing import TYPE_CHECKING 16 | from typing import Any 17 | from typing import Iterable 18 | 19 | import _csv 20 | 21 | if TYPE_CHECKING: 22 | from clevercsv._types import SupportsWrite 23 | 24 | from clevercsv._types import _DialectLike 25 | 26 | from .dialect import SimpleDialect 27 | from .exceptions import Error 28 | 29 | DIALECT_KEYS = [ 30 | "skipinitialspace", 31 | "doublequote", 32 | "strict", 33 | "delimiter", 34 | "escapechar", 35 | "lineterminator", 36 | "quotechar", 37 | "quoting", 38 | ] 39 | 40 | 41 | class writer: 42 | def __init__( 43 | self, 44 | csvfile: SupportsWrite[str], 45 | dialect: _DialectLike = "excel", 46 | **fmtparams: Any, 47 | ) -> None: 48 | self.original_dialect = dialect 49 | self.dialect: type[_csv.Dialect] = self._make_python_dialect( 50 | dialect, **fmtparams 51 | ) 52 | self._writer = _csv.writer(csvfile, dialect=self.dialect) 53 | 54 | def _make_python_dialect( 55 | self, dialect: _DialectLike, **fmtparams: Any 56 | ) -> type[_csv.Dialect]: 57 | d: _DialectLike = "" 58 | if isinstance(dialect, str): 59 | d = _csv.get_dialect(dialect) 60 | elif isinstance(dialect, _csv.Dialect): 61 | d = dialect 62 | elif isinstance(dialect, SimpleDialect): 63 | d = dialect.to_csv_dialect() 64 | elif dialect in [csv.excel, csv.excel_tab, csv.unix_dialect]: 65 | d = dialect 66 | else: 67 | raise ValueError(f"Unknown dialect type: {dialect}") 68 | 69 | # Override properties from format parameters 70 | props = {k: getattr(d, k) for k in DIALECT_KEYS if hasattr(d, k)} 71 | for key, value in fmtparams.items(): 72 | props[key] = value 73 | 74 | # lineterminator must be set 75 | if "lineterminator" not in props or props["lineterminator"] is None: 76 | props["lineterminator"] = "\n" 77 | 78 | # We have to subclass the csv.Dialect 79 | newdialect = type("dialect", (csv.Dialect,), props) 80 | return newdialect 81 | 82 | def writerow(self, row: Iterable[Any]) -> Any: 83 | try: 84 | return self._writer.writerow(row) 85 | except csv.Error as e: 86 | raise Error(str(e)) 87 | 88 | def writerows(self, rows: Iterable[Iterable[Any]]) -> Any: 89 | try: 90 | return self._writer.writerows(rows) 91 | except csv.Error as e: 92 | raise Error(str(e)) 93 | -------------------------------------------------------------------------------- /clevercsv/console/commands/code.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | 5 | from wilderness import Command 6 | 7 | from clevercsv.encoding import get_encoding 8 | from clevercsv.wrappers import detect_dialect 9 | 10 | from ._docs import FLAG_DESCRIPTIONS 11 | from ._utils import generate_code 12 | from ._utils import parse_int 13 | 14 | 15 | class CodeCommand(Command): 16 | _description = ( 17 | "Generate Python code for importing a given CSV file. This is " 18 | "especially useful if you don't want to repeatedly detect the dialect " 19 | "of the same file. Simply run:\n\n" 20 | "\tclevercsv code your_csv_file.csv\n\n" 21 | "and copy the generated code to a Python script." 22 | ) 23 | 24 | def __init__(self) -> None: 25 | super().__init__( 26 | name="code", 27 | title="Generate Python code to import a CSV file", 28 | description=self._description, 29 | extra_sections={"CleverCSV": "Part of the CleverCSV suite"}, 30 | ) 31 | 32 | def register(self) -> None: 33 | self.add_argument("path", help="Path to the CSV file") 34 | self.add_argument( 35 | "-e", 36 | "--encoding", 37 | help="Set the encoding of the file", 38 | description=FLAG_DESCRIPTIONS["encoding"], 39 | ) 40 | self.add_argument( 41 | "-n", 42 | "--num-chars", 43 | type=int, 44 | help="Number of characters to use for detection", 45 | description=FLAG_DESCRIPTIONS["num-chars"], 46 | ) 47 | self.add_argument( 48 | "-p", 49 | "--pandas", 50 | action="store_true", 51 | help="Write code that uses a Pandas DataFrame", 52 | description=( 53 | "By default, this command writes a small Python script to " 54 | "import the CSV file as a list of lists. By enabling this " 55 | "option the script will be written such that the file will be " 56 | "read as a Pandas DataFrame instead." 57 | ), 58 | ) 59 | 60 | def handle(self) -> int: 61 | filename = self.args.path 62 | encoding = self.args.encoding or get_encoding(filename) 63 | num_chars = parse_int(self.args.num_chars, "num-chars") 64 | dialect = detect_dialect( 65 | filename, 66 | num_chars=num_chars, 67 | encoding=encoding, 68 | verbose=self.args.verbose, 69 | ) 70 | if dialect is None: 71 | print("Error: dialect detection failed.", file=sys.stderr) 72 | return 1 73 | 74 | code_lines = generate_code( 75 | filename, dialect, encoding, use_pandas=self.args.pandas 76 | ) 77 | print("\n".join(code_lines)) 78 | return 0 79 | -------------------------------------------------------------------------------- /man/clevercsv-view.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: clevercsv-view 3 | .\" Author: G.J.J. van den Burg 4 | .\" Generator: Wilderness 5 | .\" Date: 2025-10-30 6 | .\" Manual: clevercsv Manual 7 | .\" Source: clevercsv 0.8.4 8 | .\" Language: English 9 | .\" 10 | .TH "CLEVERCSV-VIEW" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual" 11 | .\" ----------------------------------------------------------------- 12 | .\" * Define some portability stuff 13 | .\" ----------------------------------------------------------------- 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .\" http://bugs.debian.org/507673 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .ie \n(.g .ds Aq \(aq 19 | .el .ds Aq ' 20 | .\" ----------------------------------------------------------------- 21 | .\" * set default formatting * 22 | .\" ----------------------------------------------------------------- 23 | .\" disable hyphenation 24 | .nh 25 | .\" disable justification 26 | .ad l 27 | .\" ----------------------------------------------------------------- 28 | .\" * MAIN CONTENT STARTS HERE * 29 | .\" ----------------------------------------------------------------- 30 | .SH "NAME" 31 | clevercsv-view \- View the CSV file on the command line using TabView 32 | .SH "SYNOPSIS" 33 | .sp 34 | .nf 35 | \fIclevercsv view [\-e ENCODING | \-\-encoding=ENCODING] 36 | [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS] [\-t | \-\-transpose] 37 | .fi 38 | .sp 39 | .SH "DESCRIPTION" 40 | .sp 41 | The view command is useful to quickly inspect a messy CSV file on the command line. 42 | .SH "OPTIONS" 43 | .sp 44 | .sp 45 | .sp 46 | \-h, \-\-help 47 | .RS 4 48 | show this help message and exit 49 | .RE 50 | .PP 51 | \-e, \-\-encoding 52 | .RS 4 53 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. Moreover, when you have a number of CSV files with a known file encoding, you can use this option to speed up the code generation process. 54 | .RE 55 | .PP 56 | \-n, \-\-num\-chars 57 | .RS 4 58 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection. 59 | .sp 60 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended. 61 | .RE 62 | .PP 63 | \-t, \-\-transpose 64 | .RS 4 65 | Transpose the columns of the input file before viewing 66 | .RE 67 | .PP 68 | 69 | .RS 4 70 | Path to the CSV file 71 | .RE 72 | .PP 73 | .sp 74 | .SH "CLEVERCSV" 75 | .sp 76 | Part of the CleverCSV suite -------------------------------------------------------------------------------- /tests/test_unit/test_encoding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Unit tests for encoding detection 4 | 5 | Author: G.J.J. van den Burg 6 | License: See the LICENSE file. 7 | 8 | This file is part of CleverCSV. 9 | 10 | """ 11 | 12 | import os 13 | import tempfile 14 | import unittest 15 | 16 | from dataclasses import dataclass 17 | 18 | from typing import Any 19 | from typing import List 20 | 21 | from clevercsv._optional import import_optional_dependency 22 | from clevercsv._types import AnyPath 23 | from clevercsv.encoding import get_encoding 24 | from clevercsv.write import writer 25 | 26 | 27 | class EncodingTestCase(unittest.TestCase): 28 | @dataclass 29 | class Instance: 30 | table: List[List[Any]] 31 | encoding: str 32 | cchardet_encoding: str 33 | 34 | cases: List[Instance] = [ 35 | Instance( 36 | table=[["Å", "B", "C"], [1, 2, 3], [4, 5, 6]], 37 | encoding="ISO-8859-1", 38 | cchardet_encoding="WINDOWS-1252", 39 | ), 40 | Instance( 41 | table=[["A", "B", "C"], [1, 2, 3], [4, 5, 6]], 42 | encoding="ascii", 43 | cchardet_encoding="ASCII", 44 | ), 45 | Instance( 46 | table=[["亜唖", "娃阿", "哀愛"], [1, 2, 3], ["挨", "姶", "葵"]], 47 | encoding="ISO-2022-JP", 48 | cchardet_encoding="ISO-2022-JP", 49 | ), 50 | ] 51 | 52 | def setUp(self) -> None: 53 | self._tmpfiles: List[AnyPath] = [] 54 | 55 | def tearDown(self) -> None: 56 | for f in self._tmpfiles: 57 | os.unlink(f) 58 | 59 | def _build_file(self, table: List[List[str]], encoding: str) -> str: 60 | tmpfd, tmpfname = tempfile.mkstemp( 61 | prefix="ccsv_", 62 | suffix=".csv", 63 | ) 64 | tmpfp = os.fdopen(tmpfd, "w", newline=None, encoding=encoding) 65 | w = writer(tmpfp, dialect="excel") 66 | w.writerows(table) 67 | tmpfp.close() 68 | self._tmpfiles.append(tmpfname) 69 | return tmpfname 70 | 71 | def test_encoding_chardet(self) -> None: 72 | for case in self.cases: 73 | table = case.table 74 | encoding = case.encoding 75 | with self.subTest(encoding=encoding): 76 | tmpfname = self._build_file(table, encoding) 77 | detected = get_encoding(tmpfname, try_cchardet=False) 78 | self.assertEqual(encoding, detected) 79 | 80 | def test_encoding_cchardet(self) -> None: 81 | try: 82 | _ = import_optional_dependency("cchardet") 83 | except ImportError: 84 | self.skipTest("Failed to import cchardet, skipping this test") 85 | 86 | for case in self.cases: 87 | table = case.table 88 | encoding = case.encoding 89 | with self.subTest(encoding=encoding): 90 | out_encoding = case.cchardet_encoding 91 | tmpfname = self._build_file(table, encoding) 92 | detected = get_encoding(tmpfname, try_cchardet=True) 93 | self.assertEqual(out_encoding, detected) 94 | 95 | 96 | if __name__ == "__main__": 97 | unittest.main() 98 | -------------------------------------------------------------------------------- /clevercsv/console/commands/view.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | 5 | from typing import List 6 | from typing import Optional 7 | from typing import Sequence 8 | 9 | from wilderness import Command 10 | 11 | from clevercsv._optional import import_optional_dependency 12 | from clevercsv.exceptions import NoDetectionResult 13 | from clevercsv.wrappers import read_table 14 | 15 | from ._docs import FLAG_DESCRIPTIONS 16 | from ._utils import parse_int 17 | 18 | 19 | class ViewCommand(Command): 20 | _description = ( 21 | "The view command is useful to quickly inspect a messy CSV file on " 22 | "the command line." 23 | ) 24 | 25 | def __init__(self) -> None: 26 | super().__init__( 27 | name="view", 28 | title="View the CSV file on the command line using TabView", 29 | description=self._description, 30 | extra_sections={"CleverCSV": "Part of the CleverCSV suite"}, 31 | ) 32 | 33 | def register(self) -> None: 34 | self.add_argument("path", help="Path to the CSV file") 35 | self.add_argument( 36 | "-e", 37 | "--encoding", 38 | help="Set the encoding of the file", 39 | description=FLAG_DESCRIPTIONS["encoding"], 40 | ) 41 | self.add_argument( 42 | "-n", 43 | "--num-chars", 44 | help="Number of characters to use for detection", 45 | type=int, 46 | description=FLAG_DESCRIPTIONS["num-chars"], 47 | ) 48 | self.add_argument( 49 | "-t", 50 | "--transpose", 51 | action="store_true", 52 | help="Transpose the columns of the input file before viewing", 53 | ) 54 | 55 | def _tabview(self, rows: List[List[str]]) -> None: 56 | if sys.platform == "win32": 57 | print( 58 | "Error: unfortunately Tabview is not available on Windows, so " 59 | "the clevercsv view command is not available", 60 | file=sys.stderr, 61 | ) 62 | return 63 | 64 | import_optional_dependency("tabview", raise_on_missing=True) 65 | from tabview import view 66 | 67 | view(rows) 68 | 69 | def handle(self) -> int: 70 | verbose = self.args.verbose 71 | num_chars = parse_int(self.args.num_chars, "num-chars") 72 | try: 73 | rows = read_table( 74 | self.args.path, 75 | encoding=self.args.encoding, 76 | num_chars=num_chars, 77 | verbose=verbose, 78 | ) 79 | except NoDetectionResult: 80 | print("Error: dialect detection failed.", file=sys.stderr) 81 | return 1 82 | 83 | if self.args.transpose: 84 | max_row_length = max(map(len, rows)) 85 | fixed_rows: List[Sequence[Optional[str]]] = [] 86 | for row in rows: 87 | if len(row) == max_row_length: 88 | fixed_rows.append(row) 89 | else: 90 | fixed_rows.append( 91 | row + [None] * (max_row_length - len(row)) 92 | ) 93 | rows = list(map(list, zip(*fixed_rows))) 94 | self._tabview(rows) 95 | return 0 96 | -------------------------------------------------------------------------------- /clevercsv/_optional.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Code for dealing with optional dependencies 4 | 5 | The functionality in this file is largely based on similar functionality in the 6 | Pandas library. 7 | 8 | Author: G.J.J van den Burg 9 | Copyright: 2020, The Alan Turing Institute 10 | License: See LICENSE file. 11 | 12 | """ 13 | 14 | import importlib 15 | 16 | from types import ModuleType 17 | 18 | from typing import Dict 19 | from typing import List 20 | from typing import NamedTuple 21 | from typing import Optional 22 | 23 | from packaging.version import Version 24 | 25 | 26 | class OptionalDependency(NamedTuple): 27 | import_name: str 28 | package_name: str 29 | min_version: str 30 | 31 | 32 | # update this when changing setup.py 33 | OPTIONAL_DEPENDENCIES: List[OptionalDependency] = [ 34 | OptionalDependency("tabview", "tabview", "1.4"), 35 | OptionalDependency("pandas", "pandas", "0.24.1"), 36 | OptionalDependency("cchardet", "faust-cchardet", "2.1.18"), 37 | OptionalDependency("wilderness", "wilderness", "0.1.5"), 38 | ] 39 | 40 | 41 | def import_optional_dependency( 42 | name: str, raise_on_missing: bool = True 43 | ) -> Optional[ModuleType]: 44 | """ 45 | Import an optional dependency. 46 | 47 | This function is modelled on a similar function in the Pandas library. 48 | 49 | Parameters 50 | ---------- 51 | name : str 52 | Name of the module to import 53 | 54 | raise_on_missing : bool 55 | Whether to raise an error when the package is missing or to simply 56 | return None. 57 | 58 | Returns 59 | ------- 60 | module : module 61 | The module if importing was successful, None if 62 | :attr:`raise_on_missing` is False. 63 | 64 | Raises 65 | ------ 66 | ImportError 67 | When a module can't be imported and :attr:`raise_on_missing` is True. 68 | 69 | """ 70 | msg = ( 71 | f"\nOptional dependency '{name}' is missing. You can install it using " 72 | "pip or conda, or you can install CleverCSV with all of its optional " 73 | "dependencies by running: pip install clevercsv[full]" 74 | ) 75 | try: 76 | module = importlib.import_module(name) 77 | except ImportError: 78 | if raise_on_missing: 79 | raise ImportError(msg) from None 80 | else: 81 | return None 82 | 83 | opt_dependencies: Dict[str, OptionalDependency] = { 84 | d.import_name: d for d in OPTIONAL_DEPENDENCIES 85 | } 86 | 87 | dependency = opt_dependencies.get(name) 88 | if dependency is None: 89 | raise ImportError(f"No known optional dependency with name: {name}") 90 | 91 | version = getattr(module, "__version__", None) 92 | if version is None: 93 | return module 94 | 95 | if Version(version) < Version(dependency.min_version): 96 | msg = ( 97 | f"CleverCSV requires version '{dependency.min_version}' or newer " 98 | f"for optional dependency '{dependency.package_name}'. Please " 99 | "update the package or install CleverCSV with all its optional " 100 | "dependencies using: pip install clevercsv[full]" 101 | ) 102 | raise ImportError(msg) 103 | 104 | return module 105 | -------------------------------------------------------------------------------- /man/clevercsv-code.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: clevercsv-code 3 | .\" Author: G.J.J. van den Burg 4 | .\" Generator: Wilderness 5 | .\" Date: 2025-10-30 6 | .\" Manual: clevercsv Manual 7 | .\" Source: clevercsv 0.8.4 8 | .\" Language: English 9 | .\" 10 | .TH "CLEVERCSV-CODE" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual" 11 | .\" ----------------------------------------------------------------- 12 | .\" * Define some portability stuff 13 | .\" ----------------------------------------------------------------- 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .\" http://bugs.debian.org/507673 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .ie \n(.g .ds Aq \(aq 19 | .el .ds Aq ' 20 | .\" ----------------------------------------------------------------- 21 | .\" * set default formatting * 22 | .\" ----------------------------------------------------------------- 23 | .\" disable hyphenation 24 | .nh 25 | .\" disable justification 26 | .ad l 27 | .\" ----------------------------------------------------------------- 28 | .\" * MAIN CONTENT STARTS HERE * 29 | .\" ----------------------------------------------------------------- 30 | .SH "NAME" 31 | clevercsv-code \- Generate Python code to import a CSV file 32 | .SH "SYNOPSIS" 33 | .sp 34 | .nf 35 | \fIclevercsv code [\-e ENCODING | \-\-encoding=ENCODING] 36 | [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS] [\-p | \-\-pandas] 37 | .fi 38 | .sp 39 | .SH "DESCRIPTION" 40 | .sp 41 | Generate Python code for importing a given CSV file. This is especially useful if you don't want to repeatedly detect the dialect of the same file. Simply run: 42 | .sp 43 | .RS 4 44 | clevercsv code your_csv_file.csv 45 | .RE 46 | .sp 47 | and copy the generated code to a Python script. 48 | .SH "OPTIONS" 49 | .sp 50 | .sp 51 | .sp 52 | \-h, \-\-help 53 | .RS 4 54 | show this help message and exit 55 | .RE 56 | .PP 57 | \-e, \-\-encoding 58 | .RS 4 59 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. Moreover, when you have a number of CSV files with a known file encoding, you can use this option to speed up the code generation process. 60 | .RE 61 | .PP 62 | \-n, \-\-num\-chars 63 | .RS 4 64 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection. 65 | .sp 66 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended. 67 | .RE 68 | .PP 69 | \-p, \-\-pandas 70 | .RS 4 71 | By default, this command writes a small Python script to import the CSV file as a list of lists. By enabling this option the script will be written such that the file will be read as a Pandas DataFrame instead. 72 | .RE 73 | .PP 74 | 75 | .RS 4 76 | Path to the CSV file 77 | .RE 78 | .PP 79 | .sp 80 | .SH "CLEVERCSV" 81 | .sp 82 | Part of the CleverCSV suite -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, 8 | body size, disability, ethnicity, sex characteristics, gender identity and 9 | expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, religion, or sexual identity and 11 | orientation. 12 | 13 | ## Our Standards 14 | 15 | Examples of behavior that contributes to creating a positive environment 16 | include: 17 | 18 | * Using welcoming and inclusive language 19 | * Being respectful of differing viewpoints and experiences 20 | * Gracefully accepting constructive criticism 21 | * Focusing on what is best for the community 22 | * Showing empathy towards other community members 23 | 24 | Examples of unacceptable behavior by participants include: 25 | 26 | * The use of sexualized language or imagery and unwelcome sexual attention or 27 | advances 28 | * Trolling, insulting/derogatory comments, and personal or political attacks 29 | * Public or private harassment 30 | * Publishing others' private information, such as a physical or electronic 31 | address, without explicit permission 32 | * Other conduct which could reasonably be considered inappropriate in a 33 | professional setting 34 | 35 | ## Our Responsibilities 36 | 37 | Project maintainers are responsible for clarifying the standards of acceptable 38 | behavior and are expected to take appropriate and fair corrective action in 39 | response to any instances of unacceptable behavior. 40 | 41 | Project maintainers have the right and responsibility to remove, edit, or 42 | reject comments, commits, code, wiki edits, issues, and other contributions 43 | that are not aligned to this Code of Conduct, or to ban temporarily or 44 | permanently any contributor for other behaviors that they deem inappropriate, 45 | threatening, offensive, or harmful. 46 | 47 | ## Scope 48 | 49 | This Code of Conduct applies both within project spaces and in public spaces 50 | when an individual is representing the project or its community. Examples of 51 | representing a project or community include using an official project e-mail 52 | address, posting via an official social media account, or acting as an 53 | appointed representative at an online or offline event. Representation of a 54 | project may be further defined and clarified by project maintainers. 55 | 56 | ## Enforcement 57 | 58 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 59 | reported by contacting the project team at gvandenburg@turing.ac.uk. All 60 | complaints will be reviewed and investigated and will result in a response 61 | that is deemed necessary and appropriate to the circumstances. The project 62 | team is obligated to maintain confidentiality with regard to the reporter of 63 | an incident. Further details of specific enforcement policies may be posted 64 | separately. 65 | 66 | Project maintainers who do not follow or enforce the Code of Conduct in good 67 | faith may face temporary or permanent repercussions as determined by other 68 | members of the project's leadership. 69 | 70 | ## Attribution 71 | 72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 73 | version 1.4, available at 74 | https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 75 | 76 | [homepage]: https://www.contributor-covenant.org 77 | 78 | For answers to common questions about this code of conduct, see 79 | https://www.contributor-covenant.org/faq 80 | -------------------------------------------------------------------------------- /clevercsv/console/commands/explore.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import code 4 | import sys 5 | 6 | from wilderness import Command 7 | 8 | from clevercsv.encoding import get_encoding 9 | from clevercsv.wrappers import detect_dialect 10 | 11 | from ._docs import FLAG_DESCRIPTIONS 12 | from ._utils import generate_code 13 | from ._utils import parse_int 14 | 15 | 16 | class ExploreCommand(Command): 17 | _description = ( 18 | "The explore command allows you to quickly explore a CSV file in an " 19 | "interactive Python shell. This command detects the dialect of the " 20 | "CSV file and drops you into a Python interactive shell (REPL), " 21 | "with the CSV file already loaded. Simply run:\n\n" 22 | "\tclevercsv explore FILE\n\n" 23 | "to start working with the file loaded as a list of lists. " 24 | "Alternatively, you can run:\n\n" 25 | "\tclevercsv explore -p FILE\n\n" 26 | "to read the file as a Pandas dataframe." 27 | ) 28 | 29 | def __init__(self) -> None: 30 | super().__init__( 31 | name="explore", 32 | title="Explore the CSV file in an interactive Python shell", 33 | description=self._description, 34 | extra_sections={"CleverCSV": "Part of the CleverCSV suite"}, 35 | ) 36 | 37 | def register(self) -> None: 38 | self.add_argument("path", help="Path to the CSV file") 39 | self.add_argument( 40 | "-e", 41 | "--encoding", 42 | help="Set the encoding of the file", 43 | description=FLAG_DESCRIPTIONS["encoding"], 44 | ) 45 | self.add_argument( 46 | "-n", 47 | "--num-chars", 48 | help="Number of characters to use for detection", 49 | type=int, 50 | description=FLAG_DESCRIPTIONS["num-chars"], 51 | ) 52 | self.add_argument( 53 | "-p", 54 | "--pandas", 55 | action="store_true", 56 | help="Read the file into a Pandas DataFrame", 57 | description=( 58 | "By default, this command imports the CSV file as a list of " 59 | "lists. By enabling this option the script will be written " 60 | "such that the file will be read as a Pandas DataFrame " 61 | "instead." 62 | ), 63 | ) 64 | 65 | def handle(self) -> int: 66 | filename = self.args.path 67 | encoding = self.args.encoding or get_encoding(filename) 68 | num_chars = parse_int(self.args.num_chars, "num-chars") 69 | dialect = detect_dialect( 70 | filename, 71 | num_chars=num_chars, 72 | encoding=encoding, 73 | verbose=self.args.verbose, 74 | ) 75 | if dialect is None: 76 | print("Error: dialect detection failed.", file=sys.stderr) 77 | return 1 78 | 79 | code_lines = generate_code( 80 | filename, dialect, encoding, use_pandas=self.args.pandas 81 | ) 82 | 83 | console = code.InteractiveConsole() 84 | for line in code_lines: 85 | retcode = console.push(line) 86 | if retcode: 87 | print( 88 | "An error occurred starting the interactive console. " 89 | "Printing commands instead:\n" 90 | ) 91 | print("\n".join(code_lines)) 92 | return 1 93 | 94 | print("Dropping you into an interactive shell.\n") 95 | banner = "CleverCSV has loaded the data into the variable: " 96 | banner += "df" if self.args.pandas else "rows" 97 | console.interact(banner=banner) 98 | return 0 99 | -------------------------------------------------------------------------------- /man/clevercsv-explore.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: clevercsv-explore 3 | .\" Author: G.J.J. van den Burg 4 | .\" Generator: Wilderness 5 | .\" Date: 2025-10-30 6 | .\" Manual: clevercsv Manual 7 | .\" Source: clevercsv 0.8.4 8 | .\" Language: English 9 | .\" 10 | .TH "CLEVERCSV-EXPLORE" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual" 11 | .\" ----------------------------------------------------------------- 12 | .\" * Define some portability stuff 13 | .\" ----------------------------------------------------------------- 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .\" http://bugs.debian.org/507673 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .ie \n(.g .ds Aq \(aq 19 | .el .ds Aq ' 20 | .\" ----------------------------------------------------------------- 21 | .\" * set default formatting * 22 | .\" ----------------------------------------------------------------- 23 | .\" disable hyphenation 24 | .nh 25 | .\" disable justification 26 | .ad l 27 | .\" ----------------------------------------------------------------- 28 | .\" * MAIN CONTENT STARTS HERE * 29 | .\" ----------------------------------------------------------------- 30 | .SH "NAME" 31 | clevercsv-explore \- Explore the CSV file in an interactive Python shell 32 | .SH "SYNOPSIS" 33 | .sp 34 | .nf 35 | \fIclevercsv explore [\-e ENCODING | \-\-encoding=ENCODING] 36 | [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS] [\-p | \-\-pandas] 37 | .fi 38 | .sp 39 | .SH "DESCRIPTION" 40 | .sp 41 | The explore command allows you to quickly explore a CSV file in an interactive Python shell. This command detects the dialect of the CSV file and drops you into a Python interactive shell (REPL), with the CSV file already loaded. Simply run: 42 | .sp 43 | .RS 4 44 | clevercsv explore FILE 45 | .RE 46 | .sp 47 | to start working with the file loaded as a list of lists. Alternatively, you can run: 48 | .sp 49 | .RS 4 50 | clevercsv explore \-p FILE 51 | .RE 52 | .sp 53 | to read the file as a Pandas dataframe. 54 | .SH "OPTIONS" 55 | .sp 56 | .sp 57 | .sp 58 | \-h, \-\-help 59 | .RS 4 60 | show this help message and exit 61 | .RE 62 | .PP 63 | \-e, \-\-encoding 64 | .RS 4 65 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. Moreover, when you have a number of CSV files with a known file encoding, you can use this option to speed up the code generation process. 66 | .RE 67 | .PP 68 | \-n, \-\-num\-chars 69 | .RS 4 70 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection. 71 | .sp 72 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended. 73 | .RE 74 | .PP 75 | \-p, \-\-pandas 76 | .RS 4 77 | By default, this command imports the CSV file as a list of lists. By enabling this option the script will be written such that the file will be read as a Pandas DataFrame instead. 78 | .RE 79 | .PP 80 | 81 | .RS 4 82 | Path to the CSV file 83 | .RE 84 | .PP 85 | .sp 86 | .SH "CLEVERCSV" 87 | .sp 88 | Part of the CleverCSV suite -------------------------------------------------------------------------------- /docs/source/clevercsv.rst: -------------------------------------------------------------------------------- 1 | clevercsv package 2 | ================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | clevercsv.console 11 | 12 | Submodules 13 | ---------- 14 | 15 | clevercsv.break\_ties module 16 | ---------------------------- 17 | 18 | .. automodule:: clevercsv.break_ties 19 | :members: 20 | :show-inheritance: 21 | :undoc-members: 22 | 23 | clevercsv.consistency module 24 | ---------------------------- 25 | 26 | .. automodule:: clevercsv.consistency 27 | :members: 28 | :show-inheritance: 29 | :undoc-members: 30 | 31 | clevercsv.cparser\_util module 32 | ------------------------------ 33 | 34 | .. automodule:: clevercsv.cparser_util 35 | :members: 36 | :show-inheritance: 37 | :undoc-members: 38 | 39 | clevercsv.detect module 40 | ----------------------- 41 | 42 | .. automodule:: clevercsv.detect 43 | :members: 44 | :show-inheritance: 45 | :undoc-members: 46 | 47 | clevercsv.detect\_pattern module 48 | -------------------------------- 49 | 50 | .. automodule:: clevercsv.detect_pattern 51 | :members: 52 | :show-inheritance: 53 | :undoc-members: 54 | 55 | clevercsv.detect\_type module 56 | ----------------------------- 57 | 58 | .. automodule:: clevercsv.detect_type 59 | :members: 60 | :show-inheritance: 61 | :undoc-members: 62 | 63 | clevercsv.dialect module 64 | ------------------------ 65 | 66 | .. automodule:: clevercsv.dialect 67 | :members: 68 | :show-inheritance: 69 | :undoc-members: 70 | 71 | clevercsv.dict\_read\_write module 72 | ---------------------------------- 73 | 74 | .. automodule:: clevercsv.dict_read_write 75 | :members: 76 | :show-inheritance: 77 | :undoc-members: 78 | 79 | clevercsv.encoding module 80 | ------------------------- 81 | 82 | .. automodule:: clevercsv.encoding 83 | :members: 84 | :show-inheritance: 85 | :undoc-members: 86 | 87 | clevercsv.escape module 88 | ----------------------- 89 | 90 | .. automodule:: clevercsv.escape 91 | :members: 92 | :show-inheritance: 93 | :undoc-members: 94 | 95 | clevercsv.exceptions module 96 | --------------------------- 97 | 98 | .. automodule:: clevercsv.exceptions 99 | :members: 100 | :show-inheritance: 101 | :undoc-members: 102 | 103 | clevercsv.method module 104 | ----------------------- 105 | 106 | .. automodule:: clevercsv.method 107 | :members: 108 | :show-inheritance: 109 | :undoc-members: 110 | 111 | clevercsv.normal\_form module 112 | ----------------------------- 113 | 114 | .. automodule:: clevercsv.normal_form 115 | :members: 116 | :show-inheritance: 117 | :undoc-members: 118 | 119 | clevercsv.potential\_dialects module 120 | ------------------------------------ 121 | 122 | .. automodule:: clevercsv.potential_dialects 123 | :members: 124 | :show-inheritance: 125 | :undoc-members: 126 | 127 | clevercsv.read module 128 | --------------------- 129 | 130 | .. automodule:: clevercsv.read 131 | :members: 132 | :show-inheritance: 133 | :undoc-members: 134 | 135 | clevercsv.utils module 136 | ---------------------- 137 | 138 | .. automodule:: clevercsv.utils 139 | :members: 140 | :show-inheritance: 141 | :undoc-members: 142 | 143 | clevercsv.wrappers module 144 | ------------------------- 145 | 146 | .. automodule:: clevercsv.wrappers 147 | :members: 148 | :show-inheritance: 149 | :undoc-members: 150 | 151 | clevercsv.write module 152 | ---------------------- 153 | 154 | .. automodule:: clevercsv.write 155 | :members: 156 | :show-inheritance: 157 | :undoc-members: 158 | 159 | Module contents 160 | --------------- 161 | 162 | .. automodule:: clevercsv 163 | :members: 164 | :show-inheritance: 165 | :undoc-members: 166 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for easier installation and cleanup. 2 | # 3 | # Uses self-documenting macros from here: 4 | # http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html 5 | 6 | SHELL := bash 7 | .SHELLFLAGS := -eu -o pipefail -c 8 | MAKEFLAGS += --no-builtin-rules 9 | 10 | PACKAGE=clevercsv 11 | DOC_DIR=./docs/ 12 | VENV_DIR=/tmp/clevercsv_venv 13 | PYTHON ?= python 14 | 15 | .PHONY: help 16 | 17 | .DEFAULT_GOAL := help 18 | 19 | help: 20 | @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\ 21 | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\ 22 | %s\n", $$1, $$2}' 23 | 24 | ################ 25 | # Installation # 26 | ################ 27 | 28 | .PHONY: inplace install 29 | 30 | inplace: 31 | $(PYTHON) setup.py build_ext -i 32 | 33 | install: dist ## Install for the current user using the default python command 34 | $(PYTHON) -m pip install --user ./dist/$(PACKAGE)-*.tar.gz 35 | 36 | ################ 37 | # Distribution # 38 | ################ 39 | 40 | .PHONY: release dist 41 | 42 | release: ## Make a release 43 | $(PYTHON) make_release.py 44 | 45 | dist: man ## Make Python source distribution 46 | $(PYTHON) setup.py sdist 47 | 48 | ########### 49 | # Testing # 50 | ########### 51 | 52 | .PHONY: test integration integration_partial 53 | 54 | test: mypy green pytest 55 | 56 | green: venv ## Run unit tests 57 | source $(VENV_DIR)/bin/activate && green -a -vv ./tests/test_unit 58 | 59 | pytest: venv ## Run unit tests with PyTest 60 | source $(VENV_DIR)/bin/activate && pytest -ra -m 'not network' 61 | 62 | mypy: venv ## Run type checks 63 | source $(VENV_DIR)/bin/activate && \ 64 | mypy --check-untyped-defs ./stubs $(PACKAGE) ./tests 65 | 66 | integration: venv ## Run integration tests 67 | source $(VENV_DIR)/bin/activate && python ./tests/test_integration/test_dialect_detection.py -v 68 | 69 | integration_partial: venv ## Run partial integration tests 70 | source $(VENV_DIR)/bin/activate && python ./tests/test_integration/test_dialect_detection.py -v --partial 71 | 72 | 73 | ################# 74 | # Documentation # 75 | ################# 76 | 77 | .PHONY: docs doc man 78 | 79 | docs: doc 80 | doc: venv ## Build documentation with Sphinx 81 | source $(VENV_DIR)/bin/activate && m2r2 README.md && mv README.rst $(DOC_DIR) 82 | source $(VENV_DIR)/bin/activate && m2r2 CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR) 83 | cd $(DOC_DIR) && \ 84 | rm source/* && \ 85 | source $(VENV_DIR)/bin/activate && \ 86 | sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \ 87 | touch source/AUTOGENERATED 88 | source $(VENV_DIR)/bin/activate && $(MAKE) -C $(DOC_DIR) html 89 | 90 | man: venv ## Build man pages using Wilderness 91 | source $(VENV_DIR)/bin/activate && \ 92 | python setup.py build_manpages 93 | 94 | ####################### 95 | # Virtual environment # 96 | ####################### 97 | 98 | .PHONY: venv clean_venv 99 | 100 | venv: $(VENV_DIR)/bin/activate 101 | 102 | $(VENV_DIR)/bin/activate: 103 | test -d $(VENV_DIR) || $(PYTHON) -m venv $(VENV_DIR) 104 | source $(VENV_DIR)/bin/activate && python -m pip install -e .[dev] 105 | touch $(VENV_DIR)/bin/activate 106 | 107 | clean_venv: 108 | rm -rf $(VENV_DIR) 109 | 110 | ############ 111 | # Clean up # 112 | ############ 113 | 114 | .PHONY: clean 115 | 116 | clean: clean_venv ## Clean build dist and egg directories left after install 117 | rm -rf ./dist 118 | rm -rf ./build 119 | rm -rf ./$(PACKAGE).egg-info 120 | rm -rf ./cover 121 | rm -f MANIFEST 122 | rm -f ./$(PACKAGE)/*.so 123 | rm -f ./*_valgrind.log* 124 | rm -f ./man/* 125 | find . -type f -iname '*.pyc' -delete 126 | find . -type d -name '__pycache__' -empty -delete 127 | 128 | 129 | # Testing 130 | # 131 | gh130: venv 132 | source $(VENV_DIR)/bin/activate && \ 133 | python -m unittest -k '*github_issue_130' tests/test_unit/test_reader.py 134 | -------------------------------------------------------------------------------- /tests/test_unit/test_write.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Unit tests for the CCSV write module. 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | 11 | import csv 12 | import tempfile 13 | import unittest 14 | 15 | from typing import Any 16 | from typing import Iterable 17 | 18 | import clevercsv 19 | 20 | from clevercsv.dialect import SimpleDialect 21 | 22 | 23 | class WriterTestCase(unittest.TestCase): 24 | def _write_test( 25 | self, fields: Iterable[Any], expect: str, **kwargs: Any 26 | ) -> None: 27 | with tempfile.TemporaryFile("w+", newline="", prefix="ccsv_") as fp: 28 | writer = clevercsv.writer(fp, **kwargs) 29 | writer.writerow(fields) 30 | fp.seek(0) 31 | self.assertEqual(fp.read(), expect + writer.dialect.lineterminator) 32 | 33 | def _write_error_test( 34 | self, exc: type[Exception], fields: Any, **kwargs: Any 35 | ) -> None: 36 | with tempfile.TemporaryFile("w+", newline="", prefix="ccsv_") as fp: 37 | writer = clevercsv.writer(fp, **kwargs) 38 | with self.assertRaises(exc): 39 | writer.writerow(fields) 40 | fp.seek(0) 41 | self.assertEqual(fp.read(), "") 42 | 43 | def test_write_arg_valid(self) -> None: 44 | self._write_error_test(clevercsv.Error, None) 45 | self._write_test((), "") 46 | self._write_test([None], '""') 47 | self._write_error_test( 48 | clevercsv.Error, [None], quoting=clevercsv.QUOTE_NONE 49 | ) 50 | 51 | # Check that exceptions are passed up the chain 52 | class BadList: 53 | def __len__(self) -> int: 54 | return 10 55 | 56 | def __getitem__(self, i: int) -> None: 57 | if i > 2: 58 | raise OSError 59 | 60 | self._write_error_test(OSError, BadList()) 61 | 62 | class BadItem: 63 | def __str__(self) -> str: 64 | raise OSError 65 | 66 | self._write_error_test(OSError, [BadItem()]) 67 | 68 | def test_write_bigfield(self) -> None: 69 | bigstring = "X" * 50000 70 | self._write_test( 71 | [bigstring, bigstring], "%s,%s" % (bigstring, bigstring) 72 | ) 73 | 74 | def test_write_quoting(self) -> None: 75 | self._write_test(["a", 1, "p,q"], 'a,1,"p,q"') 76 | self._write_error_test( 77 | clevercsv.Error, ["a", 1, "p,q"], quoting=clevercsv.QUOTE_NONE 78 | ) 79 | self._write_test( 80 | ["a", 1, "p,q"], 'a,1,"p,q"', quoting=clevercsv.QUOTE_MINIMAL 81 | ) 82 | self._write_test( 83 | ["a", 1, "p,q"], '"a",1,"p,q"', quoting=clevercsv.QUOTE_NONNUMERIC 84 | ) 85 | self._write_test( 86 | ["a", 1, "p,q"], '"a","1","p,q"', quoting=clevercsv.QUOTE_ALL 87 | ) 88 | self._write_test( 89 | ["a\nb", 1], '"a\nb","1"', quoting=clevercsv.QUOTE_ALL 90 | ) 91 | 92 | def test_write_simpledialect(self) -> None: 93 | self._write_test( 94 | ["a", 1, "p,q"], 95 | "a,1,|p,q|", 96 | dialect=SimpleDialect(delimiter=",", quotechar="|", escapechar=""), 97 | ) 98 | 99 | def test_write_csv_dialect(self) -> None: 100 | self._write_test( 101 | ["a", 1, "p,q"], 102 | 'a,1,"p,q"', 103 | dialect="excel", 104 | ) 105 | self._write_test( 106 | ["a", 1, "p,q"], 107 | '"a","1","p,q"', 108 | dialect=csv.unix_dialect, 109 | ) 110 | self._write_test( 111 | [1, 2, 3], 112 | "1\t2\t3", 113 | dialect=clevercsv.excel_tab, 114 | ) 115 | 116 | 117 | if __name__ == "__main__": 118 | unittest.main() 119 | -------------------------------------------------------------------------------- /man/clevercsv.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: clevercsv 3 | .\" Author: G.J.J. van den Burg 4 | .\" Generator: Wilderness 5 | .\" Date: 2025-10-30 6 | .\" Manual: clevercsv Manual 7 | .\" Source: clevercsv 0.8.4 8 | .\" Language: English 9 | .\" 10 | .TH "CLEVERCSV" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual" 11 | .\" ----------------------------------------------------------------- 12 | .\" * Define some portability stuff 13 | .\" ----------------------------------------------------------------- 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .\" http://bugs.debian.org/507673 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .ie \n(.g .ds Aq \(aq 19 | .el .ds Aq ' 20 | .\" ----------------------------------------------------------------- 21 | .\" * set default formatting * 22 | .\" ----------------------------------------------------------------- 23 | .\" disable hyphenation 24 | .nh 25 | .\" disable justification 26 | .ad l 27 | .\" ----------------------------------------------------------------- 28 | .\" * MAIN CONTENT STARTS HERE * 29 | .\" ----------------------------------------------------------------- 30 | .SH "NAME" 31 | clevercsv \- CleverCSV command line tool 32 | .SH "SYNOPSIS" 33 | .sp 34 | .nf 35 | \fIclevercsv [\-h | \-\-help] [\-V | \-\-version] [\-v | \-\-verbose] 36 | .fi 37 | .sp 38 | .SH "DESCRIPTION" 39 | .sp 40 | CleverCSV is a Python library and command line tool for dealing with messy CSV files. It consists of a number of commands that can be used to analyze, explore, or standardize a messy CSV file. 41 | .sp 42 | Further help and documentation can be found online at https://github.com/alan\-turing\-institute/CleverCSV or https://clevercsv.readthedocs.io 43 | .SH "OPTIONS" 44 | .sp 45 | .sp 46 | .sp 47 | \-h, \-\-help 48 | .RS 4 49 | show this help message and exit 50 | .RE 51 | .PP 52 | \-V, \-\-version 53 | .RS 4 54 | Show version and exit 55 | .RE 56 | .PP 57 | \-v, \-\-verbose 58 | .RS 4 59 | Enable verbose mode 60 | .RE 61 | .PP 62 | .sp 63 | .SH "COMMANDS" 64 | .sp 65 | The following commands are available in CleverCSV: 66 | .sp 67 | clevercsv\-code(1) 68 | .RS 4 69 | Generate Python code to import a given CSV file 70 | .RE 71 | .sp 72 | clevercsv\-detect(1) 73 | .RS 4 74 | Detect the dialect of a CSV file 75 | .RE 76 | .sp 77 | clevercsv\-explore(1) 78 | .RS 4 79 | Infer the dialect and open the file in an interactive Python session 80 | .RE 81 | .sp 82 | clevercsv\-standardize(1) 83 | .RS 4 84 | Convert a messy CSV file to one that follows RFC\-4180 85 | .RE 86 | .sp 87 | clevercsv\-view(1) 88 | .RS 4 89 | Detect the dialect and open the CSV file using TabView 90 | .RE 91 | .SH "AUTHORS" 92 | .sp 93 | The CleverCSV package was originally written by Gerrit van den Burg and came out of scientific research on wrangling messy CSV files by Gerrit van den Burg, Alfredo Nazabal, and Charles Sutton. This research was conducted at and supported by The Alan Turing Institute. CleverCSV has since benefitted from a number of open\-source contributors on GitHub. 94 | .SH "REPORTING BUGS" 95 | .sp 96 | If you encounter an issue in CleverCSV, please open an issue or submit a pull request at https://github.com/alan\-turing\-institute/CleverCSV. Please don't hesitate, you're helping to make this project better for everyone! 97 | .SH "NOTES" 98 | .sp 99 | \fB1. \fRCleverCSV GitHub repository 100 | .br 101 | https://github.com/alan\-turing\-institute/CleverCSV 102 | .sp 103 | \fB2. \fRCleverCSV documentation 104 | .br 105 | https://clevercsv.readthedocs.io 106 | .sp 107 | \fB3. \fRWrangling Messy CSV Files by Detecting Row and Type Patterns 108 | .br 109 | https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_\-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf -------------------------------------------------------------------------------- /clevercsv/console/application.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | CleverCSV Command line application. 5 | 6 | """ 7 | 8 | from wilderness import Application 9 | 10 | from .. import __version__ 11 | from .commands import CodeCommand 12 | from .commands import DetectCommand 13 | from .commands import ExploreCommand 14 | from .commands import StandardizeCommand 15 | from .commands import ViewCommand 16 | 17 | 18 | class CleverCSVApplication(Application): 19 | _description = ( 20 | "CleverCSV is a Python library and command line tool for dealing " 21 | "with messy CSV files. It consists of a number of commands that can " 22 | "be used to analyze, explore, or standardize a messy CSV file.\n\n" 23 | "Further help and documentation can be found online at " 24 | "https://github.com/alan-turing-institute/CleverCSV or " 25 | "https://clevercsv.readthedocs.io" 26 | ) 27 | _extra = { 28 | "Commands": ( 29 | "The following commands are available in CleverCSV:\n\n" 30 | "clevercsv-code(1)\n" 31 | "\tGenerate Python code to import a given CSV file\n\n" 32 | "clevercsv-detect(1)\n" 33 | "\tDetect the dialect of a CSV file\n\n" 34 | "clevercsv-explore(1)\n" 35 | "\tInfer the dialect and open the file in an interactive Python " 36 | "session\n\n" 37 | "clevercsv-standardize(1)\n" 38 | "\tConvert a messy CSV file to one that follows RFC-4180\n\n" 39 | "clevercsv-view(1)\n" 40 | "\tDetect the dialect and open the CSV file using TabView" 41 | ), 42 | "Authors": ( 43 | "The CleverCSV package was originally written by Gerrit van den " 44 | "Burg and came out of scientific research on wrangling messy CSV " 45 | "files by Gerrit van den Burg, Alfredo Nazabal, and Charles " 46 | "Sutton. This research was conducted at and supported by The " 47 | "Alan Turing Institute. CleverCSV has since benefitted from a " 48 | "number of open-source contributors on GitHub." 49 | ), 50 | "Reporting Bugs": ( 51 | "If you encounter an issue in CleverCSV, please open an issue " 52 | "or submit a pull request at " 53 | "https://github.com/alan-turing-institute/CleverCSV. Please don't " 54 | "hesitate, you're helping to make this project better for " 55 | "everyone!" 56 | ), 57 | "Notes": ( 58 | "1. CleverCSV GitHub repository\n" 59 | " https://github.com/alan-turing-institute/CleverCSV\n\n" 60 | "2. CleverCSV documentation\n" 61 | " https://clevercsv.readthedocs.io\n\n" 62 | "3. Wrangling Messy CSV Files by Detecting Row and Type Patterns\n" 63 | " https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf" 64 | ), 65 | } 66 | 67 | def __init__(self) -> None: 68 | super().__init__( 69 | "clevercsv", 70 | version=__version__, 71 | title="CleverCSV command line tool", 72 | author="G.J.J. van den Burg", 73 | description=self._description, 74 | extra_sections=self._extra, 75 | ) 76 | 77 | def register(self) -> None: 78 | self.add_argument( 79 | "-V", 80 | "--version", 81 | help="Show version and exit", 82 | action="version", 83 | version=__version__, 84 | ) 85 | self.add_argument( 86 | "-v", "--verbose", help="Enable verbose mode", action="store_true" 87 | ) 88 | 89 | 90 | def build_application() -> Application: 91 | app = CleverCSVApplication() 92 | app.add(DetectCommand()) 93 | app.add(ViewCommand()) 94 | app.add(StandardizeCommand()) 95 | app.add(CodeCommand()) 96 | app.add(ExploreCommand()) 97 | return app 98 | -------------------------------------------------------------------------------- /man/clevercsv-standardize.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: clevercsv-standardize 3 | .\" Author: G.J.J. van den Burg 4 | .\" Generator: Wilderness 5 | .\" Date: 2025-10-30 6 | .\" Manual: clevercsv Manual 7 | .\" Source: clevercsv 0.8.4 8 | .\" Language: English 9 | .\" 10 | .TH "CLEVERCSV-STANDARDIZE" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual" 11 | .\" ----------------------------------------------------------------- 12 | .\" * Define some portability stuff 13 | .\" ----------------------------------------------------------------- 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .\" http://bugs.debian.org/507673 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .ie \n(.g .ds Aq \(aq 19 | .el .ds Aq ' 20 | .\" ----------------------------------------------------------------- 21 | .\" * set default formatting * 22 | .\" ----------------------------------------------------------------- 23 | .\" disable hyphenation 24 | .nh 25 | .\" disable justification 26 | .ad l 27 | .\" ----------------------------------------------------------------- 28 | .\" * MAIN CONTENT STARTS HERE * 29 | .\" ----------------------------------------------------------------- 30 | .SH "NAME" 31 | clevercsv-standardize \- Convert a CSV file to one that conforms to RFC-4180 32 | .SH "SYNOPSIS" 33 | .sp 34 | .nf 35 | \fIclevercsv standardize [\-e ENCODING | \-\-encoding=ENCODING] 36 | [\-E TARGET_ENCODING | \-\-target\-encoding=TARGET_ENCODING] 37 | [\-i | \-\-in\-place] [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS] 38 | [\-o OUTPUT | \-\-output=OUTPUT] [\-t | \-\-transpose] 39 | 40 | .fi 41 | .sp 42 | .SH "DESCRIPTION" 43 | .sp 44 | The standardize command can be used to convert a non\-standard CSVfile to the standard RFC\-4180 format [1]. When using the \-\-in\-place option, the return code of CleverCSV can be used to check whether a file was altered or not. The return codewill be 2 when the file was altered and 0 otherwise. 45 | .sp 46 | [1]: https://tools.ietf.org/html/rfc4180 47 | .SH "OPTIONS" 48 | .sp 49 | .sp 50 | .sp 51 | \-h, \-\-help 52 | .RS 4 53 | show this help message and exit 54 | .RE 55 | .PP 56 | \-e, \-\-encoding 57 | .RS 4 58 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. For this command, the provided encoding will also be used for the output file(s). When only one encoding is given, it will be used for all files given on the command line. When multiple encodings are given, the number must correspond to the number of files provided as input. 59 | .RE 60 | .PP 61 | \-E, \-\-target\-encoding 62 | .RS 4 63 | If ommited, the output file encoding while be the same as that of the original file. 64 | .RE 65 | .PP 66 | \-i, \-\-in\-place 67 | .RS 4 68 | Standardize and overwrite the input file(s) 69 | .RE 70 | .PP 71 | \-n, \-\-num\-chars 72 | .RS 4 73 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection. 74 | .sp 75 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended. 76 | .RE 77 | .PP 78 | \-o, \-\-output 79 | .RS 4 80 | The output files to write the standardized input files to. The order of the input files and the order of the output files should match if this option is used with more than one input file. 81 | .RE 82 | .PP 83 | \-t, \-\-transpose 84 | .RS 4 85 | Transpose the columns of the input file(s) before writing 86 | .RE 87 | .PP 88 | 89 | .RS 4 90 | Path to one or more CSV file(s) 91 | .RE 92 | .PP 93 | .sp 94 | .SH "CLEVERCSV" 95 | .sp 96 | Part of the CleverCSV suite -------------------------------------------------------------------------------- /clevercsv/detect_pattern.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Code for computing the pattern score. 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | import collections 11 | import re 12 | 13 | from typing import Optional 14 | from typing import Pattern 15 | 16 | from .cabstraction import base_abstraction 17 | from .cabstraction import c_merge_with_quotechar 18 | from .dialect import SimpleDialect 19 | 20 | DEFAULT_EPS_PAT: float = 1e-3 21 | 22 | RE_MULTI_C: Pattern[str] = re.compile(r"C{2,}") 23 | 24 | 25 | def pattern_score( 26 | data: str, dialect: SimpleDialect, eps: float = DEFAULT_EPS_PAT 27 | ) -> float: 28 | """ 29 | Compute the pattern score for given data and a dialect. 30 | 31 | Parameters 32 | ---------- 33 | 34 | data : str 35 | The data of the file as a raw character string 36 | 37 | dialect: dialect.Dialect 38 | The dialect object 39 | 40 | Returns 41 | ------- 42 | score : float 43 | the pattern score 44 | 45 | """ 46 | A = make_abstraction(data, dialect) 47 | row_patterns = collections.Counter(A.split("R")) 48 | P = 0.0 49 | for pat_k, Nk in row_patterns.items(): 50 | Lk = len(pat_k.split("D")) 51 | P += Nk * (max(eps, Lk - 1) / Lk) 52 | P /= len(row_patterns) 53 | return P 54 | 55 | 56 | def make_abstraction(data: str, dialect: SimpleDialect) -> str: 57 | """Create an abstract representation of the CSV file based on the dialect. 58 | 59 | This function constructs the basic abstraction used to compute the row 60 | patterns. 61 | 62 | Parameters 63 | ---------- 64 | data : str 65 | The data of the file as a string. 66 | 67 | dialect : SimpleDialect 68 | A dialect to parse the file with. 69 | 70 | Returns 71 | ------- 72 | abstraction : str 73 | An abstract representation of the CSV file. 74 | 75 | """ 76 | A = base_abstraction( 77 | data, dialect.delimiter, dialect.quotechar, dialect.escapechar 78 | ) 79 | A = merge_with_quotechar(A) 80 | A = fill_empties(A) 81 | A = strip_trailing(A) 82 | return A 83 | 84 | 85 | def merge_with_quotechar( 86 | S: str, dialect: Optional[SimpleDialect] = None 87 | ) -> str: 88 | """Merge quoted blocks in the abstraction 89 | 90 | This function takes the abstract representation and merges quoted blocks 91 | (``QC...CQ``) to a single cell (``C``). The function takes nested quotes 92 | into account. 93 | 94 | Parameters 95 | ---------- 96 | S : str 97 | The data of a file as a string 98 | 99 | dialect : SimpleDialect 100 | The dialect used to make the abstraction. This is not used but kept for 101 | backwards compatibility. Will be removed in a future version. 102 | 103 | Returns 104 | ------- 105 | abstraction : str 106 | A simplified version of the abstraction with quoted blocks merged. 107 | 108 | """ 109 | return c_merge_with_quotechar(S) 110 | 111 | 112 | def fill_empties(abstract: str) -> str: 113 | """Fill empty cells in the abstraction 114 | 115 | The way the row patterns are constructed assumes that empty cells are 116 | marked by the letter `C` as well. This function fill those in. The function 117 | also removes duplicate occurrances of ``CC`` and replaces these with 118 | ``C``. 119 | 120 | Parameters 121 | ---------- 122 | abstract : str 123 | The abstract representation of the file. 124 | 125 | Returns 126 | ------- 127 | abstraction : str 128 | The abstract representation with empties filled. 129 | 130 | 131 | """ 132 | while "DD" in abstract: 133 | abstract = abstract.replace("DD", "DCD") 134 | 135 | while "DR" in abstract: 136 | abstract = abstract.replace("DR", "DCR") 137 | 138 | while "RD" in abstract: 139 | abstract = abstract.replace("RD", "RCD") 140 | 141 | abstract = RE_MULTI_C.sub("C", abstract) 142 | 143 | if abstract.startswith("D"): 144 | abstract = "C" + abstract 145 | 146 | if abstract.endswith("D"): 147 | abstract += "C" 148 | 149 | return abstract 150 | 151 | 152 | def strip_trailing(abstract: str) -> str: 153 | """Strip trailing row separator from abstraction.""" 154 | while abstract.endswith("R"): 155 | abstract = abstract[:-1] 156 | return abstract 157 | -------------------------------------------------------------------------------- /tests/test_unit/test_normal_forms.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Unit tests for the normal form detection. 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | import unittest 11 | 12 | from clevercsv.dialect import SimpleDialect 13 | from clevercsv.normal_form import is_form_1 14 | from clevercsv.normal_form import is_form_2 15 | from clevercsv.normal_form import is_form_3 16 | from clevercsv.normal_form import is_form_4 17 | from clevercsv.normal_form import is_form_5 18 | 19 | 20 | class NormalFormTestCase(unittest.TestCase): 21 | def test_form_1(self) -> None: 22 | dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") 23 | 24 | self.assertTrue(is_form_1('"A","B","C"'.split("\n"), dialect)) 25 | self.assertTrue(is_form_1('"A","B"\n"C","D"'.split("\n"), dialect)) 26 | self.assertTrue(is_form_1('"A","","C"'.split("\n"), dialect)) 27 | 28 | self.assertFalse(is_form_1('"A","B"\n"A"'.split("\n"), dialect)) 29 | self.assertFalse(is_form_1('"A"\n"B"'.split("\n"), dialect)) 30 | self.assertFalse(is_form_1('"A"\n"A","B"'.split("\n"), dialect)) 31 | self.assertFalse(is_form_1('"A",,"C"'.split("\n"), dialect)) 32 | self.assertFalse(is_form_1('"A",C'.split("\n"), dialect)) 33 | self.assertFalse(is_form_1('"A"\n"b""A""c","B"'.split("\n"), dialect)) 34 | 35 | def test_form_2(self) -> None: 36 | dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="") 37 | 38 | self.assertTrue(is_form_2("1,2,3".split("\n"), dialect)) 39 | self.assertTrue(is_form_2("1,2,3\na,b,c".split("\n"), dialect)) 40 | self.assertTrue(is_form_2("a@b.com,3".split("\n"), dialect)) 41 | self.assertTrue(is_form_2("a,,3\n1,2,3".split("\n"), dialect)) 42 | 43 | self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6".split("\n"), dialect)) 44 | self.assertFalse(is_form_2("1".split("\n"), dialect)) 45 | self.assertFalse(is_form_2('1,"a"'.split("\n"), dialect)) 46 | self.assertFalse(is_form_2("a;b,3".split("\n"), dialect)) 47 | self.assertFalse(is_form_2('"a,3,3\n1,2,3'.split("\n"), dialect)) 48 | self.assertFalse(is_form_2('a,"",3\n1,2,3'.split("\n"), dialect)) 49 | 50 | def test_form_3(self) -> None: 51 | A = SimpleDialect(delimiter=",", quotechar="'", escapechar="") 52 | Q = SimpleDialect(delimiter=",", quotechar='"', escapechar="") 53 | 54 | self.assertTrue(is_form_3('A,B\nC,"D"'.split("\n"), Q)) 55 | self.assertTrue(is_form_3('A,B\nC,"d,e"'.split("\n"), Q)) 56 | 57 | self.assertFalse(is_form_3('A,\nC,"d,e"'.split("\n"), Q)) 58 | self.assertFalse(is_form_3("3;4,B\nC,D".split("\n"), Q)) 59 | 60 | self.assertFalse(is_form_3('A,B\n"C",D'.split("\n"), A)) 61 | self.assertTrue(is_form_3('A,B\n"C",D'.split("\n"), Q)) 62 | 63 | def test_form_4(self) -> None: 64 | quoted = SimpleDialect(delimiter="", quotechar='"', escapechar="") 65 | unquoted = SimpleDialect(delimiter="", quotechar="", escapechar="") 66 | 67 | self.assertTrue(is_form_4("A\nB\nC".split("\n"), unquoted)) 68 | self.assertTrue(is_form_4("1\n2\n3".split("\n"), unquoted)) 69 | self.assertTrue(is_form_4("A_B\n1\n2".split("\n"), unquoted)) 70 | self.assertTrue(is_form_4("A&B\n1\n2".split("\n"), unquoted)) 71 | self.assertTrue(is_form_4("A&B\n-1\n2".split("\n"), unquoted)) 72 | self.assertTrue(is_form_4('"A"\n"B"\n"C"'.split("\n"), quoted)) 73 | 74 | self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"'.split("\n"), quoted)) 75 | self.assertFalse(is_form_4('"A","B"\n"B"\n"C"'.split("\n"), quoted)) 76 | self.assertFalse(is_form_4('"A@b"\n"B"\n"C"'.split("\n"), quoted)) 77 | self.assertFalse(is_form_4('A\n"-1"\n2'.split("\n"), unquoted)) 78 | self.assertFalse(is_form_4("A B\n-1 3\n2 4".split("\n"), unquoted)) 79 | 80 | def test_form_5(self) -> None: 81 | dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") 82 | 83 | self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"'.split("\n"), dialect)) 84 | self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"'.split("\n"), dialect)) 85 | 86 | self.assertFalse(is_form_5("A,B\n1,2\n3,4".split("\n"), dialect)) 87 | self.assertFalse(is_form_5("A,B\n1,\n2,3".split("\n"), dialect)) 88 | self.assertFalse( 89 | is_form_5('"A,""B"""\n"1,"\n"2,3"'.split("\n"), dialect) 90 | ) 91 | 92 | 93 | if __name__ == "__main__": 94 | unittest.main() 95 | -------------------------------------------------------------------------------- /stubs/regex/regex.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from regex._regex_core import VERSION0 4 | 5 | def match( 6 | pattern, 7 | string, 8 | flags: int = ..., 9 | pos: Any | None = ..., 10 | endpos: Any | None = ..., 11 | partial: bool = ..., 12 | concurrent: Any | None = ..., 13 | timeout: Any | None = ..., 14 | ignore_unused: bool = ..., 15 | **kwargs 16 | ): ... 17 | def fullmatch( 18 | pattern, 19 | string, 20 | flags: int = ..., 21 | pos: Any | None = ..., 22 | endpos: Any | None = ..., 23 | partial: bool = ..., 24 | concurrent: Any | None = ..., 25 | timeout: Any | None = ..., 26 | ignore_unused: bool = ..., 27 | **kwargs 28 | ): ... 29 | def search( 30 | pattern, 31 | string, 32 | flags: int = ..., 33 | pos: Any | None = ..., 34 | endpos: Any | None = ..., 35 | partial: bool = ..., 36 | concurrent: Any | None = ..., 37 | timeout: Any | None = ..., 38 | ignore_unused: bool = ..., 39 | **kwargs 40 | ): ... 41 | def sub( 42 | pattern, 43 | repl, 44 | string, 45 | count: int = ..., 46 | flags: int = ..., 47 | pos: Any | None = ..., 48 | endpos: Any | None = ..., 49 | concurrent: Any | None = ..., 50 | timeout: Any | None = ..., 51 | ignore_unused: bool = ..., 52 | **kwargs 53 | ): ... 54 | def subf( 55 | pattern, 56 | format, 57 | string, 58 | count: int = ..., 59 | flags: int = ..., 60 | pos: Any | None = ..., 61 | endpos: Any | None = ..., 62 | concurrent: Any | None = ..., 63 | timeout: Any | None = ..., 64 | ignore_unused: bool = ..., 65 | **kwargs 66 | ): ... 67 | def subn( 68 | pattern, 69 | repl, 70 | string, 71 | count: int = ..., 72 | flags: int = ..., 73 | pos: Any | None = ..., 74 | endpos: Any | None = ..., 75 | concurrent: Any | None = ..., 76 | timeout: Any | None = ..., 77 | ignore_unused: bool = ..., 78 | **kwargs 79 | ): ... 80 | def subfn( 81 | pattern, 82 | format, 83 | string, 84 | count: int = ..., 85 | flags: int = ..., 86 | pos: Any | None = ..., 87 | endpos: Any | None = ..., 88 | concurrent: Any | None = ..., 89 | timeout: Any | None = ..., 90 | ignore_unused: bool = ..., 91 | **kwargs 92 | ): ... 93 | def split( 94 | pattern, 95 | string, 96 | maxsplit: int = ..., 97 | flags: int = ..., 98 | concurrent: Any | None = ..., 99 | timeout: Any | None = ..., 100 | ignore_unused: bool = ..., 101 | **kwargs 102 | ): ... 103 | def splititer( 104 | pattern, 105 | string, 106 | maxsplit: int = ..., 107 | flags: int = ..., 108 | concurrent: Any | None = ..., 109 | timeout: Any | None = ..., 110 | ignore_unused: bool = ..., 111 | **kwargs 112 | ): ... 113 | def findall( 114 | pattern, 115 | string, 116 | flags: int = ..., 117 | pos: Any | None = ..., 118 | endpos: Any | None = ..., 119 | overlapped: bool = ..., 120 | concurrent: Any | None = ..., 121 | timeout: Any | None = ..., 122 | ignore_unused: bool = ..., 123 | **kwargs 124 | ): ... 125 | def finditer( 126 | pattern, 127 | string, 128 | flags: int = ..., 129 | pos: Any | None = ..., 130 | endpos: Any | None = ..., 131 | overlapped: bool = ..., 132 | partial: bool = ..., 133 | concurrent: Any | None = ..., 134 | timeout: Any | None = ..., 135 | ignore_unused: bool = ..., 136 | **kwargs 137 | ): ... 138 | def compile( 139 | pattern, flags: int = ..., ignore_unused: bool = ..., **kwargs 140 | ): ... 141 | def purge() -> None: ... 142 | def cache_all(value: bool = ...): ... 143 | def template(pattern, flags: int = ...): ... 144 | def escape(pattern, special_only: bool = ..., literal_spaces: bool = ...): ... 145 | 146 | DEFAULT_VERSION = VERSION0 147 | Pattern: Any 148 | Match: Any 149 | Regex = compile 150 | 151 | # Names in __all__ with no definition: 152 | # A 153 | # ASCII 154 | # B 155 | # BESTMATCH 156 | # D 157 | # DEBUG 158 | # DOTALL 159 | # E 160 | # ENHANCEMATCH 161 | # F 162 | # FULLCASE 163 | # I 164 | # IGNORECASE 165 | # L 166 | # LOCALE 167 | # M 168 | # MULTILINE 169 | # P 170 | # POSIX 171 | # R 172 | # REVERSE 173 | # S 174 | # Scanner 175 | # T 176 | # TEMPLATE 177 | # U 178 | # UNICODE 179 | # V0 180 | # V1 181 | # VERBOSE 182 | # VERSION0 183 | # VERSION1 184 | # W 185 | # WORD 186 | # X 187 | # __doc__ 188 | # __version__ 189 | # error 190 | -------------------------------------------------------------------------------- /man/clevercsv-detect.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: clevercsv-detect 3 | .\" Author: G.J.J. van den Burg 4 | .\" Generator: Wilderness 5 | .\" Date: 2025-10-30 6 | .\" Manual: clevercsv Manual 7 | .\" Source: clevercsv 0.8.4 8 | .\" Language: English 9 | .\" 10 | .TH "CLEVERCSV-DETECT" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual" 11 | .\" ----------------------------------------------------------------- 12 | .\" * Define some portability stuff 13 | .\" ----------------------------------------------------------------- 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | .\" http://bugs.debian.org/507673 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | .ie \n(.g .ds Aq \(aq 19 | .el .ds Aq ' 20 | .\" ----------------------------------------------------------------- 21 | .\" * set default formatting * 22 | .\" ----------------------------------------------------------------- 23 | .\" disable hyphenation 24 | .nh 25 | .\" disable justification 26 | .ad l 27 | .\" ----------------------------------------------------------------- 28 | .\" * MAIN CONTENT STARTS HERE * 29 | .\" ----------------------------------------------------------------- 30 | .SH "NAME" 31 | clevercsv-detect \- Detect the dialect of a CSV file 32 | .SH "SYNOPSIS" 33 | .sp 34 | .nf 35 | \fIclevercsv detect [\-c | \-\-consistency] [\-e ENCODING | \-\-encoding=ENCODING] 36 | [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS] [ \-p | \-\-plain | 37 | \-j | \-\-json ] [\-\-no\-skip] [\-\-add\-runtime] 38 | .fi 39 | .sp 40 | .SH "DESCRIPTION" 41 | .sp 42 | Detect the dialect of a CSV file. 43 | .SH "OPTIONS" 44 | .sp 45 | .sp 46 | .sp 47 | \-h, \-\-help 48 | .RS 4 49 | show this help message and exit 50 | .RE 51 | .PP 52 | \-c, \-\-consistency 53 | .RS 4 54 | By default, the dialect of CSV files is detected using atwo\-step process. First, a strict set of checks is used to see if the file adheres to a very basic format (for example, when all cells in the file are integers). If none of these checks succeed, the data consistency measure of Van den Burg, et al. (2019) is used to detect the dialect. With this option, you can force the detection to always use the data consistency measure. This can be useful for testing or research purposes, for instance. 55 | .RE 56 | .PP 57 | \-e, \-\-encoding 58 | .RS 4 59 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. Moreover, when you have a number of CSV files with a known file encoding, you can use this option to speed up the code generation process. 60 | .RE 61 | .PP 62 | \-n, \-\-num\-chars 63 | .RS 4 64 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection. 65 | .sp 66 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended. 67 | .RE 68 | .PP 69 | \-p, \-\-plain 70 | .RS 4 71 | Print the components of the dialect on separate lines 72 | .RE 73 | .PP 74 | \-j, \-\-json 75 | .RS 4 76 | Print the dialect to standard output in the form of a JSON object. This object will always have the 'delimiter', 'quotechar', 'escapechar', and 'strict' keys. If \-\-add\-runtime is specified, it will also have a 'runtime' key. 77 | .RE 78 | .PP 79 | \-\-no\-skip 80 | .RS 4 81 | The data consistency score used for dialect detection consists of two components: a pattern score and a type score. The type score lies between 0 and 1. When computing the data consistency measures for different dialects, we skip the computation of the type score if we see that the pattern score is lower than the best data consistency score we've seen so far. This option can be used to disable this behaviour and compute the type score for all dialects. This is mainly useful for debugging and testing purposes. 82 | .RE 83 | .PP 84 | \-\-add\-runtime 85 | .RS 4 86 | Add the runtime of the detection to the detection output. 87 | .RE 88 | .PP 89 | 90 | .RS 4 91 | Path to the CSV file 92 | .RE 93 | .PP 94 | .sp 95 | .SH "CLEVERCSV" 96 | .sp 97 | Part of the CleverCSV suite -------------------------------------------------------------------------------- /clevercsv/cparser_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Python utility functions that wrap the C parser. 5 | 6 | """ 7 | 8 | import io 9 | 10 | from typing import Any 11 | from typing import Iterable 12 | from typing import Iterator 13 | from typing import List 14 | from typing import Optional 15 | from typing import Tuple 16 | from typing import Union 17 | 18 | from .cparser import Error as ParserError 19 | from .cparser import Parser 20 | from .dialect import SimpleDialect 21 | from .exceptions import Error 22 | 23 | _FIELD_SIZE_LIMIT: int = 128 * 1024 24 | 25 | 26 | def field_size_limit(*args: Any, **kwargs: Any) -> int: 27 | """Get/Set the limit to the field size. 28 | 29 | This function is adapted from the one in the Python CSV module. See the 30 | documentation there. 31 | """ 32 | global _FIELD_SIZE_LIMIT 33 | old_limit = _FIELD_SIZE_LIMIT 34 | all_args = list(args) + list(kwargs.values()) 35 | if not 0 <= len(all_args) <= 1: 36 | raise TypeError( 37 | "field_size_limit expected at most 1 arguments, got %i" 38 | % len(all_args) 39 | ) 40 | if len(all_args) == 0: 41 | return old_limit 42 | limit = all_args[0] 43 | if not isinstance(limit, int): 44 | raise TypeError("limit must be an integer") 45 | _FIELD_SIZE_LIMIT = int(limit) 46 | return old_limit 47 | 48 | 49 | def _parse_data( 50 | data: Iterable[str], 51 | delimiter: str, 52 | quotechar: str, 53 | escapechar: str, 54 | strict: bool, 55 | return_quoted: bool = False, 56 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]: 57 | parser = Parser( 58 | data, 59 | delimiter=delimiter, 60 | quotechar=quotechar, 61 | escapechar=escapechar, 62 | field_limit=field_size_limit(), 63 | strict=strict, 64 | return_quoted=return_quoted, 65 | ) 66 | try: 67 | for row in parser: 68 | yield row 69 | except ParserError as e: 70 | raise Error(str(e)) 71 | 72 | 73 | def parse_data( 74 | data: Iterable[str], 75 | dialect: Optional[SimpleDialect] = None, 76 | delimiter: Optional[str] = None, 77 | quotechar: Optional[str] = None, 78 | escapechar: Optional[str] = None, 79 | strict: Optional[bool] = None, 80 | return_quoted: bool = False, 81 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]: 82 | """Parse the data given a dialect using the C parser 83 | 84 | Parameters 85 | ---------- 86 | data : iterable 87 | The data of the CSV file as an iterable 88 | 89 | dialect : SimpleDialect 90 | The dialect to use for the parsing. If None, the dialect with each 91 | component set to the empty string is used. 92 | 93 | delimiter : str 94 | The delimiter to use. If not None, overwrites the delimiter in the 95 | dialect. 96 | 97 | quotechar : str 98 | The quote character to use. If not None, overwrites the quote character 99 | in the dialect. 100 | 101 | escapechar : str 102 | The escape character to use. If not None, overwrites the escape 103 | character in the dialect. 104 | 105 | strict : bool 106 | Enable strict mode or not. If not None, overwrites the strict mode set 107 | in the dialect. 108 | 109 | return_quoted : bool 110 | For each cell, return a tuple "(field, is_quoted)" where the second 111 | element indicates whether the cell was a quoted cell or not. 112 | 113 | Yields 114 | ------ 115 | rows : list 116 | The rows of the file as a list of cells. 117 | 118 | Raises 119 | ------ 120 | Error : clevercsv.exceptions.Error 121 | When an error occurs during parsing. 122 | 123 | """ 124 | if dialect is None: 125 | dialect = SimpleDialect("", "", "") 126 | 127 | delimiter_ = delimiter if delimiter is not None else dialect.delimiter 128 | quotechar_ = quotechar if quotechar is not None else dialect.quotechar 129 | escapechar_ = escapechar if escapechar is not None else dialect.escapechar 130 | strict_ = strict if strict is not None else dialect.strict 131 | 132 | yield from _parse_data( 133 | data, 134 | delimiter_, 135 | quotechar_, 136 | escapechar_, 137 | strict_, 138 | return_quoted=return_quoted, 139 | ) 140 | 141 | 142 | def parse_string( 143 | data: str, 144 | dialect: SimpleDialect, 145 | return_quoted: bool = False, 146 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]: 147 | """Utility for when the CSV file is encoded as a single string""" 148 | return parse_data( 149 | iter(io.StringIO(data, newline="")), 150 | dialect=dialect, 151 | return_quoted=return_quoted, 152 | ) 153 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import glob 5 | import io 6 | import os 7 | 8 | from setuptools import Command 9 | from setuptools import Extension 10 | from setuptools import find_packages 11 | from setuptools import setup 12 | 13 | # Package meta-data. 14 | AUTHOR = "Gertjan van den Burg" 15 | DESCRIPTION = "A Python package for handling messy CSV files" 16 | EMAIL = "gertjanvandenburg@gmail.com" 17 | LICENSE = "MIT" 18 | LICENSE_TROVE = "License :: OSI Approved :: MIT License" 19 | NAME = "clevercsv" 20 | REQUIRES_PYTHON = ">=3.9.0" 21 | URL = "https://github.com/alan-turing-institute/CleverCSV" 22 | VERSION = None 23 | 24 | # What packages are required for this module to be executed? 25 | REQUIRED = [ 26 | "chardet>=3.0", 27 | "regex>=2018.11", 28 | "packaging>=23.0", 29 | ] 30 | 31 | # Dependencies only needed for pre-commit 32 | precommit_require = [ 33 | "wilderness>=0.1.5", 34 | ] 35 | 36 | # When these are changed, update clevercsv/_optional.py accordingly 37 | full_require = [ 38 | *precommit_require, 39 | "faust-cchardet>=2.1.18", 40 | "pandas>=1.0.0", 41 | "tabview>=1.4", 42 | ] 43 | 44 | docs_require = ["sphinx", "m2r2", "furo"] 45 | test_require = full_require 46 | dev_require = [ 47 | "green", 48 | # "pythonfuzz", 49 | "pytest>=2.6", 50 | "termcolor", 51 | "mypy", 52 | ] 53 | 54 | # What packages are optional? 55 | EXTRAS = { 56 | "full": full_require, 57 | "docs": docs_require, 58 | "tests": test_require, 59 | "dev": docs_require + test_require + dev_require, 60 | "precommit": precommit_require, 61 | } 62 | 63 | 64 | class build_manpages(Command): 65 | description = "Generate manpages" 66 | user_options = [] 67 | 68 | def initialize_options(self): 69 | pass 70 | 71 | def finalize_options(self): 72 | pass 73 | 74 | def run(self): 75 | from wilderness import build_manpages 76 | 77 | from clevercsv.console import build_application 78 | 79 | build_manpages(build_application()) 80 | 81 | 82 | # The rest you shouldn't have to touch too much :) 83 | # ------------------------------------------------ 84 | # Except, perhaps the License and Trove Classifiers! 85 | # If you do change the License, remember to change the Trove Classifier for that! 86 | 87 | here = os.path.abspath(os.path.dirname(__file__)) 88 | 89 | # Import the README and use it as the long-description. 90 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file! 91 | try: 92 | with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f: 93 | long_description = "\n" + f.read() 94 | except FileNotFoundError: 95 | long_description = DESCRIPTION 96 | 97 | # Load the package's __version__.py module as a dictionary. 98 | about = {} 99 | if not VERSION: 100 | project_slug = NAME.lower().replace("-", "_").replace(" ", "_") 101 | with open(os.path.join(here, project_slug, "__version__.py")) as f: 102 | exec(f.read(), about) 103 | else: 104 | about["__version__"] = VERSION 105 | 106 | # Where the magic happens: 107 | setup( 108 | name=NAME, 109 | version=about["__version__"], 110 | description=DESCRIPTION, 111 | long_description=long_description, 112 | long_description_content_type="text/markdown", 113 | author=AUTHOR, 114 | author_email=EMAIL, 115 | python_requires=REQUIRES_PYTHON, 116 | url=URL, 117 | packages=find_packages( 118 | exclude=["tests", "*.tests", "*.tests.*", "tests.*"] 119 | ), 120 | install_requires=REQUIRED, 121 | extras_require=EXTRAS, 122 | include_package_data=True, 123 | package_data={"clevercsv": ["py.typed"]}, 124 | license=LICENSE, 125 | ext_modules=[ 126 | Extension("clevercsv.cparser", sources=["src/cparser.c"]), 127 | Extension("clevercsv.cabstraction", sources=["src/abstraction.c"]), 128 | ], 129 | entry_points={"console_scripts": ["clevercsv = clevercsv.__main__:main"]}, 130 | data_files=[("man/man1", glob.glob("man/*.1"))], 131 | cmdclass={"build_manpages": build_manpages}, 132 | classifiers=[ 133 | # Trove classifiers 134 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 135 | LICENSE_TROVE, 136 | "Programming Language :: Python", 137 | "Programming Language :: Python :: 3", 138 | "Programming Language :: Python :: 3.9", 139 | "Programming Language :: Python :: 3.10", 140 | "Programming Language :: Python :: 3.11", 141 | "Programming Language :: Python :: 3.12", 142 | "Programming Language :: Python :: 3.13", 143 | "Programming Language :: Python :: 3.14", 144 | "Programming Language :: Python :: Implementation :: CPython", 145 | "Programming Language :: Python :: Implementation :: PyPy", 146 | ], 147 | ) 148 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | jobs: 9 | build_wheels: 10 | name: Build wheels on ${{ matrix.os }} 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ 'ubuntu-latest', 'windows-latest', 'macos-latest' ] 15 | 16 | steps: 17 | - name: Checkout repo 18 | uses: actions/checkout@v4 19 | 20 | - uses: actions/setup-python@v5 21 | name: Install Python 22 | with: 23 | python-version: '3.10' 24 | 25 | - name: Set up QEMU 26 | if: runner.os == 'Linux' 27 | uses: docker/setup-qemu-action@v3 28 | with: 29 | platforms: all 30 | 31 | - name: Build wheels 32 | uses: pypa/cibuildwheel@v3.2.1 33 | env: 34 | CIBW_TEST_COMMAND: "python -VV && python -m unittest discover -f -s {project}/tests/test_unit/" 35 | CIBW_TEST_EXTRAS: "full" 36 | CIBW_SKIP: "pp* cp27-* cp33-* cp34-* cp35-* cp36-* cp37-* cp38-* *-win32 *-musllinux_* *-manylinux_i686" 37 | CIBW_ARCHS_MACOS: x86_64 arm64 universal2 38 | CIBW_ARCHS_LINUX: auto aarch64 39 | 40 | - uses: actions/upload-artifact@v4 41 | with: 42 | path: ./wheelhouse/*.whl 43 | name: dist-${{ matrix.os }}-${{ matrix.python-version }} 44 | 45 | build_sdist: 46 | name: Build source distribution 47 | runs-on: ubuntu-latest 48 | steps: 49 | - uses: actions/checkout@v4 50 | name: Checkout repo 51 | 52 | - uses: actions/setup-python@v5 53 | name: Install Python 54 | with: 55 | python-version: '3.10' 56 | 57 | - name: Update setuptools 58 | run: pip install -U setuptools 59 | 60 | - name: Build sdist 61 | run: python setup.py sdist 62 | 63 | - uses: actions/upload-artifact@v4 64 | with: 65 | path: dist/*.tar.gz 66 | name: dist-source 67 | 68 | upload_testpypi: 69 | needs: [build_wheels, build_sdist] 70 | runs-on: ubuntu-latest 71 | # upload to PyPI on every tag starting with 'v' 72 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') && contains(github.event.ref, '-rc.') 73 | 74 | environment: 75 | name: testpypi 76 | url: https://test.pypi.org/p/clevercsv 77 | 78 | permissions: 79 | id-token: write # IMPORTANT: mandatory for trusted publishing 80 | 81 | steps: 82 | - name: Download all the dists 83 | uses: actions/download-artifact@v4 84 | with: 85 | pattern: dist-* 86 | merge-multiple: true 87 | path: dist/ 88 | 89 | - name: Publish distribution to TestPyPI 90 | uses: pypa/gh-action-pypi-publish@release/v1 91 | with: 92 | repository-url: https://test.pypi.org/legacy/ 93 | verbose: true 94 | 95 | upload_pypi: 96 | needs: [build_wheels, build_sdist] 97 | runs-on: ubuntu-latest 98 | # upload to PyPI on tags starting with 'v' that don't contain '-rc.' 99 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') && !contains(github.event.ref, '-rc.') 100 | 101 | environment: 102 | name: pypi 103 | url: https://pypi.org/p/clevercsv 104 | 105 | permissions: 106 | id-token: write # IMPORTANT: mandatory for trusted publishing 107 | 108 | steps: 109 | - name: Download all the dists 110 | uses: actions/download-artifact@v4 111 | with: 112 | pattern: dist-* 113 | merge-multiple: true 114 | path: dist/ 115 | 116 | - name: Publish distribution to PyPI 117 | uses: pypa/gh-action-pypi-publish@release/v1 118 | with: 119 | verbose: true 120 | 121 | github-release: 122 | name: >- 123 | Sign the Python 🐍 distribution 📦 with Sigstore 124 | and upload the files to GitHub Release 125 | needs: 126 | - upload_pypi 127 | runs-on: ubuntu-latest 128 | 129 | permissions: 130 | contents: write # IMPORTANT: mandatory for making GitHub Releases 131 | id-token: write # IMPORTANT: mandatory for sigstore 132 | 133 | steps: 134 | - name: Checkout repo 135 | uses: actions/checkout@v4 136 | 137 | - name: Download all the dists 138 | uses: actions/download-artifact@v4 139 | with: 140 | pattern: dist-* 141 | merge-multiple: true 142 | path: dist/ 143 | 144 | - name: Sign the dists with Sigstore 145 | uses: sigstore/gh-action-sigstore-python@v3.0.1 146 | with: 147 | inputs: >- 148 | ./dist/*.tar.gz 149 | ./dist/*.whl 150 | 151 | - name: Create GitHub Release 152 | env: 153 | GITHUB_TOKEN: ${{ github.token }} 154 | run: >- 155 | gh release create 156 | '${{ github.ref_name }}' 157 | --notes "" 158 | 159 | - name: Upload artifact signatures to GitHub Release 160 | env: 161 | GITHUB_TOKEN: ${{ github.token }} 162 | # Upload to GitHub Release using the `gh` CLI. 163 | # `dist/` contains the built packages, and the 164 | # sigstore-produced signatures and certificates. 165 | run: >- 166 | gh release upload 167 | '${{ github.ref_name }}' dist/** 168 | --repo '${{ github.repository }}' 169 | -------------------------------------------------------------------------------- /clevercsv/dict_read_write.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | DictReader and DictWriter. 5 | 6 | This code is entirely copied from the Python csv module. The only exception is 7 | that it uses the `reader` and `writer` classes from our package. 8 | 9 | Author: Gertjan van den Burg 10 | 11 | """ 12 | 13 | from __future__ import annotations 14 | 15 | import warnings 16 | 17 | from collections import OrderedDict 18 | from collections.abc import Collection 19 | 20 | from typing import TYPE_CHECKING 21 | from typing import Any 22 | from typing import Generic 23 | from typing import Iterable 24 | from typing import Iterator 25 | from typing import Literal 26 | from typing import Mapping 27 | from typing import Optional 28 | from typing import Sequence 29 | from typing import TypeVar 30 | from typing import Union 31 | from typing import cast 32 | 33 | from clevercsv.read import reader 34 | from clevercsv.write import writer 35 | 36 | if TYPE_CHECKING: 37 | from clevercsv._types import SupportsWrite 38 | from clevercsv._types import _DialectLike 39 | from clevercsv._types import _DictReadMapping 40 | 41 | _T = TypeVar("_T") 42 | 43 | 44 | class DictReader( 45 | Generic[_T], Iterator["_DictReadMapping[Union[_T, Any], Union[str, Any]]"] 46 | ): 47 | def __init__( 48 | self, 49 | f: Iterable[str], 50 | fieldnames: Optional[Sequence[_T]] = None, 51 | restkey: Optional[str] = None, 52 | restval: Optional[str] = None, 53 | dialect: "_DialectLike" = "excel", 54 | *args: Any, 55 | **kwds: Any, 56 | ) -> None: 57 | self._fieldnames = fieldnames 58 | self.restkey = restkey 59 | self.restval = restval 60 | self.reader: reader = reader(f, dialect, *args, **kwds) 61 | self.dialect = dialect 62 | self.line_num = 0 63 | 64 | def __iter__(self) -> "DictReader[_T]": 65 | return self 66 | 67 | @property 68 | def fieldnames(self) -> Sequence[_T]: 69 | if self._fieldnames is None: 70 | try: 71 | fieldnames = next(self.reader) 72 | self._fieldnames = [cast(_T, f) for f in fieldnames] 73 | except StopIteration: 74 | pass 75 | 76 | assert self._fieldnames is not None 77 | 78 | # Note: this was added because I don't think it's expected that Python 79 | # simply drops information if there are duplicate headers. There is 80 | # discussion on this issue in the Python bug tracker here: 81 | # https://bugs.python.org/issue17537 (see linked thread therein). A 82 | # warning is easy enough to suppress and should ensure that the user 83 | # is at least aware of this behavior. 84 | if not len(self._fieldnames) == len(set(self._fieldnames)): 85 | warnings.warn( 86 | "fieldnames are not unique, some columns will be dropped." 87 | ) 88 | 89 | self.line_num = self.reader.line_num 90 | return self._fieldnames 91 | 92 | @fieldnames.setter 93 | def fieldnames(self, value: Sequence[_T]) -> None: 94 | self._fieldnames = value 95 | 96 | def __next__(self) -> "_DictReadMapping[Union[_T, Any], Union[str, Any]]": 97 | if self.line_num == 0: 98 | self.fieldnames 99 | row = next(self.reader) 100 | self.line_num = self.reader.line_num 101 | 102 | while row == []: 103 | row = next(self.reader) 104 | 105 | d: _DictReadMapping = OrderedDict(zip(self.fieldnames, row)) 106 | lf = len(self.fieldnames) 107 | lr = len(row) 108 | if lf < lr: 109 | d[self.restkey] = row[lf:] 110 | elif lf > lr: 111 | for key in self.fieldnames[lr:]: 112 | d[key] = self.restval 113 | return d 114 | 115 | 116 | class DictWriter(Generic[_T]): 117 | def __init__( 118 | self, 119 | f: SupportsWrite[str], 120 | fieldnames: Collection[_T], 121 | restval: Optional[Any] = "", 122 | extrasaction: Literal["raise", "ignore"] = "raise", 123 | dialect: "_DialectLike" = "excel", 124 | *args: Any, 125 | **kwds: Any, 126 | ): 127 | self.fieldnames = fieldnames 128 | self.restval = restval 129 | if extrasaction.lower() not in ("raise", "ignore"): 130 | raise ValueError( 131 | "extrasaction (%s) must be 'raise' or 'ignore'" % extrasaction 132 | ) 133 | self.extrasaction = extrasaction 134 | self.writer = writer(f, dialect, *args, **kwds) 135 | 136 | def writeheader(self) -> Any: 137 | header = dict(zip(self.fieldnames, self.fieldnames)) 138 | return self.writerow(header) 139 | 140 | def _dict_to_list(self, rowdict: Mapping[_T, Any]) -> Iterator[Any]: 141 | if self.extrasaction == "raise": 142 | wrong_fields = rowdict.keys() - self.fieldnames 143 | if wrong_fields: 144 | raise ValueError( 145 | "dict contains fields not in fieldnames: " 146 | + ", ".join([repr(x) for x in wrong_fields]) 147 | ) 148 | return (rowdict.get(key, self.restval) for key in self.fieldnames) 149 | 150 | def writerow(self, rowdict: Mapping[_T, Any]) -> Any: 151 | return self.writer.writerow(self._dict_to_list(rowdict)) 152 | 153 | def writerows(self, rowdicts: Iterable[Mapping[_T, Any]]) -> None: 154 | return self.writer.writerows(map(self._dict_to_list, rowdicts)) 155 | -------------------------------------------------------------------------------- /stubs/tabview/tabview.pyi: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | from typing import Any 4 | 5 | basestring = str 6 | file = io.FileIO 7 | 8 | def KEY_CTRL(key): ... 9 | def addstr(*args): ... 10 | def insstr(*args): ... 11 | 12 | class ReloadException(Exception): 13 | start_pos: Any 14 | column_width_mode: Any 15 | column_gap: Any 16 | column_widths: Any 17 | search_str: Any 18 | def __init__( 19 | self, start_pos, column_width, column_gap, column_widths, search_str 20 | ) -> None: ... 21 | 22 | class QuitException(Exception): ... 23 | 24 | class Viewer: 25 | scr: Any 26 | data: Any 27 | info: Any 28 | header_offset_orig: int 29 | header: Any 30 | header_offset: Any 31 | num_data_columns: Any 32 | column_width_mode: Any 33 | column_gap: Any 34 | trunc_char: Any 35 | num_columns: int 36 | vis_columns: int 37 | init_search: Any 38 | modifier: Any 39 | def __init__(self, *args, **kwargs) -> None: ... 40 | def column_xw(self, x): ... 41 | def quit(self) -> None: ... 42 | def reload(self) -> None: ... 43 | def consume_modifier(self, default: int = ...): ... 44 | def down(self) -> None: ... 45 | def up(self) -> None: ... 46 | def left(self) -> None: ... 47 | def right(self) -> None: ... 48 | y: Any 49 | win_y: Any 50 | def page_down(self) -> None: ... 51 | def page_up(self) -> None: ... 52 | x: Any 53 | win_x: Any 54 | def page_right(self) -> None: ... 55 | def page_left(self) -> None: ... 56 | def mark(self) -> None: ... 57 | def goto_mark(self) -> None: ... 58 | def home(self) -> None: ... 59 | def goto_y(self, y) -> None: ... 60 | def goto_row(self) -> None: ... 61 | def goto_x(self, x) -> None: ... 62 | def goto_col(self) -> None: ... 63 | def goto_yx(self, y, x) -> None: ... 64 | def line_home(self) -> None: ... 65 | def line_end(self) -> None: ... 66 | def show_cell(self) -> None: ... 67 | def show_info(self): ... 68 | textpad: Any 69 | search_str: Any 70 | def search(self) -> None: ... 71 | def search_results( 72 | self, rev: bool = ..., look_in_cur: bool = ... 73 | ) -> None: ... 74 | def search_results_prev( 75 | self, rev: bool = ..., look_in_cur: bool = ... 76 | ) -> None: ... 77 | def help(self) -> None: ... 78 | def toggle_header(self) -> None: ... 79 | def column_gap_down(self) -> None: ... 80 | def column_gap_up(self) -> None: ... 81 | column_width: Any 82 | def column_width_all_down(self) -> None: ... 83 | def column_width_all_up(self) -> None: ... 84 | def column_width_down(self) -> None: ... 85 | def column_width_up(self) -> None: ... 86 | def sort_by_column_numeric(self): ... 87 | def sort_by_column_numeric_reverse(self): ... 88 | def sort_by_column(self) -> None: ... 89 | def sort_by_column_reverse(self) -> None: ... 90 | def sort_by_column_natural(self) -> None: ... 91 | def sort_by_column_natural_reverse(self) -> None: ... 92 | def sorted_nicely(self, ls, key, rev: bool = ...): ... 93 | def float_string_key(self, value): ... 94 | def toggle_column_width(self) -> None: ... 95 | def set_current_column_width(self) -> None: ... 96 | def yank_cell(self) -> None: ... 97 | keys: Any 98 | def define_keys(self) -> None: ... 99 | def run(self) -> None: ... 100 | def handle_keys(self) -> None: ... 101 | def handle_modifier(self, mod) -> None: ... 102 | def resize(self) -> None: ... 103 | def num_columns_fwd(self, x): ... 104 | def num_columns_rev(self, x): ... 105 | def recalculate_layout(self) -> None: ... 106 | def location_string(self, yp, xp): ... 107 | def display(self) -> None: ... 108 | def strpad(self, s, width): ... 109 | def hdrstr(self, x, width): ... 110 | def cellstr(self, y, x, width): ... 111 | def skip_to_row_change(self) -> None: ... 112 | def skip_to_row_change_reverse(self) -> None: ... 113 | def skip_to_col_change(self) -> None: ... 114 | def skip_to_col_change_reverse(self) -> None: ... 115 | 116 | class TextBox: 117 | scr: Any 118 | data: Any 119 | title: Any 120 | tdata: Any 121 | hid_rows: int 122 | def __init__(self, scr, data: str = ..., title: str = ...) -> None: ... 123 | def __call__(self) -> None: ... 124 | handlers: Any 125 | def setup_handlers(self) -> None: ... 126 | def run(self) -> None: ... 127 | def handle_key(self, key) -> None: ... 128 | def close(self) -> None: ... 129 | def scroll_down(self) -> None: ... 130 | def scroll_up(self) -> None: ... 131 | def display(self) -> None: ... 132 | 133 | def csv_sniff(data, enc): ... 134 | def fix_newlines(data): ... 135 | def adjust_space_delim(data, enc): ... 136 | def process_data( 137 | data, 138 | enc: Any | None = ..., 139 | delim: Any | None = ..., 140 | quoting: Any | None = ..., 141 | quote_char=..., 142 | ): ... 143 | def data_list_or_file(data): ... 144 | def pad_data(d): ... 145 | def readme(): ... 146 | def detect_encoding(data: Any | None = ...): ... 147 | def main(stdscr, *args, **kwargs) -> None: ... 148 | def view( 149 | data, 150 | enc: Any | None = ..., 151 | start_pos=..., 152 | column_width: int = ..., 153 | column_gap: int = ..., 154 | trunc_char: str = ..., 155 | column_widths: Any | None = ..., 156 | search_str: Any | None = ..., 157 | double_width: bool = ..., 158 | delimiter: Any | None = ..., 159 | quoting: Any | None = ..., 160 | info: Any | None = ..., 161 | quote_char=..., 162 | ): ... 163 | def parse_path(path): ... 164 | -------------------------------------------------------------------------------- /clevercsv/console/commands/detect.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import sys 5 | import time 6 | 7 | from typing import Any 8 | from typing import Dict 9 | 10 | from wilderness import Command 11 | 12 | from clevercsv.wrappers import detect_dialect 13 | 14 | from ._docs import FLAG_DESCRIPTIONS 15 | from ._utils import parse_int 16 | 17 | 18 | class DetectCommand(Command): 19 | _description = "Detect the dialect of a CSV file." 20 | 21 | def __init__(self) -> None: 22 | super().__init__( 23 | name="detect", 24 | title="Detect the dialect of a CSV file", 25 | description=self._description, 26 | extra_sections={"CleverCSV": "Part of the CleverCSV suite"}, 27 | ) 28 | 29 | def register(self) -> None: 30 | self.add_argument("path", help="Path to the CSV file") 31 | self.add_argument( 32 | "-c", 33 | "--consistency", 34 | action="store_true", 35 | help="Only use the consistency measure for detection.", 36 | description=( 37 | "By default, the dialect of CSV files is detected using " 38 | "atwo-step process. First, a strict set of checks is used to " 39 | "see if the file adheres to a very basic format (for example, " 40 | "when all cells in the file are integers). If none of these " 41 | "checks succeed, the data consistency measure of Van den " 42 | "Burg, et al. (2019) is used to detect the dialect. With this " 43 | "option, you can force the detection to always use the data " 44 | "consistency measure. This can be useful for testing or " 45 | "research purposes, for instance." 46 | ), 47 | ) 48 | self.add_argument( 49 | "-e", 50 | "--encoding", 51 | help="Set the encoding of the file", 52 | description=FLAG_DESCRIPTIONS["encoding"], 53 | ) 54 | self.add_argument( 55 | "-n", 56 | "--num-chars", 57 | help="Number of characters to use for detection", 58 | type=int, 59 | description=FLAG_DESCRIPTIONS["num-chars"], 60 | ) 61 | group = self.add_mutually_exclusive_group() 62 | group.add_argument( 63 | "-p", 64 | "--plain", 65 | action="store_true", 66 | help="Print the components of the dialect on separate lines", 67 | ) 68 | group.add_argument( 69 | "-j", 70 | "--json", 71 | action="store_true", 72 | help="Print the components of the dialect as a JSON object", 73 | description=( 74 | "Print the dialect to standard output in the form of a JSON " 75 | "object. This object will always have the 'delimiter', " 76 | "'quotechar', 'escapechar', and 'strict' keys. If " 77 | "--add-runtime is specified, it will also have a 'runtime' " 78 | "key." 79 | ), 80 | ) 81 | self.add_argument( 82 | "--no-skip", 83 | action="store_true", 84 | help="Don't skip type detection for dialects with a low pattern score", 85 | description=( 86 | "The data consistency score used for dialect detection " 87 | "consists of two components: a pattern score and a type " 88 | "score. The type score lies between 0 and 1. When computing " 89 | "the data consistency measures for different dialects, we " 90 | "skip the computation of the type score if we see that the " 91 | "pattern score is lower than the best data consistency score " 92 | "we've seen so far. This option can be used to disable this " 93 | "behaviour and compute the type score for all dialects. This " 94 | "is mainly useful for debugging and testing purposes." 95 | ), 96 | ) 97 | self.add_argument( 98 | "--add-runtime", 99 | action="store_true", 100 | help="Add the runtime of the detection to the detection output.", 101 | ) 102 | 103 | def handle(self) -> int: 104 | verbose = self.args.verbose 105 | num_chars = parse_int(self.args.num_chars, "num-chars") 106 | method = "consistency" if self.args.consistency else "auto" 107 | skip = not self.args.no_skip 108 | 109 | t_start = time.time() 110 | dialect = detect_dialect( 111 | self.args.path, 112 | num_chars=num_chars, 113 | encoding=self.args.encoding, 114 | verbose=verbose, 115 | method=method, 116 | skip=skip, 117 | ) 118 | runtime = time.time() - t_start 119 | 120 | if dialect is None: 121 | print("Error: Dialect detection failed.", file=sys.stderr) 122 | return 1 123 | 124 | if self.args.plain: 125 | print(f"delimiter = {dialect.delimiter}".strip()) 126 | print(f"quotechar = {dialect.quotechar}".strip()) 127 | print(f"escapechar = {dialect.escapechar}".strip()) 128 | if self.args.add_runtime: 129 | print(f"runtime = {runtime}") 130 | elif self.args.json: 131 | dialect_dict: Dict[str, Any] = dialect.to_dict() 132 | if self.args.add_runtime: 133 | dialect_dict["runtime"] = runtime 134 | print(json.dumps(dialect_dict)) 135 | else: 136 | print("Detected: " + str(dialect)) 137 | if self.args.add_runtime: 138 | print(f"Runtime: {runtime:.6f} seconds") 139 | return 0 140 | -------------------------------------------------------------------------------- /clevercsv/dialect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Definitions for the dialect object. 6 | 7 | Author: Gertjan van den Burg 8 | 9 | """ 10 | 11 | import csv 12 | import functools 13 | import json 14 | 15 | from typing import Any 16 | from typing import Dict 17 | from typing import Optional 18 | from typing import Tuple 19 | from typing import Union 20 | 21 | import _csv 22 | 23 | excel = csv.excel 24 | excel_tab = csv.excel_tab 25 | unix_dialect = csv.unix_dialect 26 | 27 | 28 | @functools.total_ordering 29 | class SimpleDialect: 30 | """ 31 | The simplified dialect object. 32 | 33 | For the delimiter, quotechar, and escapechar the empty string means no 34 | delimiter/quotechar/escapechar in the file. None is used to mark it 35 | undefined. 36 | 37 | Parameters 38 | ---------- 39 | delimiter : str 40 | The delimiter of the CSV file. 41 | 42 | quotechar : str 43 | The quotechar of the file. 44 | 45 | escapechar : str 46 | The escapechar of the file. 47 | 48 | strict : bool 49 | Whether strict parsing should be enforced. Same as in the csv module. 50 | 51 | """ 52 | 53 | def __init__( 54 | self, 55 | delimiter: Optional[str], 56 | quotechar: Optional[str], 57 | escapechar: Optional[str], 58 | strict: bool = False, 59 | ): 60 | self.delimiter = delimiter 61 | self.quotechar = quotechar 62 | self.escapechar = escapechar 63 | self.strict = strict 64 | 65 | def validate(self) -> None: 66 | if self.delimiter is None or len(self.delimiter) > 1: 67 | raise ValueError( 68 | "Delimiter should be zero or one characters, got: %r" 69 | % self.delimiter 70 | ) 71 | if self.quotechar is None or len(self.quotechar) > 1: 72 | raise ValueError( 73 | "Quotechar should be zero or one characters, got: %r" 74 | % self.quotechar 75 | ) 76 | if self.escapechar is None or len(self.escapechar) > 1: 77 | raise ValueError( 78 | "Escapechar should be zero or one characters, got: %r" 79 | % self.escapechar 80 | ) 81 | if self.strict not in set([False, True]): 82 | raise ValueError( 83 | "Strict should be True or False, got: %r" % self.strict 84 | ) 85 | 86 | @classmethod 87 | def from_dict( 88 | cls: type["SimpleDialect"], d: Dict[str, Any] 89 | ) -> "SimpleDialect": 90 | dialect = cls( 91 | d["delimiter"], d["quotechar"], d["escapechar"], strict=d["strict"] 92 | ) 93 | return dialect 94 | 95 | @classmethod 96 | def from_csv_dialect( 97 | cls: type["SimpleDialect"], 98 | d: Union[_csv.Dialect, csv.Dialect], 99 | ) -> "SimpleDialect": 100 | delimiter = "" if d.delimiter is None else d.delimiter 101 | quotechar = "" if d.quoting == csv.QUOTE_NONE else d.quotechar 102 | escapechar = "" if d.escapechar is None else d.escapechar 103 | return cls(delimiter, quotechar, escapechar, strict=d.strict) 104 | 105 | def to_csv_dialect(self) -> csv.Dialect: 106 | class dialect(csv.Dialect): 107 | assert self.delimiter is not None 108 | delimiter = self.delimiter 109 | quotechar = '"' if self.quotechar == "" else self.quotechar 110 | escapechar = None if self.escapechar == "" else self.escapechar 111 | doublequote = True 112 | quoting = ( 113 | csv.QUOTE_NONE if self.quotechar == "" else csv.QUOTE_MINIMAL 114 | ) 115 | skipinitialspace = False 116 | # TODO: We need to set this because it can't be None anymore in 117 | # recent versions of Python 118 | lineterminator = "\n" 119 | 120 | return dialect() 121 | 122 | def to_dict(self) -> Dict[str, Union[str, bool, None]]: 123 | self.validate() 124 | d = dict( 125 | delimiter=self.delimiter, 126 | quotechar=self.quotechar, 127 | escapechar=self.escapechar, 128 | strict=self.strict, 129 | ) 130 | return d 131 | 132 | def serialize(self) -> str: 133 | """Serialize dialect to a JSON object""" 134 | return json.dumps(self.to_dict()) 135 | 136 | @classmethod 137 | def deserialize(cls: type["SimpleDialect"], obj: str) -> "SimpleDialect": 138 | """Deserialize dialect from a JSON object""" 139 | return cls.from_dict(json.loads(obj)) 140 | 141 | def __repr__(self) -> str: 142 | return "SimpleDialect(%r, %r, %r)" % ( 143 | self.delimiter, 144 | self.quotechar, 145 | self.escapechar, 146 | ) 147 | 148 | def __key( 149 | self, 150 | ) -> Tuple[Optional[str], Optional[str], Optional[str], bool]: 151 | return (self.delimiter, self.quotechar, self.escapechar, self.strict) 152 | 153 | def __hash__(self) -> int: 154 | return hash(self.__key()) 155 | 156 | def __eq__(self, other: Any) -> bool: 157 | if not isinstance(other, SimpleDialect): 158 | return False 159 | return self.__key() == other.__key() 160 | 161 | def __lt__(self, other: Any) -> bool: 162 | # This provides a partial order on dialect objects with the goal of 163 | # speeding up the consistency measure. 164 | if not isinstance(other, SimpleDialect): 165 | return False 166 | if self.delimiter == "," and not other.delimiter == ",": 167 | return True 168 | elif other.delimiter == "," and not self.delimiter == ",": 169 | return False 170 | if self.delimiter == ";" and not other.delimiter == ";": 171 | return True 172 | elif other.delimiter == ";" and not self.delimiter == ";": 173 | return False 174 | return self.__key() < other.__key() 175 | -------------------------------------------------------------------------------- /stubs/pandas/__init__.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from pandas._config import describe_option as describe_option 4 | from pandas._config import get_option as get_option 5 | from pandas._config import option_context as option_context 6 | from pandas._config import options as options 7 | from pandas._config import reset_option as reset_option 8 | from pandas._config import set_option as set_option 9 | from pandas.core.api import NA as NA 10 | from pandas.core.api import BooleanDtype as BooleanDtype 11 | from pandas.core.api import Categorical as Categorical 12 | from pandas.core.api import CategoricalDtype as CategoricalDtype 13 | from pandas.core.api import CategoricalIndex as CategoricalIndex 14 | from pandas.core.api import DataFrame as DataFrame 15 | from pandas.core.api import DateOffset as DateOffset 16 | from pandas.core.api import DatetimeIndex as DatetimeIndex 17 | from pandas.core.api import DatetimeTZDtype as DatetimeTZDtype 18 | from pandas.core.api import Flags as Flags 19 | from pandas.core.api import Float32Dtype as Float32Dtype 20 | from pandas.core.api import Float64Dtype as Float64Dtype 21 | from pandas.core.api import Float64Index as Float64Index 22 | from pandas.core.api import Grouper as Grouper 23 | from pandas.core.api import Index as Index 24 | from pandas.core.api import IndexSlice as IndexSlice 25 | from pandas.core.api import Int8Dtype as Int8Dtype 26 | from pandas.core.api import Int16Dtype as Int16Dtype 27 | from pandas.core.api import Int32Dtype as Int32Dtype 28 | from pandas.core.api import Int64Dtype as Int64Dtype 29 | from pandas.core.api import Int64Index as Int64Index 30 | from pandas.core.api import Interval as Interval 31 | from pandas.core.api import IntervalDtype as IntervalDtype 32 | from pandas.core.api import IntervalIndex as IntervalIndex 33 | from pandas.core.api import MultiIndex as MultiIndex 34 | from pandas.core.api import NamedAgg as NamedAgg 35 | from pandas.core.api import NaT as NaT 36 | from pandas.core.api import Period as Period 37 | from pandas.core.api import PeriodDtype as PeriodDtype 38 | from pandas.core.api import PeriodIndex as PeriodIndex 39 | from pandas.core.api import RangeIndex as RangeIndex 40 | from pandas.core.api import Series as Series 41 | from pandas.core.api import StringDtype as StringDtype 42 | from pandas.core.api import Timedelta as Timedelta 43 | from pandas.core.api import TimedeltaIndex as TimedeltaIndex 44 | from pandas.core.api import Timestamp as Timestamp 45 | from pandas.core.api import UInt8Dtype as UInt8Dtype 46 | from pandas.core.api import UInt16Dtype as UInt16Dtype 47 | from pandas.core.api import UInt32Dtype as UInt32Dtype 48 | from pandas.core.api import UInt64Dtype as UInt64Dtype 49 | from pandas.core.api import UInt64Index as UInt64Index 50 | from pandas.core.api import array as array 51 | from pandas.core.api import bdate_range as bdate_range 52 | from pandas.core.api import date_range as date_range 53 | from pandas.core.api import factorize as factorize 54 | from pandas.core.api import interval_range as interval_range 55 | from pandas.core.api import isna as isna 56 | from pandas.core.api import isnull as isnull 57 | from pandas.core.api import notna as notna 58 | from pandas.core.api import notnull as notnull 59 | from pandas.core.api import period_range as period_range 60 | from pandas.core.api import set_eng_float_format as set_eng_float_format 61 | from pandas.core.api import timedelta_range as timedelta_range 62 | from pandas.core.api import to_datetime as to_datetime 63 | from pandas.core.api import to_numeric as to_numeric 64 | from pandas.core.api import to_timedelta as to_timedelta 65 | from pandas.core.api import unique as unique 66 | from pandas.core.api import value_counts as value_counts 67 | from pandas.core.arrays.sparse import SparseDtype as SparseDtype 68 | from pandas.core.computation.api import eval as eval 69 | from pandas.core.reshape.api import concat as concat 70 | from pandas.core.reshape.api import crosstab as crosstab 71 | from pandas.core.reshape.api import cut as cut 72 | from pandas.core.reshape.api import get_dummies as get_dummies 73 | from pandas.core.reshape.api import lreshape as lreshape 74 | from pandas.core.reshape.api import melt as melt 75 | from pandas.core.reshape.api import merge as merge 76 | from pandas.core.reshape.api import merge_asof as merge_asof 77 | from pandas.core.reshape.api import merge_ordered as merge_ordered 78 | from pandas.core.reshape.api import pivot as pivot 79 | from pandas.core.reshape.api import pivot_table as pivot_table 80 | from pandas.core.reshape.api import qcut as qcut 81 | from pandas.core.reshape.api import wide_to_long as wide_to_long 82 | from pandas.io.api import ExcelFile as ExcelFile 83 | from pandas.io.api import ExcelWriter as ExcelWriter 84 | from pandas.io.api import HDFStore as HDFStore 85 | from pandas.io.api import read_clipboard as read_clipboard 86 | from pandas.io.api import read_csv as read_csv 87 | from pandas.io.api import read_excel as read_excel 88 | from pandas.io.api import read_feather as read_feather 89 | from pandas.io.api import read_fwf as read_fwf 90 | from pandas.io.api import read_gbq as read_gbq 91 | from pandas.io.api import read_hdf as read_hdf 92 | from pandas.io.api import read_html as read_html 93 | from pandas.io.api import read_json as read_json 94 | from pandas.io.api import read_orc as read_orc 95 | from pandas.io.api import read_parquet as read_parquet 96 | from pandas.io.api import read_pickle as read_pickle 97 | from pandas.io.api import read_sas as read_sas 98 | from pandas.io.api import read_spss as read_spss 99 | from pandas.io.api import read_sql as read_sql 100 | from pandas.io.api import read_sql_query as read_sql_query 101 | from pandas.io.api import read_sql_table as read_sql_table 102 | from pandas.io.api import read_stata as read_stata 103 | from pandas.io.api import read_table as read_table 104 | from pandas.io.api import to_pickle as to_pickle 105 | from pandas.tseries import offsets as offsets 106 | from pandas.tseries.api import infer_freq as infer_freq 107 | from pandas.util._print_versions import show_versions as show_versions 108 | from pandas.util._tester import test as test 109 | 110 | __docformat__: str 111 | hard_dependencies: Any 112 | missing_dependencies: Any 113 | module: Any 114 | v: Any 115 | __git_version__: Any 116 | 117 | def __getattr__(name: Any): ... 118 | 119 | # __doc__: str 120 | -------------------------------------------------------------------------------- /tests/test_unit/test_detect.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Unit tests for the dialect detection. 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | import unittest 11 | 12 | from clevercsv.detect import Detector 13 | 14 | 15 | class DetectorTestCase(unittest.TestCase): 16 | # Initially we copy the results from CPython test suite. 17 | 18 | sample1 = """\ 19 | Harry's, Arlington Heights, IL, 2/1/03, Kimi Hayes 20 | Shark City, Glendale Heights, IL, 12/28/02, Prezence 21 | Tommy's Place, Blue Island, IL, 12/28/02, Blue Sunday/White Crow 22 | Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back 23 | """ 24 | sample2 = """\ 25 | 'Harry''s':'Arlington Heights':'IL':'2/1/03':'Kimi Hayes' 26 | 'Shark City':'Glendale Heights':'IL':'12/28/02':'Prezence' 27 | 'Tommy''s Place':'Blue Island':'IL':'12/28/02':'Blue Sunday/White Crow' 28 | 'Stonecutters ''Seafood'' and Chop House':'Lemont':'IL':'12/19/02':'Week Back' 29 | """ 30 | header1 = """\ 31 | "venue","city","state","date","performers" 32 | """ 33 | sample3 = """\ 34 | 05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03 35 | 05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03 36 | 05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03 37 | """ 38 | 39 | sample4 = """\ 40 | 2147483648;43.0e12;17;abc;def 41 | 147483648;43.0e2;17;abc;def 42 | 47483648;43.0;170;abc;def 43 | """ 44 | 45 | sample5 = "aaa\tbbb\r\nAAA\t\r\nBBB\t\r\n" 46 | sample6 = "a|b|c\r\nd|e|f\r\n" 47 | sample7 = "'a'|'b'|'c'\r\n'd'|e|f\r\n" 48 | 49 | header2 = """\ 50 | "venue"+"city"+"state"+"date"+"performers" 51 | """ 52 | sample8 = """\ 53 | Harry's+ Arlington Heights+ IL+ 2/1/03+ Kimi Hayes 54 | Shark City+ Glendale Heights+ IL+ 12/28/02+ Prezence 55 | Tommy's Place+ Blue Island+ IL+ 12/28/02+ Blue Sunday/White Crow 56 | Stonecutters Seafood and Chop House+ Lemont+ IL+ 12/19/02+ Week Back 57 | """ 58 | # adapted to be not broken 59 | sample9 = """\ 60 | 'Harry''s'+ 'Arlington Heights'+ 'IL'+ '2/1/03'+ 'Kimi Hayes' 61 | 'Shark City'+ 'Glendale Heights'+' IL'+ '12/28/02'+ 'Prezence' 62 | 'Tommy''s Place'+ 'Blue Island'+ 'IL'+ '12/28/02'+ 'Blue Sunday/White Crow' 63 | 'Stonecutters ''Seafood'' and Chop House'+ 'Lemont'+ 'IL'+ '12/19/02'+ 'Week Back' 64 | """ 65 | 66 | sample10 = """\ 67 | bytearray(b'fake data'),20:53:06,2019-09-01T19:28:21 68 | bytearray(b'fake data'),19:33:15,2005-02-15T19:10:31 69 | bytearray(b'fake data'),10:43:05,1992-10-12T14:49:24 70 | bytearray(b'fake data'),10:36:49,1999-07-18T17:27:55 71 | bytearray(b'fake data'),03:33:35,1982-04-24T17:38:45 72 | bytearray(b'fake data'),14:49:47,1983-01-05T22:17:42 73 | bytearray(b'fake data'),10:35:30,2006-10-27T02:30:45 74 | """ 75 | 76 | sample11 = """\ 77 | "{""fake"": ""json"", ""fake2"":""json2""}",13:31:38,06:00:04+01:00 78 | "{""fake"": ""json"", ""fake2"":""json2""}",22:13:29,14:20:11+02:00 79 | "{""fake"": ""json"", ""fake2"":""json2""}",04:37:27,22:04:28+03:00 80 | "{""fake"": ""json"", ""fake2"":""json2""}",04:25:28,23:12:53+01:00 81 | "{""fake"": ""json"", ""fake2"":""json2""}",21:04:15,08:23:58+02:00 82 | "{""fake"": ""json"", ""fake2"":""json2""}",10:37:03,11:06:42+05:30 83 | "{""fake"": ""json"", ""fake2"":""json2""}",10:17:24,23:38:47+06:00 84 | "{""fake"": ""json"", ""fake2"":""json2""}",00:02:51,20:04:45-06:00 85 | """ 86 | 87 | def test_detect(self) -> None: 88 | # Adapted from CPython 89 | detector = Detector() 90 | dialect = detector.detect(self.sample1) 91 | assert dialect is not None 92 | self.assertEqual(dialect.delimiter, ",") 93 | self.assertEqual(dialect.quotechar, "") 94 | self.assertEqual(dialect.escapechar, "") 95 | 96 | dialect = detector.detect(self.sample2) 97 | assert dialect is not None 98 | self.assertEqual(dialect.delimiter, ":") 99 | self.assertEqual(dialect.quotechar, "'") 100 | self.assertEqual(dialect.escapechar, "") 101 | 102 | def test_delimiters(self) -> None: 103 | # Adapted from CPython 104 | detector = Detector() 105 | dialect = detector.detect(self.sample3) 106 | assert dialect is not None 107 | self.assertIn(dialect.delimiter, self.sample3) 108 | dialect = detector.detect(self.sample3, delimiters="?,") 109 | assert dialect is not None 110 | self.assertEqual(dialect.delimiter, "?") 111 | dialect = detector.detect(self.sample3, delimiters="/,") 112 | assert dialect is not None 113 | self.assertEqual(dialect.delimiter, "/") 114 | dialect = detector.detect(self.sample4) 115 | assert dialect is not None 116 | self.assertEqual(dialect.delimiter, ";") 117 | dialect = detector.detect(self.sample5) 118 | assert dialect is not None 119 | self.assertEqual(dialect.delimiter, "\t") 120 | dialect = detector.detect(self.sample6) 121 | assert dialect is not None 122 | self.assertEqual(dialect.delimiter, "|") 123 | dialect = detector.detect(self.sample7) 124 | assert dialect is not None 125 | self.assertEqual(dialect.delimiter, "|") 126 | self.assertEqual(dialect.quotechar, "'") 127 | dialect = detector.detect(self.sample8) 128 | assert dialect is not None 129 | self.assertEqual(dialect.delimiter, "+") 130 | dialect = detector.detect(self.sample9) 131 | assert dialect is not None 132 | self.assertEqual(dialect.delimiter, "+") 133 | self.assertEqual(dialect.quotechar, "'") 134 | dialect = detector.detect(self.sample10) 135 | assert dialect is not None 136 | self.assertEqual(dialect.delimiter, ",") 137 | self.assertEqual(dialect.quotechar, "") 138 | dialect = detector.detect(self.sample11) 139 | assert dialect is not None 140 | self.assertEqual(dialect.delimiter, ",") 141 | self.assertEqual(dialect.quotechar, '"') 142 | 143 | def test_has_header(self) -> None: 144 | detector = Detector() 145 | self.assertEqual(detector.has_header(self.sample1), False) 146 | self.assertEqual( 147 | detector.has_header(self.header1 + self.sample1), True 148 | ) 149 | 150 | def test_has_header_regex_special_delimiter(self) -> None: 151 | detector = Detector() 152 | self.assertEqual(detector.has_header(self.sample8), False) 153 | self.assertEqual( 154 | detector.has_header(self.header2 + self.sample8), True 155 | ) 156 | 157 | 158 | if __name__ == "__main__": 159 | unittest.main() 160 | -------------------------------------------------------------------------------- /stubs/wilderness/__init__.pyi: -------------------------------------------------------------------------------- 1 | import abc 2 | import argparse 3 | 4 | from typing import Dict 5 | from typing import List 6 | from typing import Optional 7 | from typing import TextIO 8 | 9 | class DocumentableMixin(metaclass=abc.ABCMeta): 10 | def __init__( 11 | self, 12 | description: Optional[str] = None, 13 | extra_sections: Optional[Dict[str, str]] = None, 14 | options_prolog: Optional[str] = None, 15 | options_epilog: Optional[str] = None, 16 | ) -> None: ... 17 | @property 18 | def description(self) -> Optional[str]: ... 19 | @property 20 | def parser(self) -> argparse.ArgumentParser: ... 21 | @parser.setter 22 | def parser(self, parser: argparse.ArgumentParser) -> None: ... 23 | @property 24 | def args(self) -> argparse.Namespace: ... 25 | @args.setter 26 | def args(self, args: argparse.Namespace) -> None: ... 27 | @property 28 | def argument_help(self) -> Dict[str, Optional[str]]: ... 29 | 30 | class Application(DocumentableMixin): 31 | def __init__( 32 | self, 33 | name: str, 34 | version: str, 35 | author: Optional[str] = None, 36 | title: Optional[str] = None, 37 | description: Optional[str] = None, 38 | default_command: Optional[str] = None, 39 | add_help: bool = True, 40 | extra_sections: Optional[Dict[str, str]] = None, 41 | prolog: Optional[str] = None, 42 | epilog: Optional[str] = None, 43 | options_prolog: Optional[str] = None, 44 | options_epilog: Optional[str] = None, 45 | add_commands_section: bool = False, 46 | ) -> None: ... 47 | @property 48 | def name(self) -> str: ... 49 | @property 50 | def author(self) -> str: ... 51 | @property 52 | def version(self) -> str: ... 53 | @property 54 | def commands(self) -> List[Command]: ... 55 | @property 56 | def groups(self) -> List[Group]: ... 57 | def add_argument(self, *args, **kwargs) -> argparse.Action: ... 58 | def add(self, command: Command): ... 59 | def add_group(self, title: str) -> Group: ... 60 | def register(self): ... 61 | def handle(self) -> int: ... 62 | def run( 63 | self, 64 | args: Optional[List[str]] = None, 65 | namespace: Optional[argparse.Namespace] = None, 66 | exit_on_error: bool = True, 67 | ) -> int: ... 68 | def run_command(self, command: Command) -> int: ... 69 | def get_command(self, command_name: str) -> Command: ... 70 | def set_prolog(self, prolog: str) -> None: ... 71 | def set_epilog(self, epilog: str) -> None: ... 72 | def get_commands_text(self) -> str: ... 73 | def create_manpage(self) -> ManPage: ... 74 | def format_help(self) -> str: ... 75 | def print_help(self, file: Optional[TextIO] = None) -> None: ... 76 | 77 | class Group: 78 | def __init__( 79 | self, title: Optional[str] = None, is_root: bool = False 80 | ) -> None: ... 81 | @property 82 | def application(self) -> Optional[Application]: ... 83 | @property 84 | def title(self) -> Optional[str]: ... 85 | @property 86 | def commands(self) -> List[Command]: ... 87 | @property 88 | def is_root(self) -> bool: ... 89 | def commands_as_actions(self) -> List[argparse.Action]: ... 90 | def set_app(self, app: Application) -> None: ... 91 | def add(self, command: Command) -> None: ... 92 | def __len__(self) -> int: ... 93 | 94 | class Command(DocumentableMixin, metaclass=abc.ABCMeta): 95 | def __init__( 96 | self, 97 | name: str, 98 | title: Optional[str] = None, 99 | description: Optional[str] = None, 100 | add_help: bool = True, 101 | extra_sections: Optional[Dict[str, str]] = None, 102 | options_prolog: Optional[str] = None, 103 | options_epilog: Optional[str] = None, 104 | ) -> None: ... 105 | @property 106 | def application(self) -> Optional[Application]: ... 107 | @property 108 | def name(self) -> str: ... 109 | @property 110 | def title(self) -> Optional[str]: ... 111 | def add_argument(self, *args, **kwargs) -> None: ... 112 | def add_argument_group(self, *args, **kwargs) -> ArgumentGroup: ... 113 | def add_mutually_exclusive_group( 114 | self, *args, **kwargs 115 | ) -> MutuallyExclusiveGroup: ... 116 | def register(self) -> None: ... 117 | @abc.abstractmethod 118 | def handle(self) -> int: ... 119 | def create_manpage(self) -> ManPage: ... 120 | 121 | class ManPage: 122 | def __init__( 123 | self, 124 | application_name: str, 125 | author: Optional[str] = "", 126 | command_name: Optional[str] = None, 127 | date: Optional[str] = None, 128 | title: Optional[str] = None, 129 | version: Optional[str] = "", 130 | ) -> None: ... 131 | @property 132 | def name(self) -> str: ... 133 | def metadata(self) -> List[str]: ... 134 | def preamble(self) -> List[str]: ... 135 | def header(self) -> str: ... 136 | def section_name(self) -> str: ... 137 | def add_section_synopsis(self, synopsis: str) -> None: ... 138 | def add_section(self, label: str, text: str) -> None: ... 139 | def groffify(self, text: str) -> str: ... 140 | def groffify_line(self, line: str) -> str: ... 141 | def export(self, output_dir: str) -> str: ... 142 | 143 | class ArgumentGroup: 144 | def __init__(self, group: argparse._ArgumentGroup) -> None: ... 145 | @property 146 | def command(self) -> Optional[Command]: ... 147 | @command.setter 148 | def command(self, command: Command) -> None: ... 149 | def add_argument(self, *args, **kwargs) -> None: ... 150 | 151 | class MutuallyExclusiveGroup: 152 | def __init__(self, meg: argparse._MutuallyExclusiveGroup) -> None: ... 153 | @property 154 | def command(self) -> Optional[Command]: ... 155 | @command.setter 156 | def command(self, command: Command) -> None: ... 157 | def add_argument(self, *args, **kwargs) -> None: ... 158 | 159 | class Tester: 160 | def __init__(self, app: Application) -> None: ... 161 | @property 162 | def application(self) -> Application: ... 163 | def clear(self) -> None: ... 164 | def get_return_code(self) -> Optional[int]: ... 165 | def get_stdout(self) -> Optional[str]: ... 166 | def get_stderr(self) -> Optional[str]: ... 167 | def test_command(self, cmd_name: str, args: List[str]) -> None: ... 168 | def test_application(self, args: Optional[List[str]] = None) -> None: ... 169 | -------------------------------------------------------------------------------- /tests/test_unit/test_detect_pattern.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Unit tests for the pattern score. 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | import unittest 11 | 12 | from clevercsv import detect_pattern 13 | from clevercsv.dialect import SimpleDialect 14 | 15 | 16 | class PatternTestCase(unittest.TestCase): 17 | 18 | """ 19 | Abstraction tests 20 | """ 21 | 22 | def test_abstraction_1(self) -> None: 23 | out = detect_pattern.make_abstraction( 24 | "A,B,C", SimpleDialect(delimiter=",", quotechar="", escapechar="") 25 | ) 26 | exp = "CDCDC" 27 | self.assertEqual(exp, out) 28 | 29 | def test_abstraction_2(self) -> None: 30 | out = detect_pattern.make_abstraction( 31 | "A,\rA,A,A\r", 32 | SimpleDialect(delimiter=",", quotechar="", escapechar=""), 33 | ) 34 | exp = "CDCRCDCDC" 35 | self.assertEqual(exp, out) 36 | 37 | def test_abstraction_3(self) -> None: 38 | out = detect_pattern.make_abstraction( 39 | "a,a,\n,a,a\ra,a,a\r\n", 40 | SimpleDialect(delimiter=",", quotechar="", escapechar=""), 41 | ) 42 | exp = "CDCDCRCDCDCRCDCDC" 43 | self.assertEqual(exp, out) 44 | 45 | def test_abstraction_4(self) -> None: 46 | out = detect_pattern.make_abstraction( 47 | 'a,"bc""d""e""f""a",\r\n', 48 | SimpleDialect(delimiter=",", quotechar='"', escapechar=""), 49 | ) 50 | exp = "CDCDC" 51 | self.assertEqual(exp, out) 52 | 53 | def test_abstraction_5(self) -> None: 54 | out = detect_pattern.make_abstraction( 55 | 'a,"bc""d"",|"f|""', 56 | SimpleDialect(delimiter=",", quotechar='"', escapechar="|"), 57 | ) 58 | exp = "CDC" 59 | self.assertEqual(exp, out) 60 | 61 | def test_abstraction_6(self) -> None: 62 | out = detect_pattern.make_abstraction( 63 | ",,,", SimpleDialect(delimiter=",", quotechar="", escapechar="") 64 | ) 65 | exp = "CDCDCDC" 66 | self.assertEqual(exp, out) 67 | 68 | def test_abstraction_7(self) -> None: 69 | out = detect_pattern.make_abstraction( 70 | ',"",,', SimpleDialect(delimiter=",", quotechar='"', escapechar="") 71 | ) 72 | exp = "CDCDCDC" 73 | self.assertEqual(exp, out) 74 | 75 | def test_abstraction_8(self) -> None: 76 | out = detect_pattern.make_abstraction( 77 | ',"",,\r\n', 78 | SimpleDialect(delimiter=",", quotechar='"', escapechar=""), 79 | ) 80 | exp = "CDCDCDC" 81 | self.assertEqual(exp, out) 82 | 83 | """ 84 | Escape char tests 85 | """ 86 | 87 | def test_abstraction_9(self) -> None: 88 | out = detect_pattern.make_abstraction( 89 | "A,B|,C", 90 | SimpleDialect(delimiter=",", quotechar="", escapechar="|"), 91 | ) 92 | exp = "CDC" 93 | self.assertEqual(exp, out) 94 | 95 | def test_abstraction_10(self) -> None: 96 | out = detect_pattern.make_abstraction( 97 | 'A,"B,C|"D"', 98 | SimpleDialect(delimiter=",", quotechar='"', escapechar="|"), 99 | ) 100 | exp = "CDC" 101 | self.assertEqual(exp, out) 102 | 103 | def test_abstraction_11(self) -> None: 104 | out = detect_pattern.make_abstraction( 105 | "a,|b,c", 106 | SimpleDialect(delimiter=",", quotechar="", escapechar="|"), 107 | ) 108 | exp = "CDCDC" 109 | self.assertEqual(exp, out) 110 | 111 | def test_abstraction_12(self) -> None: 112 | out = detect_pattern.make_abstraction( 113 | "a,b|,c", 114 | SimpleDialect(delimiter=",", quotechar="", escapechar="|"), 115 | ) 116 | exp = "CDC" 117 | self.assertEqual(exp, out) 118 | 119 | def test_abstraction_13(self) -> None: 120 | out = detect_pattern.make_abstraction( 121 | 'a,"b,c|""', 122 | SimpleDialect(delimiter=",", quotechar='"', escapechar="|"), 123 | ) 124 | exp = "CDC" 125 | self.assertEqual(exp, out) 126 | 127 | def test_abstraction_14(self) -> None: 128 | out = detect_pattern.make_abstraction( 129 | "a,b||c", 130 | SimpleDialect(delimiter=",", quotechar="", escapechar="|"), 131 | ) 132 | exp = "CDC" 133 | self.assertEqual(exp, out) 134 | 135 | def test_abstraction_15(self) -> None: 136 | out = detect_pattern.make_abstraction( 137 | 'a,"b|"c||d|"e"', 138 | SimpleDialect(delimiter=",", quotechar='"', escapechar="|"), 139 | ) 140 | exp = "CDC" 141 | self.assertEqual(exp, out) 142 | 143 | def test_abstraction_16(self) -> None: 144 | out = detect_pattern.make_abstraction( 145 | 'a,"b|"c||d","e"', 146 | SimpleDialect(delimiter=",", quotechar='"', escapechar="|"), 147 | ) 148 | exp = "CDCDC" 149 | self.assertEqual(exp, out) 150 | 151 | """ 152 | Fill empties 153 | """ 154 | 155 | def test_fill_empties_1(self) -> None: 156 | out = detect_pattern.fill_empties("DDD") 157 | exp = "CDCDCDC" 158 | self.assertEqual(exp, out) 159 | 160 | """ 161 | Pattern Score tests 162 | """ 163 | 164 | def test_pattern_score_1(self) -> None: 165 | # theta_1 from paper 166 | data = ( 167 | "7,5; Mon, Jan 12;6,40\n100; Fri, Mar 21;8,23\n8,2; Thu, Sep 17;" 168 | '2,71\n538,0;;7,26\n"NA"; Wed, Oct 4;6,93' 169 | ) 170 | d = SimpleDialect(delimiter=",", quotechar="", escapechar="") 171 | out = detect_pattern.pattern_score(data, d) 172 | exp = 7 / 4 173 | self.assertAlmostEqual(exp, out) 174 | 175 | def test_pattern_score_2(self) -> None: 176 | # theta_2 from paper 177 | data = ( 178 | "7,5; Mon, Jan 12;6,40\n100; Fri, Mar 21;8,23\n8,2; Thu, Sep 17;" 179 | '2,71\n538,0;;7,26\n"NA"; Wed, Oct 4;6,93' 180 | ) 181 | d = SimpleDialect(delimiter=";", quotechar="", escapechar="") 182 | out = detect_pattern.pattern_score(data, d) 183 | exp = 10 / 3 184 | self.assertAlmostEqual(exp, out) 185 | 186 | def test_pattern_score_3(self) -> None: 187 | # theta_3 from paper 188 | data = ( 189 | "7,5; Mon, Jan 12;6,40\n100; Fri, Mar 21;8,23\n8,2; Thu, Sep 17;" 190 | '2,71\n538,0;;7,26\n"NA"; Wed, Oct 4;6,93' 191 | ) 192 | d = SimpleDialect(delimiter=";", quotechar='"', escapechar="") 193 | out = detect_pattern.pattern_score(data, d) 194 | exp = 10 / 3 195 | self.assertAlmostEqual(exp, out) 196 | 197 | 198 | if __name__ == "__main__": 199 | unittest.main() 200 | -------------------------------------------------------------------------------- /tests/test_integration/test_dialect_detection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Integration tests for dialect detection. 5 | 6 | Author: G.J.J. van den Burg 7 | 8 | """ 9 | 10 | import argparse 11 | import gzip 12 | import json 13 | import multiprocessing 14 | import os 15 | import time 16 | import warnings 17 | 18 | from typing import Any 19 | from typing import Dict 20 | from typing import List 21 | from typing import Optional 22 | from typing import Tuple 23 | 24 | import chardet 25 | import termcolor 26 | 27 | import clevercsv 28 | 29 | from clevercsv.dialect import SimpleDialect 30 | 31 | THIS_DIR = os.path.abspath(os.path.dirname(__file__)) 32 | SOURCE_DIR = os.path.join(THIS_DIR, "data") 33 | TEST_FILES = os.path.join(SOURCE_DIR, "files") 34 | TEST_DIALECTS = os.path.join(SOURCE_DIR, "dialects") 35 | 36 | LOG_SUCCESS = os.path.join(THIS_DIR, "success.log") 37 | LOG_ERROR = os.path.join(THIS_DIR, "error.log") 38 | LOG_FAILED = os.path.join(THIS_DIR, "failed.log") 39 | LOG_METHOD = os.path.join(THIS_DIR, "method.log") 40 | LOG_RUNTIME = os.path.join(THIS_DIR, "runtime.log") 41 | 42 | LOG_SUCCESS_PARTIAL = os.path.join(THIS_DIR, "success_partial.log") 43 | LOG_ERROR_PARTIAL = os.path.join(THIS_DIR, "error_partial.log") 44 | LOG_FAILED_PARTIAL = os.path.join(THIS_DIR, "failed_partial.log") 45 | LOG_METHOD_PARTIAL = os.path.join(THIS_DIR, "method_partial.log") 46 | LOG_RUNTIME_PARTIAL = os.path.join(THIS_DIR, "runtime_partial.log") 47 | 48 | TIMEOUT = 5 * 60 49 | N_BYTES_PARTIAL = 10000 50 | 51 | 52 | def log_result(name: str, kind: str, verbose: int, partial: bool) -> None: 53 | table = { 54 | "error": (LOG_ERROR, LOG_ERROR_PARTIAL, "yellow"), 55 | "success": (LOG_SUCCESS, LOG_SUCCESS_PARTIAL, "green"), 56 | "failure": (LOG_FAILED, LOG_FAILED_PARTIAL, "red"), 57 | } 58 | assert kind in table 59 | outfull, outpartial, color = table[kind] 60 | fname = outpartial if partial else outfull 61 | 62 | with open(fname, "a") as fp: 63 | fp.write(name + "\n") 64 | if verbose: 65 | termcolor.cprint(name, color=color) 66 | 67 | 68 | def log_method(name: str, method: str, partial: bool) -> None: 69 | fname = LOG_METHOD_PARTIAL if partial else LOG_METHOD 70 | with open(fname, "a") as fp: 71 | fp.write(f"{name},{method}\n") 72 | 73 | 74 | def log_runtime(name: str, runtime: float, partial: bool) -> None: 75 | fname = LOG_RUNTIME_PARTIAL if partial else LOG_RUNTIME 76 | with open(fname, "a") as fp: 77 | fp.write(f"{name},{runtime}\n") 78 | 79 | 80 | def worker( 81 | args: List[Any], return_dict: Dict[str, Any], **kwargs: Any 82 | ) -> None: 83 | det = clevercsv.Detector() 84 | filename, encoding, partial = args 85 | return_dict["error"] = False 86 | return_dict["dialect"] = None 87 | return_dict["method"] = None 88 | return_dict["runtime"] = float("nan") 89 | with gzip.open(filename, "rt", newline="", encoding=encoding) as fp: 90 | data = fp.read(N_BYTES_PARTIAL) if partial else fp.read() 91 | try: 92 | t = time.time() 93 | return_dict["dialect"] = det.detect(data, **kwargs) 94 | return_dict["runtime"] = time.time() - t 95 | return_dict["method"] = det.method_.value 96 | except clevercsv.Error: 97 | return_dict["error"] = True 98 | 99 | 100 | def run_with_timeout( 101 | args: Tuple[Any, ...], kwargs: Dict[str, Any], limit: Optional[int] 102 | ) -> Tuple[Optional[SimpleDialect], bool, Optional[str], float]: 103 | manager = multiprocessing.Manager() 104 | return_dict = manager.dict() 105 | p = multiprocessing.Process( 106 | target=worker, args=(args, return_dict), kwargs=kwargs 107 | ) 108 | p.start() 109 | p.join(limit) 110 | if p.is_alive(): 111 | p.terminate() 112 | return None, True, None, float("nan") 113 | return ( 114 | return_dict["dialect"], 115 | return_dict["error"], 116 | return_dict["method"], 117 | return_dict["runtime"], 118 | ) 119 | 120 | 121 | def run_test( 122 | name: str, 123 | gz_filename: str, 124 | annotation: Dict[str, Any], 125 | verbose: int = 1, 126 | partial: bool = False, 127 | ) -> None: 128 | if "encoding" in annotation: 129 | enc = annotation["encoding"] 130 | else: 131 | with gzip.open(gz_filename, "rb") as fid: 132 | enc = chardet.detect(fid.read())["encoding"] 133 | 134 | true_dialect = annotation["dialect"] 135 | dialect, error, method, runtime = run_with_timeout( 136 | (gz_filename, enc, partial), {"verbose": verbose > 1}, TIMEOUT 137 | ) 138 | if error: 139 | return log_result(name, "error", verbose, partial) 140 | 141 | if dialect is None: 142 | log_result(name, "failure", verbose, partial) 143 | elif dialect.delimiter != true_dialect["delimiter"]: 144 | log_result(name, "failure", verbose, partial) 145 | elif dialect.quotechar != true_dialect["quotechar"]: 146 | log_result(name, "failure", verbose, partial) 147 | elif dialect.escapechar != true_dialect["escapechar"]: 148 | log_result(name, "failure", verbose, partial) 149 | else: 150 | log_result(name, "success", verbose, partial) 151 | 152 | assert method is not None 153 | log_method(name, method, partial) 154 | log_runtime(name, runtime, partial) 155 | 156 | 157 | def load_test_cases() -> List[Tuple[str, str, Dict[str, Any]]]: 158 | cases = [] 159 | for f in sorted(os.listdir(TEST_FILES)): 160 | base = f[: -len(".csv.gz")] 161 | dialect_file = os.path.join(TEST_DIALECTS, base + ".json") 162 | if not os.path.exists(dialect_file): 163 | continue 164 | filename = os.path.join(TEST_FILES, f) 165 | with open(dialect_file, "r") as fid: 166 | annotation = json.load(fid) 167 | if not annotation["filename"] == f[: -len(".gz")]: 168 | warnings.warn( 169 | "filename doesn't match! Input file: %s\nDialect file: %s" 170 | % (filename, dialect_file) 171 | ) 172 | continue 173 | if annotation["status"] == "skip": 174 | continue 175 | cases.append((base, filename, annotation)) 176 | return cases 177 | 178 | 179 | def clear_output_files(partial: bool) -> None: 180 | files = { 181 | True: [ 182 | LOG_SUCCESS_PARTIAL, 183 | LOG_FAILED_PARTIAL, 184 | LOG_ERROR_PARTIAL, 185 | LOG_METHOD_PARTIAL, 186 | LOG_RUNTIME_PARTIAL, 187 | ], 188 | False: [LOG_SUCCESS, LOG_FAILED, LOG_ERROR, LOG_METHOD, LOG_RUNTIME], 189 | } 190 | for filename in files[partial]: 191 | if os.path.exists(filename): 192 | os.unlink(filename) 193 | 194 | 195 | def parse_args() -> argparse.Namespace: 196 | parser = argparse.ArgumentParser() 197 | parser.add_argument( 198 | "--partial", 199 | help="Run test with partial file data", 200 | action="store_true", 201 | ) 202 | parser.add_argument("-v", "--verbose", help="Be verbose", action="count") 203 | return parser.parse_args() 204 | 205 | 206 | def main() -> None: 207 | args = parse_args() 208 | clear_output_files(args.partial) 209 | cases = load_test_cases() 210 | for name, gz_filename, annotation in cases: 211 | run_test( 212 | name, 213 | gz_filename, 214 | annotation, 215 | verbose=args.verbose, 216 | partial=args.partial, 217 | ) 218 | 219 | 220 | if __name__ == "__main__": 221 | main() 222 | -------------------------------------------------------------------------------- /clevercsv/consistency.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Detect the dialect using the data consistency measure. 5 | 6 | Author: Gertjan van den Burg 7 | 8 | """ 9 | 10 | from dataclasses import dataclass 11 | from functools import lru_cache 12 | 13 | from typing import Dict 14 | from typing import Iterable 15 | from typing import List 16 | from typing import Optional 17 | 18 | from . import field_size_limit 19 | from .break_ties import tie_breaker 20 | from .cparser_util import parse_string 21 | from .detect_pattern import pattern_score 22 | from .detect_type import DEFAULT_EPS_TYPE 23 | from .detect_type import TypeDetector 24 | from .dialect import SimpleDialect 25 | from .potential_dialects import get_dialects 26 | 27 | 28 | @dataclass 29 | class ConsistencyScore: 30 | """Container to track the consistency score calculation 31 | 32 | Parameters 33 | ---------- 34 | P : float 35 | The pattern score 36 | 37 | T : Optional[float] 38 | The type score. Can be None if not computed for speed. 39 | 40 | Q : Optional[float] 41 | The consistency score. Can be None if not computed for speed. 42 | 43 | """ 44 | 45 | P: float 46 | T: Optional[float] 47 | Q: Optional[float] 48 | 49 | 50 | class ConsistencyDetector: 51 | """Detect the dialect with the data consistency measure 52 | 53 | This class uses the data consistency measure to detect the dialect. See the 54 | paper for details. 55 | 56 | Parameters 57 | ---------- 58 | skip : bool 59 | Skip computation of the type score for dialects with a low pattern 60 | score. 61 | 62 | verbose : bool 63 | Print out the dialects considered and their scores. 64 | 65 | cache_capacity: int 66 | The size of the cache for type detection. Caching the type detection 67 | result greatly speeds up the computation of the consistency measure. 68 | The size of the cache can be changed to trade off memory use and speed. 69 | 70 | """ 71 | 72 | def __init__( 73 | self, 74 | skip: bool = True, 75 | verbose: bool = False, 76 | cache_capacity: int = 100_000, 77 | ) -> None: 78 | self._skip = skip 79 | self._verbose = verbose 80 | self._type_detector = TypeDetector() 81 | self._cache_capacity = cache_capacity 82 | 83 | # NOTE: A bit ugly but allows setting the cache size dynamically 84 | @lru_cache(cache_capacity) 85 | def cached_is_known_type(cell: str, is_quoted: bool) -> bool: 86 | return self._type_detector.is_known_type(cell, is_quoted) 87 | 88 | self._cached_is_known_type = cached_is_known_type 89 | 90 | def detect( 91 | self, data: str, delimiters: Optional[List[str]] = None 92 | ) -> Optional[SimpleDialect]: 93 | """Detect the dialect using the consistency measure 94 | 95 | Parameters 96 | ---------- 97 | data : str 98 | The data of the file as a string 99 | 100 | delimiters : iterable 101 | List of delimiters to consider. If None, the :func:`get_delimiters` 102 | function is used to automatically detect this (as described in the 103 | paper). 104 | 105 | Returns 106 | ------- 107 | dialect : SimpleDialect 108 | The detected dialect. If no dialect could be detected, returns None. 109 | 110 | """ 111 | self._cached_is_known_type.cache_clear() 112 | 113 | # TODO: probably some optimization there too 114 | dialects = get_dialects(data, delimiters=delimiters) 115 | 116 | # TODO: This is not thread-safe and this object can simply own a Parser 117 | # for each dialect and set the limit directly there (we can also cache 118 | # the best parsing result) 119 | old_limit = field_size_limit(len(data) + 1) 120 | 121 | scores = self.compute_consistency_scores(data, dialects) 122 | best_dialects = ConsistencyDetector.get_best_dialects(scores) 123 | result: Optional[SimpleDialect] = None 124 | if len(best_dialects) == 1: 125 | result = best_dialects[0] 126 | else: 127 | result = tie_breaker(data, best_dialects) 128 | 129 | field_size_limit(old_limit) 130 | return result 131 | 132 | def compute_consistency_scores( 133 | self, data: str, dialects: List[SimpleDialect] 134 | ) -> Dict[SimpleDialect, ConsistencyScore]: 135 | """Compute the consistency score for each dialect 136 | 137 | This function computes the consistency score for each dialect. This is 138 | done by first computing the pattern score for a dialect. If the class 139 | is instantiated with ``skip`` set to False, it also computes the type 140 | score for each dialect. If ``skip`` is True (the default), the type 141 | score is only computed if the pattern score is larger or equal to the 142 | current best combined score. 143 | 144 | Parameters 145 | ---------- 146 | data : str 147 | The data of the file as a string 148 | 149 | dialects : Iterable[SimpleDialect] 150 | An iterable of delimiters to consider. 151 | 152 | Returns 153 | ------- 154 | scores : Dict[SimpleDialect, ConsistencyScore] 155 | A map with a :class:`ConsistencyScore` object for each dialect 156 | provided as input. 157 | 158 | """ 159 | 160 | scores: Dict[SimpleDialect, ConsistencyScore] = {} 161 | incumbent_score = -float("inf") 162 | for dialect in sorted(dialects): 163 | P = pattern_score(data, dialect) 164 | if P < incumbent_score and self._skip: 165 | scores[dialect] = ConsistencyScore(P, None, None) 166 | if self._verbose: 167 | print("%15r:\tP = %15.6f\tskip." % (dialect, P)) 168 | continue 169 | 170 | T = self.compute_type_score(data, dialect) 171 | Q = P * T 172 | incumbent_score = max(incumbent_score, Q) 173 | scores[dialect] = ConsistencyScore(P, T, Q) 174 | if self._verbose: 175 | print( 176 | "%15r:\tP = %15.6f\tT = %15.6f\tQ = %15.6f" 177 | % (dialect, P, T, Q) 178 | ) 179 | return scores 180 | 181 | @staticmethod 182 | def get_best_dialects( 183 | scores: Dict[SimpleDialect, ConsistencyScore] 184 | ) -> List[SimpleDialect]: 185 | """Identify the dialects with the highest consistency score""" 186 | Qscores = [score.Q for score in scores.values()] 187 | Qmax = -float("inf") 188 | for q in Qscores: 189 | if q is None: 190 | continue 191 | Qmax = max(Qmax, q) 192 | return [d for d, score in scores.items() if score.Q == Qmax] 193 | 194 | def compute_type_score( 195 | self, data: str, dialect: SimpleDialect, eps: float = DEFAULT_EPS_TYPE 196 | ) -> float: 197 | """Compute the type score""" 198 | total = known = 0 199 | for row in parse_string(data, dialect, return_quoted=True): 200 | assert all(isinstance(cell, tuple) for cell in row) 201 | for cell, is_quoted in row: 202 | total += 1 203 | known += self._cached_is_known_type(cell, is_quoted=is_quoted) 204 | if not total: 205 | return eps 206 | return max(eps, known / total) 207 | 208 | 209 | def detect_dialect_consistency( 210 | data: str, 211 | delimiters: Optional[Iterable[str]] = None, 212 | skip: bool = True, 213 | verbose: bool = False, 214 | ) -> Optional[SimpleDialect]: 215 | """Helper function that wraps ConsistencyDetector""" 216 | # Mostly kept for backwards compatibility 217 | consistency_detector = ConsistencyDetector(skip=skip, verbose=verbose) 218 | if delimiters is not None: 219 | delimiters = list(delimiters) 220 | return consistency_detector.detect(data, delimiters=delimiters) 221 | -------------------------------------------------------------------------------- /example/airedale.csv: -------------------------------------------------------------------------------- 1 | Department Family,Entity,Payment Date,Expense Type,Expense Area,Supplier,Transaction No.,Amount 2 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003126885,"43,774.58" 3 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003126885,"43,774.58" 4 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003126885,"7,660.55" 5 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003126885,"7,660.55" 6 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003129243,"42,022.79" 7 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003129243,"42,022.79" 8 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003129243,"7,353.99" 9 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003129243,"7,353.99" 10 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80" 11 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80" 12 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80" 13 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80" 14 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80" 15 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34 16 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34 17 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34 18 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34 19 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34 20 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,CONTRACT : PREMISES SECURITY,SECURITY / CAR PARKING,CP PLUS LTD,11394/06,"25,028.08" 21 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,CONTRACT : PREMISES SECURITY,SECURITY / CAR PARKING,CP PLUS LTD,11524/07,"25,028.08" 22 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,COMPUTER SOFTWARE / LICENSE FEES,STRATEGY & DEV,DR FOSTER LTD,1006761,"27,000.00" 23 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,COMPUTER SOFTWARE / LICENSE FEES,STRATEGY & DEV,DR FOSTER LTD,1006761,"4,725.00" 24 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,30/07/2010,DRUGS,PHARMACY,HEALTHCARE AT HOME LTD,OP/2097110,"34,320.00" 25 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,INTERSERVE PROJECT SERVICES LTD,VYO06614,"32,602.01" 26 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,INTERSERVE PROJECT SERVICES LTD,VYO06614,"5,705.35" 27 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,SPECIALIST REGISTRAR,NURSING MANAGEMENT,LEEDS TEACHING HOSPITALS NHS TRUST,334054,"25,000.00" 28 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,SPECIALIST REGISTRAR,NURSING MANAGEMENT,LEEDS TEACHING HOSPITALS NHS TRUST,334054,"25,000.00" 29 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,BLOOD PRODUCTS,PATHOLOGY,NHS BLOOD AND TRANSPLANT,795172,"64,926.70" 30 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,DRUGS,PHARMACY,NHS BUSINESS SERVICES AUTHORITY,PHS1000023817,"34,283.21" 31 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNE1785878,"60,520.87" 32 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNEI771700,"44,419.66" 33 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNEI775748,"51,157.92" 34 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,12/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNEI779508,"63,683.90" 35 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNEI783035,"70,963.90" 36 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,COMPUTER MAINTENANCE,FINANCE DEPARTMENT,NORTHUMBRIA HEALTHCARE NHS FOUNDATION TRUST,44584,"27,212.46" 37 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ELECTRICITY,UTILITIES,NPOWER LTD,LGUC8SSS,"40,057.57" 38 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ELECTRICITY,UTILITIES,NPOWER LTD,LGUC8SSS,"7,010.08" 39 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,COMPUTER HARDWARE PURCHASES,SURGICAL MANAGEMENT,RED EMBEDDED DESIGN LTD,80010,"48,000.00" 40 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,COMPUTER HARDWARE PURCHASES,SURGICAL MANAGEMENT,RED EMBEDDED DESIGN LTD,80010,"8,400.00" 41 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,RN WOOLER & CO LTD,20603,"28,547.50" 42 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,RN WOOLER & CO LTD,20603,"4,995.81" 43 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,RN WOOLER & CO LTD,20603,"1,502.50" 44 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,X-RAY EQUIPMENT : PURCHASES,RADIOLOGY,SIEMENS PLC,1019839343,"94,274.86" 45 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,12/07/2010,UTILISATION - OTHER PROVISIONS/LIABILITIES,BALANCE SHEET,SODEXO HEALTHCARE SERVICES LTD,9050725474,"142,313.25" 46 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,EXTERNAL CONTRACTS : CATERING,CATERING,SODEXO HEALTHCARE SERVICES LTD,9050731742,"153,030.11" 47 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,12/07/2010,GAS,UTILITIES,TOTAL GAS AND POWER LTD,59355378/10,"29,184.61" 48 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,12/07/2010,GAS,UTILITIES,TOTAL GAS AND POWER LTD,59355378/10,"5,107.30" 49 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE CLINIC,20101,"18,126.00" 50 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE CLINIC,20101,"17,252.00" 51 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE CLINIC,20101,"8,275.00" 52 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE CLINIC,20101,"1,503.00" 53 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,MEDICAL MANAGEMENT,YORKSHIRE CLINIC,20101,193.00 54 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,30/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE EYE HOSPITAL,200011-,"-25,728.00" 55 | 56 | Report created October 10 - RX7 RWW,,,,,,, 57 | --------------------------------------------------------------------------------