├── clevercsv
    ├── py.typed
    ├── __version__.py
    ├── console
    │   ├── __init__.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   ├── _docs.py
    │   │   ├── _utils.py
    │   │   ├── code.py
    │   │   ├── view.py
    │   │   ├── explore.py
    │   │   └── detect.py
    │   └── application.py
    ├── cabstraction.pyi
    ├── exceptions.py
    ├── __main__.py
    ├── utils.py
    ├── __init__.py
    ├── _types.py
    ├── cparser.pyi
    ├── encoding.py
    ├── cparser_util.pyi
    ├── escape.py
    ├── read.py
    ├── write.py
    ├── _optional.py
    ├── detect_pattern.py
    ├── cparser_util.py
    ├── dict_read_write.py
    ├── dialect.py
    └── consistency.py
├── stubs
    ├── pythonfuzz
    │   ├── __init__.pyi
    │   └── main.pyi
    ├── tabview
    │   ├── __init__.pyi
    │   └── tabview.pyi
    ├── regex
    │   ├── _regex.pyi
    │   ├── __init__.pyi
    │   └── regex.pyi
    ├── termcolor
    │   └── __init__.pyi
    ├── pandas
    │   └── __init__.pyi
    └── wilderness
    │   └── __init__.pyi
├── docs
    ├── _readme.rst
    ├── _changelog.rst
    ├── source
    │   ├── modules.rst
    │   ├── clevercsv.console.rst
    │   ├── clevercsv.console.commands.rst
    │   └── clevercsv.rst
    ├── index.rst
    ├── Makefile
    ├── make.bat
    └── conf.py
├── tests
    ├── test_integration
    │   ├── error.log
    │   ├── error_partial.log
    │   ├── README.md
    │   ├── failed.log
    │   ├── failed_partial.log
    │   └── test_dialect_detection.py
    ├── test_unit
    │   ├── data
    │   │   └── abstraction_testcases.json.gz
    │   ├── test_fuzzing.py
    │   ├── test_c_file_naming.py
    │   ├── test_consistency.py
    │   ├── test_potential_dialects.py
    │   ├── test_abstraction.py
    │   ├── test_encoding.py
    │   ├── test_write.py
    │   ├── test_normal_forms.py
    │   ├── test_detect.py
    │   └── test_detect_pattern.py
    ├── README.md
    └── test_fuzz
    │   └── fuzz_sniffer.py
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── build.yml
    │   └── deploy.yml
├── .readthedocs.yml
├── .gitignore
├── notes
    └── date_regex
    │   ├── README.md
    │   ├── dateregexmin.txt
    │   ├── dateregex_formats.txt
    │   ├── dateregex.txt
    │   ├── datefmt.py
    │   └── dateregex_annotated.txt
├── MANIFEST.in
├── .pre-commit-config.yaml
├── pyproject.toml
├── example
    ├── README.md
    └── airedale.csv
├── LICENSE
├── man
    ├── clevercsv-help.1
    ├── clevercsv-view.1
    ├── clevercsv-code.1
    ├── clevercsv-explore.1
    ├── clevercsv.1
    ├── clevercsv-standardize.1
    └── clevercsv-detect.1
├── CODE_OF_CONDUCT.md
├── Makefile
└── setup.py


/clevercsv/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/stubs/pythonfuzz/__init__.pyi:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/_readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ./README.rst
2 | 


--------------------------------------------------------------------------------
/docs/_changelog.rst:
--------------------------------------------------------------------------------
1 | .. include:: ./CHANGELOG.rst
2 | 


--------------------------------------------------------------------------------
/stubs/tabview/__init__.pyi:
--------------------------------------------------------------------------------
1 | from .tabview import view as view
2 | 


--------------------------------------------------------------------------------
/tests/test_integration/error.log:
--------------------------------------------------------------------------------
1 | 12f6fa751d2b2a491a54bc9e0e39d05f
2 | 


--------------------------------------------------------------------------------
/tests/test_integration/error_partial.log:
--------------------------------------------------------------------------------
1 | 12f6fa751d2b2a491a54bc9e0e39d05f
2 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | CleverCSV API Documentation
2 | ===========================
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    clevercsv
8 | 


--------------------------------------------------------------------------------
/tests/test_unit/data/abstraction_testcases.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/CleverCSV/HEAD/tests/test_unit/data/abstraction_testcases.json.gz


--------------------------------------------------------------------------------
/clevercsv/__version__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from typing import Tuple
4 | 
5 | VERSION: Tuple[int, int, int] = (0, 8, 4)
6 | 
7 | __version__: str = ".".join(map(str, VERSION))
8 | 


--------------------------------------------------------------------------------
/clevercsv/console/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .application import build_application
4 | 
5 | 
6 | def main() -> int:
7 |     app = build_application()
8 |     return app.run()
9 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "weekly"
7 |     ignore:
8 |       - dependency-name: "actions/*"
9 | 


--------------------------------------------------------------------------------
/stubs/pythonfuzz/main.pyi:
--------------------------------------------------------------------------------
1 | from typing import Any
2 | from typing import Callable
3 | 
4 | class PythonFuzz:
5 |     def __init__(self, func: Callable[[bytes], Any]) -> None: ...
6 |     def __call__(self, *args: Any, **kwargs: Any) -> None: ...
7 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # Tests
2 | 
3 | We have two types of tests for CleverCSV: unit tests and integration tests.
4 | 
5 | * Unit tests evaluate the functionality of the package in the usual way
6 | * Integration tests specifically evaluate the dialect detection accuracy on a 
7 |   large set of test files.
8 | 


--------------------------------------------------------------------------------
/clevercsv/cabstraction.pyi:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from typing import Optional
 4 | 
 5 | def base_abstraction(
 6 |     data: str,
 7 |     delimiter: Optional[str],
 8 |     quotechar: Optional[str],
 9 |     escapechar: Optional[str],
10 | ) -> str: ...
11 | def c_merge_with_quotechar(data: str) -> str: ...
12 | 


--------------------------------------------------------------------------------
/clevercsv/exceptions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Exceptions for CleverCSV
 5 | 
 6 | Author: Gertjan van den Burg
 7 | 
 8 | """
 9 | 
10 | from .cparser import Error as ParserError
11 | 
12 | 
13 | class Error(ParserError):
14 |     pass
15 | 
16 | 
17 | class NoDetectionResult(Exception):
18 |     pass
19 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file for CleverCSV
 2 | #
 3 | version: 2
 4 | 
 5 | build:
 6 |   os: ubuntu-22.04
 7 |   tools:
 8 |     python: "latest"
 9 | 
10 | sphinx:
11 |   configuration: docs/conf.py
12 | 
13 | python:
14 |   install:
15 |     - method: pip
16 |       path: .
17 |       extra_requirements:
18 |         - docs
19 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. include:: ./_readme.rst
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 | 
 7 |    _readme
 8 | 
 9 | .. toctree::
10 |    :caption: Further Documentation
11 |    :maxdepth: 2
12 | 
13 |    _changelog
14 |    source/modules
15 | 
16 | 
17 | Indices and tables
18 | ==================
19 | 
20 | * :ref:`genindex`
21 | * :ref:`modindex`
22 | * :ref:`search`
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | */__pycache__/
 2 | *.pyc
 3 | build/
 4 | cover/*
 5 | tests/test_integration/data
 6 | tests/test_integration/__pycache__/
 7 | tests/test_unit/__pycache__/
 8 | .coverage
 9 | clevercsv.egg-info/
10 | clevercsv/.coverage
11 | clevercsv/*.so
12 | dist/*
13 | docs/_build
14 | docs/_static
15 | cgrep
16 | vgrep
17 | auxiliary/
18 | notes/
19 | _logo
20 | .logo.png
21 | docs/source/AUTOGENERATED
22 | comparison/
23 | 


--------------------------------------------------------------------------------
/clevercsv/console/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from .code import CodeCommand
 4 | from .detect import DetectCommand
 5 | from .explore import ExploreCommand
 6 | from .standardize import StandardizeCommand
 7 | from .view import ViewCommand
 8 | 
 9 | __all__ = [
10 |     "CodeCommand",
11 |     "DetectCommand",
12 |     "ExploreCommand",
13 |     "StandardizeCommand",
14 |     "ViewCommand",
15 | ]
16 | 


--------------------------------------------------------------------------------
/notes/date_regex/README.md:
--------------------------------------------------------------------------------
 1 | # Date regex
 2 | 
 3 | These are some files used to develop the date regular expression. It is based 
 4 | on individual regular expressions for each of the date formats generated by 
 5 | ``datefmt.py``, but merged into a single expression by hand.
 6 | 
 7 | Note that this regex checks whether a string is a valid *date format*, not 
 8 | whether it's a valid *date* (i.e. 2019-02-31 is considered valid). This is for 
 9 | both speed and simplicity.
10 | 


--------------------------------------------------------------------------------
/stubs/regex/_regex.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | CODE_SIZE: int
 4 | MAGIC: int
 5 | copyright: str
 6 | 
 7 | def compile(*args, **kwargs) -> Any: ...
 8 | def fold_case(*args, **kwargs) -> Any: ...
 9 | def get_all_cases(*args, **kwargs) -> Any: ...
10 | def get_code_size(*args, **kwargs) -> Any: ...
11 | def get_expand_on_folding(*args, **kwargs) -> Any: ...
12 | def get_properties(*args, **kwargs) -> Any: ...
13 | def has_property_value(*args, **kwargs) -> Any: ...
14 | 


--------------------------------------------------------------------------------
/stubs/termcolor/__init__.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | __ALL__: Any
 4 | VERSION: Any
 5 | ATTRIBUTES: Any
 6 | HIGHLIGHTS: Any
 7 | COLORS: Any
 8 | RESET: str
 9 | 
10 | def colored(
11 |     text,
12 |     color: Any | None = ...,
13 |     on_color: Any | None = ...,
14 |     attrs: Any | None = ...,
15 | ): ...
16 | def cprint(
17 |     text,
18 |     color: Any | None = ...,
19 |     on_color: Any | None = ...,
20 |     attrs: Any | None = ...,
21 |     **kwargs
22 | ) -> None: ...
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include setup.py
 2 | include README.md
 3 | include CHANGELOG.md
 4 | include LICENSE
 5 | include requirements.txt
 6 | recursive-include clevercsv *.py
 7 | recursive-include src *.c
 8 | recursive-include bin *
 9 | recursive-include tests/test_unit *.py
10 | recursive-include man *.1
11 | prune tests/test_integration
12 | exclude Makefile
13 | exclude .gitignore
14 | exclude .travis.yml
15 | exclude .readthedocs.yml
16 | exclude make_release.py
17 | exclude cgrep
18 | exclude vgrep
19 | prune notes
20 | prune auxiliary
21 | 


--------------------------------------------------------------------------------
/clevercsv/__main__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Caller for the command line application.
 5 | 
 6 | """
 7 | 
 8 | import sys
 9 | 
10 | from ._optional import import_optional_dependency
11 | 
12 | 
13 | def main() -> None:
14 |     # Check that necessary dependencies are available
15 |     import_optional_dependency("wilderness")
16 | 
17 |     # if so, load the actual main function and call it.
18 |     from .console import main as realmain
19 | 
20 |     sys.exit(realmain())
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     main()
25 | 


--------------------------------------------------------------------------------
/tests/test_fuzz/fuzz_sniffer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Script to run PythonFuzz to detect unhandled exceptions in the Sniffer
 5 | 
 6 | This file is part of CleverCSV.
 7 | 
 8 | """
 9 | 
10 | from pythonfuzz.main import PythonFuzz
11 | 
12 | import clevercsv
13 | 
14 | 
15 | @PythonFuzz
16 | def fuzz(buf):
17 |     try:
18 |         string = buf.decode("utf-8")
19 |         _ = clevercsv.Sniffer().sniff(string)
20 |     except UnicodeDecodeError:
21 |         pass
22 |     except clevercsv.exceptions.Error:
23 |         pass
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     fuzz()
28 | 


--------------------------------------------------------------------------------
/tests/test_unit/test_fuzzing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Unit tests based on fuzzing
 5 | 
 6 | """
 7 | 
 8 | import unittest
 9 | 
10 | import clevercsv
11 | 
12 | 
13 | class FuzzingTestCase(unittest.TestCase):
14 |     def test_sniffer_fuzzing(self) -> None:
15 |         strings = ['"""', "```", "\"'", "'@'", "'\"", "'''", "O##P~`"]
16 |         for string in strings:
17 |             with self.subTest(string=string):
18 |                 try:
19 |                     _ = clevercsv.Sniffer().sniff(string)
20 |                 except clevercsv.exceptions.Error:
21 |                     pass
22 | 


--------------------------------------------------------------------------------
/docs/source/clevercsv.console.rst:
--------------------------------------------------------------------------------
 1 | clevercsv.console package
 2 | =========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 4
 9 | 
10 |    clevercsv.console.commands
11 | 
12 | Submodules
13 | ----------
14 | 
15 | clevercsv.console.application module
16 | ------------------------------------
17 | 
18 | .. automodule:: clevercsv.console.application
19 |    :members:
20 |    :show-inheritance:
21 |    :undoc-members:
22 | 
23 | Module contents
24 | ---------------
25 | 
26 | .. automodule:: clevercsv.console
27 |    :members:
28 |    :show-inheritance:
29 |    :undoc-members:
30 | 


--------------------------------------------------------------------------------
/notes/date_regex/dateregexmin.txt:
--------------------------------------------------------------------------------
1 | ((0[1-9]|1[0-2])((0[1-9]|[12]\d|3[01])([12]\d{3}|\d{2})|(?P<sep1>[-\/. ])(0?[1-9]|[12]\d|3[01])(?P=sep1)([12]\d{3}|\d{2}))|(0[1-9]|[12]\d|3[01])((0[1-9]|1[0-2])([12]\d{3}|\d{2})|(?P<sep2>[-\/. ])(0?[1-9]|1[0-2])(?P=sep2)([12]\d{3}|\d{2}))|([12]\d{3}|\d{2})((?P<sep3>[-\/. ])(0?[1-9]|1[0-2])(?P=sep3)(0?[1-9]|[12]\d|3[01])|年(0?[1-9]|1[0-2])月(0?[1-9]|[12]\d|3[01])日|년(0?[1-9]|1[0-2])월(0?[1-9]|[12]\d|3[01])일|(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01]))|(([1-9]|1[0-2])(?P<sep4>[-\/. ])(0?[1-9]|[12]\d|3[01])(?P=sep4)([12]\d{3}|\d{2})|([1-9]|[12]\d|3[01])(?P<sep5>[-\/. ])(0?[1-9]|1[0-2])(?P=sep5)([12]\d{3}|\d{2})))
2 | 


--------------------------------------------------------------------------------
/notes/date_regex/dateregex_formats.txt:
--------------------------------------------------------------------------------
 1 | DDMMYY
 2 | DDMMYYYY
 3 | DDxMMxYY
 4 | DDxMMxYYYY
 5 | DDxMxYY
 6 | DDxMxYYYY
 7 | DxMMxYY
 8 | DxMMxYYYY
 9 | DxMxYY
10 | DxMxYYYY
11 | MMDDYY
12 | MMDDYYYY
13 | MMxDDxYY
14 | MMxDDxYYYY
15 | MMxDxYY
16 | MMxDxYYYY
17 | MxDDxYY
18 | MxDDxYYYY
19 | MxDxYY
20 | MxDxYYYY
21 | YYMMDD
22 | YYYYMMDD
23 | YYYYxMMxD
24 | YYYYxMMxDD
25 | YYYYxMxD
26 | YYYYxMxDD
27 | YYYY年MM月DD日
28 | YYYY年MM月D日
29 | YYYY年M月DD日
30 | YYYY年M月D日
31 | YYYY년MM월DD일
32 | YYYY년MM월D일
33 | YYYY년M월DD일
34 | YYYY년M월D일
35 | YYxMMxD
36 | YYxMMxDD
37 | YYxMxD
38 | YYxMxDD
39 | YY年MM月DD日
40 | YY年MM月D日
41 | YY年M月DD日
42 | YY年M月D日
43 | YY년MM월DD일
44 | YY년MM월D일
45 | YY년M월DD일
46 | YY년M월D일
47 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # NOTE: Keep versions of tools in sync with Github Actions build.yml
 2 | repos:
 3 |   - repo: https://github.com/psf/black
 4 |     rev: 23.3.0
 5 |     hooks:
 6 |       - id: black
 7 |         language_version: python3
 8 | 
 9 |   - repo: https://github.com/pycqa/isort
10 |     rev: 5.12.0
11 |     hooks:
12 |       - id: isort
13 |         name: isort (python)
14 |       - id: isort
15 |         name: isort (cython)
16 |         types: [cython]
17 |       - id: isort
18 |         name: isort (pyi)
19 |         types: [pyi]
20 | 
21 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
22 |     rev: "v0.0.261"
23 |     hooks:
24 |       - id: ruff
25 |         name: ruff (python)
26 |         args: [--fix, --exit-non-zero-on-fix]
27 | 


--------------------------------------------------------------------------------
/tests/test_integration/README.md:
--------------------------------------------------------------------------------
 1 | # Integration Tests
 2 | 
 3 | This directory is for the integration tests that evaluate the accuracy of 
 4 | dialect detection in CleverCSV. We have a ``data`` folder that contains 
 5 | annotated dialects for CSV files scraped from GitHub from repositories with 
 6 | the MIT license (allowing their redistribution and use). The 
 7 | ``test_dialect_detection.py`` script runs CleverCSV on each file for which 
 8 | ground truth is available, and writes the file hash to either ``success.log``, 
 9 | ``failed.log``, or ``error.log``. By keeping these files in Git we can keep 
10 | track of CleverCSVs performance.
11 | 
12 | Note that runtime should be interpreted very carefully and only when 
13 | experimental conditions are constant, and then generally only as averages.
14 | 


--------------------------------------------------------------------------------
/tests/test_unit/test_c_file_naming.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import clevercsv
 4 | 
 5 | 
 6 | class CNamingTestCase(unittest.TestCase):
 7 |     def test_name_cabstraction_module(self) -> None:
 8 |         self.assertEqual(
 9 |             clevercsv.cabstraction.__name__, "clevercsv.cabstraction"
10 |         )
11 | 
12 |     def test_name_cparser_module(self) -> None:
13 |         self.assertEqual(clevercsv.cparser.__name__, "clevercsv.cparser")
14 | 
15 |     def test_name_cparser_error(self) -> None:
16 |         self.assertEqual(
17 |             clevercsv.cparser.Error.__module__, "clevercsv.cparser"
18 |         )
19 | 
20 |     def test_name_cparser_parser(self) -> None:
21 |         self.assertEqual(
22 |             clevercsv.cparser.Parser.__module__, "clevercsv.cparser"
23 |         )
24 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length=79
 3 | 
 4 | [tool.isort]
 5 | profile="black"
 6 | sections=["FUTURE", "STDLIB", "TYPING", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
 7 | known_typing=["typing"]
 8 | force_single_line=true
 9 | lines_between_types=1
10 | 
11 | [tool.ruff]
12 | # Exclude stubs directory for now
13 | exclude = ["stubs"]
14 | 
15 | [tool.mypy]
16 | python_version = 3.10
17 | warn_unused_configs = true
18 | warn_redundant_casts = true
19 | warn_unused_ignores = true
20 | strict_equality = true
21 | strict_concatenate = true
22 | check_untyped_defs = true
23 | disallow_subclassing_any = true
24 | disallow_untyped_decorators = true
25 | disallow_any_generics = true
26 | disallow_untyped_calls = true
27 | disallow_incomplete_defs = true
28 | disallow_untyped_defs = false
29 | 
30 | [[tool.mypy.overrides]]
31 | packages = ["stubs", "clevercsv"]
32 | disallow_incomplete_defs = true
33 | 


--------------------------------------------------------------------------------
/tests/test_unit/test_consistency.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Unit test for consistency score
 5 | 
 6 | Author: G.J.J. van den Burg
 7 | 
 8 | """
 9 | 
10 | import unittest
11 | 
12 | from clevercsv.consistency import ConsistencyDetector
13 | from clevercsv.consistency import ConsistencyScore
14 | from clevercsv.dialect import SimpleDialect
15 | 
16 | 
17 | class ConsistencyTestCase(unittest.TestCase):
18 |     def test_get_best_set_1(self) -> None:
19 |         scores = {
20 |             SimpleDialect(",", None, None): ConsistencyScore(P=1, T=1, Q=1),
21 |             SimpleDialect(";", None, None): ConsistencyScore(
22 |                 P=1, T=None, Q=None
23 |             ),
24 |             SimpleDialect("|", None, None): ConsistencyScore(P=2, T=1, Q=2),
25 |         }
26 |         H = ConsistencyDetector.get_best_dialects(scores)
27 |         self.assertEqual(H, [SimpleDialect("|", None, None)])
28 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/stubs/regex/__init__.pyi:
--------------------------------------------------------------------------------
 1 | from .regex import *
 2 | 
 3 | # Names in __all__ with no definition:
 4 | #   A
 5 | #   ASCII
 6 | #   B
 7 | #   BESTMATCH
 8 | #   D
 9 | #   DEBUG
10 | #   DEFAULT_VERSION
11 | #   DOTALL
12 | #   E
13 | #   ENHANCEMATCH
14 | #   F
15 | #   FULLCASE
16 | #   I
17 | #   IGNORECASE
18 | #   L
19 | #   LOCALE
20 | #   M
21 | #   MULTILINE
22 | #   Match
23 | #   P
24 | #   POSIX
25 | #   Pattern
26 | #   R
27 | #   REVERSE
28 | #   Regex
29 | #   S
30 | #   Scanner
31 | #   T
32 | #   TEMPLATE
33 | #   U
34 | #   UNICODE
35 | #   V0
36 | #   V1
37 | #   VERBOSE
38 | #   VERSION0
39 | #   VERSION1
40 | #   W
41 | #   WORD
42 | #   X
43 | #   __doc__
44 | #   __version__
45 | #   cache_all
46 | #   compile
47 | #   error
48 | #   escape
49 | #   findall
50 | #   finditer
51 | #   fullmatch
52 | #   match
53 | #   purge
54 | #   search
55 | #   split
56 | #   splititer
57 | #   sub
58 | #   subf
59 | #   subfn
60 | #   subn
61 | #   template
62 | 


--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
 1 | # Example files
 2 | 
 3 | This directory contains some example files for the demo.
 4 | 
 5 | These are the sources:
 6 | 
 7 | - ``imdb.csv`` comes from https://www.kaggle.com/orgesleka/imdbmovies and is 
 8 |   CC0 licensed. We only use the first 100 rows in this example.
 9 | 
10 | - ``airedale.csv`` comes from 
11 |   https://data.gov.uk/dataset/9c0b1334-dcaf-4a25-ad24-31425933afd9/spend-over-25-000-in-airedale-nhs-foundation-trust 
12 |   and is the file "2011 July Return". This data is made available under the 
13 |   [Open Government 
14 |   License](http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). 
15 | 
16 | - ``milk.csv`` comes from 
17 |   https://data.gov.uk/dataset/b4674861-f2a0-4dcd-bd5e-687d01380259/utilisation-of-milk-by-dairies-in-england-and-wales 
18 |   and is the file " UK Availability, Disposals and Production of Milk and Milk 
19 |   Products January 1987 to June 2015 ". This data is made available under the 
20 |   [Open Government 
21 |   License](http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). 
22 | 
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 The Alan Turing Institute
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/clevercsv/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Various utilities
 5 | 
 6 | Author: Gertjan van den Burg
 7 | 
 8 | """
 9 | 
10 | import hashlib
11 | 
12 | from typing import Iterable
13 | from typing import Iterator
14 | from typing import Tuple
15 | from typing import TypeVar
16 | 
17 | from clevercsv._types import AnyPath
18 | 
19 | T = TypeVar("T")
20 | 
21 | 
22 | def pairwise(iterable: Iterable[T]) -> Iterator[Tuple[T, T]]:
23 |     "s - > (s0, s1), (s1, s2), (s2, s3), ..."
24 |     a = iter(iterable)
25 |     b = iter(iterable)
26 |     next(b, None)
27 |     return zip(a, b)
28 | 
29 | 
30 | def sha1sum(filename: AnyPath) -> str:
31 |     """Compute the SHA1 checksum of a given file
32 | 
33 |     Parameters
34 |     ----------
35 |     filename : str
36 |         Path to a file
37 | 
38 |     Returns
39 |     -------
40 |     checksum : str
41 |         The SHA1 checksum of the file contents.
42 |     """
43 |     blocksize = 1 << 16
44 |     hasher = hashlib.sha1()
45 |     with open(filename, "rb") as fp:
46 |         buf = fp.read(blocksize)
47 |         while len(buf) > 0:
48 |             hasher.update(buf)
49 |             buf = fp.read(blocksize)
50 |     return hasher.hexdigest()
51 | 


--------------------------------------------------------------------------------
/tests/test_integration/failed.log:
--------------------------------------------------------------------------------
 1 | 0069e7bfc8ca0884a84752226a8fb78d
 2 | 026061a7526455946a3f983899d2f0c6
 3 | 0349cc6c33ecda401bff13f23905ed72
 4 | 04095bd80e50f90df503ff7d09ae8672
 5 | 044196aa9f527ccd8aeff80cfa757dd1
 6 | 0475a9956f5bcbb16b58f4a7ccaea973
 7 | 060b4e623f38d193b16aaee0527a4f20
 8 | 07675dd515504407b82f897ec886434f
 9 | 080ca82bd5b56e99bd2a3db334a1a1aa
10 | 081dec2b2d490d2745b5ff99dcd98640
11 | 09af9825ae42e1cb4fb4f3609d5ec1e1
12 | 09cab00748d36387afd638c61ec077df
13 | 0a06a4a6b4151a5288bdb5f25f2754d5
14 | 0a677459b727fc7a7cc583054d7b0f42
15 | 0aded0a7e6428183e30b4dfe0edf476b
16 | 0b869b132594763bba8bb85b8f54688b
17 | 0e7a7f43c445ef171e5132372ff63601
18 | 104761c04f7278b2f5afce85c96db719
19 | 120b852c984ad304b3393c7beeea6491
20 | 1390ca6ccd8500cbbfbc5c7f64979004
21 | 13a6c86a18f053c593feda3d98755010
22 | 17c8007d6eb9baf19d075cb33759e313
23 | 17ccdf2fd0edef2d3bf5fca779cb2161
24 | 17e16b55d1d9ee2e13068db7cc69dbf9
25 | 1a53f0e394e74914659007cc5f153b9f
26 | 1a63a9d56584ec38adf5458adc4764f4
27 | 1a92ace99cdc356b862211df8c3ddc85
28 | 1ca7753332e716f667217edcd90efa83
29 | 1d0bf45700dccca4fa294b7c14fb578e
30 | 30e0f5bcbf1b29b01b27dea8353d1a62
31 | 62ea927849e53c95f0a6bff63ef26f82
32 | 74adf57365bf722eec497c8a2f306ca9
33 | 


--------------------------------------------------------------------------------
/notes/date_regex/dateregex.txt:
--------------------------------------------------------------------------------
 1 | (
 2 | 	(0[1-9]|1[0-2])
 3 | 	(
 4 | 		(0[1-9]|[12]\d|3[01])
 5 | 		(
 6 | 			[12]\d{3}
 7 | 			|
 8 | 			\d{2}
 9 | 		)
10 | 	|
11 | 		(?P<sep1>[-\/. ])
12 | 		(0?[1-9]|[12]\d|3[01])
13 | 		(?P=sep1)
14 | 		(
15 | 			[12]\d{3}
16 | 			|
17 | 			\d{2}
18 | 		)
19 | 	)
20 | |
21 | 	(0[1-9]|[12]\d|3[01])
22 | 	(
23 | 		(0[1-9]|1[0-2])
24 | 		(
25 | 			[12]\d{3}
26 | 			|
27 | 			\d{2}
28 | 		)
29 | 	|
30 | 		(?P<sep2>[-\/. ])
31 | 		(0?[1-9]|1[0-2])
32 | 		(?P=sep2)
33 | 		(
34 | 			[12]\d{3}
35 | 			|
36 | 			\d{2}
37 | 		)
38 | 	)
39 | |
40 | 	(
41 | 		[12]\d{3}
42 | 		|
43 | 		\d{2}
44 | 	)
45 | 	(
46 | 		(?P<sep3>[-\/. ])
47 | 		(0?[1-9]|1[0-2])
48 | 		(?P=sep3)
49 | 		(0?[1-9]|[12]\d|3[01])
50 | 		|
51 | 		年
52 | 		(0?[1-9]|1[0-2])
53 | 		月
54 | 		(0?[1-9]|[12]\d|3[01])
55 | 		日
56 | 		|
57 | 		년
58 | 		(0?[1-9]|1[0-2])
59 | 		월
60 | 		(0?[1-9]|[12]\d|3[01])
61 | 		일
62 | 		|
63 | 		(0[1-9]|1[0-2])
64 | 		(0[1-9]|[12]\d|3[01])
65 | 	)
66 | |
67 | 	(
68 | 		([1-9]|1[0-2])
69 | 		(?P<sep4>[-\/. ])
70 | 		(0?[1-9]|[12]\d|3[01])
71 | 		(?P=sep4)
72 | 		(
73 | 			[12]\d{3}
74 | 			|
75 | 			\d{2}
76 | 		)
77 | 		|
78 | 		([1-9]|[12]\d|3[01])
79 | 		(?P<sep5>[-\/. ])
80 | 		(0?[1-9]|1[0-2])
81 | 		(?P=sep5)
82 | 		(
83 | 			[12]\d{3}
84 | 			|
85 | 			\d{2}
86 | 		)
87 | 	)
88 | )
89 | 


--------------------------------------------------------------------------------
/docs/source/clevercsv.console.commands.rst:
--------------------------------------------------------------------------------
 1 | clevercsv.console.commands package
 2 | ==================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | clevercsv.console.commands.code module
 8 | --------------------------------------
 9 | 
10 | .. automodule:: clevercsv.console.commands.code
11 |    :members:
12 |    :show-inheritance:
13 |    :undoc-members:
14 | 
15 | clevercsv.console.commands.detect module
16 | ----------------------------------------
17 | 
18 | .. automodule:: clevercsv.console.commands.detect
19 |    :members:
20 |    :show-inheritance:
21 |    :undoc-members:
22 | 
23 | clevercsv.console.commands.explore module
24 | -----------------------------------------
25 | 
26 | .. automodule:: clevercsv.console.commands.explore
27 |    :members:
28 |    :show-inheritance:
29 |    :undoc-members:
30 | 
31 | clevercsv.console.commands.standardize module
32 | ---------------------------------------------
33 | 
34 | .. automodule:: clevercsv.console.commands.standardize
35 |    :members:
36 |    :show-inheritance:
37 |    :undoc-members:
38 | 
39 | clevercsv.console.commands.view module
40 | --------------------------------------
41 | 
42 | .. automodule:: clevercsv.console.commands.view
43 |    :members:
44 |    :show-inheritance:
45 |    :undoc-members:
46 | 
47 | Module contents
48 | ---------------
49 | 
50 | .. automodule:: clevercsv.console.commands
51 |    :members:
52 |    :show-inheritance:
53 |    :undoc-members:
54 | 


--------------------------------------------------------------------------------
/clevercsv/console/commands/_docs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | FLAG_DESCRIPTIONS = {
 4 |     "encoding": (
 5 |         "The file encoding of the given CSV file is automatically "
 6 |         "detected using chardet. While chardet is incredibly "
 7 |         "accurate, it is not perfect. In the rare cases that it makes "
 8 |         "a mistake in detecting the file encoding, you can override "
 9 |         "the encoding by providing it through this flag. Moreover, "
10 |         "when you have a number of CSV files with a known file "
11 |         "encoding, you can use this option to speed up the code "
12 |         "generation process."
13 |     ),
14 |     "num-chars": (
15 |         "On large CSV files, dialect detection can sometimes be a bit "
16 |         "slow due to the large number of possible dialects to "
17 |         "consider. To alleviate this, you can limit the number of "
18 |         "characters to use for detection.\n\n"
19 |         "One aspect to keep in mind is that CleverCSV may need to "
20 |         "read a specific number of characters to be able to correctly "
21 |         "infer the dialect. For example, in the ``imdb.csv`` file "
22 |         "in the GitHub repository, the correct dialect can only "
23 |         "be found after at least 66 lines of the file are read. "
24 |         "Therefore, if there is availability to run CleverCSV on "
25 |         "the entire file, that is generally recommended."
26 |     ),
27 | }
28 | 


--------------------------------------------------------------------------------
/clevercsv/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from csv import QUOTE_ALL
 4 | from csv import QUOTE_MINIMAL
 5 | from csv import QUOTE_NONE
 6 | from csv import QUOTE_NONNUMERIC
 7 | 
 8 | from .__version__ import __version__
 9 | from .cparser_util import field_size_limit
10 | from .detect import Detector
11 | from .detect import Detector as Sniffer
12 | from .dialect import excel
13 | from .dialect import excel_tab
14 | from .dialect import unix_dialect
15 | from .dict_read_write import DictReader
16 | from .dict_read_write import DictWriter
17 | from .exceptions import Error
18 | from .read import reader
19 | from .wrappers import detect_dialect
20 | from .wrappers import read_dataframe
21 | from .wrappers import read_dicts
22 | from .wrappers import read_table
23 | from .wrappers import stream_dicts
24 | from .wrappers import stream_table
25 | from .wrappers import write_table
26 | from .write import writer
27 | 
28 | __all__ = [
29 |     "QUOTE_ALL",
30 |     "QUOTE_MINIMAL",
31 |     "QUOTE_NONE",
32 |     "QUOTE_NONNUMERIC",
33 |     "__version__",
34 |     "field_size_limit",
35 |     "Detector",
36 |     "Sniffer",
37 |     "excel",
38 |     "excel_tab",
39 |     "unix_dialect",
40 |     "DictReader",
41 |     "DictWriter",
42 |     "Error",
43 |     "reader",
44 |     "detect_dialect",
45 |     "read_dataframe",
46 |     "read_dicts",
47 |     "read_table",
48 |     "stream_dicts",
49 |     "stream_table",
50 |     "write_table",
51 |     "writer",
52 | ]
53 | 


--------------------------------------------------------------------------------
/notes/date_regex/datefmt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Generate strings for all the date formats.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | 
12 | def main():
13 |     year2 = "YY"
14 |     year4 = "YYYY"
15 |     month_leading = "MM"
16 |     month_sparse = "M"
17 |     day_leading = "D"
18 |     day_leading = "DD"
19 |     day_sparse = "D"
20 |     sep = "x"
21 |     pats = []
22 |     for year in [year2, year4]:
23 |         for month in [month_leading, month_sparse]:
24 |             for day in [day_leading, day_sparse]:
25 |                 fmt = dict(year=year, month=month, day=day, sep=sep)
26 |                 pats.append("{year}{sep}{month}{sep}{day}".format(**fmt))
27 |                 pats.append("{day}{sep}{month}{sep}{year}".format(**fmt))
28 |                 pats.append("{month}{sep}{day}{sep}{year}".format(**fmt))
29 |                 pats.append("{year}年{month}月{day}日".format(**fmt))
30 |                 pats.append("{year}년{month}월{day}일".format(**fmt))
31 | 
32 |     for year in [year2, year4]:
33 |         fmt = dict(year=year, month=month_leading, day=day_leading, sep="")
34 |         pats.append("{year}{sep}{month}{sep}{day}".format(**fmt))
35 |         pats.append("{day}{sep}{month}{sep}{year}".format(**fmt))
36 |         pats.append("{month}{sep}{day}{sep}{year}".format(**fmt))
37 | 
38 |     for pat in sorted(pats):
39 |         print(pat)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     main()
44 | 


--------------------------------------------------------------------------------
/clevercsv/_types.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import csv
 6 | import os
 7 | import sys
 8 | 
 9 | from typing import TYPE_CHECKING
10 | from typing import Any
11 | from typing import Mapping
12 | from typing import TypeVar
13 | from typing import Union
14 | 
15 | import _csv
16 | 
17 | from clevercsv.dialect import SimpleDialect
18 | 
19 | AnyPath = Union[str, bytes, "os.PathLike[str]", "os.PathLike[bytes]"]
20 | StrPath = Union[str, "os.PathLike[str]"]
21 | _OpenFile = Union[AnyPath, int]
22 | _DictRow = Mapping[str, Any]
23 | _DialectLike = Union[
24 |     str,
25 |     csv.Dialect,
26 |     _csv.Dialect,
27 |     type[_csv.Dialect],
28 |     SimpleDialect,
29 | ]
30 | _T = TypeVar("_T")
31 | 
32 | if sys.version_info >= (3, 8):
33 |     from typing import Dict as _DictReadMapping
34 | else:
35 |     from collections import OrderedDict as _DictReadMapping
36 | 
37 | 
38 | if TYPE_CHECKING:
39 |     from _typeshed import FileDescriptorOrPath  # NOQA
40 |     from _typeshed import SupportsIter  # NOQA
41 |     from _typeshed import SupportsWrite  # NOQA
42 | 
43 |     __all__ = [
44 |         "SupportsWrite",
45 |         "SupportsIter",
46 |         "FileDescriptorOrPath",
47 |         "AnyPath",
48 |         "_OpenFile",
49 |         "_DictRow",
50 |         "_DialectLike",
51 |         "_DictReadMapping",
52 |     ]
53 | else:
54 |     __all__ = [
55 |         "AnyPath",
56 |         "_OpenFile",
57 |         "_DictRow",
58 |         "_DialectLike",
59 |         "_DictReadMapping",
60 |     ]
61 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 |   schedule:
11 |     - cron: 53 18 */10 * *
12 | 
13 | jobs:
14 |   code-quality:
15 |     name: Code quality checks for CleverCSV
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout
19 |         uses: actions/checkout@v3
20 | 
21 |       # NOTE: Keep versions in sync with .pre-commit-config.yaml
22 | 
23 |       - name: Run black
24 |         uses: psf/black@stable
25 |         with:
26 |           version: "23.3.0"
27 | 
28 |       - name: Run isort
29 |         uses: jamescurtin/isort-action@master
30 |         with:
31 |           isortVersion: "5.12.0"
32 | 
33 |       - name: Run ruff
34 |         uses: chartboost/ruff-action@v1
35 |         with:
36 |           version: "v0.0.261"
37 | 
38 |   python-test:
39 |     needs: [code-quality]
40 |     name: Tests
41 |     runs-on: ${{ matrix.os }}
42 |     strategy:
43 |       matrix:
44 |         os: [ 'ubuntu-latest', 'macos-latest', 'windows-latest' ]
45 |         py: [ '3.9', '3.14' ] # minimal and latest
46 |     steps:
47 |       - name: Install Python ${{ matrix.py }}
48 |         uses: actions/setup-python@v5
49 |         with:
50 |           python-version: ${{ matrix.py }}
51 | 
52 |       - name: Checkout
53 |         uses: actions/checkout@v3
54 | 
55 |       - name: Install CleverCSV
56 |         run: pip install -e .[full,tests]
57 | 
58 |       - name: Run unit tests
59 |         run: python -m unittest discover -v -f -s ./tests/test_unit
60 | 


--------------------------------------------------------------------------------
/tests/test_integration/failed_partial.log:
--------------------------------------------------------------------------------
 1 | 0069e7bfc8ca0884a84752226a8fb78d
 2 | 0123de82b4e6f80b2da76a76611efd1a
 3 | 026061a7526455946a3f983899d2f0c6
 4 | 04095bd80e50f90df503ff7d09ae8672
 5 | 044196aa9f527ccd8aeff80cfa757dd1
 6 | 0475a9956f5bcbb16b58f4a7ccaea973
 7 | 04c88c064919bd89510d1dd22cb8642e
 8 | 060b4e623f38d193b16aaee0527a4f20
 9 | 068ff5ebfca3329887fa6289880004d1
10 | 0747d24252451f9110b252294a1cfd75
11 | 07675dd515504407b82f897ec886434f
12 | 080ca82bd5b56e99bd2a3db334a1a1aa
13 | 081dec2b2d490d2745b5ff99dcd98640
14 | 090a15a2f7f10009ad43406b4f60fe04
15 | 0985985f5dd21760ec916d66e73546fd
16 | 09af9825ae42e1cb4fb4f3609d5ec1e1
17 | 09cab00748d36387afd638c61ec077df
18 | 0a06a4a6b4151a5288bdb5f25f2754d5
19 | 0a677459b727fc7a7cc583054d7b0f42
20 | 0aded0a7e6428183e30b4dfe0edf476b
21 | 0af0ffb351e75c7756d273fca2a2f82b
22 | 0b869b132594763bba8bb85b8f54688b
23 | 0cf0e1bc97595760217806686a32ff39
24 | 0d00e37b443633368e8f136cffed5730
25 | 0da1208093afc2b90f3f4df49e8d996c
26 | 0e51f0fe2a2ddd25ff7a0d1bac00d17b
27 | 0e7a7f43c445ef171e5132372ff63601
28 | 102a1c511703cee9bb3decd49764ab38
29 | 107d3dbbfa4a37773b24e7d095ddfce2
30 | 119cecf07dd8af6a3c2229fb35b3103f
31 | 1390ca6ccd8500cbbfbc5c7f64979004
32 | 13989b94a814dfe6b7b784a3a8c5c581
33 | 13a6c86a18f053c593feda3d98755010
34 | 13fa5d67c7315502e255e3f53672775a
35 | 30e0f5bcbf1b29b01b27dea8353d1a62
36 | 32a92fc9acf632202de46d25b1c0fc3b
37 | 62aa70d13ad2c26a114d030cab55fae6
38 | 62ea927849e53c95f0a6bff63ef26f82
39 | 70e2d41fccd442a2d1b56bc6ba2ee310
40 | 74adf57365bf722eec497c8a2f306ca9
41 | 94dd7789e4e0a26f3a292f9721d248ff
42 | dbade3e6bf171424d97fcc1d487e2b67
43 | 


--------------------------------------------------------------------------------
/tests/test_unit/test_potential_dialects.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Unit tests for the potential dialect selection.
 5 | 
 6 | Author: Gertjan van den Burg
 7 | 
 8 | """
 9 | 
10 | import unittest
11 | 
12 | from clevercsv.potential_dialects import filter_urls
13 | from clevercsv.potential_dialects import get_delimiters
14 | from clevercsv.potential_dialects import get_quotechars
15 | from clevercsv.potential_dialects import masked_by_quotechar
16 | 
17 | 
18 | class PotentialDialectTestCase(unittest.TestCase):
19 |     def test_masked_by_quotechar(self) -> None:
20 |         self.assertTrue(masked_by_quotechar('A"B&C"A', '"', "", "&"))
21 |         self.assertFalse(masked_by_quotechar('A"B&C"A&A', '"', "", "&"))
22 |         self.assertFalse(masked_by_quotechar('A|"B&C"A', '"', "|", "&"))
23 |         self.assertFalse(masked_by_quotechar('A"B"C', '"', "", ""))
24 | 
25 |     def test_filter_urls(self) -> None:
26 |         data = "A,B\nwww.google.com,10\nhttps://gertjanvandenburg.com,25\n"
27 |         exp = "A,B\nU,10\nU,25\n"
28 |         self.assertEqual(exp, filter_urls(data))
29 | 
30 |     def test_get_quotechars(self) -> None:
31 |         data = "A,B,'A',B\"D\"E"
32 |         exp = set(['"', "'", ""])
33 |         out = get_quotechars(data)
34 |         self.assertEqual(out, exp)
35 | 
36 |     def test_get_delimiters(self) -> None:
37 |         data = "A,B|CD,E;F\tD123£123€10.,0"
38 |         exp = set([",", "|", ";", "\t", "€", "£", ""])
39 |         out = get_delimiters(data, "UTF-8")
40 |         self.assertEqual(out, exp)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     unittest.main()
45 | 


--------------------------------------------------------------------------------
/man/clevercsv-help.1:
--------------------------------------------------------------------------------
 1 | '\" t
 2 | .\"     Title: clevercsv-help
 3 | .\"    Author: G.J.J. van den Burg
 4 | .\" Generator: Wilderness <https://pypi.org/project/wilderness>
 5 | .\"      Date: 2025-10-30
 6 | .\"    Manual: clevercsv Manual
 7 | .\"    Source: clevercsv 0.8.4
 8 | .\"  Language: English
 9 | .\"
10 | .TH "CLEVERCSV-HELP" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual"
11 | .\" -----------------------------------------------------------------
12 | .\" * Define some portability stuff
13 | .\" -----------------------------------------------------------------
14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .\" http://bugs.debian.org/507673
16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | .ie \n(.g .ds Aq \(aq
19 | .el       .ds Aq '
20 | .\" -----------------------------------------------------------------
21 | .\" * set default formatting *
22 | .\" -----------------------------------------------------------------
23 | .\" disable hyphenation
24 | .nh
25 | .\" disable justification
26 | .ad l
27 | .\" -----------------------------------------------------------------
28 | .\" * MAIN CONTENT STARTS HERE *
29 | .\" -----------------------------------------------------------------
30 | .SH "NAME"
31 | clevercsv-help \- Display help information
32 | .SH "SYNOPSIS"
33 | .sp
34 | .nf
35 | \fIclevercsv help [command]
36 | .fi
37 | .sp
38 | .SH "DESCRIPTION"
39 | .sp
40 | Display help information
41 | .SH "OPTIONS"
42 | .sp
43 | .sp
44 | .sp
45 | \-h, \-\-help
46 | .RS 4
47 | show this help message and exit
48 | .RE
49 | .PP
50 | .sp


--------------------------------------------------------------------------------
/clevercsv/cparser.pyi:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import Final
 6 | from typing import Generic
 7 | from typing import Iterable
 8 | from typing import List
 9 | from typing import Literal
10 | from typing import Optional
11 | from typing import Tuple
12 | from typing import TypeVar
13 | from typing import overload
14 | 
15 | _T = TypeVar("_T")
16 | 
17 | class Parser(Generic[_T]):
18 |     _return_quoted: Final[bool]
19 | 
20 |     @overload
21 |     def __init__(
22 |         self: Parser[List[Tuple[str, bool]]],
23 |         delimiter: Optional[str] = "",
24 |         quotechar: Optional[str] = "",
25 |         escapechar: Optional[str] = "",
26 |         field_limit: Optional[int] = 128 * 1024,
27 |         strict: Optional[bool] = False,
28 |         return_quoted: Literal[True] = ...,
29 |     ) -> None: ...
30 |     @overload
31 |     def __init__(
32 |         self: Parser[List[str]],
33 |         delimiter: Optional[str] = "",
34 |         quotechar: Optional[str] = "",
35 |         escapechar: Optional[str] = "",
36 |         field_limit: Optional[int] = 128 * 1024,
37 |         strict: Optional[bool] = False,
38 |         return_quoted: Literal[False] = ...,
39 |     ) -> None: ...
40 |     @overload
41 |     def __init__(
42 |         self,
43 |         data: Iterable[str],
44 |         delimiter: Optional[str] = "",
45 |         quotechar: Optional[str] = "",
46 |         escapechar: Optional[str] = "",
47 |         field_limit: Optional[int] = 128 * 1024,
48 |         strict: Optional[bool] = False,
49 |         return_quoted: bool = ...,
50 |     ) -> None: ...
51 |     def __iter__(self) -> "Parser": ...
52 |     def __next__(self) -> _T: ...
53 | 
54 | class Error(Exception): ...
55 | 


--------------------------------------------------------------------------------
/clevercsv/encoding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Functionality to detect file encodings
 4 | 
 5 | Author: G.J.J. van den Burg
 6 | License: See the LICENSE file
 7 | 
 8 | This file is part of CleverCSV.
 9 | 
10 | """
11 | 
12 | from typing import Optional
13 | 
14 | import chardet
15 | 
16 | from ._optional import import_optional_dependency
17 | from ._types import _OpenFile
18 | 
19 | 
20 | def get_encoding(
21 |     filename: _OpenFile, try_cchardet: bool = True
22 | ) -> Optional[str]:
23 |     """Get the encoding of the file
24 | 
25 |     This function uses the chardet package for detecting the encoding of a
26 |     file.
27 | 
28 |     Parameters
29 |     ----------
30 |     filename: str
31 |         Path to a file
32 | 
33 |     try_cchardet: bool
34 |         Whether to run detection using cChardet if it is available. This can be
35 |         faster, but may give different results than using chardet.
36 | 
37 |     Returns
38 |     -------
39 |     encoding: str
40 |         Encoding of the file.
41 |     """
42 |     if try_cchardet:
43 |         cchardet = import_optional_dependency(
44 |             "cchardet", raise_on_missing=False
45 |         )
46 |     else:
47 |         cchardet = None
48 | 
49 |     if cchardet is None:
50 |         detector = chardet.UniversalDetector()
51 |     else:
52 |         detector = cchardet.UniversalDetector()
53 | 
54 |     final_chunk = False
55 |     blk_size = 65536
56 |     with open(filename, "rb") as fid:
57 |         while (not final_chunk) and (not detector.done):
58 |             chunk = fid.read(blk_size)
59 |             if len(chunk) < blk_size:
60 |                 final_chunk = True
61 |             detector.feed(chunk)
62 |     detector.close()
63 |     encoding = detector.result.get("encoding", None)
64 |     return encoding
65 | 


--------------------------------------------------------------------------------
/clevercsv/console/commands/_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from typing import Any
 4 | from typing import List
 5 | from typing import Optional
 6 | 
 7 | from clevercsv import __version__
 8 | from clevercsv.dialect import SimpleDialect
 9 | 
10 | 
11 | def parse_int(val: Any, name: str) -> Optional[int]:
12 |     """Parse a number to an integer if possible"""
13 |     if val is None:
14 |         return val
15 |     try:
16 |         return int(val)
17 |     except ValueError:
18 |         raise ValueError(
19 |             f"Please provide a number for {name}, instead of {val}"
20 |         )
21 | 
22 | 
23 | def generate_code(
24 |     filename: str,
25 |     dialect: SimpleDialect,
26 |     encoding: Optional[str],
27 |     use_pandas: bool = False,
28 | ) -> List[str]:
29 |     assert dialect.quotechar is not None
30 |     d = '"\\t"' if dialect.delimiter == "\t" else f'"{dialect.delimiter}"'
31 |     q = '"%s"' % (dialect.quotechar.replace('"', '\\"'))
32 |     e = repr(f"{dialect.escapechar}").replace("'", '"')
33 |     base = [
34 |         "",
35 |         f"# Code generated with CleverCSV version {__version__}",
36 |         "",
37 |         "import clevercsv",
38 |     ]
39 |     if use_pandas:
40 |         return [
41 |             *base,
42 |             "",
43 |             f'df = clevercsv.read_dataframe("{filename}", delimiter={d}, '
44 |             f"quotechar={q}, escapechar={e})",
45 |             "",
46 |         ]
47 | 
48 |     enc = "None" if encoding is None else f'"{encoding}"'
49 |     lines = [
50 |         *base,
51 |         "",
52 |         f'with open("{filename}", "r", newline="", encoding={enc}) as fp:',
53 |         "    reader = clevercsv.reader(fp, "
54 |         + f"delimiter={d}, quotechar={q}, escapechar={e})",
55 |         "    rows = list(reader)",
56 |         "",
57 |     ]
58 |     return lines
59 | 


--------------------------------------------------------------------------------
/notes/date_regex/dateregex_annotated.txt:
--------------------------------------------------------------------------------
 1 | ( 								# MMDDYY(YY)
 2 | 	(0[1-9]|1[0-2]) 					# MM
 3 | 	(
 4 | 		(0[1-9]|[12]\d|3[01]) 				# DD
 5 | 		(
 6 | 			[12]\d{3} 				# YYYY
 7 | 			|
 8 | 			\d{2} 					# YY
 9 | 		)
10 | 	| 							# MMxD(D)xYY(YY)
11 | 		(?P<sep1>[-\/. ])
12 | 		(0?[1-9]|[12]\d|3[01]) 				# DD|D
13 | 		(?P=sep1)
14 | 		(
15 | 			[12]\d{3}  				# YYYY
16 | 			|
17 | 			\d{2} 					# YY
18 | 		)
19 | 	)
20 | | 								# DDMMYY(YY)
21 | 	(0[1-9]|[12]\d|3[01]) 					# DD
22 | 	(
23 | 		(0[1-9]|1[0-2]) 				# MM
24 | 		(
25 | 			[12]\d{3} 				# YYYY
26 | 			|
27 | 			\d{2} 					# YY
28 | 		)
29 | 	| 							# DDxM(M)xYY(YY)
30 | 		(?P<sep2>[-\/. ])
31 | 		(0?[1-9]|1[0-2]) 				# M|MM
32 | 		(?P=sep2)
33 | 		(
34 | 			[12]\d{3} 				# YYYY
35 | 			|
36 | 			\d{2} 					# YY
37 | 		)
38 | 	)
39 | |
40 | 	(
41 | 		[12]\d{3} 					# YYYY
42 | 		|
43 | 		\d{2} 						# YY
44 | 	)
45 | 	( 							# YY(YY)xM(M)xD(D)
46 | 		(?P<sep3>[-\/. ])
47 | 		(0?[1-9]|1[0-2]) 				# MM|M
48 | 		(?P=sep3)
49 | 		(0?[1-9]|[12]\d|3[01]) 				# DD|D
50 | 		|
51 | 		年 						# YY(YY)年M(M)月D(D)日
52 | 		(0?[1-9]|1[0-2]) 				# MM|M
53 | 		月
54 | 		(0?[1-9]|[12]\d|3[01]) 				# DD|D
55 | 		日
56 | 		|
57 | 		년 						# YY(YY)년M(M)월D(D)일
58 | 		(0?[1-9]|1[0-2]) 				# MM|M
59 | 		월
60 | 		(0?[1-9]|[12]\d|3[01]) 				# DD|D
61 | 		일
62 | 		| 						# YY(YY)MMDD
63 | 		(0[1-9]|1[0-2]) 				# MM
64 | 		(0[1-9]|[12]\d|3[01]) 				# DD
65 | 	)
66 | |
67 | 	( 							# MxD(D)xYY(YY)
68 | 		([1-9]|1[0-2]) 					# M
69 | 		(?P<sep4>[-\/. ])
70 | 		(0?[1-9]|[12]\d|3[01]) 				# DD|D
71 | 		(?P=sep4)
72 | 		(
73 | 			[12]\d{3} 				# YYYY
74 | 			|
75 | 			\d{2} 					# YY
76 | 		)
77 | 		| 						# DxM(M)xYY(YY)
78 | 		([1-9]|[12]\d|3[01]) 				# D
79 | 		(?P<sep5>[-\/. ])
80 | 		(0?[1-9]|1[0-2]) 				# MM|M
81 | 		(?P=sep5)
82 | 		(
83 | 			[12]\d{3} 				# YYYY
84 | 			|
85 | 			\d{2} 					# YY
86 | 		)
87 | 	)
88 | )
89 | 


--------------------------------------------------------------------------------
/clevercsv/cparser_util.pyi:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from typing import Any
 4 | from typing import Iterable
 5 | from typing import Iterator
 6 | from typing import List
 7 | from typing import Literal
 8 | from typing import Optional
 9 | from typing import Tuple
10 | from typing import Union
11 | from typing import overload
12 | 
13 | from .dialect import SimpleDialect
14 | 
15 | def field_size_limit(*args: Any, **kwargs: Any) -> int: ...
16 | @overload
17 | def _parse_data(
18 |     data: Iterable[str],
19 |     delimiter: str,
20 |     quotechar: str,
21 |     escapechar: str,
22 |     strict: bool,
23 |     return_quoted: Literal[False] = ...,
24 | ) -> Iterator[List[str]]: ...
25 | @overload
26 | def _parse_data(
27 |     data: Iterable[str],
28 |     delimiter: str,
29 |     quotechar: str,
30 |     escapechar: str,
31 |     strict: bool,
32 |     return_quoted: Literal[True],
33 | ) -> Iterator[List[Tuple[str, bool]]]: ...
34 | @overload
35 | def _parse_data(
36 |     data: Iterable[str],
37 |     delimiter: str,
38 |     quotechar: str,
39 |     escapechar: str,
40 |     strict: bool,
41 |     return_quoted: bool = ...,
42 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]: ...
43 | def parse_data(
44 |     data: Iterable[str],
45 |     dialect: Optional[SimpleDialect] = None,
46 |     delimiter: Optional[str] = None,
47 |     quotechar: Optional[str] = None,
48 |     escapechar: Optional[str] = None,
49 |     strict: Optional[bool] = None,
50 |     return_quoted: bool = False,
51 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]: ...
52 | @overload
53 | def parse_string(
54 |     data: str,
55 |     dialect: SimpleDialect,
56 |     return_quoted: Literal[False] = ...,
57 | ) -> Iterator[List[str]]: ...
58 | @overload
59 | def parse_string(
60 |     data: str,
61 |     dialect: SimpleDialect,
62 |     return_quoted: Literal[True],
63 | ) -> Iterator[List[Tuple[str, bool]]]: ...
64 | @overload
65 | def parse_string(
66 |     data: str,
67 |     dialect: SimpleDialect,
68 |     return_quoted: bool = ...,
69 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]: ...
70 | 


--------------------------------------------------------------------------------
/clevercsv/escape.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Common functions for dealing with escape characters.
 5 | 
 6 | Author: Gertjan van den Burg
 7 | Date: 2018-11-06
 8 | """
 9 | 
10 | import codecs
11 | import sys
12 | import unicodedata
13 | 
14 | from typing import Iterable
15 | from typing import Optional
16 | from typing import Set
17 | 
18 | #: Set of default characters to *never* consider as escape character
19 | DEFAULT_BLOCK_CHARS: Set[str] = set(
20 |     [
21 |         "!",
22 |         "?",
23 |         '"',
24 |         "'",
25 |         ".",
26 |         ",",
27 |         ";",
28 |         ":",
29 |         "%",
30 |         "*",
31 |         "&",
32 |         "#",
33 |     ]
34 | )
35 | 
36 | #: Set of characters in the Unicode "Po" category
37 | UNICODE_PO_CHARS: Set[str] = set(
38 |     [
39 |         c
40 |         for c in map(chr, range(sys.maxunicode + 1))
41 |         if unicodedata.category(c) == "Po"
42 |     ]
43 | )
44 | 
45 | 
46 | def is_potential_escapechar(
47 |     char: str, encoding: str, block_char: Optional[Iterable[str]] = None
48 | ) -> bool:
49 |     """Check if a character is a potential escape character.
50 | 
51 |     A character is considered a potential escape character if it is in the
52 |     "Punctuation, Other" Unicode category and not in the list of blocked
53 |     characters.
54 | 
55 |     Parameters
56 |     ----------
57 |     char: str
58 |         The character to check
59 | 
60 |     encoding : str
61 |         The encoding of the character
62 | 
63 |     block_char : Optional[Iterable[str]]
64 |         Characters that are in the Punctuation Other category but that should
65 |         not be considered as escape character. If None, the default set is
66 |         used, which is defined in :py:data:`DEFAULT_BLOCK_CHARS`.
67 | 
68 |     Returns
69 |     -------
70 |     is_escape : bool
71 |         Whether the character is considered a potential escape or not.
72 | 
73 |     """
74 |     if encoding.lower() in set(["utf-8", "ascii"]):
75 |         uchar = char
76 |     else:
77 |         uchar = codecs.decode(bytes(char, encoding), encoding=encoding)
78 | 
79 |     block_chars = (
80 |         DEFAULT_BLOCK_CHARS if block_char is None else set(block_char)
81 |     )
82 |     if uchar in UNICODE_PO_CHARS and uchar not in block_chars:
83 |         return True
84 |     return False
85 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # http://www.sphinx-doc.org/en/master/config
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | sys.path.insert(0, os.path.abspath("."))
17 | 
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = "CleverCSV"
22 | copyright = "2019, The Alan Turing Institute"
23 | author = "G.J.J. van den Burg"
24 | 
25 | 
26 | # -- General configuration ---------------------------------------------------
27 | 
28 | master_doc = "index"
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     "sphinx.ext.autodoc",
35 |     "sphinx.ext.coverage",
36 |     "sphinx.ext.napoleon",
37 | ]
38 | 
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = ["_templates"]
41 | 
42 | # List of patterns, relative to source directory, that match files and
43 | # directories to ignore when looking for source files.
44 | # This pattern also affects html_static_path and html_extra_path.
45 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
46 | 
47 | 
48 | # -- Options for HTML output -------------------------------------------------
49 | 
50 | # The theme to use for HTML and HTML Help pages.  See the documentation for
51 | # a list of builtin themes.
52 | #
53 | html_theme = "furo"
54 | html_logo = "https://raw.githubusercontent.com/alan-turing-institute/CleverCSV/eea72549195e37bd4347d87fd82bc98be2f1383d/.logo.png"
55 | 
56 | html_theme_options = {
57 |     "sidebar_hide_name": True,
58 |     "light_css_variables": {
59 |         "color-brand-primary": "#336790",  # "blue"
60 |         "color-brand-content": "#336790",
61 |     },
62 |     "dark_css_variables": {
63 |         "color-brand-primary": "#c03232ff",  # "red"
64 |         "color-brand-content": "#c03232ff",
65 |     },
66 | }
67 | 
68 | # Add any paths that contain custom static files (such as style sheets) here,
69 | # relative to this directory. They are copied after the builtin static files,
70 | # so a file named "default.css" will overwrite the builtin "default.css".
71 | html_static_path = ["_static"]
72 | 


--------------------------------------------------------------------------------
/clevercsv/read.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Drop-in replacement for the Python csv reader class. This is a wrapper for the
 5 | Parser class, defined in :mod:`cparser`.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | import csv
12 | 
13 | from typing import Any
14 | from typing import Iterable
15 | from typing import Iterator
16 | from typing import List
17 | from typing import Optional
18 | 
19 | from . import field_size_limit
20 | from ._types import _DialectLike
21 | from .cparser import Error as ParserError
22 | from .cparser import Parser
23 | from .dialect import SimpleDialect
24 | from .exceptions import Error
25 | 
26 | 
27 | class reader:
28 |     def __init__(
29 |         self,
30 |         csvfile: Iterable[str],
31 |         dialect: _DialectLike = "excel",
32 |         **fmtparams: Any,
33 |     ):
34 |         self.csvfile = csvfile
35 |         self.original_dialect = dialect
36 |         self._dialect = self._make_simple_dialect(dialect, **fmtparams)
37 |         self.line_num: int = 0
38 |         self.parser_gen: Optional[Parser] = None
39 | 
40 |     @property
41 |     def dialect(self) -> csv.Dialect:
42 |         return self._dialect.to_csv_dialect()
43 | 
44 |     def _make_simple_dialect(
45 |         self, dialect: _DialectLike, **fmtparams: Any
46 |     ) -> SimpleDialect:
47 |         if isinstance(dialect, str):
48 |             sd = SimpleDialect.from_csv_dialect(csv.get_dialect(dialect))
49 |         elif isinstance(dialect, csv.Dialect):
50 |             sd = SimpleDialect.from_csv_dialect(dialect)
51 |         elif isinstance(dialect, SimpleDialect):
52 |             sd = dialect
53 |         else:
54 |             raise ValueError("Unknown dialect type: %r" % dialect)
55 |         for key, value in fmtparams.items():
56 |             if key in ["delimiter", "quotechar", "escapechar", "strict"]:
57 |                 setattr(sd, key, value)
58 |         sd.validate()
59 |         return sd
60 | 
61 |     def __iter__(self) -> Iterator[List[str]]:
62 |         self.parser_gen = Parser(
63 |             self.csvfile,
64 |             delimiter=self._dialect.delimiter,
65 |             quotechar=self._dialect.quotechar,
66 |             escapechar=self._dialect.escapechar,
67 |             field_limit=field_size_limit(),
68 |             strict=self._dialect.strict,
69 |         )
70 |         return self
71 | 
72 |     def __next__(self) -> List[str]:
73 |         if self.parser_gen is None:
74 |             self.__iter__()
75 |         assert self.parser_gen is not None
76 |         try:
77 |             row = next(self.parser_gen)
78 |         except ParserError as e:
79 |             raise Error(str(e))
80 |         self.line_num += 1
81 |         return row
82 | 


--------------------------------------------------------------------------------
/tests/test_unit/test_abstraction.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Unit tests for making abstractions
 5 | 
 6 | """
 7 | 
 8 | import gzip
 9 | import json
10 | import unittest
11 | 
12 | from pathlib import Path
13 | 
14 | from typing import Any
15 | from typing import Dict
16 | from typing import List
17 | 
18 | from clevercsv.cabstraction import c_merge_with_quotechar
19 | from clevercsv.detect_pattern import base_abstraction
20 | from clevercsv.detect_pattern import fill_empties
21 | from clevercsv.detect_pattern import strip_trailing
22 | 
23 | 
24 | class AbstractionTestCase(unittest.TestCase):
25 |     def setUp(self) -> None:
26 |         here = Path(__file__)
27 |         this_dir = here.parent
28 |         data_dir = this_dir / "data"
29 |         testcases_file = data_dir / "abstraction_testcases.json.gz"
30 |         if not testcases_file.exists():
31 |             self._cases = []
32 |         else:
33 |             self._cases = self._load_cases(testcases_file)
34 | 
35 |     @staticmethod
36 |     def _load_cases(filename: Path) -> List[Dict[str, Any]]:
37 |         cases = []
38 |         with gzip.open(filename, "rt", newline="", encoding="utf-8") as fp:
39 |             for line in fp:
40 |                 cases.append(json.loads(line))
41 |         return cases
42 | 
43 |     def test_abstraction_multi(self) -> None:
44 |         if not self._cases:
45 |             self.skipTest("no abstraction test cases found")
46 | 
47 |         for case in self._cases:
48 |             content = case["content"]
49 |             dialect = case["dialect"]
50 | 
51 |             exp_base = case["base_abstraction"]
52 |             exp_merge = case["after_merge_with_quotechar"]
53 |             exp_empties = case["after_fill_empties"]
54 |             exp_trailing = case["after_strip_trailing"]
55 |             with self.subTest(name=case["name"], kind="base"):
56 |                 base = base_abstraction(
57 |                     content,
58 |                     dialect["delimiter"],
59 |                     dialect["quotechar"],
60 |                     dialect["escapechar"],
61 |                 )
62 |                 self.assertEqual(base, exp_base)
63 | 
64 |             with self.subTest(name=case["name"], kind="merge"):
65 |                 merge = c_merge_with_quotechar(base)
66 |                 self.assertEqual(merge, exp_merge)
67 | 
68 |             with self.subTest(name=case["name"], kind="empties"):
69 |                 empties = fill_empties(merge)
70 |                 self.assertEqual(empties, exp_empties)
71 | 
72 |             with self.subTest(name=case["name"], kind="trailing"):
73 |                 trailing = strip_trailing(empties)
74 |                 self.assertEqual(trailing, exp_trailing)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     unittest.main()
79 | 


--------------------------------------------------------------------------------
/clevercsv/write.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Drop-in replacement for the Python csv writer class.
 6 | 
 7 | Author: Gertjan van den Burg
 8 | 
 9 | """
10 | 
11 | from __future__ import annotations
12 | 
13 | import csv
14 | 
15 | from typing import TYPE_CHECKING
16 | from typing import Any
17 | from typing import Iterable
18 | 
19 | import _csv
20 | 
21 | if TYPE_CHECKING:
22 |     from clevercsv._types import SupportsWrite
23 | 
24 | from clevercsv._types import _DialectLike
25 | 
26 | from .dialect import SimpleDialect
27 | from .exceptions import Error
28 | 
29 | DIALECT_KEYS = [
30 |     "skipinitialspace",
31 |     "doublequote",
32 |     "strict",
33 |     "delimiter",
34 |     "escapechar",
35 |     "lineterminator",
36 |     "quotechar",
37 |     "quoting",
38 | ]
39 | 
40 | 
41 | class writer:
42 |     def __init__(
43 |         self,
44 |         csvfile: SupportsWrite[str],
45 |         dialect: _DialectLike = "excel",
46 |         **fmtparams: Any,
47 |     ) -> None:
48 |         self.original_dialect = dialect
49 |         self.dialect: type[_csv.Dialect] = self._make_python_dialect(
50 |             dialect, **fmtparams
51 |         )
52 |         self._writer = _csv.writer(csvfile, dialect=self.dialect)
53 | 
54 |     def _make_python_dialect(
55 |         self, dialect: _DialectLike, **fmtparams: Any
56 |     ) -> type[_csv.Dialect]:
57 |         d: _DialectLike = ""
58 |         if isinstance(dialect, str):
59 |             d = _csv.get_dialect(dialect)
60 |         elif isinstance(dialect, _csv.Dialect):
61 |             d = dialect
62 |         elif isinstance(dialect, SimpleDialect):
63 |             d = dialect.to_csv_dialect()
64 |         elif dialect in [csv.excel, csv.excel_tab, csv.unix_dialect]:
65 |             d = dialect
66 |         else:
67 |             raise ValueError(f"Unknown dialect type: {dialect}")
68 | 
69 |         # Override properties from format parameters
70 |         props = {k: getattr(d, k) for k in DIALECT_KEYS if hasattr(d, k)}
71 |         for key, value in fmtparams.items():
72 |             props[key] = value
73 | 
74 |         # lineterminator must be set
75 |         if "lineterminator" not in props or props["lineterminator"] is None:
76 |             props["lineterminator"] = "\n"
77 | 
78 |         # We have to subclass the csv.Dialect
79 |         newdialect = type("dialect", (csv.Dialect,), props)
80 |         return newdialect
81 | 
82 |     def writerow(self, row: Iterable[Any]) -> Any:
83 |         try:
84 |             return self._writer.writerow(row)
85 |         except csv.Error as e:
86 |             raise Error(str(e))
87 | 
88 |     def writerows(self, rows: Iterable[Iterable[Any]]) -> Any:
89 |         try:
90 |             return self._writer.writerows(rows)
91 |         except csv.Error as e:
92 |             raise Error(str(e))
93 | 


--------------------------------------------------------------------------------
/clevercsv/console/commands/code.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | 
 5 | from wilderness import Command
 6 | 
 7 | from clevercsv.encoding import get_encoding
 8 | from clevercsv.wrappers import detect_dialect
 9 | 
10 | from ._docs import FLAG_DESCRIPTIONS
11 | from ._utils import generate_code
12 | from ._utils import parse_int
13 | 
14 | 
15 | class CodeCommand(Command):
16 |     _description = (
17 |         "Generate Python code for importing a given CSV file. This is "
18 |         "especially useful if you don't want to repeatedly detect the dialect "
19 |         "of the same file. Simply run:\n\n"
20 |         "\tclevercsv code your_csv_file.csv\n\n"
21 |         "and copy the generated code to a Python script."
22 |     )
23 | 
24 |     def __init__(self) -> None:
25 |         super().__init__(
26 |             name="code",
27 |             title="Generate Python code to import a CSV file",
28 |             description=self._description,
29 |             extra_sections={"CleverCSV": "Part of the CleverCSV suite"},
30 |         )
31 | 
32 |     def register(self) -> None:
33 |         self.add_argument("path", help="Path to the CSV file")
34 |         self.add_argument(
35 |             "-e",
36 |             "--encoding",
37 |             help="Set the encoding of the file",
38 |             description=FLAG_DESCRIPTIONS["encoding"],
39 |         )
40 |         self.add_argument(
41 |             "-n",
42 |             "--num-chars",
43 |             type=int,
44 |             help="Number of characters to use for detection",
45 |             description=FLAG_DESCRIPTIONS["num-chars"],
46 |         )
47 |         self.add_argument(
48 |             "-p",
49 |             "--pandas",
50 |             action="store_true",
51 |             help="Write code that uses a Pandas DataFrame",
52 |             description=(
53 |                 "By default, this command writes a small Python script to "
54 |                 "import the CSV file as a list of lists. By enabling this "
55 |                 "option the script will be written such that the file will be "
56 |                 "read as a Pandas DataFrame instead."
57 |             ),
58 |         )
59 | 
60 |     def handle(self) -> int:
61 |         filename = self.args.path
62 |         encoding = self.args.encoding or get_encoding(filename)
63 |         num_chars = parse_int(self.args.num_chars, "num-chars")
64 |         dialect = detect_dialect(
65 |             filename,
66 |             num_chars=num_chars,
67 |             encoding=encoding,
68 |             verbose=self.args.verbose,
69 |         )
70 |         if dialect is None:
71 |             print("Error: dialect detection failed.", file=sys.stderr)
72 |             return 1
73 | 
74 |         code_lines = generate_code(
75 |             filename, dialect, encoding, use_pandas=self.args.pandas
76 |         )
77 |         print("\n".join(code_lines))
78 |         return 0
79 | 


--------------------------------------------------------------------------------
/man/clevercsv-view.1:
--------------------------------------------------------------------------------
 1 | '\" t
 2 | .\"     Title: clevercsv-view
 3 | .\"    Author: G.J.J. van den Burg
 4 | .\" Generator: Wilderness <https://pypi.org/project/wilderness>
 5 | .\"      Date: 2025-10-30
 6 | .\"    Manual: clevercsv Manual
 7 | .\"    Source: clevercsv 0.8.4
 8 | .\"  Language: English
 9 | .\"
10 | .TH "CLEVERCSV-VIEW" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual"
11 | .\" -----------------------------------------------------------------
12 | .\" * Define some portability stuff
13 | .\" -----------------------------------------------------------------
14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .\" http://bugs.debian.org/507673
16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | .ie \n(.g .ds Aq \(aq
19 | .el       .ds Aq '
20 | .\" -----------------------------------------------------------------
21 | .\" * set default formatting *
22 | .\" -----------------------------------------------------------------
23 | .\" disable hyphenation
24 | .nh
25 | .\" disable justification
26 | .ad l
27 | .\" -----------------------------------------------------------------
28 | .\" * MAIN CONTENT STARTS HERE *
29 | .\" -----------------------------------------------------------------
30 | .SH "NAME"
31 | clevercsv-view \- View the CSV file on the command line using TabView
32 | .SH "SYNOPSIS"
33 | .sp
34 | .nf
35 | \fIclevercsv view [\-e ENCODING | \-\-encoding=ENCODING]
36 |                [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS] [\-t | \-\-transpose] <path>
37 | .fi
38 | .sp
39 | .SH "DESCRIPTION"
40 | .sp
41 | The view command is useful to quickly inspect a messy CSV file on the command line.
42 | .SH "OPTIONS"
43 | .sp
44 | .sp
45 | .sp
46 | \-h, \-\-help
47 | .RS 4
48 | show this help message and exit
49 | .RE
50 | .PP
51 | \-e, \-\-encoding
52 | .RS 4
53 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. Moreover, when you have a number of CSV files with a known file encoding, you can use this option to speed up the code generation process.
54 | .RE
55 | .PP
56 | \-n, \-\-num\-chars
57 | .RS 4
58 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection.
59 | .sp
60 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended.
61 | .RE
62 | .PP
63 | \-t, \-\-transpose
64 | .RS 4
65 | Transpose the columns of the input file before viewing
66 | .RE
67 | .PP
68 | <path>
69 | .RS 4
70 | Path to the CSV file
71 | .RE
72 | .PP
73 | .sp
74 | .SH "CLEVERCSV"
75 | .sp
76 | Part of the CleverCSV suite


--------------------------------------------------------------------------------
/tests/test_unit/test_encoding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Unit tests for encoding detection
 4 | 
 5 | Author: G.J.J. van den Burg
 6 | License: See the LICENSE file.
 7 | 
 8 | This file is part of CleverCSV.
 9 | 
10 | """
11 | 
12 | import os
13 | import tempfile
14 | import unittest
15 | 
16 | from dataclasses import dataclass
17 | 
18 | from typing import Any
19 | from typing import List
20 | 
21 | from clevercsv._optional import import_optional_dependency
22 | from clevercsv._types import AnyPath
23 | from clevercsv.encoding import get_encoding
24 | from clevercsv.write import writer
25 | 
26 | 
27 | class EncodingTestCase(unittest.TestCase):
28 |     @dataclass
29 |     class Instance:
30 |         table: List[List[Any]]
31 |         encoding: str
32 |         cchardet_encoding: str
33 | 
34 |     cases: List[Instance] = [
35 |         Instance(
36 |             table=[["Å", "B", "C"], [1, 2, 3], [4, 5, 6]],
37 |             encoding="ISO-8859-1",
38 |             cchardet_encoding="WINDOWS-1252",
39 |         ),
40 |         Instance(
41 |             table=[["A", "B", "C"], [1, 2, 3], [4, 5, 6]],
42 |             encoding="ascii",
43 |             cchardet_encoding="ASCII",
44 |         ),
45 |         Instance(
46 |             table=[["亜唖", "娃阿", "哀愛"], [1, 2, 3], ["挨", "姶", "葵"]],
47 |             encoding="ISO-2022-JP",
48 |             cchardet_encoding="ISO-2022-JP",
49 |         ),
50 |     ]
51 | 
52 |     def setUp(self) -> None:
53 |         self._tmpfiles: List[AnyPath] = []
54 | 
55 |     def tearDown(self) -> None:
56 |         for f in self._tmpfiles:
57 |             os.unlink(f)
58 | 
59 |     def _build_file(self, table: List[List[str]], encoding: str) -> str:
60 |         tmpfd, tmpfname = tempfile.mkstemp(
61 |             prefix="ccsv_",
62 |             suffix=".csv",
63 |         )
64 |         tmpfp = os.fdopen(tmpfd, "w", newline=None, encoding=encoding)
65 |         w = writer(tmpfp, dialect="excel")
66 |         w.writerows(table)
67 |         tmpfp.close()
68 |         self._tmpfiles.append(tmpfname)
69 |         return tmpfname
70 | 
71 |     def test_encoding_chardet(self) -> None:
72 |         for case in self.cases:
73 |             table = case.table
74 |             encoding = case.encoding
75 |             with self.subTest(encoding=encoding):
76 |                 tmpfname = self._build_file(table, encoding)
77 |                 detected = get_encoding(tmpfname, try_cchardet=False)
78 |                 self.assertEqual(encoding, detected)
79 | 
80 |     def test_encoding_cchardet(self) -> None:
81 |         try:
82 |             _ = import_optional_dependency("cchardet")
83 |         except ImportError:
84 |             self.skipTest("Failed to import cchardet, skipping this test")
85 | 
86 |         for case in self.cases:
87 |             table = case.table
88 |             encoding = case.encoding
89 |             with self.subTest(encoding=encoding):
90 |                 out_encoding = case.cchardet_encoding
91 |                 tmpfname = self._build_file(table, encoding)
92 |                 detected = get_encoding(tmpfname, try_cchardet=True)
93 |                 self.assertEqual(out_encoding, detected)
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     unittest.main()
98 | 


--------------------------------------------------------------------------------
/clevercsv/console/commands/view.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | 
 5 | from typing import List
 6 | from typing import Optional
 7 | from typing import Sequence
 8 | 
 9 | from wilderness import Command
10 | 
11 | from clevercsv._optional import import_optional_dependency
12 | from clevercsv.exceptions import NoDetectionResult
13 | from clevercsv.wrappers import read_table
14 | 
15 | from ._docs import FLAG_DESCRIPTIONS
16 | from ._utils import parse_int
17 | 
18 | 
19 | class ViewCommand(Command):
20 |     _description = (
21 |         "The view command is useful to quickly inspect a messy CSV file on "
22 |         "the command line."
23 |     )
24 | 
25 |     def __init__(self) -> None:
26 |         super().__init__(
27 |             name="view",
28 |             title="View the CSV file on the command line using TabView",
29 |             description=self._description,
30 |             extra_sections={"CleverCSV": "Part of the CleverCSV suite"},
31 |         )
32 | 
33 |     def register(self) -> None:
34 |         self.add_argument("path", help="Path to the CSV file")
35 |         self.add_argument(
36 |             "-e",
37 |             "--encoding",
38 |             help="Set the encoding of the file",
39 |             description=FLAG_DESCRIPTIONS["encoding"],
40 |         )
41 |         self.add_argument(
42 |             "-n",
43 |             "--num-chars",
44 |             help="Number of characters to use for detection",
45 |             type=int,
46 |             description=FLAG_DESCRIPTIONS["num-chars"],
47 |         )
48 |         self.add_argument(
49 |             "-t",
50 |             "--transpose",
51 |             action="store_true",
52 |             help="Transpose the columns of the input file before viewing",
53 |         )
54 | 
55 |     def _tabview(self, rows: List[List[str]]) -> None:
56 |         if sys.platform == "win32":
57 |             print(
58 |                 "Error: unfortunately Tabview is not available on Windows, so "
59 |                 "the clevercsv view command is not available",
60 |                 file=sys.stderr,
61 |             )
62 |             return
63 | 
64 |         import_optional_dependency("tabview", raise_on_missing=True)
65 |         from tabview import view
66 | 
67 |         view(rows)
68 | 
69 |     def handle(self) -> int:
70 |         verbose = self.args.verbose
71 |         num_chars = parse_int(self.args.num_chars, "num-chars")
72 |         try:
73 |             rows = read_table(
74 |                 self.args.path,
75 |                 encoding=self.args.encoding,
76 |                 num_chars=num_chars,
77 |                 verbose=verbose,
78 |             )
79 |         except NoDetectionResult:
80 |             print("Error: dialect detection failed.", file=sys.stderr)
81 |             return 1
82 | 
83 |         if self.args.transpose:
84 |             max_row_length = max(map(len, rows))
85 |             fixed_rows: List[Sequence[Optional[str]]] = []
86 |             for row in rows:
87 |                 if len(row) == max_row_length:
88 |                     fixed_rows.append(row)
89 |                 else:
90 |                     fixed_rows.append(
91 |                         row + [None] * (max_row_length - len(row))
92 |                     )
93 |             rows = list(map(list, zip(*fixed_rows)))
94 |         self._tabview(rows)
95 |         return 0
96 | 


--------------------------------------------------------------------------------
/clevercsv/_optional.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Code for dealing with optional dependencies
  4 | 
  5 | The functionality in this file is largely based on similar functionality in the 
  6 | Pandas library.
  7 | 
  8 | Author: G.J.J van den Burg
  9 | Copyright: 2020, The Alan Turing Institute
 10 | License: See LICENSE file.
 11 | 
 12 | """
 13 | 
 14 | import importlib
 15 | 
 16 | from types import ModuleType
 17 | 
 18 | from typing import Dict
 19 | from typing import List
 20 | from typing import NamedTuple
 21 | from typing import Optional
 22 | 
 23 | from packaging.version import Version
 24 | 
 25 | 
 26 | class OptionalDependency(NamedTuple):
 27 |     import_name: str
 28 |     package_name: str
 29 |     min_version: str
 30 | 
 31 | 
 32 | # update this when changing setup.py
 33 | OPTIONAL_DEPENDENCIES: List[OptionalDependency] = [
 34 |     OptionalDependency("tabview", "tabview", "1.4"),
 35 |     OptionalDependency("pandas", "pandas", "0.24.1"),
 36 |     OptionalDependency("cchardet", "faust-cchardet", "2.1.18"),
 37 |     OptionalDependency("wilderness", "wilderness", "0.1.5"),
 38 | ]
 39 | 
 40 | 
 41 | def import_optional_dependency(
 42 |     name: str, raise_on_missing: bool = True
 43 | ) -> Optional[ModuleType]:
 44 |     """
 45 |     Import an optional dependency.
 46 | 
 47 |     This function is modelled on a similar function in the Pandas library.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     name : str
 52 |         Name of the module to import
 53 | 
 54 |     raise_on_missing : bool
 55 |         Whether to raise an error when the package is missing or to simply
 56 |         return None.
 57 | 
 58 |     Returns
 59 |     -------
 60 |     module : module
 61 |         The module if importing was successful, None if
 62 |         :attr:`raise_on_missing` is False.
 63 | 
 64 |     Raises
 65 |     ------
 66 |     ImportError
 67 |         When a module can't be imported and :attr:`raise_on_missing` is True.
 68 | 
 69 |     """
 70 |     msg = (
 71 |         f"\nOptional dependency '{name}' is missing. You can install it using "
 72 |         "pip or conda, or you can install CleverCSV with all of its optional "
 73 |         "dependencies by running: pip install clevercsv[full]"
 74 |     )
 75 |     try:
 76 |         module = importlib.import_module(name)
 77 |     except ImportError:
 78 |         if raise_on_missing:
 79 |             raise ImportError(msg) from None
 80 |         else:
 81 |             return None
 82 | 
 83 |     opt_dependencies: Dict[str, OptionalDependency] = {
 84 |         d.import_name: d for d in OPTIONAL_DEPENDENCIES
 85 |     }
 86 | 
 87 |     dependency = opt_dependencies.get(name)
 88 |     if dependency is None:
 89 |         raise ImportError(f"No known optional dependency with name: {name}")
 90 | 
 91 |     version = getattr(module, "__version__", None)
 92 |     if version is None:
 93 |         return module
 94 | 
 95 |     if Version(version) < Version(dependency.min_version):
 96 |         msg = (
 97 |             f"CleverCSV requires version '{dependency.min_version}' or newer "
 98 |             f"for optional dependency '{dependency.package_name}'. Please "
 99 |             "update the package or install CleverCSV with all its optional "
100 |             "dependencies using: pip install clevercsv[full]"
101 |         )
102 |         raise ImportError(msg)
103 | 
104 |     return module
105 | 


--------------------------------------------------------------------------------
/man/clevercsv-code.1:
--------------------------------------------------------------------------------
 1 | '\" t
 2 | .\"     Title: clevercsv-code
 3 | .\"    Author: G.J.J. van den Burg
 4 | .\" Generator: Wilderness <https://pypi.org/project/wilderness>
 5 | .\"      Date: 2025-10-30
 6 | .\"    Manual: clevercsv Manual
 7 | .\"    Source: clevercsv 0.8.4
 8 | .\"  Language: English
 9 | .\"
10 | .TH "CLEVERCSV-CODE" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual"
11 | .\" -----------------------------------------------------------------
12 | .\" * Define some portability stuff
13 | .\" -----------------------------------------------------------------
14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .\" http://bugs.debian.org/507673
16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | .ie \n(.g .ds Aq \(aq
19 | .el       .ds Aq '
20 | .\" -----------------------------------------------------------------
21 | .\" * set default formatting *
22 | .\" -----------------------------------------------------------------
23 | .\" disable hyphenation
24 | .nh
25 | .\" disable justification
26 | .ad l
27 | .\" -----------------------------------------------------------------
28 | .\" * MAIN CONTENT STARTS HERE *
29 | .\" -----------------------------------------------------------------
30 | .SH "NAME"
31 | clevercsv-code \- Generate Python code to import a CSV file
32 | .SH "SYNOPSIS"
33 | .sp
34 | .nf
35 | \fIclevercsv code [\-e ENCODING | \-\-encoding=ENCODING]
36 |                [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS] [\-p | \-\-pandas] <path>
37 | .fi
38 | .sp
39 | .SH "DESCRIPTION"
40 | .sp
41 | Generate Python code for importing a given CSV file. This is especially useful if you don't want to repeatedly detect the dialect of the same file. Simply run:
42 | .sp
43 | .RS 4
44 | clevercsv code your_csv_file.csv
45 | .RE
46 | .sp
47 | and copy the generated code to a Python script.
48 | .SH "OPTIONS"
49 | .sp
50 | .sp
51 | .sp
52 | \-h, \-\-help
53 | .RS 4
54 | show this help message and exit
55 | .RE
56 | .PP
57 | \-e, \-\-encoding
58 | .RS 4
59 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. Moreover, when you have a number of CSV files with a known file encoding, you can use this option to speed up the code generation process.
60 | .RE
61 | .PP
62 | \-n, \-\-num\-chars
63 | .RS 4
64 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection.
65 | .sp
66 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended.
67 | .RE
68 | .PP
69 | \-p, \-\-pandas
70 | .RS 4
71 | By default, this command writes a small Python script to import the CSV file as a list of lists. By enabling this option the script will be written such that the file will be read as a Pandas DataFrame instead.
72 | .RE
73 | .PP
74 | <path>
75 | .RS 4
76 | Path to the CSV file
77 | .RE
78 | .PP
79 | .sp
80 | .SH "CLEVERCSV"
81 | .sp
82 | Part of the CleverCSV suite


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as 
 6 | contributors and maintainers pledge to making participation in our project and 
 7 | our community a harassment-free experience for everyone, regardless of age, 
 8 | body size, disability, ethnicity, sex characteristics, gender identity and 
 9 | expression, level of experience, education, socio-economic status, 
10 | nationality, personal appearance, race, religion, or sexual identity and 
11 | orientation.
12 | 
13 | ## Our Standards
14 | 
15 | Examples of behavior that contributes to creating a positive environment 
16 | include:
17 | 
18 | * Using welcoming and inclusive language
19 | * Being respectful of differing viewpoints and experiences
20 | * Gracefully accepting constructive criticism
21 | * Focusing on what is best for the community
22 | * Showing empathy towards other community members
23 | 
24 | Examples of unacceptable behavior by participants include:
25 | 
26 | * The use of sexualized language or imagery and unwelcome sexual attention or 
27 |   advances
28 | * Trolling, insulting/derogatory comments, and personal or political attacks 
29 | * Public or private harassment
30 | * Publishing others' private information, such as a physical or electronic 
31 |   address, without explicit permission
32 | * Other conduct which could reasonably be considered inappropriate in a 
33 |   professional setting
34 | 
35 | ## Our Responsibilities
36 | 
37 | Project maintainers are responsible for clarifying the standards of acceptable 
38 | behavior and are expected to take appropriate and fair corrective action in 
39 | response to any instances of unacceptable behavior.
40 | 
41 | Project maintainers have the right and responsibility to remove, edit, or 
42 | reject comments, commits, code, wiki edits, issues, and other contributions 
43 | that are not aligned to this Code of Conduct, or to ban temporarily or 
44 | permanently any contributor for other behaviors that they deem inappropriate, 
45 | threatening, offensive, or harmful.
46 | 
47 | ## Scope
48 | 
49 | This Code of Conduct applies both within project spaces and in public spaces 
50 | when an individual is representing the project or its community. Examples of 
51 | representing a project or community include using an official project e-mail 
52 | address, posting via an official social media account, or acting as an 
53 | appointed representative at an online or offline event. Representation of a 
54 | project may be further defined and clarified by project maintainers.
55 | 
56 | ## Enforcement
57 | 
58 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 
59 | reported by contacting the project team at gvandenburg@turing.ac.uk. All 
60 | complaints will be reviewed and investigated and will result in a response 
61 | that is deemed necessary and appropriate to the circumstances. The project 
62 | team is obligated to maintain confidentiality with regard to the reporter of 
63 | an incident. Further details of specific enforcement policies may be posted 
64 | separately.
65 | 
66 | Project maintainers who do not follow or enforce the Code of Conduct in good 
67 | faith may face temporary or permanent repercussions as determined by other 
68 | members of the project's leadership.
69 | 
70 | ## Attribution
71 | 
72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 
73 | version 1.4, available at 
74 | https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 
75 | 
76 | [homepage]: https://www.contributor-covenant.org
77 | 
78 | For answers to common questions about this code of conduct, see 
79 | https://www.contributor-covenant.org/faq
80 | 


--------------------------------------------------------------------------------
/clevercsv/console/commands/explore.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import code
 4 | import sys
 5 | 
 6 | from wilderness import Command
 7 | 
 8 | from clevercsv.encoding import get_encoding
 9 | from clevercsv.wrappers import detect_dialect
10 | 
11 | from ._docs import FLAG_DESCRIPTIONS
12 | from ._utils import generate_code
13 | from ._utils import parse_int
14 | 
15 | 
16 | class ExploreCommand(Command):
17 |     _description = (
18 |         "The explore command allows you to quickly explore a CSV file in an "
19 |         "interactive Python shell. This command detects the dialect of the "
20 |         "CSV file and drops you into a Python interactive shell (REPL), "
21 |         "with the CSV file already loaded. Simply run:\n\n"
22 |         "\tclevercsv explore FILE\n\n"
23 |         "to start working with the file loaded as a list of lists. "
24 |         "Alternatively, you can run:\n\n"
25 |         "\tclevercsv explore -p FILE\n\n"
26 |         "to read the file as a Pandas dataframe."
27 |     )
28 | 
29 |     def __init__(self) -> None:
30 |         super().__init__(
31 |             name="explore",
32 |             title="Explore the CSV file in an interactive Python shell",
33 |             description=self._description,
34 |             extra_sections={"CleverCSV": "Part of the CleverCSV suite"},
35 |         )
36 | 
37 |     def register(self) -> None:
38 |         self.add_argument("path", help="Path to the CSV file")
39 |         self.add_argument(
40 |             "-e",
41 |             "--encoding",
42 |             help="Set the encoding of the file",
43 |             description=FLAG_DESCRIPTIONS["encoding"],
44 |         )
45 |         self.add_argument(
46 |             "-n",
47 |             "--num-chars",
48 |             help="Number of characters to use for detection",
49 |             type=int,
50 |             description=FLAG_DESCRIPTIONS["num-chars"],
51 |         )
52 |         self.add_argument(
53 |             "-p",
54 |             "--pandas",
55 |             action="store_true",
56 |             help="Read the file into a Pandas DataFrame",
57 |             description=(
58 |                 "By default, this command imports the CSV file as a list of "
59 |                 "lists. By enabling this option the script will be written "
60 |                 "such that the file will be read as a Pandas DataFrame "
61 |                 "instead."
62 |             ),
63 |         )
64 | 
65 |     def handle(self) -> int:
66 |         filename = self.args.path
67 |         encoding = self.args.encoding or get_encoding(filename)
68 |         num_chars = parse_int(self.args.num_chars, "num-chars")
69 |         dialect = detect_dialect(
70 |             filename,
71 |             num_chars=num_chars,
72 |             encoding=encoding,
73 |             verbose=self.args.verbose,
74 |         )
75 |         if dialect is None:
76 |             print("Error: dialect detection failed.", file=sys.stderr)
77 |             return 1
78 | 
79 |         code_lines = generate_code(
80 |             filename, dialect, encoding, use_pandas=self.args.pandas
81 |         )
82 | 
83 |         console = code.InteractiveConsole()
84 |         for line in code_lines:
85 |             retcode = console.push(line)
86 |         if retcode:
87 |             print(
88 |                 "An error occurred starting the interactive console. "
89 |                 "Printing commands instead:\n"
90 |             )
91 |             print("\n".join(code_lines))
92 |             return 1
93 | 
94 |         print("Dropping you into an interactive shell.\n")
95 |         banner = "CleverCSV has loaded the data into the variable: "
96 |         banner += "df" if self.args.pandas else "rows"
97 |         console.interact(banner=banner)
98 |         return 0
99 | 


--------------------------------------------------------------------------------
/man/clevercsv-explore.1:
--------------------------------------------------------------------------------
 1 | '\" t
 2 | .\"     Title: clevercsv-explore
 3 | .\"    Author: G.J.J. van den Burg
 4 | .\" Generator: Wilderness <https://pypi.org/project/wilderness>
 5 | .\"      Date: 2025-10-30
 6 | .\"    Manual: clevercsv Manual
 7 | .\"    Source: clevercsv 0.8.4
 8 | .\"  Language: English
 9 | .\"
10 | .TH "CLEVERCSV-EXPLORE" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual"
11 | .\" -----------------------------------------------------------------
12 | .\" * Define some portability stuff
13 | .\" -----------------------------------------------------------------
14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .\" http://bugs.debian.org/507673
16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | .ie \n(.g .ds Aq \(aq
19 | .el       .ds Aq '
20 | .\" -----------------------------------------------------------------
21 | .\" * set default formatting *
22 | .\" -----------------------------------------------------------------
23 | .\" disable hyphenation
24 | .nh
25 | .\" disable justification
26 | .ad l
27 | .\" -----------------------------------------------------------------
28 | .\" * MAIN CONTENT STARTS HERE *
29 | .\" -----------------------------------------------------------------
30 | .SH "NAME"
31 | clevercsv-explore \- Explore the CSV file in an interactive Python shell
32 | .SH "SYNOPSIS"
33 | .sp
34 | .nf
35 | \fIclevercsv explore [\-e ENCODING | \-\-encoding=ENCODING]
36 |                   [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS] [\-p | \-\-pandas] <path>
37 | .fi
38 | .sp
39 | .SH "DESCRIPTION"
40 | .sp
41 | The explore command allows you to quickly explore a CSV file in an interactive Python shell. This command detects the dialect of the CSV file and drops you into a Python interactive shell (REPL), with the CSV file already loaded. Simply run:
42 | .sp
43 | .RS 4
44 | clevercsv explore FILE
45 | .RE
46 | .sp
47 | to start working with the file loaded as a list of lists. Alternatively, you can run:
48 | .sp
49 | .RS 4
50 | clevercsv explore \-p FILE
51 | .RE
52 | .sp
53 | to read the file as a Pandas dataframe.
54 | .SH "OPTIONS"
55 | .sp
56 | .sp
57 | .sp
58 | \-h, \-\-help
59 | .RS 4
60 | show this help message and exit
61 | .RE
62 | .PP
63 | \-e, \-\-encoding
64 | .RS 4
65 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. Moreover, when you have a number of CSV files with a known file encoding, you can use this option to speed up the code generation process.
66 | .RE
67 | .PP
68 | \-n, \-\-num\-chars
69 | .RS 4
70 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection.
71 | .sp
72 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended.
73 | .RE
74 | .PP
75 | \-p, \-\-pandas
76 | .RS 4
77 | By default, this command imports the CSV file as a list of lists. By enabling this option the script will be written such that the file will be read as a Pandas DataFrame instead.
78 | .RE
79 | .PP
80 | <path>
81 | .RS 4
82 | Path to the CSV file
83 | .RE
84 | .PP
85 | .sp
86 | .SH "CLEVERCSV"
87 | .sp
88 | Part of the CleverCSV suite


--------------------------------------------------------------------------------
/docs/source/clevercsv.rst:
--------------------------------------------------------------------------------
  1 | clevercsv package
  2 | =================
  3 | 
  4 | Subpackages
  5 | -----------
  6 | 
  7 | .. toctree::
  8 |    :maxdepth: 4
  9 | 
 10 |    clevercsv.console
 11 | 
 12 | Submodules
 13 | ----------
 14 | 
 15 | clevercsv.break\_ties module
 16 | ----------------------------
 17 | 
 18 | .. automodule:: clevercsv.break_ties
 19 |    :members:
 20 |    :show-inheritance:
 21 |    :undoc-members:
 22 | 
 23 | clevercsv.consistency module
 24 | ----------------------------
 25 | 
 26 | .. automodule:: clevercsv.consistency
 27 |    :members:
 28 |    :show-inheritance:
 29 |    :undoc-members:
 30 | 
 31 | clevercsv.cparser\_util module
 32 | ------------------------------
 33 | 
 34 | .. automodule:: clevercsv.cparser_util
 35 |    :members:
 36 |    :show-inheritance:
 37 |    :undoc-members:
 38 | 
 39 | clevercsv.detect module
 40 | -----------------------
 41 | 
 42 | .. automodule:: clevercsv.detect
 43 |    :members:
 44 |    :show-inheritance:
 45 |    :undoc-members:
 46 | 
 47 | clevercsv.detect\_pattern module
 48 | --------------------------------
 49 | 
 50 | .. automodule:: clevercsv.detect_pattern
 51 |    :members:
 52 |    :show-inheritance:
 53 |    :undoc-members:
 54 | 
 55 | clevercsv.detect\_type module
 56 | -----------------------------
 57 | 
 58 | .. automodule:: clevercsv.detect_type
 59 |    :members:
 60 |    :show-inheritance:
 61 |    :undoc-members:
 62 | 
 63 | clevercsv.dialect module
 64 | ------------------------
 65 | 
 66 | .. automodule:: clevercsv.dialect
 67 |    :members:
 68 |    :show-inheritance:
 69 |    :undoc-members:
 70 | 
 71 | clevercsv.dict\_read\_write module
 72 | ----------------------------------
 73 | 
 74 | .. automodule:: clevercsv.dict_read_write
 75 |    :members:
 76 |    :show-inheritance:
 77 |    :undoc-members:
 78 | 
 79 | clevercsv.encoding module
 80 | -------------------------
 81 | 
 82 | .. automodule:: clevercsv.encoding
 83 |    :members:
 84 |    :show-inheritance:
 85 |    :undoc-members:
 86 | 
 87 | clevercsv.escape module
 88 | -----------------------
 89 | 
 90 | .. automodule:: clevercsv.escape
 91 |    :members:
 92 |    :show-inheritance:
 93 |    :undoc-members:
 94 | 
 95 | clevercsv.exceptions module
 96 | ---------------------------
 97 | 
 98 | .. automodule:: clevercsv.exceptions
 99 |    :members:
100 |    :show-inheritance:
101 |    :undoc-members:
102 | 
103 | clevercsv.method module
104 | -----------------------
105 | 
106 | .. automodule:: clevercsv.method
107 |    :members:
108 |    :show-inheritance:
109 |    :undoc-members:
110 | 
111 | clevercsv.normal\_form module
112 | -----------------------------
113 | 
114 | .. automodule:: clevercsv.normal_form
115 |    :members:
116 |    :show-inheritance:
117 |    :undoc-members:
118 | 
119 | clevercsv.potential\_dialects module
120 | ------------------------------------
121 | 
122 | .. automodule:: clevercsv.potential_dialects
123 |    :members:
124 |    :show-inheritance:
125 |    :undoc-members:
126 | 
127 | clevercsv.read module
128 | ---------------------
129 | 
130 | .. automodule:: clevercsv.read
131 |    :members:
132 |    :show-inheritance:
133 |    :undoc-members:
134 | 
135 | clevercsv.utils module
136 | ----------------------
137 | 
138 | .. automodule:: clevercsv.utils
139 |    :members:
140 |    :show-inheritance:
141 |    :undoc-members:
142 | 
143 | clevercsv.wrappers module
144 | -------------------------
145 | 
146 | .. automodule:: clevercsv.wrappers
147 |    :members:
148 |    :show-inheritance:
149 |    :undoc-members:
150 | 
151 | clevercsv.write module
152 | ----------------------
153 | 
154 | .. automodule:: clevercsv.write
155 |    :members:
156 |    :show-inheritance:
157 |    :undoc-members:
158 | 
159 | Module contents
160 | ---------------
161 | 
162 | .. automodule:: clevercsv
163 |    :members:
164 |    :show-inheritance:
165 |    :undoc-members:
166 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for easier installation and cleanup.
  2 | #
  3 | # Uses self-documenting macros from here:
  4 | # http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
  5 | 
  6 | SHELL := bash
  7 | .SHELLFLAGS := -eu -o pipefail -c
  8 | MAKEFLAGS += --no-builtin-rules
  9 | 
 10 | PACKAGE=clevercsv
 11 | DOC_DIR=./docs/
 12 | VENV_DIR=/tmp/clevercsv_venv
 13 | PYTHON ?= python
 14 | 
 15 | .PHONY: help
 16 | 
 17 | .DEFAULT_GOAL := help
 18 | 
 19 | help:
 20 | 	@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\
 21 | 		 awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m\
 22 | 		 %s\n", $$1, $$2}'
 23 | 
 24 | ################
 25 | # Installation #
 26 | ################
 27 | 
 28 | .PHONY: inplace install
 29 | 
 30 | inplace:
 31 | 	$(PYTHON) setup.py build_ext -i
 32 | 
 33 | install: dist ## Install for the current user using the default python command
 34 | 	$(PYTHON) -m pip install --user ./dist/$(PACKAGE)-*.tar.gz
 35 | 
 36 | ################
 37 | # Distribution #
 38 | ################
 39 | 
 40 | .PHONY: release dist
 41 | 
 42 | release: ## Make a release
 43 | 	$(PYTHON) make_release.py
 44 | 
 45 | dist: man ## Make Python source distribution
 46 | 	$(PYTHON) setup.py sdist
 47 | 
 48 | ###########
 49 | # Testing #
 50 | ###########
 51 | 
 52 | .PHONY: test integration integration_partial
 53 | 
 54 | test: mypy green pytest
 55 | 
 56 | green: venv ## Run unit tests
 57 | 	source $(VENV_DIR)/bin/activate && green -a -vv ./tests/test_unit
 58 | 
 59 | pytest: venv ## Run unit tests with PyTest
 60 | 	source $(VENV_DIR)/bin/activate && pytest -ra -m 'not network'
 61 | 
 62 | mypy: venv ## Run type checks
 63 | 	source $(VENV_DIR)/bin/activate && \
 64 | 		mypy --check-untyped-defs ./stubs $(PACKAGE) ./tests
 65 | 
 66 | integration: venv ## Run integration tests
 67 | 	source $(VENV_DIR)/bin/activate && python ./tests/test_integration/test_dialect_detection.py -v
 68 | 
 69 | integration_partial: venv ## Run partial integration tests
 70 | 	source $(VENV_DIR)/bin/activate && python ./tests/test_integration/test_dialect_detection.py -v --partial
 71 | 
 72 | 
 73 | #################
 74 | # Documentation #
 75 | #################
 76 | 
 77 | .PHONY: docs doc man
 78 | 
 79 | docs: doc
 80 | doc: venv ## Build documentation with Sphinx
 81 | 	source $(VENV_DIR)/bin/activate && m2r2 README.md && mv README.rst $(DOC_DIR)
 82 | 	source $(VENV_DIR)/bin/activate && m2r2 CHANGELOG.md && mv CHANGELOG.rst $(DOC_DIR)
 83 | 	cd $(DOC_DIR) && \
 84 | 		rm source/* && \
 85 | 		source $(VENV_DIR)/bin/activate && \
 86 | 		sphinx-apidoc -H 'CleverCSV API Documentation' -o source ../$(PACKAGE) && \
 87 | 		touch source/AUTOGENERATED
 88 | 	source $(VENV_DIR)/bin/activate && $(MAKE) -C $(DOC_DIR) html
 89 | 
 90 | man: venv ## Build man pages using Wilderness
 91 | 	source $(VENV_DIR)/bin/activate && \
 92 | 		python setup.py build_manpages
 93 | 
 94 | #######################
 95 | # Virtual environment #
 96 | #######################
 97 | 
 98 | .PHONY: venv clean_venv
 99 | 
100 | venv: $(VENV_DIR)/bin/activate
101 | 
102 | $(VENV_DIR)/bin/activate:
103 | 	test -d $(VENV_DIR) || $(PYTHON) -m venv $(VENV_DIR)
104 | 	source $(VENV_DIR)/bin/activate && python -m pip install -e .[dev]
105 | 	touch $(VENV_DIR)/bin/activate
106 | 
107 | clean_venv:
108 | 	rm -rf $(VENV_DIR)
109 | 
110 | ############
111 | # Clean up #
112 | ############
113 | 
114 | .PHONY: clean
115 | 
116 | clean: clean_venv ## Clean build dist and egg directories left after install
117 | 	rm -rf ./dist
118 | 	rm -rf ./build
119 | 	rm -rf ./$(PACKAGE).egg-info
120 | 	rm -rf ./cover
121 | 	rm -f MANIFEST
122 | 	rm -f ./$(PACKAGE)/*.so
123 | 	rm -f ./*_valgrind.log*
124 | 	rm -f ./man/*
125 | 	find . -type f -iname '*.pyc' -delete
126 | 	find . -type d -name '__pycache__' -empty -delete
127 | 
128 | 
129 | # Testing
130 | #
131 | gh130: venv
132 | 	source $(VENV_DIR)/bin/activate && \
133 | 		python -m unittest -k '*github_issue_130' tests/test_unit/test_reader.py
134 | 


--------------------------------------------------------------------------------
/tests/test_unit/test_write.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Unit tests for the CCSV write module.
  5 | 
  6 | Author: Gertjan van den Burg
  7 | 
  8 | """
  9 | 
 10 | 
 11 | import csv
 12 | import tempfile
 13 | import unittest
 14 | 
 15 | from typing import Any
 16 | from typing import Iterable
 17 | 
 18 | import clevercsv
 19 | 
 20 | from clevercsv.dialect import SimpleDialect
 21 | 
 22 | 
 23 | class WriterTestCase(unittest.TestCase):
 24 |     def _write_test(
 25 |         self, fields: Iterable[Any], expect: str, **kwargs: Any
 26 |     ) -> None:
 27 |         with tempfile.TemporaryFile("w+", newline="", prefix="ccsv_") as fp:
 28 |             writer = clevercsv.writer(fp, **kwargs)
 29 |             writer.writerow(fields)
 30 |             fp.seek(0)
 31 |             self.assertEqual(fp.read(), expect + writer.dialect.lineterminator)
 32 | 
 33 |     def _write_error_test(
 34 |         self, exc: type[Exception], fields: Any, **kwargs: Any
 35 |     ) -> None:
 36 |         with tempfile.TemporaryFile("w+", newline="", prefix="ccsv_") as fp:
 37 |             writer = clevercsv.writer(fp, **kwargs)
 38 |             with self.assertRaises(exc):
 39 |                 writer.writerow(fields)
 40 |             fp.seek(0)
 41 |             self.assertEqual(fp.read(), "")
 42 | 
 43 |     def test_write_arg_valid(self) -> None:
 44 |         self._write_error_test(clevercsv.Error, None)
 45 |         self._write_test((), "")
 46 |         self._write_test([None], '""')
 47 |         self._write_error_test(
 48 |             clevercsv.Error, [None], quoting=clevercsv.QUOTE_NONE
 49 |         )
 50 | 
 51 |         # Check that exceptions are passed up the chain
 52 |         class BadList:
 53 |             def __len__(self) -> int:
 54 |                 return 10
 55 | 
 56 |             def __getitem__(self, i: int) -> None:
 57 |                 if i > 2:
 58 |                     raise OSError
 59 | 
 60 |         self._write_error_test(OSError, BadList())
 61 | 
 62 |         class BadItem:
 63 |             def __str__(self) -> str:
 64 |                 raise OSError
 65 | 
 66 |         self._write_error_test(OSError, [BadItem()])
 67 | 
 68 |     def test_write_bigfield(self) -> None:
 69 |         bigstring = "X" * 50000
 70 |         self._write_test(
 71 |             [bigstring, bigstring], "%s,%s" % (bigstring, bigstring)
 72 |         )
 73 | 
 74 |     def test_write_quoting(self) -> None:
 75 |         self._write_test(["a", 1, "p,q"], 'a,1,"p,q"')
 76 |         self._write_error_test(
 77 |             clevercsv.Error, ["a", 1, "p,q"], quoting=clevercsv.QUOTE_NONE
 78 |         )
 79 |         self._write_test(
 80 |             ["a", 1, "p,q"], 'a,1,"p,q"', quoting=clevercsv.QUOTE_MINIMAL
 81 |         )
 82 |         self._write_test(
 83 |             ["a", 1, "p,q"], '"a",1,"p,q"', quoting=clevercsv.QUOTE_NONNUMERIC
 84 |         )
 85 |         self._write_test(
 86 |             ["a", 1, "p,q"], '"a","1","p,q"', quoting=clevercsv.QUOTE_ALL
 87 |         )
 88 |         self._write_test(
 89 |             ["a\nb", 1], '"a\nb","1"', quoting=clevercsv.QUOTE_ALL
 90 |         )
 91 | 
 92 |     def test_write_simpledialect(self) -> None:
 93 |         self._write_test(
 94 |             ["a", 1, "p,q"],
 95 |             "a,1,|p,q|",
 96 |             dialect=SimpleDialect(delimiter=",", quotechar="|", escapechar=""),
 97 |         )
 98 | 
 99 |     def test_write_csv_dialect(self) -> None:
100 |         self._write_test(
101 |             ["a", 1, "p,q"],
102 |             'a,1,"p,q"',
103 |             dialect="excel",
104 |         )
105 |         self._write_test(
106 |             ["a", 1, "p,q"],
107 |             '"a","1","p,q"',
108 |             dialect=csv.unix_dialect,
109 |         )
110 |         self._write_test(
111 |             [1, 2, 3],
112 |             "1\t2\t3",
113 |             dialect=clevercsv.excel_tab,
114 |         )
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     unittest.main()
119 | 


--------------------------------------------------------------------------------
/man/clevercsv.1:
--------------------------------------------------------------------------------
  1 | '\" t
  2 | .\"     Title: clevercsv
  3 | .\"    Author: G.J.J. van den Burg
  4 | .\" Generator: Wilderness <https://pypi.org/project/wilderness>
  5 | .\"      Date: 2025-10-30
  6 | .\"    Manual: clevercsv Manual
  7 | .\"    Source: clevercsv 0.8.4
  8 | .\"  Language: English
  9 | .\"
 10 | .TH "CLEVERCSV" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual"
 11 | .\" -----------------------------------------------------------------
 12 | .\" * Define some portability stuff
 13 | .\" -----------------------------------------------------------------
 14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 15 | .\" http://bugs.debian.org/507673
 16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
 17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 18 | .ie \n(.g .ds Aq \(aq
 19 | .el       .ds Aq '
 20 | .\" -----------------------------------------------------------------
 21 | .\" * set default formatting *
 22 | .\" -----------------------------------------------------------------
 23 | .\" disable hyphenation
 24 | .nh
 25 | .\" disable justification
 26 | .ad l
 27 | .\" -----------------------------------------------------------------
 28 | .\" * MAIN CONTENT STARTS HERE *
 29 | .\" -----------------------------------------------------------------
 30 | .SH "NAME"
 31 | clevercsv \- CleverCSV command line tool
 32 | .SH "SYNOPSIS"
 33 | .sp
 34 | .nf
 35 | \fIclevercsv [\-h | \-\-help] [\-V | \-\-version] [\-v | \-\-verbose] <command \&...>
 36 | .fi
 37 | .sp
 38 | .SH "DESCRIPTION"
 39 | .sp
 40 | CleverCSV is a Python library and command line tool for dealing with messy CSV files. It consists of a number of commands that can be used to analyze, explore, or standardize a messy CSV file.
 41 | .sp
 42 | Further help and documentation can be found online at https://github.com/alan\-turing\-institute/CleverCSV or https://clevercsv.readthedocs.io
 43 | .SH "OPTIONS"
 44 | .sp
 45 | .sp
 46 | .sp
 47 | \-h, \-\-help
 48 | .RS 4
 49 | show this help message and exit
 50 | .RE
 51 | .PP
 52 | \-V, \-\-version
 53 | .RS 4
 54 | Show version and exit
 55 | .RE
 56 | .PP
 57 | \-v, \-\-verbose
 58 | .RS 4
 59 | Enable verbose mode
 60 | .RE
 61 | .PP
 62 | .sp
 63 | .SH "COMMANDS"
 64 | .sp
 65 | The following commands are available in CleverCSV:
 66 | .sp
 67 | clevercsv\-code(1)
 68 | .RS 4
 69 | Generate Python code to import a given CSV file
 70 | .RE
 71 | .sp
 72 | clevercsv\-detect(1)
 73 | .RS 4
 74 | Detect the dialect of a CSV file
 75 | .RE
 76 | .sp
 77 | clevercsv\-explore(1)
 78 | .RS 4
 79 | Infer the dialect and open the file in an interactive Python session
 80 | .RE
 81 | .sp
 82 | clevercsv\-standardize(1)
 83 | .RS 4
 84 | Convert a messy CSV file to one that follows RFC\-4180
 85 | .RE
 86 | .sp
 87 | clevercsv\-view(1)
 88 | .RS 4
 89 | Detect the dialect and open the CSV file using TabView
 90 | .RE
 91 | .SH "AUTHORS"
 92 | .sp
 93 | The CleverCSV package was originally written by Gerrit van den Burg and came out of scientific research on wrangling messy CSV files by Gerrit van den Burg, Alfredo Nazabal, and Charles Sutton. This research was conducted at and supported by The Alan Turing Institute. CleverCSV has since benefitted from a number of open\-source contributors on GitHub.
 94 | .SH "REPORTING BUGS"
 95 | .sp
 96 | If you encounter an issue in CleverCSV, please open an issue or submit a pull request at https://github.com/alan\-turing\-institute/CleverCSV. Please don't hesitate, you're helping to make this project better for everyone!
 97 | .SH "NOTES"
 98 | .sp
 99 | \fB1. \fRCleverCSV GitHub repository
100 | .br
101 |    https://github.com/alan\-turing\-institute/CleverCSV
102 | .sp
103 | \fB2. \fRCleverCSV documentation
104 | .br
105 |    https://clevercsv.readthedocs.io
106 | .sp
107 | \fB3. \fRWrangling Messy CSV Files by Detecting Row and Type Patterns
108 | .br
109 |    https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_\-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf


--------------------------------------------------------------------------------
/clevercsv/console/application.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | CleverCSV Command line application.
 5 | 
 6 | """
 7 | 
 8 | from wilderness import Application
 9 | 
10 | from .. import __version__
11 | from .commands import CodeCommand
12 | from .commands import DetectCommand
13 | from .commands import ExploreCommand
14 | from .commands import StandardizeCommand
15 | from .commands import ViewCommand
16 | 
17 | 
18 | class CleverCSVApplication(Application):
19 |     _description = (
20 |         "CleverCSV is a Python library and command line tool for dealing "
21 |         "with messy CSV files. It consists of a number of commands that can "
22 |         "be used to analyze, explore, or standardize a messy CSV file.\n\n"
23 |         "Further help and documentation can be found online at "
24 |         "https://github.com/alan-turing-institute/CleverCSV or "
25 |         "https://clevercsv.readthedocs.io"
26 |     )
27 |     _extra = {
28 |         "Commands": (
29 |             "The following commands are available in CleverCSV:\n\n"
30 |             "clevercsv-code(1)\n"
31 |             "\tGenerate Python code to import a given CSV file\n\n"
32 |             "clevercsv-detect(1)\n"
33 |             "\tDetect the dialect of a CSV file\n\n"
34 |             "clevercsv-explore(1)\n"
35 |             "\tInfer the dialect and open the file in an interactive Python "
36 |             "session\n\n"
37 |             "clevercsv-standardize(1)\n"
38 |             "\tConvert a messy CSV file to one that follows RFC-4180\n\n"
39 |             "clevercsv-view(1)\n"
40 |             "\tDetect the dialect and open the CSV file using TabView"
41 |         ),
42 |         "Authors": (
43 |             "The CleverCSV package was originally written by Gerrit van den "
44 |             "Burg and came out of scientific research on wrangling messy CSV "
45 |             "files by Gerrit van den Burg, Alfredo Nazabal, and Charles "
46 |             "Sutton. This research was conducted at and supported by The "
47 |             "Alan Turing Institute. CleverCSV has since benefitted from a "
48 |             "number of open-source contributors on GitHub."
49 |         ),
50 |         "Reporting Bugs": (
51 |             "If you encounter an issue in CleverCSV, please open an issue "
52 |             "or submit a pull request at "
53 |             "https://github.com/alan-turing-institute/CleverCSV. Please don't "
54 |             "hesitate, you're helping to make this project better for "
55 |             "everyone!"
56 |         ),
57 |         "Notes": (
58 |             "1. CleverCSV GitHub repository\n"
59 |             "   https://github.com/alan-turing-institute/CleverCSV\n\n"
60 |             "2. CleverCSV documentation\n"
61 |             "   https://clevercsv.readthedocs.io\n\n"
62 |             "3. Wrangling Messy CSV Files by Detecting Row and Type Patterns\n"
63 |             "   https://gertjanvandenburg.com/papers/VandenBurg_Nazabal_Sutton_-_Wrangling_Messy_CSV_Files_by_Detecting_Row_and_Type_Patterns_2019.pdf"
64 |         ),
65 |     }
66 | 
67 |     def __init__(self) -> None:
68 |         super().__init__(
69 |             "clevercsv",
70 |             version=__version__,
71 |             title="CleverCSV command line tool",
72 |             author="G.J.J. van den Burg",
73 |             description=self._description,
74 |             extra_sections=self._extra,
75 |         )
76 | 
77 |     def register(self) -> None:
78 |         self.add_argument(
79 |             "-V",
80 |             "--version",
81 |             help="Show version and exit",
82 |             action="version",
83 |             version=__version__,
84 |         )
85 |         self.add_argument(
86 |             "-v", "--verbose", help="Enable verbose mode", action="store_true"
87 |         )
88 | 
89 | 
90 | def build_application() -> Application:
91 |     app = CleverCSVApplication()
92 |     app.add(DetectCommand())
93 |     app.add(ViewCommand())
94 |     app.add(StandardizeCommand())
95 |     app.add(CodeCommand())
96 |     app.add(ExploreCommand())
97 |     return app
98 | 


--------------------------------------------------------------------------------
/man/clevercsv-standardize.1:
--------------------------------------------------------------------------------
 1 | '\" t
 2 | .\"     Title: clevercsv-standardize
 3 | .\"    Author: G.J.J. van den Burg
 4 | .\" Generator: Wilderness <https://pypi.org/project/wilderness>
 5 | .\"      Date: 2025-10-30
 6 | .\"    Manual: clevercsv Manual
 7 | .\"    Source: clevercsv 0.8.4
 8 | .\"  Language: English
 9 | .\"
10 | .TH "CLEVERCSV-STANDARDIZE" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual"
11 | .\" -----------------------------------------------------------------
12 | .\" * Define some portability stuff
13 | .\" -----------------------------------------------------------------
14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .\" http://bugs.debian.org/507673
16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | .ie \n(.g .ds Aq \(aq
19 | .el       .ds Aq '
20 | .\" -----------------------------------------------------------------
21 | .\" * set default formatting *
22 | .\" -----------------------------------------------------------------
23 | .\" disable hyphenation
24 | .nh
25 | .\" disable justification
26 | .ad l
27 | .\" -----------------------------------------------------------------
28 | .\" * MAIN CONTENT STARTS HERE *
29 | .\" -----------------------------------------------------------------
30 | .SH "NAME"
31 | clevercsv-standardize \- Convert a CSV file to one that conforms to RFC-4180
32 | .SH "SYNOPSIS"
33 | .sp
34 | .nf
35 | \fIclevercsv standardize [\-e ENCODING | \-\-encoding=ENCODING]
36 |                       [\-E TARGET_ENCODING | \-\-target\-encoding=TARGET_ENCODING]
37 |                       [\-i | \-\-in\-place] [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS]
38 |                       [\-o OUTPUT | \-\-output=OUTPUT] [\-t | \-\-transpose]
39 |                       <path [path \&...]>
40 | .fi
41 | .sp
42 | .SH "DESCRIPTION"
43 | .sp
44 | The standardize command can be used to convert a non\-standard CSVfile to the standard RFC\-4180 format [1]. When using the \-\-in\-place option, the return code of CleverCSV can be used to check whether a file was altered or not. The return codewill be 2 when the file was altered and 0 otherwise.
45 | .sp
46 | [1]: https://tools.ietf.org/html/rfc4180
47 | .SH "OPTIONS"
48 | .sp
49 | .sp
50 | .sp
51 | \-h, \-\-help
52 | .RS 4
53 | show this help message and exit
54 | .RE
55 | .PP
56 | \-e, \-\-encoding
57 | .RS 4
58 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. For this command, the provided encoding will also be used for the output file(s). When only one encoding is given, it will be used for all files given on the command line. When multiple encodings are given, the number must correspond to the number of files provided as input.
59 | .RE
60 | .PP
61 | \-E, \-\-target\-encoding
62 | .RS 4
63 | If ommited, the output file encoding while be the same as that of the original file.
64 | .RE
65 | .PP
66 | \-i, \-\-in\-place
67 | .RS 4
68 | Standardize and overwrite the input file(s)
69 | .RE
70 | .PP
71 | \-n, \-\-num\-chars
72 | .RS 4
73 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection.
74 | .sp
75 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended.
76 | .RE
77 | .PP
78 | \-o, \-\-output
79 | .RS 4
80 | The output files to write the standardized input files to. The order of the input files and the order of the output files should match if this option is used with more than one input file.
81 | .RE
82 | .PP
83 | \-t, \-\-transpose
84 | .RS 4
85 | Transpose the columns of the input file(s) before writing
86 | .RE
87 | .PP
88 | <path>
89 | .RS 4
90 | Path to one or more CSV file(s)
91 | .RE
92 | .PP
93 | .sp
94 | .SH "CLEVERCSV"
95 | .sp
96 | Part of the CleverCSV suite


--------------------------------------------------------------------------------
/clevercsv/detect_pattern.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Code for computing the pattern score.
  5 | 
  6 | Author: Gertjan van den Burg
  7 | 
  8 | """
  9 | 
 10 | import collections
 11 | import re
 12 | 
 13 | from typing import Optional
 14 | from typing import Pattern
 15 | 
 16 | from .cabstraction import base_abstraction
 17 | from .cabstraction import c_merge_with_quotechar
 18 | from .dialect import SimpleDialect
 19 | 
 20 | DEFAULT_EPS_PAT: float = 1e-3
 21 | 
 22 | RE_MULTI_C: Pattern[str] = re.compile(r"C{2,}")
 23 | 
 24 | 
 25 | def pattern_score(
 26 |     data: str, dialect: SimpleDialect, eps: float = DEFAULT_EPS_PAT
 27 | ) -> float:
 28 |     """
 29 |     Compute the pattern score for given data and a dialect.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 | 
 34 |     data : str
 35 |         The data of the file as a raw character string
 36 | 
 37 |     dialect: dialect.Dialect
 38 |         The dialect object
 39 | 
 40 |     Returns
 41 |     -------
 42 |     score : float
 43 |         the pattern score
 44 | 
 45 |     """
 46 |     A = make_abstraction(data, dialect)
 47 |     row_patterns = collections.Counter(A.split("R"))
 48 |     P = 0.0
 49 |     for pat_k, Nk in row_patterns.items():
 50 |         Lk = len(pat_k.split("D"))
 51 |         P += Nk * (max(eps, Lk - 1) / Lk)
 52 |     P /= len(row_patterns)
 53 |     return P
 54 | 
 55 | 
 56 | def make_abstraction(data: str, dialect: SimpleDialect) -> str:
 57 |     """Create an abstract representation of the CSV file based on the dialect.
 58 | 
 59 |     This function constructs the basic abstraction used to compute the row
 60 |     patterns.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     data : str
 65 |         The data of the file as a string.
 66 | 
 67 |     dialect : SimpleDialect
 68 |         A dialect to parse the file with.
 69 | 
 70 |     Returns
 71 |     -------
 72 |     abstraction : str
 73 |         An abstract representation of the CSV file.
 74 | 
 75 |     """
 76 |     A = base_abstraction(
 77 |         data, dialect.delimiter, dialect.quotechar, dialect.escapechar
 78 |     )
 79 |     A = merge_with_quotechar(A)
 80 |     A = fill_empties(A)
 81 |     A = strip_trailing(A)
 82 |     return A
 83 | 
 84 | 
 85 | def merge_with_quotechar(
 86 |     S: str, dialect: Optional[SimpleDialect] = None
 87 | ) -> str:
 88 |     """Merge quoted blocks in the abstraction
 89 | 
 90 |     This function takes the abstract representation and merges quoted blocks
 91 |     (``QC...CQ``) to a single cell (``C``). The function takes nested quotes
 92 |     into account.
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     S : str
 97 |         The data of a file as a string
 98 | 
 99 |     dialect : SimpleDialect
100 |         The dialect used to make the abstraction. This is not used but kept for
101 |         backwards compatibility. Will be removed in a future version.
102 | 
103 |     Returns
104 |     -------
105 |     abstraction : str
106 |         A simplified version of the abstraction with quoted blocks merged.
107 | 
108 |     """
109 |     return c_merge_with_quotechar(S)
110 | 
111 | 
112 | def fill_empties(abstract: str) -> str:
113 |     """Fill empty cells in the abstraction
114 | 
115 |     The way the row patterns are constructed assumes that empty cells are
116 |     marked by the letter `C` as well. This function fill those in. The function
117 |     also removes duplicate occurrances of ``CC`` and replaces these  with
118 |     ``C``.
119 | 
120 |     Parameters
121 |     ----------
122 |     abstract : str
123 |         The abstract representation of the file.
124 | 
125 |     Returns
126 |     -------
127 |     abstraction : str
128 |         The abstract representation with empties filled.
129 | 
130 | 
131 |     """
132 |     while "DD" in abstract:
133 |         abstract = abstract.replace("DD", "DCD")
134 | 
135 |     while "DR" in abstract:
136 |         abstract = abstract.replace("DR", "DCR")
137 | 
138 |     while "RD" in abstract:
139 |         abstract = abstract.replace("RD", "RCD")
140 | 
141 |     abstract = RE_MULTI_C.sub("C", abstract)
142 | 
143 |     if abstract.startswith("D"):
144 |         abstract = "C" + abstract
145 | 
146 |     if abstract.endswith("D"):
147 |         abstract += "C"
148 | 
149 |     return abstract
150 | 
151 | 
152 | def strip_trailing(abstract: str) -> str:
153 |     """Strip trailing row separator from abstraction."""
154 |     while abstract.endswith("R"):
155 |         abstract = abstract[:-1]
156 |     return abstract
157 | 


--------------------------------------------------------------------------------
/tests/test_unit/test_normal_forms.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Unit tests for the normal form detection.
 5 | 
 6 | Author: Gertjan van den Burg
 7 | 
 8 | """
 9 | 
10 | import unittest
11 | 
12 | from clevercsv.dialect import SimpleDialect
13 | from clevercsv.normal_form import is_form_1
14 | from clevercsv.normal_form import is_form_2
15 | from clevercsv.normal_form import is_form_3
16 | from clevercsv.normal_form import is_form_4
17 | from clevercsv.normal_form import is_form_5
18 | 
19 | 
20 | class NormalFormTestCase(unittest.TestCase):
21 |     def test_form_1(self) -> None:
22 |         dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
23 | 
24 |         self.assertTrue(is_form_1('"A","B","C"'.split("\n"), dialect))
25 |         self.assertTrue(is_form_1('"A","B"\n"C","D"'.split("\n"), dialect))
26 |         self.assertTrue(is_form_1('"A","","C"'.split("\n"), dialect))
27 | 
28 |         self.assertFalse(is_form_1('"A","B"\n"A"'.split("\n"), dialect))
29 |         self.assertFalse(is_form_1('"A"\n"B"'.split("\n"), dialect))
30 |         self.assertFalse(is_form_1('"A"\n"A","B"'.split("\n"), dialect))
31 |         self.assertFalse(is_form_1('"A",,"C"'.split("\n"), dialect))
32 |         self.assertFalse(is_form_1('"A",C'.split("\n"), dialect))
33 |         self.assertFalse(is_form_1('"A"\n"b""A""c","B"'.split("\n"), dialect))
34 | 
35 |     def test_form_2(self) -> None:
36 |         dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="")
37 | 
38 |         self.assertTrue(is_form_2("1,2,3".split("\n"), dialect))
39 |         self.assertTrue(is_form_2("1,2,3\na,b,c".split("\n"), dialect))
40 |         self.assertTrue(is_form_2("a@b.com,3".split("\n"), dialect))
41 |         self.assertTrue(is_form_2("a,,3\n1,2,3".split("\n"), dialect))
42 | 
43 |         self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6".split("\n"), dialect))
44 |         self.assertFalse(is_form_2("1".split("\n"), dialect))
45 |         self.assertFalse(is_form_2('1,"a"'.split("\n"), dialect))
46 |         self.assertFalse(is_form_2("a;b,3".split("\n"), dialect))
47 |         self.assertFalse(is_form_2('"a,3,3\n1,2,3'.split("\n"), dialect))
48 |         self.assertFalse(is_form_2('a,"",3\n1,2,3'.split("\n"), dialect))
49 | 
50 |     def test_form_3(self) -> None:
51 |         A = SimpleDialect(delimiter=",", quotechar="'", escapechar="")
52 |         Q = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
53 | 
54 |         self.assertTrue(is_form_3('A,B\nC,"D"'.split("\n"), Q))
55 |         self.assertTrue(is_form_3('A,B\nC,"d,e"'.split("\n"), Q))
56 | 
57 |         self.assertFalse(is_form_3('A,\nC,"d,e"'.split("\n"), Q))
58 |         self.assertFalse(is_form_3("3;4,B\nC,D".split("\n"), Q))
59 | 
60 |         self.assertFalse(is_form_3('A,B\n"C",D'.split("\n"), A))
61 |         self.assertTrue(is_form_3('A,B\n"C",D'.split("\n"), Q))
62 | 
63 |     def test_form_4(self) -> None:
64 |         quoted = SimpleDialect(delimiter="", quotechar='"', escapechar="")
65 |         unquoted = SimpleDialect(delimiter="", quotechar="", escapechar="")
66 | 
67 |         self.assertTrue(is_form_4("A\nB\nC".split("\n"), unquoted))
68 |         self.assertTrue(is_form_4("1\n2\n3".split("\n"), unquoted))
69 |         self.assertTrue(is_form_4("A_B\n1\n2".split("\n"), unquoted))
70 |         self.assertTrue(is_form_4("A&B\n1\n2".split("\n"), unquoted))
71 |         self.assertTrue(is_form_4("A&B\n-1\n2".split("\n"), unquoted))
72 |         self.assertTrue(is_form_4('"A"\n"B"\n"C"'.split("\n"), quoted))
73 | 
74 |         self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"'.split("\n"), quoted))
75 |         self.assertFalse(is_form_4('"A","B"\n"B"\n"C"'.split("\n"), quoted))
76 |         self.assertFalse(is_form_4('"A@b"\n"B"\n"C"'.split("\n"), quoted))
77 |         self.assertFalse(is_form_4('A\n"-1"\n2'.split("\n"), unquoted))
78 |         self.assertFalse(is_form_4("A B\n-1 3\n2 4".split("\n"), unquoted))
79 | 
80 |     def test_form_5(self) -> None:
81 |         dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
82 | 
83 |         self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"'.split("\n"), dialect))
84 |         self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"'.split("\n"), dialect))
85 | 
86 |         self.assertFalse(is_form_5("A,B\n1,2\n3,4".split("\n"), dialect))
87 |         self.assertFalse(is_form_5("A,B\n1,\n2,3".split("\n"), dialect))
88 |         self.assertFalse(
89 |             is_form_5('"A,""B"""\n"1,"\n"2,3"'.split("\n"), dialect)
90 |         )
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     unittest.main()
95 | 


--------------------------------------------------------------------------------
/stubs/regex/regex.pyi:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | from regex._regex_core import VERSION0
  4 | 
  5 | def match(
  6 |     pattern,
  7 |     string,
  8 |     flags: int = ...,
  9 |     pos: Any | None = ...,
 10 |     endpos: Any | None = ...,
 11 |     partial: bool = ...,
 12 |     concurrent: Any | None = ...,
 13 |     timeout: Any | None = ...,
 14 |     ignore_unused: bool = ...,
 15 |     **kwargs
 16 | ): ...
 17 | def fullmatch(
 18 |     pattern,
 19 |     string,
 20 |     flags: int = ...,
 21 |     pos: Any | None = ...,
 22 |     endpos: Any | None = ...,
 23 |     partial: bool = ...,
 24 |     concurrent: Any | None = ...,
 25 |     timeout: Any | None = ...,
 26 |     ignore_unused: bool = ...,
 27 |     **kwargs
 28 | ): ...
 29 | def search(
 30 |     pattern,
 31 |     string,
 32 |     flags: int = ...,
 33 |     pos: Any | None = ...,
 34 |     endpos: Any | None = ...,
 35 |     partial: bool = ...,
 36 |     concurrent: Any | None = ...,
 37 |     timeout: Any | None = ...,
 38 |     ignore_unused: bool = ...,
 39 |     **kwargs
 40 | ): ...
 41 | def sub(
 42 |     pattern,
 43 |     repl,
 44 |     string,
 45 |     count: int = ...,
 46 |     flags: int = ...,
 47 |     pos: Any | None = ...,
 48 |     endpos: Any | None = ...,
 49 |     concurrent: Any | None = ...,
 50 |     timeout: Any | None = ...,
 51 |     ignore_unused: bool = ...,
 52 |     **kwargs
 53 | ): ...
 54 | def subf(
 55 |     pattern,
 56 |     format,
 57 |     string,
 58 |     count: int = ...,
 59 |     flags: int = ...,
 60 |     pos: Any | None = ...,
 61 |     endpos: Any | None = ...,
 62 |     concurrent: Any | None = ...,
 63 |     timeout: Any | None = ...,
 64 |     ignore_unused: bool = ...,
 65 |     **kwargs
 66 | ): ...
 67 | def subn(
 68 |     pattern,
 69 |     repl,
 70 |     string,
 71 |     count: int = ...,
 72 |     flags: int = ...,
 73 |     pos: Any | None = ...,
 74 |     endpos: Any | None = ...,
 75 |     concurrent: Any | None = ...,
 76 |     timeout: Any | None = ...,
 77 |     ignore_unused: bool = ...,
 78 |     **kwargs
 79 | ): ...
 80 | def subfn(
 81 |     pattern,
 82 |     format,
 83 |     string,
 84 |     count: int = ...,
 85 |     flags: int = ...,
 86 |     pos: Any | None = ...,
 87 |     endpos: Any | None = ...,
 88 |     concurrent: Any | None = ...,
 89 |     timeout: Any | None = ...,
 90 |     ignore_unused: bool = ...,
 91 |     **kwargs
 92 | ): ...
 93 | def split(
 94 |     pattern,
 95 |     string,
 96 |     maxsplit: int = ...,
 97 |     flags: int = ...,
 98 |     concurrent: Any | None = ...,
 99 |     timeout: Any | None = ...,
100 |     ignore_unused: bool = ...,
101 |     **kwargs
102 | ): ...
103 | def splititer(
104 |     pattern,
105 |     string,
106 |     maxsplit: int = ...,
107 |     flags: int = ...,
108 |     concurrent: Any | None = ...,
109 |     timeout: Any | None = ...,
110 |     ignore_unused: bool = ...,
111 |     **kwargs
112 | ): ...
113 | def findall(
114 |     pattern,
115 |     string,
116 |     flags: int = ...,
117 |     pos: Any | None = ...,
118 |     endpos: Any | None = ...,
119 |     overlapped: bool = ...,
120 |     concurrent: Any | None = ...,
121 |     timeout: Any | None = ...,
122 |     ignore_unused: bool = ...,
123 |     **kwargs
124 | ): ...
125 | def finditer(
126 |     pattern,
127 |     string,
128 |     flags: int = ...,
129 |     pos: Any | None = ...,
130 |     endpos: Any | None = ...,
131 |     overlapped: bool = ...,
132 |     partial: bool = ...,
133 |     concurrent: Any | None = ...,
134 |     timeout: Any | None = ...,
135 |     ignore_unused: bool = ...,
136 |     **kwargs
137 | ): ...
138 | def compile(
139 |     pattern, flags: int = ..., ignore_unused: bool = ..., **kwargs
140 | ): ...
141 | def purge() -> None: ...
142 | def cache_all(value: bool = ...): ...
143 | def template(pattern, flags: int = ...): ...
144 | def escape(pattern, special_only: bool = ..., literal_spaces: bool = ...): ...
145 | 
146 | DEFAULT_VERSION = VERSION0
147 | Pattern: Any
148 | Match: Any
149 | Regex = compile
150 | 
151 | # Names in __all__ with no definition:
152 | #   A
153 | #   ASCII
154 | #   B
155 | #   BESTMATCH
156 | #   D
157 | #   DEBUG
158 | #   DOTALL
159 | #   E
160 | #   ENHANCEMATCH
161 | #   F
162 | #   FULLCASE
163 | #   I
164 | #   IGNORECASE
165 | #   L
166 | #   LOCALE
167 | #   M
168 | #   MULTILINE
169 | #   P
170 | #   POSIX
171 | #   R
172 | #   REVERSE
173 | #   S
174 | #   Scanner
175 | #   T
176 | #   TEMPLATE
177 | #   U
178 | #   UNICODE
179 | #   V0
180 | #   V1
181 | #   VERBOSE
182 | #   VERSION0
183 | #   VERSION1
184 | #   W
185 | #   WORD
186 | #   X
187 | #   __doc__
188 | #   __version__
189 | #   error
190 | 


--------------------------------------------------------------------------------
/man/clevercsv-detect.1:
--------------------------------------------------------------------------------
 1 | '\" t
 2 | .\"     Title: clevercsv-detect
 3 | .\"    Author: G.J.J. van den Burg
 4 | .\" Generator: Wilderness <https://pypi.org/project/wilderness>
 5 | .\"      Date: 2025-10-30
 6 | .\"    Manual: clevercsv Manual
 7 | .\"    Source: clevercsv 0.8.4
 8 | .\"  Language: English
 9 | .\"
10 | .TH "CLEVERCSV-DETECT" "1" "2025\-10\-30" "Clevercsv 0\&.8\&.4" "Clevercsv Manual"
11 | .\" -----------------------------------------------------------------
12 | .\" * Define some portability stuff
13 | .\" -----------------------------------------------------------------
14 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15 | .\" http://bugs.debian.org/507673
16 | .\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
17 | .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
18 | .ie \n(.g .ds Aq \(aq
19 | .el       .ds Aq '
20 | .\" -----------------------------------------------------------------
21 | .\" * set default formatting *
22 | .\" -----------------------------------------------------------------
23 | .\" disable hyphenation
24 | .nh
25 | .\" disable justification
26 | .ad l
27 | .\" -----------------------------------------------------------------
28 | .\" * MAIN CONTENT STARTS HERE *
29 | .\" -----------------------------------------------------------------
30 | .SH "NAME"
31 | clevercsv-detect \- Detect the dialect of a CSV file
32 | .SH "SYNOPSIS"
33 | .sp
34 | .nf
35 | \fIclevercsv detect [\-c | \-\-consistency] [\-e ENCODING | \-\-encoding=ENCODING]
36 |                  [\-n NUM_CHARS | \-\-num\-chars=NUM_CHARS] [ \-p | \-\-plain |
37 |                  \-j | \-\-json ] [\-\-no\-skip] [\-\-add\-runtime] <path>
38 | .fi
39 | .sp
40 | .SH "DESCRIPTION"
41 | .sp
42 | Detect the dialect of a CSV file.
43 | .SH "OPTIONS"
44 | .sp
45 | .sp
46 | .sp
47 | \-h, \-\-help
48 | .RS 4
49 | show this help message and exit
50 | .RE
51 | .PP
52 | \-c, \-\-consistency
53 | .RS 4
54 | By default, the dialect of CSV files is detected using atwo\-step process. First, a strict set of checks is used to see if the file adheres to a very basic format (for example, when all cells in the file are integers). If none of these checks succeed, the data consistency measure of Van den Burg, et al. (2019) is used to detect the dialect. With this option, you can force the detection to always use the data consistency measure. This can be useful for testing or research purposes, for instance.
55 | .RE
56 | .PP
57 | \-e, \-\-encoding
58 | .RS 4
59 | The file encoding of the given CSV file is automatically detected using chardet. While chardet is incredibly accurate, it is not perfect. In the rare cases that it makes a mistake in detecting the file encoding, you can override the encoding by providing it through this flag. Moreover, when you have a number of CSV files with a known file encoding, you can use this option to speed up the code generation process.
60 | .RE
61 | .PP
62 | \-n, \-\-num\-chars
63 | .RS 4
64 | On large CSV files, dialect detection can sometimes be a bit slow due to the large number of possible dialects to consider. To alleviate this, you can limit the number of characters to use for detection.
65 | .sp
66 | One aspect to keep in mind is that CleverCSV may need to read a specific number of characters to be able to correctly infer the dialect. For example, in the ``imdb.csv`` file in the GitHub repository, the correct dialect can only be found after at least 66 lines of the file are read. Therefore, if there is availability to run CleverCSV on the entire file, that is generally recommended.
67 | .RE
68 | .PP
69 | \-p, \-\-plain
70 | .RS 4
71 | Print the components of the dialect on separate lines
72 | .RE
73 | .PP
74 | \-j, \-\-json
75 | .RS 4
76 | Print the dialect to standard output in the form of a JSON object. This object will always have the 'delimiter', 'quotechar', 'escapechar', and 'strict' keys. If \-\-add\-runtime is specified, it will also have a 'runtime' key.
77 | .RE
78 | .PP
79 | \-\-no\-skip
80 | .RS 4
81 | The data consistency score used for dialect detection consists of two components: a pattern score and a type score. The type score lies between 0 and 1. When computing the data consistency measures for different dialects, we skip the computation of the type score if we see that the pattern score is lower than the best data consistency score we've seen so far. This option can be used to disable this behaviour and compute the type score for all dialects. This is mainly useful for debugging and testing purposes.
82 | .RE
83 | .PP
84 | \-\-add\-runtime
85 | .RS 4
86 | Add the runtime of the detection to the detection output.
87 | .RE
88 | .PP
89 | <path>
90 | .RS 4
91 | Path to the CSV file
92 | .RE
93 | .PP
94 | .sp
95 | .SH "CLEVERCSV"
96 | .sp
97 | Part of the CleverCSV suite


--------------------------------------------------------------------------------
/clevercsv/cparser_util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Python utility functions that wrap the C parser.
  5 | 
  6 | """
  7 | 
  8 | import io
  9 | 
 10 | from typing import Any
 11 | from typing import Iterable
 12 | from typing import Iterator
 13 | from typing import List
 14 | from typing import Optional
 15 | from typing import Tuple
 16 | from typing import Union
 17 | 
 18 | from .cparser import Error as ParserError
 19 | from .cparser import Parser
 20 | from .dialect import SimpleDialect
 21 | from .exceptions import Error
 22 | 
 23 | _FIELD_SIZE_LIMIT: int = 128 * 1024
 24 | 
 25 | 
 26 | def field_size_limit(*args: Any, **kwargs: Any) -> int:
 27 |     """Get/Set the limit to the field size.
 28 | 
 29 |     This function is adapted from the one in the Python CSV module. See the
 30 |     documentation there.
 31 |     """
 32 |     global _FIELD_SIZE_LIMIT
 33 |     old_limit = _FIELD_SIZE_LIMIT
 34 |     all_args = list(args) + list(kwargs.values())
 35 |     if not 0 <= len(all_args) <= 1:
 36 |         raise TypeError(
 37 |             "field_size_limit expected at most 1 arguments, got %i"
 38 |             % len(all_args)
 39 |         )
 40 |     if len(all_args) == 0:
 41 |         return old_limit
 42 |     limit = all_args[0]
 43 |     if not isinstance(limit, int):
 44 |         raise TypeError("limit must be an integer")
 45 |     _FIELD_SIZE_LIMIT = int(limit)
 46 |     return old_limit
 47 | 
 48 | 
 49 | def _parse_data(
 50 |     data: Iterable[str],
 51 |     delimiter: str,
 52 |     quotechar: str,
 53 |     escapechar: str,
 54 |     strict: bool,
 55 |     return_quoted: bool = False,
 56 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]:
 57 |     parser = Parser(
 58 |         data,
 59 |         delimiter=delimiter,
 60 |         quotechar=quotechar,
 61 |         escapechar=escapechar,
 62 |         field_limit=field_size_limit(),
 63 |         strict=strict,
 64 |         return_quoted=return_quoted,
 65 |     )
 66 |     try:
 67 |         for row in parser:
 68 |             yield row
 69 |     except ParserError as e:
 70 |         raise Error(str(e))
 71 | 
 72 | 
 73 | def parse_data(
 74 |     data: Iterable[str],
 75 |     dialect: Optional[SimpleDialect] = None,
 76 |     delimiter: Optional[str] = None,
 77 |     quotechar: Optional[str] = None,
 78 |     escapechar: Optional[str] = None,
 79 |     strict: Optional[bool] = None,
 80 |     return_quoted: bool = False,
 81 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]:
 82 |     """Parse the data given a dialect using the C parser
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     data : iterable
 87 |         The data of the CSV file as an iterable
 88 | 
 89 |     dialect : SimpleDialect
 90 |         The dialect to use for the parsing. If None, the dialect with each
 91 |         component set to the empty string is used.
 92 | 
 93 |     delimiter : str
 94 |         The delimiter to use. If not None, overwrites the delimiter in the
 95 |         dialect.
 96 | 
 97 |     quotechar : str
 98 |         The quote character to use. If not None, overwrites the quote character
 99 |         in the dialect.
100 | 
101 |     escapechar : str
102 |         The escape character to use. If not None, overwrites the escape
103 |         character in the dialect.
104 | 
105 |     strict : bool
106 |         Enable strict mode or not. If not None, overwrites the strict mode set
107 |         in the dialect.
108 | 
109 |     return_quoted : bool
110 |         For each cell, return a tuple "(field, is_quoted)" where the second
111 |         element indicates whether the cell was a quoted cell or not.
112 | 
113 |     Yields
114 |     ------
115 |     rows : list
116 |         The rows of the file as a list of cells.
117 | 
118 |     Raises
119 |     ------
120 |     Error : clevercsv.exceptions.Error
121 |         When an error occurs during parsing.
122 | 
123 |     """
124 |     if dialect is None:
125 |         dialect = SimpleDialect("", "", "")
126 | 
127 |     delimiter_ = delimiter if delimiter is not None else dialect.delimiter
128 |     quotechar_ = quotechar if quotechar is not None else dialect.quotechar
129 |     escapechar_ = escapechar if escapechar is not None else dialect.escapechar
130 |     strict_ = strict if strict is not None else dialect.strict
131 | 
132 |     yield from _parse_data(
133 |         data,
134 |         delimiter_,
135 |         quotechar_,
136 |         escapechar_,
137 |         strict_,
138 |         return_quoted=return_quoted,
139 |     )
140 | 
141 | 
142 | def parse_string(
143 |     data: str,
144 |     dialect: SimpleDialect,
145 |     return_quoted: bool = False,
146 | ) -> Iterator[Union[List[str], List[Tuple[str, bool]]]]:
147 |     """Utility for when the CSV file is encoded as a single string"""
148 |     return parse_data(
149 |         iter(io.StringIO(data, newline="")),
150 |         dialect=dialect,
151 |         return_quoted=return_quoted,
152 |     )
153 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import glob
  5 | import io
  6 | import os
  7 | 
  8 | from setuptools import Command
  9 | from setuptools import Extension
 10 | from setuptools import find_packages
 11 | from setuptools import setup
 12 | 
 13 | # Package meta-data.
 14 | AUTHOR = "Gertjan van den Burg"
 15 | DESCRIPTION = "A Python package for handling messy CSV files"
 16 | EMAIL = "gertjanvandenburg@gmail.com"
 17 | LICENSE = "MIT"
 18 | LICENSE_TROVE = "License :: OSI Approved :: MIT License"
 19 | NAME = "clevercsv"
 20 | REQUIRES_PYTHON = ">=3.9.0"
 21 | URL = "https://github.com/alan-turing-institute/CleverCSV"
 22 | VERSION = None
 23 | 
 24 | # What packages are required for this module to be executed?
 25 | REQUIRED = [
 26 |     "chardet>=3.0",
 27 |     "regex>=2018.11",
 28 |     "packaging>=23.0",
 29 | ]
 30 | 
 31 | # Dependencies only needed for pre-commit
 32 | precommit_require = [
 33 |     "wilderness>=0.1.5",
 34 | ]
 35 | 
 36 | # When these are changed, update clevercsv/_optional.py accordingly
 37 | full_require = [
 38 |     *precommit_require,
 39 |     "faust-cchardet>=2.1.18",
 40 |     "pandas>=1.0.0",
 41 |     "tabview>=1.4",
 42 | ]
 43 | 
 44 | docs_require = ["sphinx", "m2r2", "furo"]
 45 | test_require = full_require
 46 | dev_require = [
 47 |     "green",
 48 |     # "pythonfuzz",
 49 |     "pytest>=2.6",
 50 |     "termcolor",
 51 |     "mypy",
 52 | ]
 53 | 
 54 | # What packages are optional?
 55 | EXTRAS = {
 56 |     "full": full_require,
 57 |     "docs": docs_require,
 58 |     "tests": test_require,
 59 |     "dev": docs_require + test_require + dev_require,
 60 |     "precommit": precommit_require,
 61 | }
 62 | 
 63 | 
 64 | class build_manpages(Command):
 65 |     description = "Generate manpages"
 66 |     user_options = []
 67 | 
 68 |     def initialize_options(self):
 69 |         pass
 70 | 
 71 |     def finalize_options(self):
 72 |         pass
 73 | 
 74 |     def run(self):
 75 |         from wilderness import build_manpages
 76 | 
 77 |         from clevercsv.console import build_application
 78 | 
 79 |         build_manpages(build_application())
 80 | 
 81 | 
 82 | # The rest you shouldn't have to touch too much :)
 83 | # ------------------------------------------------
 84 | # Except, perhaps the License and Trove Classifiers!
 85 | # If you do change the License, remember to change the Trove Classifier for that!
 86 | 
 87 | here = os.path.abspath(os.path.dirname(__file__))
 88 | 
 89 | # Import the README and use it as the long-description.
 90 | # Note: this will only work if 'README.md' is present in your MANIFEST.in file!
 91 | try:
 92 |     with io.open(os.path.join(here, "README.md"), encoding="utf-8") as f:
 93 |         long_description = "\n" + f.read()
 94 | except FileNotFoundError:
 95 |     long_description = DESCRIPTION
 96 | 
 97 | # Load the package's __version__.py module as a dictionary.
 98 | about = {}
 99 | if not VERSION:
100 |     project_slug = NAME.lower().replace("-", "_").replace(" ", "_")
101 |     with open(os.path.join(here, project_slug, "__version__.py")) as f:
102 |         exec(f.read(), about)
103 | else:
104 |     about["__version__"] = VERSION
105 | 
106 | # Where the magic happens:
107 | setup(
108 |     name=NAME,
109 |     version=about["__version__"],
110 |     description=DESCRIPTION,
111 |     long_description=long_description,
112 |     long_description_content_type="text/markdown",
113 |     author=AUTHOR,
114 |     author_email=EMAIL,
115 |     python_requires=REQUIRES_PYTHON,
116 |     url=URL,
117 |     packages=find_packages(
118 |         exclude=["tests", "*.tests", "*.tests.*", "tests.*"]
119 |     ),
120 |     install_requires=REQUIRED,
121 |     extras_require=EXTRAS,
122 |     include_package_data=True,
123 |     package_data={"clevercsv": ["py.typed"]},
124 |     license=LICENSE,
125 |     ext_modules=[
126 |         Extension("clevercsv.cparser", sources=["src/cparser.c"]),
127 |         Extension("clevercsv.cabstraction", sources=["src/abstraction.c"]),
128 |     ],
129 |     entry_points={"console_scripts": ["clevercsv = clevercsv.__main__:main"]},
130 |     data_files=[("man/man1", glob.glob("man/*.1"))],
131 |     cmdclass={"build_manpages": build_manpages},
132 |     classifiers=[
133 |         # Trove classifiers
134 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
135 |         LICENSE_TROVE,
136 |         "Programming Language :: Python",
137 |         "Programming Language :: Python :: 3",
138 |         "Programming Language :: Python :: 3.9",
139 |         "Programming Language :: Python :: 3.10",
140 |         "Programming Language :: Python :: 3.11",
141 |         "Programming Language :: Python :: 3.12",
142 |         "Programming Language :: Python :: 3.13",
143 |         "Programming Language :: Python :: 3.14",
144 |         "Programming Language :: Python :: Implementation :: CPython",
145 |         "Programming Language :: Python :: Implementation :: PyPy",
146 |     ],
147 | )
148 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
  1 | name: Deploy to PyPI
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - v*
  7 | 
  8 | jobs:
  9 |   build_wheels:
 10 |     name: Build wheels on ${{ matrix.os }}
 11 |     runs-on: ${{ matrix.os }}
 12 |     strategy:
 13 |       matrix:
 14 |         os: [ 'ubuntu-latest', 'windows-latest', 'macos-latest' ]
 15 | 
 16 |     steps:
 17 |       - name: Checkout repo
 18 |         uses: actions/checkout@v4
 19 | 
 20 |       - uses: actions/setup-python@v5
 21 |         name: Install Python
 22 |         with:
 23 |           python-version: '3.10'
 24 | 
 25 |       - name: Set up QEMU
 26 |         if: runner.os == 'Linux'
 27 |         uses: docker/setup-qemu-action@v3
 28 |         with:
 29 |           platforms: all
 30 | 
 31 |       - name: Build wheels
 32 |         uses: pypa/cibuildwheel@v3.2.1
 33 |         env:
 34 |           CIBW_TEST_COMMAND: "python -VV && python -m unittest discover -f -s {project}/tests/test_unit/"
 35 |           CIBW_TEST_EXTRAS: "full"
 36 |           CIBW_SKIP: "pp* cp27-* cp33-* cp34-* cp35-* cp36-* cp37-* cp38-* *-win32 *-musllinux_* *-manylinux_i686"
 37 |           CIBW_ARCHS_MACOS: x86_64 arm64 universal2
 38 |           CIBW_ARCHS_LINUX: auto aarch64
 39 | 
 40 |       - uses: actions/upload-artifact@v4
 41 |         with:
 42 |           path: ./wheelhouse/*.whl
 43 |           name: dist-${{ matrix.os }}-${{ matrix.python-version }}
 44 | 
 45 |   build_sdist:
 46 |     name: Build source distribution
 47 |     runs-on: ubuntu-latest
 48 |     steps:
 49 |       - uses: actions/checkout@v4
 50 |         name: Checkout repo
 51 | 
 52 |       - uses: actions/setup-python@v5
 53 |         name: Install Python
 54 |         with:
 55 |           python-version: '3.10'
 56 | 
 57 |       - name: Update setuptools
 58 |         run: pip install -U setuptools
 59 | 
 60 |       - name: Build sdist
 61 |         run: python setup.py sdist
 62 | 
 63 |       - uses: actions/upload-artifact@v4
 64 |         with:
 65 |           path: dist/*.tar.gz
 66 |           name: dist-source
 67 | 
 68 |   upload_testpypi:
 69 |     needs: [build_wheels, build_sdist]
 70 |     runs-on: ubuntu-latest
 71 |     # upload to PyPI on every tag starting with 'v'
 72 |     if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') && contains(github.event.ref, '-rc.')
 73 | 
 74 |     environment:
 75 |       name: testpypi
 76 |       url: https://test.pypi.org/p/clevercsv
 77 | 
 78 |     permissions:
 79 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
 80 | 
 81 |     steps:
 82 |       - name: Download all the dists
 83 |         uses: actions/download-artifact@v4
 84 |         with:
 85 |           pattern: dist-*
 86 |           merge-multiple: true
 87 |           path: dist/
 88 | 
 89 |       - name: Publish distribution to TestPyPI
 90 |         uses: pypa/gh-action-pypi-publish@release/v1
 91 |         with:
 92 |           repository-url: https://test.pypi.org/legacy/
 93 |           verbose: true
 94 | 
 95 |   upload_pypi:
 96 |     needs: [build_wheels, build_sdist]
 97 |     runs-on: ubuntu-latest
 98 |     # upload to PyPI on tags starting with 'v' that don't contain '-rc.'
 99 |     if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') && !contains(github.event.ref, '-rc.')
100 | 
101 |     environment:
102 |       name: pypi
103 |       url: https://pypi.org/p/clevercsv
104 | 
105 |     permissions:
106 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
107 | 
108 |     steps:
109 |       - name: Download all the dists
110 |         uses: actions/download-artifact@v4
111 |         with:
112 |           pattern: dist-*
113 |           merge-multiple: true
114 |           path: dist/
115 | 
116 |       - name: Publish distribution to PyPI
117 |         uses: pypa/gh-action-pypi-publish@release/v1
118 |         with:
119 |           verbose: true
120 | 
121 |   github-release:
122 |     name: >-
123 |       Sign the Python 🐍 distribution 📦 with Sigstore
124 |       and upload the files to GitHub Release
125 |     needs:
126 |     - upload_pypi
127 |     runs-on: ubuntu-latest
128 | 
129 |     permissions:
130 |       contents: write  # IMPORTANT: mandatory for making GitHub Releases
131 |       id-token: write  # IMPORTANT: mandatory for sigstore
132 | 
133 |     steps:
134 |     - name: Checkout repo
135 |       uses: actions/checkout@v4
136 | 
137 |     - name: Download all the dists
138 |       uses: actions/download-artifact@v4
139 |       with:
140 |         pattern: dist-*
141 |         merge-multiple: true
142 |         path: dist/
143 | 
144 |     - name: Sign the dists with Sigstore
145 |       uses: sigstore/gh-action-sigstore-python@v3.0.1
146 |       with:
147 |         inputs: >-
148 |           ./dist/*.tar.gz
149 |           ./dist/*.whl
150 | 
151 |     - name: Create GitHub Release
152 |       env:
153 |         GITHUB_TOKEN: ${{ github.token }}
154 |       run: >-
155 |         gh release create
156 |         '${{ github.ref_name }}'
157 |         --notes ""
158 | 
159 |     - name: Upload artifact signatures to GitHub Release
160 |       env:
161 |         GITHUB_TOKEN: ${{ github.token }}
162 |       # Upload to GitHub Release using the `gh` CLI.
163 |       # `dist/` contains the built packages, and the
164 |       # sigstore-produced signatures and certificates.
165 |       run: >-
166 |         gh release upload
167 |         '${{ github.ref_name }}' dist/**
168 |         --repo '${{ github.repository }}'
169 | 


--------------------------------------------------------------------------------
/clevercsv/dict_read_write.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | DictReader and DictWriter.
  5 | 
  6 | This code is entirely copied from the Python csv module. The only exception is
  7 | that it uses the `reader` and `writer` classes from our package.
  8 | 
  9 | Author: Gertjan van den Burg
 10 | 
 11 | """
 12 | 
 13 | from __future__ import annotations
 14 | 
 15 | import warnings
 16 | 
 17 | from collections import OrderedDict
 18 | from collections.abc import Collection
 19 | 
 20 | from typing import TYPE_CHECKING
 21 | from typing import Any
 22 | from typing import Generic
 23 | from typing import Iterable
 24 | from typing import Iterator
 25 | from typing import Literal
 26 | from typing import Mapping
 27 | from typing import Optional
 28 | from typing import Sequence
 29 | from typing import TypeVar
 30 | from typing import Union
 31 | from typing import cast
 32 | 
 33 | from clevercsv.read import reader
 34 | from clevercsv.write import writer
 35 | 
 36 | if TYPE_CHECKING:
 37 |     from clevercsv._types import SupportsWrite
 38 |     from clevercsv._types import _DialectLike
 39 |     from clevercsv._types import _DictReadMapping
 40 | 
 41 | _T = TypeVar("_T")
 42 | 
 43 | 
 44 | class DictReader(
 45 |     Generic[_T], Iterator["_DictReadMapping[Union[_T, Any], Union[str, Any]]"]
 46 | ):
 47 |     def __init__(
 48 |         self,
 49 |         f: Iterable[str],
 50 |         fieldnames: Optional[Sequence[_T]] = None,
 51 |         restkey: Optional[str] = None,
 52 |         restval: Optional[str] = None,
 53 |         dialect: "_DialectLike" = "excel",
 54 |         *args: Any,
 55 |         **kwds: Any,
 56 |     ) -> None:
 57 |         self._fieldnames = fieldnames
 58 |         self.restkey = restkey
 59 |         self.restval = restval
 60 |         self.reader: reader = reader(f, dialect, *args, **kwds)
 61 |         self.dialect = dialect
 62 |         self.line_num = 0
 63 | 
 64 |     def __iter__(self) -> "DictReader[_T]":
 65 |         return self
 66 | 
 67 |     @property
 68 |     def fieldnames(self) -> Sequence[_T]:
 69 |         if self._fieldnames is None:
 70 |             try:
 71 |                 fieldnames = next(self.reader)
 72 |                 self._fieldnames = [cast(_T, f) for f in fieldnames]
 73 |             except StopIteration:
 74 |                 pass
 75 | 
 76 |         assert self._fieldnames is not None
 77 | 
 78 |         # Note: this was added because I don't think it's expected that Python
 79 |         # simply drops information if there are duplicate headers. There is
 80 |         # discussion on this issue in the Python bug tracker here:
 81 |         # https://bugs.python.org/issue17537 (see linked thread therein). A
 82 |         # warning is easy enough to suppress and should ensure that the user
 83 |         # is at least aware of this behavior.
 84 |         if not len(self._fieldnames) == len(set(self._fieldnames)):
 85 |             warnings.warn(
 86 |                 "fieldnames are not unique, some columns will be dropped."
 87 |             )
 88 | 
 89 |         self.line_num = self.reader.line_num
 90 |         return self._fieldnames
 91 | 
 92 |     @fieldnames.setter
 93 |     def fieldnames(self, value: Sequence[_T]) -> None:
 94 |         self._fieldnames = value
 95 | 
 96 |     def __next__(self) -> "_DictReadMapping[Union[_T, Any], Union[str, Any]]":
 97 |         if self.line_num == 0:
 98 |             self.fieldnames
 99 |         row = next(self.reader)
100 |         self.line_num = self.reader.line_num
101 | 
102 |         while row == []:
103 |             row = next(self.reader)
104 | 
105 |         d: _DictReadMapping = OrderedDict(zip(self.fieldnames, row))
106 |         lf = len(self.fieldnames)
107 |         lr = len(row)
108 |         if lf < lr:
109 |             d[self.restkey] = row[lf:]
110 |         elif lf > lr:
111 |             for key in self.fieldnames[lr:]:
112 |                 d[key] = self.restval
113 |         return d
114 | 
115 | 
116 | class DictWriter(Generic[_T]):
117 |     def __init__(
118 |         self,
119 |         f: SupportsWrite[str],
120 |         fieldnames: Collection[_T],
121 |         restval: Optional[Any] = "",
122 |         extrasaction: Literal["raise", "ignore"] = "raise",
123 |         dialect: "_DialectLike" = "excel",
124 |         *args: Any,
125 |         **kwds: Any,
126 |     ):
127 |         self.fieldnames = fieldnames
128 |         self.restval = restval
129 |         if extrasaction.lower() not in ("raise", "ignore"):
130 |             raise ValueError(
131 |                 "extrasaction (%s) must be 'raise' or 'ignore'" % extrasaction
132 |             )
133 |         self.extrasaction = extrasaction
134 |         self.writer = writer(f, dialect, *args, **kwds)
135 | 
136 |     def writeheader(self) -> Any:
137 |         header = dict(zip(self.fieldnames, self.fieldnames))
138 |         return self.writerow(header)
139 | 
140 |     def _dict_to_list(self, rowdict: Mapping[_T, Any]) -> Iterator[Any]:
141 |         if self.extrasaction == "raise":
142 |             wrong_fields = rowdict.keys() - self.fieldnames
143 |             if wrong_fields:
144 |                 raise ValueError(
145 |                     "dict contains fields not in fieldnames: "
146 |                     + ", ".join([repr(x) for x in wrong_fields])
147 |                 )
148 |         return (rowdict.get(key, self.restval) for key in self.fieldnames)
149 | 
150 |     def writerow(self, rowdict: Mapping[_T, Any]) -> Any:
151 |         return self.writer.writerow(self._dict_to_list(rowdict))
152 | 
153 |     def writerows(self, rowdicts: Iterable[Mapping[_T, Any]]) -> None:
154 |         return self.writer.writerows(map(self._dict_to_list, rowdicts))
155 | 


--------------------------------------------------------------------------------
/stubs/tabview/tabview.pyi:
--------------------------------------------------------------------------------
  1 | import io
  2 | 
  3 | from typing import Any
  4 | 
  5 | basestring = str
  6 | file = io.FileIO
  7 | 
  8 | def KEY_CTRL(key): ...
  9 | def addstr(*args): ...
 10 | def insstr(*args): ...
 11 | 
 12 | class ReloadException(Exception):
 13 |     start_pos: Any
 14 |     column_width_mode: Any
 15 |     column_gap: Any
 16 |     column_widths: Any
 17 |     search_str: Any
 18 |     def __init__(
 19 |         self, start_pos, column_width, column_gap, column_widths, search_str
 20 |     ) -> None: ...
 21 | 
 22 | class QuitException(Exception): ...
 23 | 
 24 | class Viewer:
 25 |     scr: Any
 26 |     data: Any
 27 |     info: Any
 28 |     header_offset_orig: int
 29 |     header: Any
 30 |     header_offset: Any
 31 |     num_data_columns: Any
 32 |     column_width_mode: Any
 33 |     column_gap: Any
 34 |     trunc_char: Any
 35 |     num_columns: int
 36 |     vis_columns: int
 37 |     init_search: Any
 38 |     modifier: Any
 39 |     def __init__(self, *args, **kwargs) -> None: ...
 40 |     def column_xw(self, x): ...
 41 |     def quit(self) -> None: ...
 42 |     def reload(self) -> None: ...
 43 |     def consume_modifier(self, default: int = ...): ...
 44 |     def down(self) -> None: ...
 45 |     def up(self) -> None: ...
 46 |     def left(self) -> None: ...
 47 |     def right(self) -> None: ...
 48 |     y: Any
 49 |     win_y: Any
 50 |     def page_down(self) -> None: ...
 51 |     def page_up(self) -> None: ...
 52 |     x: Any
 53 |     win_x: Any
 54 |     def page_right(self) -> None: ...
 55 |     def page_left(self) -> None: ...
 56 |     def mark(self) -> None: ...
 57 |     def goto_mark(self) -> None: ...
 58 |     def home(self) -> None: ...
 59 |     def goto_y(self, y) -> None: ...
 60 |     def goto_row(self) -> None: ...
 61 |     def goto_x(self, x) -> None: ...
 62 |     def goto_col(self) -> None: ...
 63 |     def goto_yx(self, y, x) -> None: ...
 64 |     def line_home(self) -> None: ...
 65 |     def line_end(self) -> None: ...
 66 |     def show_cell(self) -> None: ...
 67 |     def show_info(self): ...
 68 |     textpad: Any
 69 |     search_str: Any
 70 |     def search(self) -> None: ...
 71 |     def search_results(
 72 |         self, rev: bool = ..., look_in_cur: bool = ...
 73 |     ) -> None: ...
 74 |     def search_results_prev(
 75 |         self, rev: bool = ..., look_in_cur: bool = ...
 76 |     ) -> None: ...
 77 |     def help(self) -> None: ...
 78 |     def toggle_header(self) -> None: ...
 79 |     def column_gap_down(self) -> None: ...
 80 |     def column_gap_up(self) -> None: ...
 81 |     column_width: Any
 82 |     def column_width_all_down(self) -> None: ...
 83 |     def column_width_all_up(self) -> None: ...
 84 |     def column_width_down(self) -> None: ...
 85 |     def column_width_up(self) -> None: ...
 86 |     def sort_by_column_numeric(self): ...
 87 |     def sort_by_column_numeric_reverse(self): ...
 88 |     def sort_by_column(self) -> None: ...
 89 |     def sort_by_column_reverse(self) -> None: ...
 90 |     def sort_by_column_natural(self) -> None: ...
 91 |     def sort_by_column_natural_reverse(self) -> None: ...
 92 |     def sorted_nicely(self, ls, key, rev: bool = ...): ...
 93 |     def float_string_key(self, value): ...
 94 |     def toggle_column_width(self) -> None: ...
 95 |     def set_current_column_width(self) -> None: ...
 96 |     def yank_cell(self) -> None: ...
 97 |     keys: Any
 98 |     def define_keys(self) -> None: ...
 99 |     def run(self) -> None: ...
100 |     def handle_keys(self) -> None: ...
101 |     def handle_modifier(self, mod) -> None: ...
102 |     def resize(self) -> None: ...
103 |     def num_columns_fwd(self, x): ...
104 |     def num_columns_rev(self, x): ...
105 |     def recalculate_layout(self) -> None: ...
106 |     def location_string(self, yp, xp): ...
107 |     def display(self) -> None: ...
108 |     def strpad(self, s, width): ...
109 |     def hdrstr(self, x, width): ...
110 |     def cellstr(self, y, x, width): ...
111 |     def skip_to_row_change(self) -> None: ...
112 |     def skip_to_row_change_reverse(self) -> None: ...
113 |     def skip_to_col_change(self) -> None: ...
114 |     def skip_to_col_change_reverse(self) -> None: ...
115 | 
116 | class TextBox:
117 |     scr: Any
118 |     data: Any
119 |     title: Any
120 |     tdata: Any
121 |     hid_rows: int
122 |     def __init__(self, scr, data: str = ..., title: str = ...) -> None: ...
123 |     def __call__(self) -> None: ...
124 |     handlers: Any
125 |     def setup_handlers(self) -> None: ...
126 |     def run(self) -> None: ...
127 |     def handle_key(self, key) -> None: ...
128 |     def close(self) -> None: ...
129 |     def scroll_down(self) -> None: ...
130 |     def scroll_up(self) -> None: ...
131 |     def display(self) -> None: ...
132 | 
133 | def csv_sniff(data, enc): ...
134 | def fix_newlines(data): ...
135 | def adjust_space_delim(data, enc): ...
136 | def process_data(
137 |     data,
138 |     enc: Any | None = ...,
139 |     delim: Any | None = ...,
140 |     quoting: Any | None = ...,
141 |     quote_char=...,
142 | ): ...
143 | def data_list_or_file(data): ...
144 | def pad_data(d): ...
145 | def readme(): ...
146 | def detect_encoding(data: Any | None = ...): ...
147 | def main(stdscr, *args, **kwargs) -> None: ...
148 | def view(
149 |     data,
150 |     enc: Any | None = ...,
151 |     start_pos=...,
152 |     column_width: int = ...,
153 |     column_gap: int = ...,
154 |     trunc_char: str = ...,
155 |     column_widths: Any | None = ...,
156 |     search_str: Any | None = ...,
157 |     double_width: bool = ...,
158 |     delimiter: Any | None = ...,
159 |     quoting: Any | None = ...,
160 |     info: Any | None = ...,
161 |     quote_char=...,
162 | ): ...
163 | def parse_path(path): ...
164 | 


--------------------------------------------------------------------------------
/clevercsv/console/commands/detect.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import sys
  5 | import time
  6 | 
  7 | from typing import Any
  8 | from typing import Dict
  9 | 
 10 | from wilderness import Command
 11 | 
 12 | from clevercsv.wrappers import detect_dialect
 13 | 
 14 | from ._docs import FLAG_DESCRIPTIONS
 15 | from ._utils import parse_int
 16 | 
 17 | 
 18 | class DetectCommand(Command):
 19 |     _description = "Detect the dialect of a CSV file."
 20 | 
 21 |     def __init__(self) -> None:
 22 |         super().__init__(
 23 |             name="detect",
 24 |             title="Detect the dialect of a CSV file",
 25 |             description=self._description,
 26 |             extra_sections={"CleverCSV": "Part of the CleverCSV suite"},
 27 |         )
 28 | 
 29 |     def register(self) -> None:
 30 |         self.add_argument("path", help="Path to the CSV file")
 31 |         self.add_argument(
 32 |             "-c",
 33 |             "--consistency",
 34 |             action="store_true",
 35 |             help="Only use the consistency measure for detection.",
 36 |             description=(
 37 |                 "By default, the dialect of CSV files is detected using "
 38 |                 "atwo-step process. First, a strict set of checks is used to "
 39 |                 "see if the file adheres to a very basic format (for example, "
 40 |                 "when all cells in the file are integers). If none of these "
 41 |                 "checks succeed, the data consistency measure of Van den "
 42 |                 "Burg, et al. (2019) is used to detect the dialect. With this "
 43 |                 "option, you can force the detection to always use the data "
 44 |                 "consistency measure. This can be useful for testing or "
 45 |                 "research purposes, for instance."
 46 |             ),
 47 |         )
 48 |         self.add_argument(
 49 |             "-e",
 50 |             "--encoding",
 51 |             help="Set the encoding of the file",
 52 |             description=FLAG_DESCRIPTIONS["encoding"],
 53 |         )
 54 |         self.add_argument(
 55 |             "-n",
 56 |             "--num-chars",
 57 |             help="Number of characters to use for detection",
 58 |             type=int,
 59 |             description=FLAG_DESCRIPTIONS["num-chars"],
 60 |         )
 61 |         group = self.add_mutually_exclusive_group()
 62 |         group.add_argument(
 63 |             "-p",
 64 |             "--plain",
 65 |             action="store_true",
 66 |             help="Print the components of the dialect on separate lines",
 67 |         )
 68 |         group.add_argument(
 69 |             "-j",
 70 |             "--json",
 71 |             action="store_true",
 72 |             help="Print the components of the dialect as a JSON object",
 73 |             description=(
 74 |                 "Print the dialect to standard output in the form of a JSON "
 75 |                 "object. This object will always have the 'delimiter', "
 76 |                 "'quotechar', 'escapechar', and 'strict' keys. If "
 77 |                 "--add-runtime is specified, it will also have a 'runtime' "
 78 |                 "key."
 79 |             ),
 80 |         )
 81 |         self.add_argument(
 82 |             "--no-skip",
 83 |             action="store_true",
 84 |             help="Don't skip type detection for dialects with a low pattern score",
 85 |             description=(
 86 |                 "The data consistency score used for dialect detection "
 87 |                 "consists of two components: a pattern score and a type "
 88 |                 "score. The type score lies between 0 and 1. When computing "
 89 |                 "the data consistency measures for different dialects, we "
 90 |                 "skip the computation of the type score if we see that the "
 91 |                 "pattern score is lower than the best data consistency score "
 92 |                 "we've seen so far. This option can be used to disable this "
 93 |                 "behaviour and compute the type score for all dialects. This "
 94 |                 "is mainly useful for debugging and testing purposes."
 95 |             ),
 96 |         )
 97 |         self.add_argument(
 98 |             "--add-runtime",
 99 |             action="store_true",
100 |             help="Add the runtime of the detection to the detection output.",
101 |         )
102 | 
103 |     def handle(self) -> int:
104 |         verbose = self.args.verbose
105 |         num_chars = parse_int(self.args.num_chars, "num-chars")
106 |         method = "consistency" if self.args.consistency else "auto"
107 |         skip = not self.args.no_skip
108 | 
109 |         t_start = time.time()
110 |         dialect = detect_dialect(
111 |             self.args.path,
112 |             num_chars=num_chars,
113 |             encoding=self.args.encoding,
114 |             verbose=verbose,
115 |             method=method,
116 |             skip=skip,
117 |         )
118 |         runtime = time.time() - t_start
119 | 
120 |         if dialect is None:
121 |             print("Error: Dialect detection failed.", file=sys.stderr)
122 |             return 1
123 | 
124 |         if self.args.plain:
125 |             print(f"delimiter = {dialect.delimiter}".strip())
126 |             print(f"quotechar = {dialect.quotechar}".strip())
127 |             print(f"escapechar = {dialect.escapechar}".strip())
128 |             if self.args.add_runtime:
129 |                 print(f"runtime = {runtime}")
130 |         elif self.args.json:
131 |             dialect_dict: Dict[str, Any] = dialect.to_dict()
132 |             if self.args.add_runtime:
133 |                 dialect_dict["runtime"] = runtime
134 |             print(json.dumps(dialect_dict))
135 |         else:
136 |             print("Detected: " + str(dialect))
137 |             if self.args.add_runtime:
138 |                 print(f"Runtime: {runtime:.6f} seconds")
139 |         return 0
140 | 


--------------------------------------------------------------------------------
/clevercsv/dialect.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Definitions for the dialect object.
  6 | 
  7 | Author: Gertjan van den Burg
  8 | 
  9 | """
 10 | 
 11 | import csv
 12 | import functools
 13 | import json
 14 | 
 15 | from typing import Any
 16 | from typing import Dict
 17 | from typing import Optional
 18 | from typing import Tuple
 19 | from typing import Union
 20 | 
 21 | import _csv
 22 | 
 23 | excel = csv.excel
 24 | excel_tab = csv.excel_tab
 25 | unix_dialect = csv.unix_dialect
 26 | 
 27 | 
 28 | @functools.total_ordering
 29 | class SimpleDialect:
 30 |     """
 31 |     The simplified dialect object.
 32 | 
 33 |     For the delimiter, quotechar, and escapechar the empty string means no
 34 |     delimiter/quotechar/escapechar in the file. None is used to mark it
 35 |     undefined.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     delimiter : str
 40 |         The delimiter of the CSV file.
 41 | 
 42 |     quotechar : str
 43 |         The quotechar of the file.
 44 | 
 45 |     escapechar : str
 46 |         The escapechar of the file.
 47 | 
 48 |     strict : bool
 49 |         Whether strict parsing should be enforced. Same as in the csv module.
 50 | 
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         delimiter: Optional[str],
 56 |         quotechar: Optional[str],
 57 |         escapechar: Optional[str],
 58 |         strict: bool = False,
 59 |     ):
 60 |         self.delimiter = delimiter
 61 |         self.quotechar = quotechar
 62 |         self.escapechar = escapechar
 63 |         self.strict = strict
 64 | 
 65 |     def validate(self) -> None:
 66 |         if self.delimiter is None or len(self.delimiter) > 1:
 67 |             raise ValueError(
 68 |                 "Delimiter should be zero or one characters, got: %r"
 69 |                 % self.delimiter
 70 |             )
 71 |         if self.quotechar is None or len(self.quotechar) > 1:
 72 |             raise ValueError(
 73 |                 "Quotechar should be zero or one characters, got: %r"
 74 |                 % self.quotechar
 75 |             )
 76 |         if self.escapechar is None or len(self.escapechar) > 1:
 77 |             raise ValueError(
 78 |                 "Escapechar should be zero or one characters, got: %r"
 79 |                 % self.escapechar
 80 |             )
 81 |         if self.strict not in set([False, True]):
 82 |             raise ValueError(
 83 |                 "Strict should be True or False, got: %r" % self.strict
 84 |             )
 85 | 
 86 |     @classmethod
 87 |     def from_dict(
 88 |         cls: type["SimpleDialect"], d: Dict[str, Any]
 89 |     ) -> "SimpleDialect":
 90 |         dialect = cls(
 91 |             d["delimiter"], d["quotechar"], d["escapechar"], strict=d["strict"]
 92 |         )
 93 |         return dialect
 94 | 
 95 |     @classmethod
 96 |     def from_csv_dialect(
 97 |         cls: type["SimpleDialect"],
 98 |         d: Union[_csv.Dialect, csv.Dialect],
 99 |     ) -> "SimpleDialect":
100 |         delimiter = "" if d.delimiter is None else d.delimiter
101 |         quotechar = "" if d.quoting == csv.QUOTE_NONE else d.quotechar
102 |         escapechar = "" if d.escapechar is None else d.escapechar
103 |         return cls(delimiter, quotechar, escapechar, strict=d.strict)
104 | 
105 |     def to_csv_dialect(self) -> csv.Dialect:
106 |         class dialect(csv.Dialect):
107 |             assert self.delimiter is not None
108 |             delimiter = self.delimiter
109 |             quotechar = '"' if self.quotechar == "" else self.quotechar
110 |             escapechar = None if self.escapechar == "" else self.escapechar
111 |             doublequote = True
112 |             quoting = (
113 |                 csv.QUOTE_NONE if self.quotechar == "" else csv.QUOTE_MINIMAL
114 |             )
115 |             skipinitialspace = False
116 |             # TODO: We need to set this because it can't be None anymore in
117 |             # recent versions of Python
118 |             lineterminator = "\n"
119 | 
120 |         return dialect()
121 | 
122 |     def to_dict(self) -> Dict[str, Union[str, bool, None]]:
123 |         self.validate()
124 |         d = dict(
125 |             delimiter=self.delimiter,
126 |             quotechar=self.quotechar,
127 |             escapechar=self.escapechar,
128 |             strict=self.strict,
129 |         )
130 |         return d
131 | 
132 |     def serialize(self) -> str:
133 |         """Serialize dialect to a JSON object"""
134 |         return json.dumps(self.to_dict())
135 | 
136 |     @classmethod
137 |     def deserialize(cls: type["SimpleDialect"], obj: str) -> "SimpleDialect":
138 |         """Deserialize dialect from a JSON object"""
139 |         return cls.from_dict(json.loads(obj))
140 | 
141 |     def __repr__(self) -> str:
142 |         return "SimpleDialect(%r, %r, %r)" % (
143 |             self.delimiter,
144 |             self.quotechar,
145 |             self.escapechar,
146 |         )
147 | 
148 |     def __key(
149 |         self,
150 |     ) -> Tuple[Optional[str], Optional[str], Optional[str], bool]:
151 |         return (self.delimiter, self.quotechar, self.escapechar, self.strict)
152 | 
153 |     def __hash__(self) -> int:
154 |         return hash(self.__key())
155 | 
156 |     def __eq__(self, other: Any) -> bool:
157 |         if not isinstance(other, SimpleDialect):
158 |             return False
159 |         return self.__key() == other.__key()
160 | 
161 |     def __lt__(self, other: Any) -> bool:
162 |         # This provides a partial order on dialect objects with the goal of
163 |         # speeding up the consistency measure.
164 |         if not isinstance(other, SimpleDialect):
165 |             return False
166 |         if self.delimiter == "," and not other.delimiter == ",":
167 |             return True
168 |         elif other.delimiter == "," and not self.delimiter == ",":
169 |             return False
170 |         if self.delimiter == ";" and not other.delimiter == ";":
171 |             return True
172 |         elif other.delimiter == ";" and not self.delimiter == ";":
173 |             return False
174 |         return self.__key() < other.__key()
175 | 


--------------------------------------------------------------------------------
/stubs/pandas/__init__.pyi:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | from pandas._config import describe_option as describe_option
  4 | from pandas._config import get_option as get_option
  5 | from pandas._config import option_context as option_context
  6 | from pandas._config import options as options
  7 | from pandas._config import reset_option as reset_option
  8 | from pandas._config import set_option as set_option
  9 | from pandas.core.api import NA as NA
 10 | from pandas.core.api import BooleanDtype as BooleanDtype
 11 | from pandas.core.api import Categorical as Categorical
 12 | from pandas.core.api import CategoricalDtype as CategoricalDtype
 13 | from pandas.core.api import CategoricalIndex as CategoricalIndex
 14 | from pandas.core.api import DataFrame as DataFrame
 15 | from pandas.core.api import DateOffset as DateOffset
 16 | from pandas.core.api import DatetimeIndex as DatetimeIndex
 17 | from pandas.core.api import DatetimeTZDtype as DatetimeTZDtype
 18 | from pandas.core.api import Flags as Flags
 19 | from pandas.core.api import Float32Dtype as Float32Dtype
 20 | from pandas.core.api import Float64Dtype as Float64Dtype
 21 | from pandas.core.api import Float64Index as Float64Index
 22 | from pandas.core.api import Grouper as Grouper
 23 | from pandas.core.api import Index as Index
 24 | from pandas.core.api import IndexSlice as IndexSlice
 25 | from pandas.core.api import Int8Dtype as Int8Dtype
 26 | from pandas.core.api import Int16Dtype as Int16Dtype
 27 | from pandas.core.api import Int32Dtype as Int32Dtype
 28 | from pandas.core.api import Int64Dtype as Int64Dtype
 29 | from pandas.core.api import Int64Index as Int64Index
 30 | from pandas.core.api import Interval as Interval
 31 | from pandas.core.api import IntervalDtype as IntervalDtype
 32 | from pandas.core.api import IntervalIndex as IntervalIndex
 33 | from pandas.core.api import MultiIndex as MultiIndex
 34 | from pandas.core.api import NamedAgg as NamedAgg
 35 | from pandas.core.api import NaT as NaT
 36 | from pandas.core.api import Period as Period
 37 | from pandas.core.api import PeriodDtype as PeriodDtype
 38 | from pandas.core.api import PeriodIndex as PeriodIndex
 39 | from pandas.core.api import RangeIndex as RangeIndex
 40 | from pandas.core.api import Series as Series
 41 | from pandas.core.api import StringDtype as StringDtype
 42 | from pandas.core.api import Timedelta as Timedelta
 43 | from pandas.core.api import TimedeltaIndex as TimedeltaIndex
 44 | from pandas.core.api import Timestamp as Timestamp
 45 | from pandas.core.api import UInt8Dtype as UInt8Dtype
 46 | from pandas.core.api import UInt16Dtype as UInt16Dtype
 47 | from pandas.core.api import UInt32Dtype as UInt32Dtype
 48 | from pandas.core.api import UInt64Dtype as UInt64Dtype
 49 | from pandas.core.api import UInt64Index as UInt64Index
 50 | from pandas.core.api import array as array
 51 | from pandas.core.api import bdate_range as bdate_range
 52 | from pandas.core.api import date_range as date_range
 53 | from pandas.core.api import factorize as factorize
 54 | from pandas.core.api import interval_range as interval_range
 55 | from pandas.core.api import isna as isna
 56 | from pandas.core.api import isnull as isnull
 57 | from pandas.core.api import notna as notna
 58 | from pandas.core.api import notnull as notnull
 59 | from pandas.core.api import period_range as period_range
 60 | from pandas.core.api import set_eng_float_format as set_eng_float_format
 61 | from pandas.core.api import timedelta_range as timedelta_range
 62 | from pandas.core.api import to_datetime as to_datetime
 63 | from pandas.core.api import to_numeric as to_numeric
 64 | from pandas.core.api import to_timedelta as to_timedelta
 65 | from pandas.core.api import unique as unique
 66 | from pandas.core.api import value_counts as value_counts
 67 | from pandas.core.arrays.sparse import SparseDtype as SparseDtype
 68 | from pandas.core.computation.api import eval as eval
 69 | from pandas.core.reshape.api import concat as concat
 70 | from pandas.core.reshape.api import crosstab as crosstab
 71 | from pandas.core.reshape.api import cut as cut
 72 | from pandas.core.reshape.api import get_dummies as get_dummies
 73 | from pandas.core.reshape.api import lreshape as lreshape
 74 | from pandas.core.reshape.api import melt as melt
 75 | from pandas.core.reshape.api import merge as merge
 76 | from pandas.core.reshape.api import merge_asof as merge_asof
 77 | from pandas.core.reshape.api import merge_ordered as merge_ordered
 78 | from pandas.core.reshape.api import pivot as pivot
 79 | from pandas.core.reshape.api import pivot_table as pivot_table
 80 | from pandas.core.reshape.api import qcut as qcut
 81 | from pandas.core.reshape.api import wide_to_long as wide_to_long
 82 | from pandas.io.api import ExcelFile as ExcelFile
 83 | from pandas.io.api import ExcelWriter as ExcelWriter
 84 | from pandas.io.api import HDFStore as HDFStore
 85 | from pandas.io.api import read_clipboard as read_clipboard
 86 | from pandas.io.api import read_csv as read_csv
 87 | from pandas.io.api import read_excel as read_excel
 88 | from pandas.io.api import read_feather as read_feather
 89 | from pandas.io.api import read_fwf as read_fwf
 90 | from pandas.io.api import read_gbq as read_gbq
 91 | from pandas.io.api import read_hdf as read_hdf
 92 | from pandas.io.api import read_html as read_html
 93 | from pandas.io.api import read_json as read_json
 94 | from pandas.io.api import read_orc as read_orc
 95 | from pandas.io.api import read_parquet as read_parquet
 96 | from pandas.io.api import read_pickle as read_pickle
 97 | from pandas.io.api import read_sas as read_sas
 98 | from pandas.io.api import read_spss as read_spss
 99 | from pandas.io.api import read_sql as read_sql
100 | from pandas.io.api import read_sql_query as read_sql_query
101 | from pandas.io.api import read_sql_table as read_sql_table
102 | from pandas.io.api import read_stata as read_stata
103 | from pandas.io.api import read_table as read_table
104 | from pandas.io.api import to_pickle as to_pickle
105 | from pandas.tseries import offsets as offsets
106 | from pandas.tseries.api import infer_freq as infer_freq
107 | from pandas.util._print_versions import show_versions as show_versions
108 | from pandas.util._tester import test as test
109 | 
110 | __docformat__: str
111 | hard_dependencies: Any
112 | missing_dependencies: Any
113 | module: Any
114 | v: Any
115 | __git_version__: Any
116 | 
117 | def __getattr__(name: Any): ...
118 | 
119 | # __doc__: str
120 | 


--------------------------------------------------------------------------------
/tests/test_unit/test_detect.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Unit tests for the dialect detection.
  5 | 
  6 | Author: Gertjan van den Burg
  7 | 
  8 | """
  9 | 
 10 | import unittest
 11 | 
 12 | from clevercsv.detect import Detector
 13 | 
 14 | 
 15 | class DetectorTestCase(unittest.TestCase):
 16 |     # Initially we copy the results from CPython test suite.
 17 | 
 18 |     sample1 = """\
 19 | Harry's, Arlington Heights, IL, 2/1/03, Kimi Hayes
 20 | Shark City, Glendale Heights, IL, 12/28/02, Prezence
 21 | Tommy's Place, Blue Island, IL, 12/28/02, Blue Sunday/White Crow
 22 | Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
 23 | """
 24 |     sample2 = """\
 25 | 'Harry''s':'Arlington Heights':'IL':'2/1/03':'Kimi Hayes'
 26 | 'Shark City':'Glendale Heights':'IL':'12/28/02':'Prezence'
 27 | 'Tommy''s Place':'Blue Island':'IL':'12/28/02':'Blue Sunday/White Crow'
 28 | 'Stonecutters ''Seafood'' and Chop House':'Lemont':'IL':'12/19/02':'Week Back'
 29 | """
 30 |     header1 = """\
 31 | "venue","city","state","date","performers"
 32 | """
 33 |     sample3 = """\
 34 | 05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
 35 | 05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
 36 | 05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
 37 | """
 38 | 
 39 |     sample4 = """\
 40 | 2147483648;43.0e12;17;abc;def
 41 | 147483648;43.0e2;17;abc;def
 42 | 47483648;43.0;170;abc;def
 43 | """
 44 | 
 45 |     sample5 = "aaa\tbbb\r\nAAA\t\r\nBBB\t\r\n"
 46 |     sample6 = "a|b|c\r\nd|e|f\r\n"
 47 |     sample7 = "'a'|'b'|'c'\r\n'd'|e|f\r\n"
 48 | 
 49 |     header2 = """\
 50 | "venue"+"city"+"state"+"date"+"performers"
 51 | """
 52 |     sample8 = """\
 53 | Harry's+ Arlington Heights+ IL+ 2/1/03+ Kimi Hayes
 54 | Shark City+ Glendale Heights+ IL+ 12/28/02+ Prezence
 55 | Tommy's Place+ Blue Island+ IL+ 12/28/02+ Blue Sunday/White Crow
 56 | Stonecutters Seafood and Chop House+ Lemont+ IL+ 12/19/02+ Week Back
 57 | """
 58 |     # adapted to be not broken
 59 |     sample9 = """\
 60 | 'Harry''s'+ 'Arlington Heights'+ 'IL'+ '2/1/03'+ 'Kimi Hayes'
 61 | 'Shark City'+ 'Glendale Heights'+' IL'+ '12/28/02'+ 'Prezence'
 62 | 'Tommy''s Place'+ 'Blue Island'+ 'IL'+ '12/28/02'+ 'Blue Sunday/White Crow'
 63 | 'Stonecutters ''Seafood'' and Chop House'+ 'Lemont'+ 'IL'+ '12/19/02'+ 'Week Back'
 64 | """
 65 | 
 66 |     sample10 = """\
 67 | bytearray(b'fake data'),20:53:06,2019-09-01T19:28:21
 68 | bytearray(b'fake data'),19:33:15,2005-02-15T19:10:31
 69 | bytearray(b'fake data'),10:43:05,1992-10-12T14:49:24
 70 | bytearray(b'fake data'),10:36:49,1999-07-18T17:27:55
 71 | bytearray(b'fake data'),03:33:35,1982-04-24T17:38:45
 72 | bytearray(b'fake data'),14:49:47,1983-01-05T22:17:42
 73 | bytearray(b'fake data'),10:35:30,2006-10-27T02:30:45
 74 | """
 75 | 
 76 |     sample11 = """\
 77 | "{""fake"": ""json"", ""fake2"":""json2""}",13:31:38,06:00:04+01:00
 78 | "{""fake"": ""json"", ""fake2"":""json2""}",22:13:29,14:20:11+02:00
 79 | "{""fake"": ""json"", ""fake2"":""json2""}",04:37:27,22:04:28+03:00
 80 | "{""fake"": ""json"", ""fake2"":""json2""}",04:25:28,23:12:53+01:00
 81 | "{""fake"": ""json"", ""fake2"":""json2""}",21:04:15,08:23:58+02:00
 82 | "{""fake"": ""json"", ""fake2"":""json2""}",10:37:03,11:06:42+05:30
 83 | "{""fake"": ""json"", ""fake2"":""json2""}",10:17:24,23:38:47+06:00
 84 | "{""fake"": ""json"", ""fake2"":""json2""}",00:02:51,20:04:45-06:00
 85 | """
 86 | 
 87 |     def test_detect(self) -> None:
 88 |         # Adapted from CPython
 89 |         detector = Detector()
 90 |         dialect = detector.detect(self.sample1)
 91 |         assert dialect is not None
 92 |         self.assertEqual(dialect.delimiter, ",")
 93 |         self.assertEqual(dialect.quotechar, "")
 94 |         self.assertEqual(dialect.escapechar, "")
 95 | 
 96 |         dialect = detector.detect(self.sample2)
 97 |         assert dialect is not None
 98 |         self.assertEqual(dialect.delimiter, ":")
 99 |         self.assertEqual(dialect.quotechar, "'")
100 |         self.assertEqual(dialect.escapechar, "")
101 | 
102 |     def test_delimiters(self) -> None:
103 |         # Adapted from CPython
104 |         detector = Detector()
105 |         dialect = detector.detect(self.sample3)
106 |         assert dialect is not None
107 |         self.assertIn(dialect.delimiter, self.sample3)
108 |         dialect = detector.detect(self.sample3, delimiters="?,")
109 |         assert dialect is not None
110 |         self.assertEqual(dialect.delimiter, "?")
111 |         dialect = detector.detect(self.sample3, delimiters="/,")
112 |         assert dialect is not None
113 |         self.assertEqual(dialect.delimiter, "/")
114 |         dialect = detector.detect(self.sample4)
115 |         assert dialect is not None
116 |         self.assertEqual(dialect.delimiter, ";")
117 |         dialect = detector.detect(self.sample5)
118 |         assert dialect is not None
119 |         self.assertEqual(dialect.delimiter, "\t")
120 |         dialect = detector.detect(self.sample6)
121 |         assert dialect is not None
122 |         self.assertEqual(dialect.delimiter, "|")
123 |         dialect = detector.detect(self.sample7)
124 |         assert dialect is not None
125 |         self.assertEqual(dialect.delimiter, "|")
126 |         self.assertEqual(dialect.quotechar, "'")
127 |         dialect = detector.detect(self.sample8)
128 |         assert dialect is not None
129 |         self.assertEqual(dialect.delimiter, "+")
130 |         dialect = detector.detect(self.sample9)
131 |         assert dialect is not None
132 |         self.assertEqual(dialect.delimiter, "+")
133 |         self.assertEqual(dialect.quotechar, "'")
134 |         dialect = detector.detect(self.sample10)
135 |         assert dialect is not None
136 |         self.assertEqual(dialect.delimiter, ",")
137 |         self.assertEqual(dialect.quotechar, "")
138 |         dialect = detector.detect(self.sample11)
139 |         assert dialect is not None
140 |         self.assertEqual(dialect.delimiter, ",")
141 |         self.assertEqual(dialect.quotechar, '"')
142 | 
143 |     def test_has_header(self) -> None:
144 |         detector = Detector()
145 |         self.assertEqual(detector.has_header(self.sample1), False)
146 |         self.assertEqual(
147 |             detector.has_header(self.header1 + self.sample1), True
148 |         )
149 | 
150 |     def test_has_header_regex_special_delimiter(self) -> None:
151 |         detector = Detector()
152 |         self.assertEqual(detector.has_header(self.sample8), False)
153 |         self.assertEqual(
154 |             detector.has_header(self.header2 + self.sample8), True
155 |         )
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     unittest.main()
160 | 


--------------------------------------------------------------------------------
/stubs/wilderness/__init__.pyi:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import argparse
  3 | 
  4 | from typing import Dict
  5 | from typing import List
  6 | from typing import Optional
  7 | from typing import TextIO
  8 | 
  9 | class DocumentableMixin(metaclass=abc.ABCMeta):
 10 |     def __init__(
 11 |         self,
 12 |         description: Optional[str] = None,
 13 |         extra_sections: Optional[Dict[str, str]] = None,
 14 |         options_prolog: Optional[str] = None,
 15 |         options_epilog: Optional[str] = None,
 16 |     ) -> None: ...
 17 |     @property
 18 |     def description(self) -> Optional[str]: ...
 19 |     @property
 20 |     def parser(self) -> argparse.ArgumentParser: ...
 21 |     @parser.setter
 22 |     def parser(self, parser: argparse.ArgumentParser) -> None: ...
 23 |     @property
 24 |     def args(self) -> argparse.Namespace: ...
 25 |     @args.setter
 26 |     def args(self, args: argparse.Namespace) -> None: ...
 27 |     @property
 28 |     def argument_help(self) -> Dict[str, Optional[str]]: ...
 29 | 
 30 | class Application(DocumentableMixin):
 31 |     def __init__(
 32 |         self,
 33 |         name: str,
 34 |         version: str,
 35 |         author: Optional[str] = None,
 36 |         title: Optional[str] = None,
 37 |         description: Optional[str] = None,
 38 |         default_command: Optional[str] = None,
 39 |         add_help: bool = True,
 40 |         extra_sections: Optional[Dict[str, str]] = None,
 41 |         prolog: Optional[str] = None,
 42 |         epilog: Optional[str] = None,
 43 |         options_prolog: Optional[str] = None,
 44 |         options_epilog: Optional[str] = None,
 45 |         add_commands_section: bool = False,
 46 |     ) -> None: ...
 47 |     @property
 48 |     def name(self) -> str: ...
 49 |     @property
 50 |     def author(self) -> str: ...
 51 |     @property
 52 |     def version(self) -> str: ...
 53 |     @property
 54 |     def commands(self) -> List[Command]: ...
 55 |     @property
 56 |     def groups(self) -> List[Group]: ...
 57 |     def add_argument(self, *args, **kwargs) -> argparse.Action: ...
 58 |     def add(self, command: Command): ...
 59 |     def add_group(self, title: str) -> Group: ...
 60 |     def register(self): ...
 61 |     def handle(self) -> int: ...
 62 |     def run(
 63 |         self,
 64 |         args: Optional[List[str]] = None,
 65 |         namespace: Optional[argparse.Namespace] = None,
 66 |         exit_on_error: bool = True,
 67 |     ) -> int: ...
 68 |     def run_command(self, command: Command) -> int: ...
 69 |     def get_command(self, command_name: str) -> Command: ...
 70 |     def set_prolog(self, prolog: str) -> None: ...
 71 |     def set_epilog(self, epilog: str) -> None: ...
 72 |     def get_commands_text(self) -> str: ...
 73 |     def create_manpage(self) -> ManPage: ...
 74 |     def format_help(self) -> str: ...
 75 |     def print_help(self, file: Optional[TextIO] = None) -> None: ...
 76 | 
 77 | class Group:
 78 |     def __init__(
 79 |         self, title: Optional[str] = None, is_root: bool = False
 80 |     ) -> None: ...
 81 |     @property
 82 |     def application(self) -> Optional[Application]: ...
 83 |     @property
 84 |     def title(self) -> Optional[str]: ...
 85 |     @property
 86 |     def commands(self) -> List[Command]: ...
 87 |     @property
 88 |     def is_root(self) -> bool: ...
 89 |     def commands_as_actions(self) -> List[argparse.Action]: ...
 90 |     def set_app(self, app: Application) -> None: ...
 91 |     def add(self, command: Command) -> None: ...
 92 |     def __len__(self) -> int: ...
 93 | 
 94 | class Command(DocumentableMixin, metaclass=abc.ABCMeta):
 95 |     def __init__(
 96 |         self,
 97 |         name: str,
 98 |         title: Optional[str] = None,
 99 |         description: Optional[str] = None,
100 |         add_help: bool = True,
101 |         extra_sections: Optional[Dict[str, str]] = None,
102 |         options_prolog: Optional[str] = None,
103 |         options_epilog: Optional[str] = None,
104 |     ) -> None: ...
105 |     @property
106 |     def application(self) -> Optional[Application]: ...
107 |     @property
108 |     def name(self) -> str: ...
109 |     @property
110 |     def title(self) -> Optional[str]: ...
111 |     def add_argument(self, *args, **kwargs) -> None: ...
112 |     def add_argument_group(self, *args, **kwargs) -> ArgumentGroup: ...
113 |     def add_mutually_exclusive_group(
114 |         self, *args, **kwargs
115 |     ) -> MutuallyExclusiveGroup: ...
116 |     def register(self) -> None: ...
117 |     @abc.abstractmethod
118 |     def handle(self) -> int: ...
119 |     def create_manpage(self) -> ManPage: ...
120 | 
121 | class ManPage:
122 |     def __init__(
123 |         self,
124 |         application_name: str,
125 |         author: Optional[str] = "",
126 |         command_name: Optional[str] = None,
127 |         date: Optional[str] = None,
128 |         title: Optional[str] = None,
129 |         version: Optional[str] = "",
130 |     ) -> None: ...
131 |     @property
132 |     def name(self) -> str: ...
133 |     def metadata(self) -> List[str]: ...
134 |     def preamble(self) -> List[str]: ...
135 |     def header(self) -> str: ...
136 |     def section_name(self) -> str: ...
137 |     def add_section_synopsis(self, synopsis: str) -> None: ...
138 |     def add_section(self, label: str, text: str) -> None: ...
139 |     def groffify(self, text: str) -> str: ...
140 |     def groffify_line(self, line: str) -> str: ...
141 |     def export(self, output_dir: str) -> str: ...
142 | 
143 | class ArgumentGroup:
144 |     def __init__(self, group: argparse._ArgumentGroup) -> None: ...
145 |     @property
146 |     def command(self) -> Optional[Command]: ...
147 |     @command.setter
148 |     def command(self, command: Command) -> None: ...
149 |     def add_argument(self, *args, **kwargs) -> None: ...
150 | 
151 | class MutuallyExclusiveGroup:
152 |     def __init__(self, meg: argparse._MutuallyExclusiveGroup) -> None: ...
153 |     @property
154 |     def command(self) -> Optional[Command]: ...
155 |     @command.setter
156 |     def command(self, command: Command) -> None: ...
157 |     def add_argument(self, *args, **kwargs) -> None: ...
158 | 
159 | class Tester:
160 |     def __init__(self, app: Application) -> None: ...
161 |     @property
162 |     def application(self) -> Application: ...
163 |     def clear(self) -> None: ...
164 |     def get_return_code(self) -> Optional[int]: ...
165 |     def get_stdout(self) -> Optional[str]: ...
166 |     def get_stderr(self) -> Optional[str]: ...
167 |     def test_command(self, cmd_name: str, args: List[str]) -> None: ...
168 |     def test_application(self, args: Optional[List[str]] = None) -> None: ...
169 | 


--------------------------------------------------------------------------------
/tests/test_unit/test_detect_pattern.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Unit tests for the pattern score.
  5 | 
  6 | Author: Gertjan van den Burg
  7 | 
  8 | """
  9 | 
 10 | import unittest
 11 | 
 12 | from clevercsv import detect_pattern
 13 | from clevercsv.dialect import SimpleDialect
 14 | 
 15 | 
 16 | class PatternTestCase(unittest.TestCase):
 17 | 
 18 |     """
 19 |     Abstraction tests
 20 |     """
 21 | 
 22 |     def test_abstraction_1(self) -> None:
 23 |         out = detect_pattern.make_abstraction(
 24 |             "A,B,C", SimpleDialect(delimiter=",", quotechar="", escapechar="")
 25 |         )
 26 |         exp = "CDCDC"
 27 |         self.assertEqual(exp, out)
 28 | 
 29 |     def test_abstraction_2(self) -> None:
 30 |         out = detect_pattern.make_abstraction(
 31 |             "A,\rA,A,A\r",
 32 |             SimpleDialect(delimiter=",", quotechar="", escapechar=""),
 33 |         )
 34 |         exp = "CDCRCDCDC"
 35 |         self.assertEqual(exp, out)
 36 | 
 37 |     def test_abstraction_3(self) -> None:
 38 |         out = detect_pattern.make_abstraction(
 39 |             "a,a,\n,a,a\ra,a,a\r\n",
 40 |             SimpleDialect(delimiter=",", quotechar="", escapechar=""),
 41 |         )
 42 |         exp = "CDCDCRCDCDCRCDCDC"
 43 |         self.assertEqual(exp, out)
 44 | 
 45 |     def test_abstraction_4(self) -> None:
 46 |         out = detect_pattern.make_abstraction(
 47 |             'a,"bc""d""e""f""a",\r\n',
 48 |             SimpleDialect(delimiter=",", quotechar='"', escapechar=""),
 49 |         )
 50 |         exp = "CDCDC"
 51 |         self.assertEqual(exp, out)
 52 | 
 53 |     def test_abstraction_5(self) -> None:
 54 |         out = detect_pattern.make_abstraction(
 55 |             'a,"bc""d"",|"f|""',
 56 |             SimpleDialect(delimiter=",", quotechar='"', escapechar="|"),
 57 |         )
 58 |         exp = "CDC"
 59 |         self.assertEqual(exp, out)
 60 | 
 61 |     def test_abstraction_6(self) -> None:
 62 |         out = detect_pattern.make_abstraction(
 63 |             ",,,", SimpleDialect(delimiter=",", quotechar="", escapechar="")
 64 |         )
 65 |         exp = "CDCDCDC"
 66 |         self.assertEqual(exp, out)
 67 | 
 68 |     def test_abstraction_7(self) -> None:
 69 |         out = detect_pattern.make_abstraction(
 70 |             ',"",,', SimpleDialect(delimiter=",", quotechar='"', escapechar="")
 71 |         )
 72 |         exp = "CDCDCDC"
 73 |         self.assertEqual(exp, out)
 74 | 
 75 |     def test_abstraction_8(self) -> None:
 76 |         out = detect_pattern.make_abstraction(
 77 |             ',"",,\r\n',
 78 |             SimpleDialect(delimiter=",", quotechar='"', escapechar=""),
 79 |         )
 80 |         exp = "CDCDCDC"
 81 |         self.assertEqual(exp, out)
 82 | 
 83 |     """
 84 |     Escape char tests
 85 |     """
 86 | 
 87 |     def test_abstraction_9(self) -> None:
 88 |         out = detect_pattern.make_abstraction(
 89 |             "A,B|,C",
 90 |             SimpleDialect(delimiter=",", quotechar="", escapechar="|"),
 91 |         )
 92 |         exp = "CDC"
 93 |         self.assertEqual(exp, out)
 94 | 
 95 |     def test_abstraction_10(self) -> None:
 96 |         out = detect_pattern.make_abstraction(
 97 |             'A,"B,C|"D"',
 98 |             SimpleDialect(delimiter=",", quotechar='"', escapechar="|"),
 99 |         )
100 |         exp = "CDC"
101 |         self.assertEqual(exp, out)
102 | 
103 |     def test_abstraction_11(self) -> None:
104 |         out = detect_pattern.make_abstraction(
105 |             "a,|b,c",
106 |             SimpleDialect(delimiter=",", quotechar="", escapechar="|"),
107 |         )
108 |         exp = "CDCDC"
109 |         self.assertEqual(exp, out)
110 | 
111 |     def test_abstraction_12(self) -> None:
112 |         out = detect_pattern.make_abstraction(
113 |             "a,b|,c",
114 |             SimpleDialect(delimiter=",", quotechar="", escapechar="|"),
115 |         )
116 |         exp = "CDC"
117 |         self.assertEqual(exp, out)
118 | 
119 |     def test_abstraction_13(self) -> None:
120 |         out = detect_pattern.make_abstraction(
121 |             'a,"b,c|""',
122 |             SimpleDialect(delimiter=",", quotechar='"', escapechar="|"),
123 |         )
124 |         exp = "CDC"
125 |         self.assertEqual(exp, out)
126 | 
127 |     def test_abstraction_14(self) -> None:
128 |         out = detect_pattern.make_abstraction(
129 |             "a,b||c",
130 |             SimpleDialect(delimiter=",", quotechar="", escapechar="|"),
131 |         )
132 |         exp = "CDC"
133 |         self.assertEqual(exp, out)
134 | 
135 |     def test_abstraction_15(self) -> None:
136 |         out = detect_pattern.make_abstraction(
137 |             'a,"b|"c||d|"e"',
138 |             SimpleDialect(delimiter=",", quotechar='"', escapechar="|"),
139 |         )
140 |         exp = "CDC"
141 |         self.assertEqual(exp, out)
142 | 
143 |     def test_abstraction_16(self) -> None:
144 |         out = detect_pattern.make_abstraction(
145 |             'a,"b|"c||d","e"',
146 |             SimpleDialect(delimiter=",", quotechar='"', escapechar="|"),
147 |         )
148 |         exp = "CDCDC"
149 |         self.assertEqual(exp, out)
150 | 
151 |     """
152 |     Fill empties
153 |     """
154 | 
155 |     def test_fill_empties_1(self) -> None:
156 |         out = detect_pattern.fill_empties("DDD")
157 |         exp = "CDCDCDC"
158 |         self.assertEqual(exp, out)
159 | 
160 |     """
161 |     Pattern Score tests
162 |     """
163 | 
164 |     def test_pattern_score_1(self) -> None:
165 |         # theta_1 from paper
166 |         data = (
167 |             "7,5; Mon, Jan 12;6,40\n100; Fri, Mar 21;8,23\n8,2; Thu, Sep 17;"
168 |             '2,71\n538,0;;7,26\n"NA"; Wed, Oct 4;6,93'
169 |         )
170 |         d = SimpleDialect(delimiter=",", quotechar="", escapechar="")
171 |         out = detect_pattern.pattern_score(data, d)
172 |         exp = 7 / 4
173 |         self.assertAlmostEqual(exp, out)
174 | 
175 |     def test_pattern_score_2(self) -> None:
176 |         # theta_2 from paper
177 |         data = (
178 |             "7,5; Mon, Jan 12;6,40\n100; Fri, Mar 21;8,23\n8,2; Thu, Sep 17;"
179 |             '2,71\n538,0;;7,26\n"NA"; Wed, Oct 4;6,93'
180 |         )
181 |         d = SimpleDialect(delimiter=";", quotechar="", escapechar="")
182 |         out = detect_pattern.pattern_score(data, d)
183 |         exp = 10 / 3
184 |         self.assertAlmostEqual(exp, out)
185 | 
186 |     def test_pattern_score_3(self) -> None:
187 |         # theta_3 from paper
188 |         data = (
189 |             "7,5; Mon, Jan 12;6,40\n100; Fri, Mar 21;8,23\n8,2; Thu, Sep 17;"
190 |             '2,71\n538,0;;7,26\n"NA"; Wed, Oct 4;6,93'
191 |         )
192 |         d = SimpleDialect(delimiter=";", quotechar='"', escapechar="")
193 |         out = detect_pattern.pattern_score(data, d)
194 |         exp = 10 / 3
195 |         self.assertAlmostEqual(exp, out)
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     unittest.main()
200 | 


--------------------------------------------------------------------------------
/tests/test_integration/test_dialect_detection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Integration tests for dialect detection.
  5 | 
  6 | Author: G.J.J. van den Burg
  7 | 
  8 | """
  9 | 
 10 | import argparse
 11 | import gzip
 12 | import json
 13 | import multiprocessing
 14 | import os
 15 | import time
 16 | import warnings
 17 | 
 18 | from typing import Any
 19 | from typing import Dict
 20 | from typing import List
 21 | from typing import Optional
 22 | from typing import Tuple
 23 | 
 24 | import chardet
 25 | import termcolor
 26 | 
 27 | import clevercsv
 28 | 
 29 | from clevercsv.dialect import SimpleDialect
 30 | 
 31 | THIS_DIR = os.path.abspath(os.path.dirname(__file__))
 32 | SOURCE_DIR = os.path.join(THIS_DIR, "data")
 33 | TEST_FILES = os.path.join(SOURCE_DIR, "files")
 34 | TEST_DIALECTS = os.path.join(SOURCE_DIR, "dialects")
 35 | 
 36 | LOG_SUCCESS = os.path.join(THIS_DIR, "success.log")
 37 | LOG_ERROR = os.path.join(THIS_DIR, "error.log")
 38 | LOG_FAILED = os.path.join(THIS_DIR, "failed.log")
 39 | LOG_METHOD = os.path.join(THIS_DIR, "method.log")
 40 | LOG_RUNTIME = os.path.join(THIS_DIR, "runtime.log")
 41 | 
 42 | LOG_SUCCESS_PARTIAL = os.path.join(THIS_DIR, "success_partial.log")
 43 | LOG_ERROR_PARTIAL = os.path.join(THIS_DIR, "error_partial.log")
 44 | LOG_FAILED_PARTIAL = os.path.join(THIS_DIR, "failed_partial.log")
 45 | LOG_METHOD_PARTIAL = os.path.join(THIS_DIR, "method_partial.log")
 46 | LOG_RUNTIME_PARTIAL = os.path.join(THIS_DIR, "runtime_partial.log")
 47 | 
 48 | TIMEOUT = 5 * 60
 49 | N_BYTES_PARTIAL = 10000
 50 | 
 51 | 
 52 | def log_result(name: str, kind: str, verbose: int, partial: bool) -> None:
 53 |     table = {
 54 |         "error": (LOG_ERROR, LOG_ERROR_PARTIAL, "yellow"),
 55 |         "success": (LOG_SUCCESS, LOG_SUCCESS_PARTIAL, "green"),
 56 |         "failure": (LOG_FAILED, LOG_FAILED_PARTIAL, "red"),
 57 |     }
 58 |     assert kind in table
 59 |     outfull, outpartial, color = table[kind]
 60 |     fname = outpartial if partial else outfull
 61 | 
 62 |     with open(fname, "a") as fp:
 63 |         fp.write(name + "\n")
 64 |     if verbose:
 65 |         termcolor.cprint(name, color=color)
 66 | 
 67 | 
 68 | def log_method(name: str, method: str, partial: bool) -> None:
 69 |     fname = LOG_METHOD_PARTIAL if partial else LOG_METHOD
 70 |     with open(fname, "a") as fp:
 71 |         fp.write(f"{name},{method}\n")
 72 | 
 73 | 
 74 | def log_runtime(name: str, runtime: float, partial: bool) -> None:
 75 |     fname = LOG_RUNTIME_PARTIAL if partial else LOG_RUNTIME
 76 |     with open(fname, "a") as fp:
 77 |         fp.write(f"{name},{runtime}\n")
 78 | 
 79 | 
 80 | def worker(
 81 |     args: List[Any], return_dict: Dict[str, Any], **kwargs: Any
 82 | ) -> None:
 83 |     det = clevercsv.Detector()
 84 |     filename, encoding, partial = args
 85 |     return_dict["error"] = False
 86 |     return_dict["dialect"] = None
 87 |     return_dict["method"] = None
 88 |     return_dict["runtime"] = float("nan")
 89 |     with gzip.open(filename, "rt", newline="", encoding=encoding) as fp:
 90 |         data = fp.read(N_BYTES_PARTIAL) if partial else fp.read()
 91 |         try:
 92 |             t = time.time()
 93 |             return_dict["dialect"] = det.detect(data, **kwargs)
 94 |             return_dict["runtime"] = time.time() - t
 95 |             return_dict["method"] = det.method_.value
 96 |         except clevercsv.Error:
 97 |             return_dict["error"] = True
 98 | 
 99 | 
100 | def run_with_timeout(
101 |     args: Tuple[Any, ...], kwargs: Dict[str, Any], limit: Optional[int]
102 | ) -> Tuple[Optional[SimpleDialect], bool, Optional[str], float]:
103 |     manager = multiprocessing.Manager()
104 |     return_dict = manager.dict()
105 |     p = multiprocessing.Process(
106 |         target=worker, args=(args, return_dict), kwargs=kwargs
107 |     )
108 |     p.start()
109 |     p.join(limit)
110 |     if p.is_alive():
111 |         p.terminate()
112 |         return None, True, None, float("nan")
113 |     return (
114 |         return_dict["dialect"],
115 |         return_dict["error"],
116 |         return_dict["method"],
117 |         return_dict["runtime"],
118 |     )
119 | 
120 | 
121 | def run_test(
122 |     name: str,
123 |     gz_filename: str,
124 |     annotation: Dict[str, Any],
125 |     verbose: int = 1,
126 |     partial: bool = False,
127 | ) -> None:
128 |     if "encoding" in annotation:
129 |         enc = annotation["encoding"]
130 |     else:
131 |         with gzip.open(gz_filename, "rb") as fid:
132 |             enc = chardet.detect(fid.read())["encoding"]
133 | 
134 |     true_dialect = annotation["dialect"]
135 |     dialect, error, method, runtime = run_with_timeout(
136 |         (gz_filename, enc, partial), {"verbose": verbose > 1}, TIMEOUT
137 |     )
138 |     if error:
139 |         return log_result(name, "error", verbose, partial)
140 | 
141 |     if dialect is None:
142 |         log_result(name, "failure", verbose, partial)
143 |     elif dialect.delimiter != true_dialect["delimiter"]:
144 |         log_result(name, "failure", verbose, partial)
145 |     elif dialect.quotechar != true_dialect["quotechar"]:
146 |         log_result(name, "failure", verbose, partial)
147 |     elif dialect.escapechar != true_dialect["escapechar"]:
148 |         log_result(name, "failure", verbose, partial)
149 |     else:
150 |         log_result(name, "success", verbose, partial)
151 | 
152 |     assert method is not None
153 |     log_method(name, method, partial)
154 |     log_runtime(name, runtime, partial)
155 | 
156 | 
157 | def load_test_cases() -> List[Tuple[str, str, Dict[str, Any]]]:
158 |     cases = []
159 |     for f in sorted(os.listdir(TEST_FILES)):
160 |         base = f[: -len(".csv.gz")]
161 |         dialect_file = os.path.join(TEST_DIALECTS, base + ".json")
162 |         if not os.path.exists(dialect_file):
163 |             continue
164 |         filename = os.path.join(TEST_FILES, f)
165 |         with open(dialect_file, "r") as fid:
166 |             annotation = json.load(fid)
167 |         if not annotation["filename"] == f[: -len(".gz")]:
168 |             warnings.warn(
169 |                 "filename doesn't match! Input file: %s\nDialect file: %s"
170 |                 % (filename, dialect_file)
171 |             )
172 |             continue
173 |         if annotation["status"] == "skip":
174 |             continue
175 |         cases.append((base, filename, annotation))
176 |     return cases
177 | 
178 | 
179 | def clear_output_files(partial: bool) -> None:
180 |     files = {
181 |         True: [
182 |             LOG_SUCCESS_PARTIAL,
183 |             LOG_FAILED_PARTIAL,
184 |             LOG_ERROR_PARTIAL,
185 |             LOG_METHOD_PARTIAL,
186 |             LOG_RUNTIME_PARTIAL,
187 |         ],
188 |         False: [LOG_SUCCESS, LOG_FAILED, LOG_ERROR, LOG_METHOD, LOG_RUNTIME],
189 |     }
190 |     for filename in files[partial]:
191 |         if os.path.exists(filename):
192 |             os.unlink(filename)
193 | 
194 | 
195 | def parse_args() -> argparse.Namespace:
196 |     parser = argparse.ArgumentParser()
197 |     parser.add_argument(
198 |         "--partial",
199 |         help="Run test with partial file data",
200 |         action="store_true",
201 |     )
202 |     parser.add_argument("-v", "--verbose", help="Be verbose", action="count")
203 |     return parser.parse_args()
204 | 
205 | 
206 | def main() -> None:
207 |     args = parse_args()
208 |     clear_output_files(args.partial)
209 |     cases = load_test_cases()
210 |     for name, gz_filename, annotation in cases:
211 |         run_test(
212 |             name,
213 |             gz_filename,
214 |             annotation,
215 |             verbose=args.verbose,
216 |             partial=args.partial,
217 |         )
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     main()
222 | 


--------------------------------------------------------------------------------
/clevercsv/consistency.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Detect the dialect using the data consistency measure.
  5 | 
  6 | Author: Gertjan van den Burg
  7 | 
  8 | """
  9 | 
 10 | from dataclasses import dataclass
 11 | from functools import lru_cache
 12 | 
 13 | from typing import Dict
 14 | from typing import Iterable
 15 | from typing import List
 16 | from typing import Optional
 17 | 
 18 | from . import field_size_limit
 19 | from .break_ties import tie_breaker
 20 | from .cparser_util import parse_string
 21 | from .detect_pattern import pattern_score
 22 | from .detect_type import DEFAULT_EPS_TYPE
 23 | from .detect_type import TypeDetector
 24 | from .dialect import SimpleDialect
 25 | from .potential_dialects import get_dialects
 26 | 
 27 | 
 28 | @dataclass
 29 | class ConsistencyScore:
 30 |     """Container to track the consistency score calculation
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     P : float
 35 |         The pattern score
 36 | 
 37 |     T : Optional[float]
 38 |         The type score. Can be None if not computed for speed.
 39 | 
 40 |     Q : Optional[float]
 41 |         The consistency score. Can be None if not computed for speed.
 42 | 
 43 |     """
 44 | 
 45 |     P: float
 46 |     T: Optional[float]
 47 |     Q: Optional[float]
 48 | 
 49 | 
 50 | class ConsistencyDetector:
 51 |     """Detect the dialect with the data consistency measure
 52 | 
 53 |     This class uses the data consistency measure to detect the dialect. See the
 54 |     paper for details.
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     skip : bool
 59 |         Skip computation of the type score for dialects with a low pattern
 60 |         score.
 61 | 
 62 |     verbose : bool
 63 |         Print out the dialects considered and their scores.
 64 | 
 65 |     cache_capacity: int
 66 |         The size of the cache for type detection. Caching the type detection
 67 |         result greatly speeds up the computation of the consistency measure.
 68 |         The size of the cache can be changed to trade off memory use and speed.
 69 | 
 70 |     """
 71 | 
 72 |     def __init__(
 73 |         self,
 74 |         skip: bool = True,
 75 |         verbose: bool = False,
 76 |         cache_capacity: int = 100_000,
 77 |     ) -> None:
 78 |         self._skip = skip
 79 |         self._verbose = verbose
 80 |         self._type_detector = TypeDetector()
 81 |         self._cache_capacity = cache_capacity
 82 | 
 83 |         # NOTE: A bit ugly but allows setting the cache size dynamically
 84 |         @lru_cache(cache_capacity)
 85 |         def cached_is_known_type(cell: str, is_quoted: bool) -> bool:
 86 |             return self._type_detector.is_known_type(cell, is_quoted)
 87 | 
 88 |         self._cached_is_known_type = cached_is_known_type
 89 | 
 90 |     def detect(
 91 |         self, data: str, delimiters: Optional[List[str]] = None
 92 |     ) -> Optional[SimpleDialect]:
 93 |         """Detect the dialect using the consistency measure
 94 | 
 95 |         Parameters
 96 |         ----------
 97 |         data : str
 98 |             The data of the file as a string
 99 | 
100 |         delimiters : iterable
101 |             List of delimiters to consider. If None, the :func:`get_delimiters`
102 |             function is used to automatically detect this (as described in the
103 |             paper).
104 | 
105 |         Returns
106 |         -------
107 |         dialect : SimpleDialect
108 |             The detected dialect. If no dialect could be detected, returns None.
109 | 
110 |         """
111 |         self._cached_is_known_type.cache_clear()
112 | 
113 |         # TODO: probably some optimization there too
114 |         dialects = get_dialects(data, delimiters=delimiters)
115 | 
116 |         # TODO: This is not thread-safe and this object can simply own a Parser
117 |         # for each dialect and set the limit directly there (we can also cache
118 |         # the best parsing result)
119 |         old_limit = field_size_limit(len(data) + 1)
120 | 
121 |         scores = self.compute_consistency_scores(data, dialects)
122 |         best_dialects = ConsistencyDetector.get_best_dialects(scores)
123 |         result: Optional[SimpleDialect] = None
124 |         if len(best_dialects) == 1:
125 |             result = best_dialects[0]
126 |         else:
127 |             result = tie_breaker(data, best_dialects)
128 | 
129 |         field_size_limit(old_limit)
130 |         return result
131 | 
132 |     def compute_consistency_scores(
133 |         self, data: str, dialects: List[SimpleDialect]
134 |     ) -> Dict[SimpleDialect, ConsistencyScore]:
135 |         """Compute the consistency score for each dialect
136 | 
137 |         This function computes the consistency score for each dialect. This is
138 |         done by first computing the pattern score for a dialect. If the class
139 |         is instantiated with ``skip`` set to False, it also computes the type
140 |         score for each dialect. If ``skip`` is True (the default), the type
141 |         score is only computed if the pattern score is larger or equal to the
142 |         current best combined score.
143 | 
144 |         Parameters
145 |         ----------
146 |         data : str
147 |             The data of the file as a string
148 | 
149 |         dialects : Iterable[SimpleDialect]
150 |             An iterable of delimiters to consider.
151 | 
152 |         Returns
153 |         -------
154 |         scores : Dict[SimpleDialect, ConsistencyScore]
155 |             A map with a :class:`ConsistencyScore` object for each dialect
156 |             provided as input.
157 | 
158 |         """
159 | 
160 |         scores: Dict[SimpleDialect, ConsistencyScore] = {}
161 |         incumbent_score = -float("inf")
162 |         for dialect in sorted(dialects):
163 |             P = pattern_score(data, dialect)
164 |             if P < incumbent_score and self._skip:
165 |                 scores[dialect] = ConsistencyScore(P, None, None)
166 |                 if self._verbose:
167 |                     print("%15r:\tP = %15.6f\tskip." % (dialect, P))
168 |                 continue
169 | 
170 |             T = self.compute_type_score(data, dialect)
171 |             Q = P * T
172 |             incumbent_score = max(incumbent_score, Q)
173 |             scores[dialect] = ConsistencyScore(P, T, Q)
174 |             if self._verbose:
175 |                 print(
176 |                     "%15r:\tP = %15.6f\tT = %15.6f\tQ = %15.6f"
177 |                     % (dialect, P, T, Q)
178 |                 )
179 |         return scores
180 | 
181 |     @staticmethod
182 |     def get_best_dialects(
183 |         scores: Dict[SimpleDialect, ConsistencyScore]
184 |     ) -> List[SimpleDialect]:
185 |         """Identify the dialects with the highest consistency score"""
186 |         Qscores = [score.Q for score in scores.values()]
187 |         Qmax = -float("inf")
188 |         for q in Qscores:
189 |             if q is None:
190 |                 continue
191 |             Qmax = max(Qmax, q)
192 |         return [d for d, score in scores.items() if score.Q == Qmax]
193 | 
194 |     def compute_type_score(
195 |         self, data: str, dialect: SimpleDialect, eps: float = DEFAULT_EPS_TYPE
196 |     ) -> float:
197 |         """Compute the type score"""
198 |         total = known = 0
199 |         for row in parse_string(data, dialect, return_quoted=True):
200 |             assert all(isinstance(cell, tuple) for cell in row)
201 |             for cell, is_quoted in row:
202 |                 total += 1
203 |                 known += self._cached_is_known_type(cell, is_quoted=is_quoted)
204 |         if not total:
205 |             return eps
206 |         return max(eps, known / total)
207 | 
208 | 
209 | def detect_dialect_consistency(
210 |     data: str,
211 |     delimiters: Optional[Iterable[str]] = None,
212 |     skip: bool = True,
213 |     verbose: bool = False,
214 | ) -> Optional[SimpleDialect]:
215 |     """Helper function that wraps ConsistencyDetector"""
216 |     # Mostly kept for backwards compatibility
217 |     consistency_detector = ConsistencyDetector(skip=skip, verbose=verbose)
218 |     if delimiters is not None:
219 |         delimiters = list(delimiters)
220 |     return consistency_detector.detect(data, delimiters=delimiters)
221 | 


--------------------------------------------------------------------------------
/example/airedale.csv:
--------------------------------------------------------------------------------
 1 | Department Family,Entity,Payment Date,Expense Type,Expense Area,Supplier,Transaction No.,Amount
 2 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003126885,"43,774.58"
 3 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003126885,"43,774.58"
 4 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003126885,"7,660.55"
 5 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003126885,"7,660.55"
 6 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003129243,"42,022.79"
 7 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003129243,"42,022.79"
 8 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003129243,"7,353.99"
 9 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,INFORMATION MANAGEMENT & TECHNOLOGY,ACCENTURE PACS,3003129243,"7,353.99"
10 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80"
11 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80"
12 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80"
13 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80"
14 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,"5,584.80"
15 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34
16 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34
17 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34
18 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34
19 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - PLANT AND MACHINERY,BALANCE SHEET,CARDIAC SERVICES LTD,G-INV139216,977.34
20 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,CONTRACT : PREMISES SECURITY,SECURITY / CAR PARKING,CP PLUS LTD,11394/06,"25,028.08"
21 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,CONTRACT : PREMISES SECURITY,SECURITY / CAR PARKING,CP PLUS LTD,11524/07,"25,028.08"
22 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,STRATEGY & DEV,DR FOSTER LTD,1006761,"27,000.00"
23 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,COMPUTER SOFTWARE / LICENSE  FEES,STRATEGY & DEV,DR FOSTER LTD,1006761,"4,725.00"
24 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,30/07/2010,DRUGS,PHARMACY,HEALTHCARE AT HOME LTD,OP/2097110,"34,320.00"
25 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,INTERSERVE PROJECT SERVICES LTD,VYO06614,"32,602.01"
26 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,INTERSERVE PROJECT SERVICES LTD,VYO06614,"5,705.35"
27 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,SPECIALIST REGISTRAR,NURSING MANAGEMENT,LEEDS TEACHING HOSPITALS NHS TRUST,334054,"25,000.00"
28 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,SPECIALIST REGISTRAR,NURSING MANAGEMENT,LEEDS TEACHING HOSPITALS NHS TRUST,334054,"25,000.00"
29 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,BLOOD PRODUCTS,PATHOLOGY,NHS BLOOD AND TRANSPLANT,795172,"64,926.70"
30 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,DRUGS,PHARMACY,NHS BUSINESS SERVICES AUTHORITY,PHS1000023817,"34,283.21"
31 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNE1785878,"60,520.87"
32 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNEI771700,"44,419.66"
33 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNEI775748,"51,157.92"
34 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,12/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNEI779508,"63,683.90"
35 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,16/07/2010,N.H.S. CREDITORS < ONE YEAR,BALANCE SHEET,NHS SUPPLY CHAIN,WNEI783035,"70,963.90"
36 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,COMPUTER MAINTENANCE,FINANCE DEPARTMENT,NORTHUMBRIA HEALTHCARE NHS FOUNDATION TRUST,44584,"27,212.46"
37 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ELECTRICITY,UTILITIES,NPOWER LTD,LGUC8SSS,"40,057.57"
38 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ELECTRICITY,UTILITIES,NPOWER LTD,LGUC8SSS,"7,010.08"
39 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,COMPUTER HARDWARE PURCHASES,SURGICAL MANAGEMENT,RED EMBEDDED DESIGN LTD,80010,"48,000.00"
40 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,COMPUTER HARDWARE PURCHASES,SURGICAL MANAGEMENT,RED EMBEDDED DESIGN LTD,80010,"8,400.00"
41 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,RN WOOLER & CO LTD,20603,"28,547.50"
42 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,RN WOOLER & CO LTD,20603,"4,995.81"
43 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,ADDITIONS - NON RESIDENTIAL BUILDINGS,BALANCE SHEET,RN WOOLER & CO LTD,20603,"1,502.50"
44 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,X-RAY EQUIPMENT : PURCHASES,RADIOLOGY,SIEMENS PLC,1019839343,"94,274.86"
45 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,12/07/2010,UTILISATION - OTHER PROVISIONS/LIABILITIES,BALANCE SHEET,SODEXO HEALTHCARE SERVICES LTD,9050725474,"142,313.25"
46 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,07/07/2010,EXTERNAL CONTRACTS : CATERING,CATERING,SODEXO HEALTHCARE SERVICES LTD,9050731742,"153,030.11"
47 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,12/07/2010,GAS,UTILITIES,TOTAL GAS AND POWER LTD,59355378/10,"29,184.61"
48 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,12/07/2010,GAS,UTILITIES,TOTAL GAS AND POWER LTD,59355378/10,"5,107.30"
49 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE CLINIC,20101,"18,126.00"
50 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE CLINIC,20101,"17,252.00"
51 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE CLINIC,20101,"8,275.00"
52 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE CLINIC,20101,"1,503.00"
53 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,23/07/2010,HEALTHCARE - OTHER NHS BODIES,MEDICAL MANAGEMENT,YORKSHIRE CLINIC,20101,193.00
54 | DEPARTMENT OF HEALTH,AIREDALE NHS FOUNDATION TRUST,30/07/2010,HEALTHCARE - OTHER NHS BODIES,SUB CONTRACTED HEALTHCARE,YORKSHIRE EYE HOSPITAL,200011-,"-25,728.00"
55 | 
56 | Report created October 10 - RX7 RWW,,,,,,,
57 | 


--------------------------------------------------------------------------------