├── MANIFEST.in
├── pytest.ini
├── tests
    ├── __init__.py
    ├── documents
    │   ├── cleaned
    │   │   ├── sicilia.csv
    │   │   ├── padding.csv
    │   │   └── duplicated_schema.csv
    │   └── noisy
    │   │   ├── padding.csv
    │   │   ├── sicilia.csv
    │   │   └── duplicated_schema.csv
    ├── test_version.py
    ├── trim_correlation_simple_cleaned.csv
    ├── trim_correlation_simple.csv
    ├── test_trimming.py
    ├── test_cases.py
    ├── test_trim_with_correlation.py
    ├── expected_result.csv
    ├── test_cli.py
    ├── test_readme.py
    └── test.csv
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── python.yml
├── csv_trimming
    ├── __version__.py
    ├── __init__.py
    ├── logger.py
    ├── cli.py
    └── trim.py
├── conftest.py
├── .gitignore
├── LICENSE
├── setup.py
└── README.md


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --doctest-modules


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test suite for the CSVTrimmer class."""


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: LucaCappelletti94
4 | 


--------------------------------------------------------------------------------
/csv_trimming/__version__.py:
--------------------------------------------------------------------------------
1 | """Current version of package csv_trimming"""
2 | 
3 | __version__ = "1.1.1"
4 | 


--------------------------------------------------------------------------------
/csv_trimming/__init__.py:
--------------------------------------------------------------------------------
1 | """Package for cleaning & trimming CSV files."""
2 | 
3 | from csv_trimming.trim import CSVTrimmer
4 | 
5 | __all__ = ["CSVTrimmer"]
6 | 


--------------------------------------------------------------------------------
/tests/documents/cleaned/sicilia.csv:
--------------------------------------------------------------------------------
1 | ,region,province,surname
2 | 0,Calabria,Catanzaro,Rossi
3 | 1,Sicilia,Ragusa,Pinna
4 | 2,Lombardia,Varese,Sbrana
5 | 3,Lazio,Roma,Mair
6 | 4,Sicilia,Messina,Ferrari


--------------------------------------------------------------------------------
/tests/documents/cleaned/padding.csv:
--------------------------------------------------------------------------------
1 | ,region,province,surname
2 | 0,Campania,Caserta,Ferrero
3 | 1,Liguria,Imperia,Conti
4 | 2,Puglia,Bari,Fabris
5 | 3,Sardegna,Medio Campidano,Conti
6 | 4,Lazio,Roma,Fabbri


--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
1 | from validate_version_code import validate_version_code
2 | from csv_trimming.__version__ import __version__
3 | 
4 | def test_version():
5 |     assert validate_version_code(__version__)


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | """Script to setup the test_readme.py file in the tests directory."""
2 | 
3 | import os
4 | from pytest_readme import setup
5 | 
6 | setup()
7 | os.rename("test_readme.py", "tests/test_readme.py")
8 | 


--------------------------------------------------------------------------------
/tests/trim_correlation_simple_cleaned.csv:
--------------------------------------------------------------------------------
1 | ,region,province,surname
2 | 0,Campania,Caserta,Ferrero
3 | 1,Liguria,Imperia,Conti
4 | 2,Puglia,Bari,Fabris
5 | 3,Sardegna,Medio Campidano,Conti
6 | 4,Lazio,Roma,Fabbri


--------------------------------------------------------------------------------
/tests/documents/cleaned/duplicated_schema.csv:
--------------------------------------------------------------------------------
1 | ,region,province,surname
2 | 0,Puglia,Bari,Zanetti
3 | 1,Piemonte,Alessandria,Fabbri
4 | 2,Sicilia,Agrigento,Ferretti
5 | 3,Campania,Napoli,Belotti
6 | 4,Liguria,Savona,Casini


--------------------------------------------------------------------------------
/tests/trim_correlation_simple.csv:
--------------------------------------------------------------------------------
 1 | ,region,province
 2 | 0,Campania,Caserta
 3 | 1,,Ferrero
 4 | 2,Liguria,Imperia
 5 | 3,,Conti
 6 | 4,Puglia,Bari
 7 | 5,,Fabris
 8 | 6,Sardegna,Medio Campidano
 9 | 7,,Conti
10 | 8,Lazio,Roma
11 | 9,,Fabbri


--------------------------------------------------------------------------------
/tests/documents/noisy/padding.csv:
--------------------------------------------------------------------------------
 1 | ,0,1,2,3
 2 | 0,,,,
 3 | 1,,,,
 4 | 2,,region,province,surname
 5 | 3,,Campania,Caserta,Ferrero
 6 | 4,,Liguria,Imperia,Conti
 7 | 5,,Puglia,Bari,Fabris
 8 | 6,,Sardegna,Medio Campidano,Conti
 9 | 7,,Lazio,Roma,Fabbri
10 | 8,,,,
11 | 9,,,,
12 | 10,,,,
13 | 11,,,,
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/__pycache__/*
 2 | */__pycache__/*
 3 | coverage.xml
 4 | __pycache__
 5 | .coverage*
 6 | \.DS_Store
 7 | .single_run
 8 | \.vscode/
 9 | .notipy.json
10 | .ipynb_checkpoints
11 | htmlcov
12 | DONOTUPLOAD.csv
13 | test_do_not_upload.py
14 | *.egg-info
15 | build
16 | .pytest_cache
17 | dist
18 | tests/output.tmp.csv


--------------------------------------------------------------------------------
/tests/test_trimming.py:
--------------------------------------------------------------------------------
 1 | """Test the CSVTrimmer class."""
 2 | 
 3 | import random
 4 | from random_csv_generator import random_csv
 5 | from ugly_csv_generator import uglify
 6 | from tqdm.auto import trange
 7 | from csv_trimming import CSVTrimmer
 8 | 
 9 | 
10 | def test_trim():
11 |     """Test the trim method."""
12 |     state = random.Random(1234)
13 |     for iteration in trange(100):
14 |         csv = random_csv(
15 |             number_of_rows=state.randint(1, 100),
16 |             random_state=(iteration + 1) * 543678,
17 |             localization="en_US.UTF-8",
18 |         )
19 |         ugly = uglify(
20 |             csv,
21 |             duplicate_schema=False,
22 |             seed=(iteration + 1) * 5443678,
23 |         )
24 |         trimmer = CSVTrimmer()
25 |         trimmer.trim(ugly)
26 | 


--------------------------------------------------------------------------------
/.github/workflows/python.yml:
--------------------------------------------------------------------------------
 1 | name: Python Package CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   build:
13 | 
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |     - name: Checkout repository
18 |       uses: actions/checkout@v2
19 | 
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: 3.9
24 | 
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install ".[test]"
29 | 
30 |     - name: Run tests
31 |       run: |
32 |         pytest
33 | 
34 |     - name: Build package
35 |       run: |
36 |         python -m pip install wheel
37 |         python setup.py sdist bdist_wheel


--------------------------------------------------------------------------------
/csv_trimming/logger.py:
--------------------------------------------------------------------------------
 1 | """Submodule for setting up the logger for the csv_trimming package."""
 2 | 
 3 | import sys
 4 | import logging
 5 | 
 6 | # Create the logger
 7 | logger = logging.getLogger(__name__)
 8 | # Change the levels names to that they are 4 chars long
 9 | logging.addLevelName(logging.DEBUG, "DEBG")
10 | logging.addLevelName(logging.WARNING, "WARN")
11 | logging.addLevelName(logging.ERROR, "ERRO")
12 | logging.addLevelName(logging.CRITICAL, "CRIT")
13 | # Set the default log level
14 | logger.setLevel(logging.INFO)
15 | # Set the format of the loger
16 | formatter = logging.Formatter("[%(levelname)s] %(asctime)-15s : %(message)s")
17 | 
18 | # Setup a stdout logger
19 | shandler = logging.StreamHandler(sys.stdout)
20 | shandler.setLevel(logging.INFO)
21 | shandler.setFormatter(formatter)
22 | logger.addHandler(shandler)
23 | 


--------------------------------------------------------------------------------
/tests/documents/noisy/sicilia.csv:
--------------------------------------------------------------------------------
 1 | ,0,1,2,3,4
 2 | 0,#RIF!,#RIF!,.........,///,-----
 3 | 1,"('surname',)('-',)(0,)",region,"(""('surname',)('-',)(0,)"",)(' ',)(1,)",province,surname
 4 | 2,------,#RIF!,#RIF!,"
 5 | 
 6 | 
 7 | ","
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | "
15 | 3,#RIF!,  Calabria          ,-------,        Catanzaro  ,"  
16 |    
17 |  Rossi   
18 |    "
19 | 4,0, Sicilia      ,_____,"      
20 | Ragusa     ","  Pinna
21 | 
22 | 
23 | 
24 |      "
25 | 5,"
26 | 
27 | "," 
28 |   
29 |  
30 |  Lombardia
31 | 
32 | 
33 |   ",------,"
34 |     Varese
35 | 
36 |   
37 | 
38 | ","  
39 |     Sbrana 
40 |  
41 |     "
42 | 6,0,"      Lazio
43 |    
44 | ",__," 
45 | Roma    "," 
46 |   Mair       "
47 | 7,_," Sicilia  
48 | 
49 |    ",#RIF!,         Messina  ,"   Ferrari
50 |    
51 | 
52 | 
53 |   
54 | "
55 | 8,-----,..,"
56 | 
57 | 
58 | 
59 | ",0,--------


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Luca Cappelletti
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/tests/test_cases.py:
--------------------------------------------------------------------------------
 1 | """Test cases from the documents collection."""
 2 | 
 3 | import os
 4 | from glob import glob
 5 | import pandas as pd
 6 | from tqdm.auto import tqdm
 7 | from csv_trimming import CSVTrimmer
 8 | 
 9 | 
10 | def test_document_collection():
11 |     """Test the document collection."""
12 |     documents = glob("tests/documents/cleaned/*.csv")
13 |     noisy_document_pattern = "tests/documents/noisy/{}"
14 | 
15 |     for document in tqdm(
16 |         documents,
17 |         desc="Testing documents",
18 |         leave=False,
19 |         dynamic_ncols=True,
20 |     ):
21 |         desinence = document.split(os.sep)[-1]
22 |         noisy_document = noisy_document_pattern.format(desinence)
23 | 
24 |         noisy_csv = pd.read_csv(noisy_document, index_col=0)
25 |         expected_cleaned_csv = pd.read_csv(document, index_col=0)
26 | 
27 |         trimmer = CSVTrimmer()
28 |         trimmed_csv = trimmer.trim(noisy_csv)
29 | 
30 |         try:
31 |             assert trimmed_csv.equals(expected_cleaned_csv)
32 |         except AssertionError as exp:
33 |             trimmed_csv.to_csv("tests/trimmed.csv", index=False)
34 |             raise exp
35 |     
36 |     if os.path.exists("tests/trimmed.csv"):
37 |         os.remove("tests/trimmed.csv")


--------------------------------------------------------------------------------
/tests/documents/noisy/duplicated_schema.csv:
--------------------------------------------------------------------------------
 1 | ,0,1,2,3,4,5,6,7,8
 2 | 0,#RIF!,////,#RIF!,#RIF!,0,....,0,0,          
 3 | 1,          ,"('surname',)('.',)(0,)",region,province,surname,"('province',)('_',)(1,)",,0,___
 4 | 2,0,////////,     region ,"         province 
 5 | ","  surname 
 6 |   
 7 | ",0,0,,..........
 8 | 3,_____,///////,"   region 
 9 |   
10 |  
11 |  ","     
12 |    province         ","
13 |  
14 |  
15 | surname ",#RIF!,#RIF!,,#RIF!
16 | 4,"
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | ",,  Puglia      ,"
24 | 
25 | Bari  
26 |    
27 |  ","
28 |    
29 |   Zanetti   
30 |  
31 | 
32 |   ",0,--------,------,0
33 | 5,0,"
34 | ",   Piemonte ,  Alessandria  ,"  Fabbri   
35 | 
36 | 
37 | 
38 | ","
39 | 
40 | 
41 | 
42 | ",       ,"
43 | ",  
44 | 6,0,-------,,#RIF!,#RIF!,0,"
45 | 
46 | ",----,"
47 | 
48 | "
49 | 7,/////////,/////////,"
50 | Sicilia        ","   
51 |  
52 | 
53 |  Agrigento  
54 |     ",     Ferretti    ,//////////,,----------,#RIF!
55 | 8,__,---------,"   Campania      
56 | ","         Napoli 
57 |   ","
58 |  Belotti  ",,,///,"
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | "
69 | 9,,--------,0,/////,---,0,/////,----------,      
70 | 10,-----,#RIF!,"  Liguria
71 |     ", Savona   ," 
72 |  
73 |   Casini
74 | 
75 | ",0,,#RIF!,#RIF!
76 | 11,...,0,,-----,"
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | ",--------,0,0,
84 | 


--------------------------------------------------------------------------------
/csv_trimming/cli.py:
--------------------------------------------------------------------------------
 1 | """CLI command and entry point"""
 2 | 
 3 | import argparse
 4 | import pandas as pd
 5 | from csv_trimming import CSVTrimmer
 6 | 
 7 | 
 8 | def main():
 9 |     """CLI command and entry point"""
10 |     parser = argparse.ArgumentParser(
11 |         description="Clean up malformed CSV files using heuristics."
12 |     )
13 | 
14 |     parser.add_argument("input_csv", help="Path to the input CSV file.")
15 |     parser.add_argument("output_csv", help="Path to save the cleaned CSV file.")
16 |     parser.add_argument(
17 |         "--no-restore-header",
18 |         action="store_true",
19 |         help="Does not attempt to restore the header.",
20 |         default=False,
21 |     )
22 |     parser.add_argument(
23 |         "--keep-padding",
24 |         action="store_true",
25 |         help="Does not attempt to drop padding.",
26 |         default=False,
27 |     )
28 |     parser.add_argument(
29 |         "--keep-duplicated-schema",
30 |         action="store_true",
31 |         help="Does not attempt to drop duplicated schema.",
32 |         default=False,
33 |     )
34 | 
35 |     args = parser.parse_args()
36 | 
37 |     # Load the CSV file
38 |     csv = pd.read_csv(args.input_csv)
39 | 
40 |     # Create the CSVTrimmer object
41 |     trimmer = CSVTrimmer()
42 | 
43 |     # Clean up the CSV using the options provided
44 |     cleaned_csv = trimmer.trim(
45 |         csv,
46 |         restore_header=not args.no_restore_header,
47 |         drop_padding=not args.keep_padding,
48 |         drop_duplicated_schema=not args.keep_duplicated_schema,
49 |     )
50 | 
51 |     # Save the cleaned CSV
52 |     cleaned_csv.to_csv(args.output_csv, index=False)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup for the csv_trimming package."""
 2 | import os
 3 | import re
 4 | from setuptools import find_packages, setup
 5 | 
 6 | here = os.path.abspath(os.path.dirname(__file__))
 7 | 
 8 | # Get the long description from the relevant file
 9 | with open(os.path.join(here, 'README.md'), encoding='utf8') as f:
10 |     long_description = f.read()
11 | 
12 | 
13 | def read(*parts):
14 |     with open(os.path.join(here, *parts), 'r', encoding='utf8') as fp:
15 |         return fp.read()
16 | 
17 | 
18 | def find_version(*file_paths):
19 |     version_file = read(*file_paths)
20 |     version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
21 |                               version_file, re.M)
22 |     if version_match:
23 |         return version_match.group(1)
24 |     raise RuntimeError("Unable to find version string.")
25 | 
26 | 
27 | __version__ = find_version("csv_trimming", "__version__.py")
28 | 
29 | test_deps =[
30 |     "pytest",
31 |     "pytest-cov",
32 |     "pytest-readme",
33 |     "tqdm",
34 |     "validate_version_code",
35 |     "random_csv_generator",
36 |     "ugly_csv_generator"
37 | ]
38 | 
39 | extras = {
40 |     'test': test_deps,
41 | }
42 | 
43 | setup(
44 |     name='csv_trimming',
45 |     version=__version__,
46 |     description="Package python to remove common ugliness from a csv-like file",
47 |     long_description=long_description,
48 |     long_description_content_type='text/markdown',
49 |     url="https://github.com/LucaCappelletti94/csv_trimming",
50 |     author="LucaCappelletti94",
51 |     author_email="cappelletti.luca94@gmail.com",
52 |     license='MIT',
53 |     include_package_data=True,
54 |     classifiers=[
55 |         'Development Status :: 5 - Production/Stable',
56 |         'License :: OSI Approved :: MIT License',
57 |         'Programming Language :: Python :: 3'
58 |     ],
59 |     packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
60 |     tests_require=test_deps,
61 |     python_requires='>=3.9',
62 |     install_requires=[
63 |         "pandas>=2.1.0",
64 |         "scipy",
65 |         "numpy",
66 |         "ugly_csv_generator>=1.1.4"
67 |     ],
68 |     extras_require=extras,
69 |     entry_points={
70 |         'console_scripts': [
71 |             'csv-trim = csv_trimming.cli:main',  # CLI command and entry point
72 |         ],
73 |     },
74 | )


--------------------------------------------------------------------------------
/tests/test_trim_with_correlation.py:
--------------------------------------------------------------------------------
 1 | """Test the trim method with correlation."""
 2 | 
 3 | from typing import Tuple
 4 | import pandas as pd
 5 | from csv_trimming import CSVTrimmer
 6 | 
 7 | 
 8 | def correlation_callback(
 9 |     current_row: pd.Series, next_row: pd.Series
10 | ) -> Tuple[bool, pd.Series]:
11 |     """Return the correlation between two rows.
12 | 
13 |     Parameters
14 |     --------------------------
15 |     current_row: pd.Series,
16 |         The current row.
17 |     next_row: pd.Series,
18 |         The next row.
19 |     """
20 |     for value in current_row:
21 |         if value == "Piemonte":
22 |             return True, pd.concat(
23 |                 [
24 |                     current_row,
25 |                     pd.Series(
26 |                         {f"correlated_{key}": value for key, value in next_row.items()}
27 |                     ),
28 |                 ]
29 |             )
30 |     return False, current_row
31 | 
32 | 
33 | def test_trim_with_correlation():
34 |     """Test the trim method with correlation."""
35 |     csv = pd.read_csv("tests/test.csv", index_col=0)
36 |     trimmer = CSVTrimmer(correlation_callback)
37 |     result = trimmer.trim(csv)
38 |     with open("tests/expected_result.csv", "r", encoding="utf8") as f:
39 |         assert result.to_csv() == f.read()
40 | 
41 | 
42 | def simple_correlation_callback(
43 |     current_row: pd.Series, next_row: pd.Series
44 | ) -> Tuple[bool, pd.Series]:
45 |     """Return the correlation between two rows."""
46 | 
47 |     # All of the rows that have a subsequent correlated row are
48 |     # non-empty, and the subsequent correlated rows are always
49 |     # with the first cell empty.
50 |     if pd.isna(next_row.iloc[0]) and all(pd.notna(current_row)):
51 |         return True, pd.concat(
52 |             [
53 |                 current_row,
54 |                 pd.Series({"surname": next_row.iloc[-1]}),
55 |             ]
56 |         )
57 | 
58 |     return False, current_row
59 | 
60 | 
61 | def test_trim_correlation_simple():
62 |     """Test the trim method with correlation."""
63 |     csv = pd.read_csv("tests/trim_correlation_simple.csv", index_col=0, header=None)
64 |     expected = pd.read_csv("tests/trim_correlation_simple_cleaned.csv", index_col=0)
65 |     trimmer = CSVTrimmer(simple_correlation_callback)
66 |     result = trimmer.trim(csv)
67 |     assert result.equals(expected)
68 | 


--------------------------------------------------------------------------------
/tests/expected_result.csv:
--------------------------------------------------------------------------------
1 | ,region,province,surname,name,sex,birth_municipality,birth_province,birth_region,birth_cap,birth_province_code,birthdate,address,house_number,cap,municipality,province_code,codice_fiscale,total_debit,payed_debit,correlated_region,correlated_province,correlated_surname,correlated_name,correlated_sex,correlated_birth_municipality,correlated_birth_province,correlated_birth_region,correlated_birth_cap,correlated_birth_province_code,correlated_birthdate,correlated_address,correlated_house_number,correlated_cap,correlated_municipality,correlated_province_code,correlated_codice_fiscale,correlated_total_debit,correlated_payed_debit
2 | 0,Campania,Napoli,Villa,Giangiacomo Maria,M,Busto Garolfo,Milano,Lombardia,20020,MI,1997-03-24,Via Epomeo,489,80126,Napoli,,VLLGGC97C24B301W,"Eu 83.294,00","Eu 68.537,00",,,,,,,,,,,,,,,,,,,
3 | 1,Lombardia,Bergamo,Ferrari,Farhat,F,Rivoli Veronese,Verona,Veneto,37010,VR,1925-03-26,Piazza Repubblica,1,24050,Zanica,BG,FRRFHT25C66H356T,"Eu 4.771,00","Eu 4.188,00",,,,,,,,,,,,,,,,,,,
4 | 2,Campania,Napoli,Venturelli,Francesco,M,Mirandola,Modena,Emilia Romagna,41037,MO,1959-10-29,Via Monteoliveto,1,80135,Napoli,,VNTFNC59R29F240C,"Eu 84.020,00","Eu 80.640,00",,,,,,,,,,,,,,,,,,,
5 | 3,Piemonte,Biella,Nocentini,Saadia,F,Castelfranco Di Sopra,Arezzo,Toscana,52020,AR,1933-12-08,Via Xxv Aprile,15,13851,Castelletto Cervo,BI,NCNSDA33T48C112S,"Eu 30.843,00","Eu 21.587,00",Emilia Romagna,Ravenna,Bruno,Francesca,F,Terranova Da Sibari,Cosenza,Calabria,87010,CS,1983-11-21,Via Matteotti,55,48010,Cotignola,RA,BRNFNC83S61L124W,"Eu 46.499,00","Eu 36.566,00"
6 | 4,Piemonte,Torino,Ricci,Mattia,M,Sante Marie,L'Aquila,Abruzzo,67067,AQ,1926-08-04,Corso Re Umberto,38,10128,Torino,TO,RCCMTT26M04I326A,"Eu 80.583,00","Eu 4.186,00",Lombardia,Milano,Caruso,Sara,F,San Giovanni La Punta,Catania,Sicilia,95037,CT,1970-03-25,Via Giambellino,64,20146,Milano,MI,CRSSRA70C65H922G,"Eu 85.595,00","Eu 78.088,00"
7 | 5,Emilia Romagna,Bologna,Piras,Sofia,F,San Basilio,Cagliari,Sardegna,09040,CA,1991-10-19,Via Appia,24/B,40026,Imola,BO,PRSSFO91R59H766W,"Eu 59.769,00","Eu 13.577,00",,,,,,,,,,,,,,,,,,,
8 | 6,Abruzzo,Chieti,Musso,Bouchaib,M,Loazzolo,Asti,Piemonte,14051,AT,1974-09-01,Piazza Vittorio Emanuele,6,66043,Casoli,CH,MSSBHB74P01E633Q,"Eu 39.475,00","Eu 13.796,00",Lombardia,Brescia,Gamper,Andrea,M,Chiusa,Bolzano,Trentino Alto Adige,39043,BZ,1964-01-03,Via Fossadelli,snc,25031,Capriolo,BS,GMPNDR64A03C652R,"Eu 72.610,00","Eu 68.475,00"
9 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | """Test suite to verify that the CLI of the package works as expected.
 2 | 
 3 | The CLI commands are of the form:
 4 | 
 5 | ```bash
 6 | csv-trim input.csv output.csv
 7 | ```
 8 | 
 9 | where `input.csv` is the path to the input CSV file and `output.csv` is the path to save the cleaned CSV file.
10 | 
11 | The CLI also supports the following options:
12 | 
13 | - `--restore-header`: Attempts to restore the header.
14 | - `--drop-padding`: Attempts to drop padding.
15 | - `--drop-duplicated-schema`: Attempts to drop duplicated schema.
16 | 
17 | The tests in this module verify that the CLI commands work as expected.
18 | """
19 | import os
20 | import subprocess
21 | import pandas as pd
22 | from csv_trimming import CSVTrimmer
23 | 
24 | def test_cli():
25 |     """Test that the CLI works as expected."""
26 | 
27 |     paths = [
28 |         "tests/test.csv",
29 |         "tests/documents/noisy/padding.csv",
30 |         "tests/documents/noisy/duplicated_schema.csv",
31 |         "tests/documents/noisy/sicilia.csv",
32 |     ]
33 | 
34 |     for path in paths:
35 |         for restore_header in (True, False):
36 |             for drop_padding in (True, False):
37 |                 for drop_duplicated_schema in (True, False):
38 |                     trimmer = CSVTrimmer()
39 |                     csv = pd.read_csv(path)
40 |                     cleaned_csv = trimmer.trim(
41 |                         csv,
42 |                         restore_header=restore_header,
43 |                         drop_padding=drop_padding,
44 |                         drop_duplicated_schema=drop_duplicated_schema,
45 |                     )
46 | 
47 |                     # We store the cleaned CSV in a temporary file
48 |                     cleaned_csv.to_csv("tests/output.tmp.csv", index=False)
49 | 
50 |                     # We reload the cleaned CSV from the temporary file
51 |                     cleaned_csv = pd.read_csv("tests/output.tmp.csv")
52 | 
53 |                     # We create the same output with the CLI and compare
54 |                     # the results
55 | 
56 |                     status = subprocess.run(
57 |                         [
58 |                             "csv-trim",
59 |                             path,
60 |                             "tests/output.tmp.cli.csv",
61 |                             *(("--no-restore-header",) if not restore_header else ()),
62 |                             *(("--keep-padding",) if not drop_padding else ()),
63 |                             *(("--keep-duplicated-schema",) if not drop_duplicated_schema else ()),
64 |                         ],
65 |                         check=True,
66 |                     )
67 | 
68 |                     assert status.returncode == 0
69 | 
70 |                     cli_cleaned_csv = pd.read_csv("tests/output.tmp.cli.csv")
71 | 
72 |                     assert cleaned_csv.equals(cli_cleaned_csv)
73 | 
74 |                     # We remove the temporary files
75 |                     os.remove("tests/output.tmp.cli.csv")
76 |                     os.remove("tests/output.tmp.csv")
77 | 
78 | 


--------------------------------------------------------------------------------
/csv_trimming/trim.py:
--------------------------------------------------------------------------------
  1 | """Module handling the cleaning up of malformed CSVs using heuristics."""
  2 | 
  3 | from typing import Tuple, Any, Optional, Callable
  4 | import pandas as pd
  5 | import numpy as np
  6 | from ugly_csv_generator.utils.add_nan_like_artefacts import (
  7 |     NAN_LIKE_ARTIFACTS,
  8 |     UNICODE_NAN_LIKE_ARTIFACTS,
  9 | )
 10 | from ugly_csv_generator.utils.add_random_spaces import (
 11 |     SPACES,
 12 |     UNICODE_SPACES
 13 | )
 14 | from csv_trimming.logger import logger
 15 | 
 16 | NAN_LIKE = NAN_LIKE_ARTIFACTS + UNICODE_NAN_LIKE_ARTIFACTS
 17 | SPACE_LIKE = sorted(SPACES + UNICODE_SPACES, key=lambda x: -len(x))
 18 | 
 19 | 
 20 | def is_nan(candidate: Any) -> bool:
 21 |     """Return True if the given candidate is NaN-like.
 22 | 
 23 |     Parameters
 24 |     ---------------------------
 25 |     candidate: object,
 26 |         candidate to be checked.
 27 | 
 28 |     Returns
 29 |     ---------------------------
 30 |     True if the given candidate is NaN-like.
 31 |     """
 32 |     return (
 33 |         pd.isna(candidate)
 34 |         or candidate in NAN_LIKE
 35 |         or isinstance(candidate, str)
 36 |         and len(candidate) > 1
 37 |         and all(is_nan(e) for e in candidate)
 38 |     )
 39 | 
 40 | 
 41 | class CSVTrimmer:
 42 |     """Class handling the cleaning up of malformed CSVs using heuristics."""
 43 | 
 44 |     def __init__(
 45 |         self,
 46 |         correlation_callback: Optional[
 47 |             Callable[[pd.Series, pd.Series], Tuple[bool, pd.Series]]
 48 |         ] = None,
 49 |     ):
 50 |         """Create new CVSTrimmer object.
 51 | 
 52 |         Parameters
 53 |         ---------------------------
 54 |         correlation_callback: Optional[Callable] = None,
 55 |             Callback to use to check if two rows required to be specially handled for correlations.
 56 |         """
 57 |         self._correlation_callback = correlation_callback
 58 | 
 59 |     def _mask_edges(self, mask: np.ndarray) -> np.ndarray:
 60 |         """ "Return boolean array with only boolean True attached to sides.
 61 | 
 62 |         Parameters
 63 |         -------------------------------
 64 |         mask: np.ndarray,
 65 |             Boolean vector from which to extract borders.
 66 | 
 67 |         Returns
 68 |         -------------------------------
 69 |         Boolean array with only boolean True attached to array sides.
 70 |         """
 71 |         left, right = 0, 0
 72 |         for left, val in enumerate(mask):
 73 |             if not val:
 74 |                 break
 75 |         for right, val in enumerate(np.flip(mask, axis=0)):
 76 |             if not val:
 77 |                 break
 78 |         if right == 0:
 79 |             mask[left:] = False
 80 |         else:
 81 |             mask[left:-right] = False
 82 |         return mask
 83 | 
 84 |     def trim_padding(self, csv: pd.DataFrame) -> pd.DataFrame:
 85 |         """Return given CSV with trimmed rows and columns.
 86 | 
 87 |         Parameters
 88 |         -------------------------------
 89 |         csv: pd.DataFrame,
 90 |             DataFrame whose borders are to be cleaned up.
 91 | 
 92 |         Returns
 93 |         -------------------------------
 94 |         DataFrame wthout empty or near-empty border columns.
 95 |         """
 96 |         nan_mask = csv.map(is_nan)
 97 |         rows_threshold = np.logical_not(nan_mask).sum(axis=1).mean() / 2
 98 |         rows_mask = self._mask_edges((~nan_mask).sum(axis=1).values < rows_threshold)
 99 |         columns_mask = self._mask_edges(nan_mask.all(axis=0).values)
100 |         csv = csv[~rows_mask][csv.columns[~columns_mask]]
101 |         return csv
102 | 
103 |     def restore_header(self, csv: pd.DataFrame) -> pd.DataFrame:
104 |         """Return CSV with restored first row as header of CSV.
105 | 
106 |         Eventual double columns have added the term '.duplicated'.
107 |         Eventual columns without name are called 'column #n'
108 | 
109 |         Parameters
110 |         -------------------------------
111 |         csv: pd.DataFrame,
112 |             DataFrame where to restore the header.
113 | 
114 |         Returns
115 |         -------------------------------
116 |         DataFrame with restored header.
117 |         """
118 |         new_header = csv.iloc[0]  # grab the first row for the header
119 | 
120 |         new_sanitized_header = []
121 |         nan_values_count = 0
122 |         for value in new_header:
123 |             if is_nan(value):
124 |                 new_sanitized_header.append(f"column {nan_values_count}")
125 |                 nan_values_count += 1
126 |                 continue
127 | 
128 |             while value in new_sanitized_header:
129 |                 value = f"{value}.duplicated"
130 | 
131 |             new_sanitized_header.append(value)
132 | 
133 |         csv = csv.iloc[1:]  # take the data less the header row
134 |         csv.columns = new_sanitized_header  # set the header row as the csv header
135 |         return csv
136 | 
137 |     def drop_empty_columns(self, csv: pd.DataFrame) -> pd.DataFrame:
138 |         """Return DataFrame with removed empty columns.
139 | 
140 |         Parameters
141 |         ---------------------------
142 |         csv: pd.DataFrame,
143 |             DataFrame where to drop the empty columns.
144 | 
145 |         Returns
146 |         ---------------------------
147 |         DataFrame without empty columns.
148 |         """
149 |         nan_mask = csv.map(is_nan).all(axis=0)
150 |         return csv[csv.columns[~nan_mask]]
151 | 
152 |     def drop_duplicated_schema(self, csv: pd.DataFrame) -> pd.DataFrame:
153 |         """Return DataFrame with removed duplicated schema.
154 | 
155 |         Implementative details
156 |         ---------------------------
157 |         In some cases, such as when multiple CSVs are chained in a poor manner,
158 |         the same schema can be repeated multiple times. This method removes
159 |         the duplicated schema if it is detected.
160 |         """
161 |         # We detect the indices of all the rows that are equal to
162 |         # the header, and then we drop them.
163 |         header = csv.columns
164 | 
165 |         indices_to_drop = []
166 | 
167 |         for idx, row in csv.iterrows():
168 |             if all(row == header):
169 |                 indices_to_drop.append(idx)
170 | 
171 |         return csv.drop(index=indices_to_drop)
172 | 
173 |     def drop_empty_rows(self, csv: pd.DataFrame) -> pd.DataFrame:
174 |         """Return DataFrame with removed empty columns.
175 | 
176 |         Parameters
177 |         ---------------------------
178 |         csv: pd.DataFrame,
179 |             DataFrame where to drop the empty columns.
180 | 
181 |         Returns
182 |         ---------------------------
183 |         DataFrame without empty columns.
184 |         """
185 |         nan_mask = csv.map(is_nan).all(axis=1)
186 |         return csv[~nan_mask]
187 | 
188 |     def _deep_strip(self, string: str):
189 |         """Return string without continuos spaces.
190 | 
191 |         Parameters
192 |         ----------------------------
193 |         string: str,
194 |             Sanitized string.
195 | 
196 |         Returns
197 |         ----------------------------
198 |         String without duplicated spaces.
199 |         """
200 |         old_string = None
201 |         while old_string != string:
202 |             old_string = string
203 |             for char in SPACE_LIKE:
204 |                 if char in string:
205 |                     string = " ".join(e for e in string.split(char) if e)
206 |         return string.strip()
207 | 
208 |     def trim_spaces(self, csv: pd.DataFrame) -> pd.DataFrame:
209 |         """Return dataframe without multiple spaces.
210 | 
211 |         Parameters
212 |         ---------------------------
213 |         csv: pd.DataFrame,
214 |             DataFrame to be sanitized.
215 | 
216 |         Returns
217 |         ---------------------------
218 |         DataFrame without multiple spaces in strings.
219 |         """
220 |         return csv.map(lambda x: self._deep_strip(x) if isinstance(x, str) else x)
221 | 
222 |     def restore_true_nan(self, csv: pd.DataFrame) -> pd.DataFrame:
223 |         """Return CSV with restored True NaN values.
224 | 
225 |         Parameters
226 |         ----------------------------
227 |         csv: pd.DataFrame,
228 |             DataFrame where to restore the NaN values.
229 | 
230 |         Returns
231 |         ----------------------------
232 |         DataFrame with restored NaN values.
233 |         """
234 |         nan_mask = csv.map(is_nan)
235 |         return csv.where(np.logical_not(nan_mask))
236 | 
237 |     def normalize_correlated_rows(self, csv: pd.DataFrame) -> pd.DataFrame:
238 |         """Return normalized correlated rows.
239 | 
240 |         Parameters
241 |         --------------------------
242 |         csv: pd.DataFrame,
243 |             DataFrame to be normalized.
244 | 
245 |         Returns
246 |         --------------------------
247 |         The dataframe normalized correlated rows.
248 |         """
249 |         if self._correlation_callback is None:
250 |             return csv
251 | 
252 |         new_rows = []
253 |         skip_row = False
254 |         stored_next_row = None
255 | 
256 |         for (_, current_row), (_, next_row) in zip(
257 |             csv.iterrows(), csv.iloc[1:].iterrows()
258 |         ):
259 |             if skip_row:
260 |                 skip_row = False
261 |                 continue
262 |             skip_row, result = self._correlation_callback(current_row, next_row)
263 |             new_rows.append(result)
264 |             stored_next_row = next_row
265 | 
266 |         if not skip_row and stored_next_row is not None:
267 |             new_rows.append(stored_next_row)
268 | 
269 |         return pd.DataFrame(new_rows)
270 | 
271 |     def trim(
272 |         self,
273 |         csv: pd.DataFrame,
274 |         restore_header: bool = True,
275 |         drop_padding: bool = True,
276 |         drop_duplicated_schema: bool = True,
277 |     ) -> pd.DataFrame:
278 |         """Return sanitized version of given dataframe.
279 | 
280 |         Parameters
281 |         ----------------------------
282 |         csv: pd.DataFrame,
283 |             The dataframe to clean up.
284 |         restore_header: bool = True,
285 |             Whether to restore the header.
286 |         drop_padding: bool = True,
287 |             Whether to drop padding.
288 |         drop_duplicated_schema: bool = True,
289 |             Whether to drop duplicated schemas.
290 | 
291 |         Returns
292 |         ----------------------------
293 |         The cleaned up dataframe.
294 |         """
295 |         logger.info("Removing extra spaces within cells.")
296 |         csv = self.trim_spaces(csv)
297 |         if drop_padding:
298 |             logger.info("Removing empty space (or NaNs).")
299 |             csv = self.trim_padding(csv)
300 |         logger.info("Removing empty space rows.")
301 |         csv = self.drop_empty_rows(csv)
302 |         if restore_header:
303 |             logger.info("Restoring detected header.")
304 |             csv = self.restore_header(csv)
305 |         logger.info("Restoring true NaN values.")
306 |         csv = self.restore_true_nan(csv)
307 |         logger.info("Normalizing correlated rows (if lambda is provided).")
308 |         csv = self.normalize_correlated_rows(csv)
309 |         logger.info("Dropping empty columns.")
310 |         csv = self.drop_empty_columns(csv)
311 |         if drop_duplicated_schema:
312 |             logger.info("Dropping rows containing duplicated schema.")
313 |             csv = self.drop_duplicated_schema(csv)
314 | 
315 |         csv = csv.reset_index(drop=True)
316 |         csv.index.name = None
317 |         csv.columns.name = None
318 |         return csv
319 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ✂️ CSV Trimming
  2 | 
  3 | [![PyPI](https://badge.fury.io/py/csv-trimming.svg)](https://badge.fury.io/py/csv-trimming)
  4 | [![python](https://img.shields.io/pypi/pyversions/csv-trimming)](https://pypi.org/project/csv-trimming/)
  5 | [![license](https://img.shields.io/pypi/l/csv-trimming)](https://pypi.org/project/csv-trimming/)
  6 | [![Downloads](https://pepy.tech/badge/csv-trimming)](https://pepy.tech/projects/csv-trimming)
  7 | [![Github Actions](https://github.com/LucaCappelletti94/csv_trimming/actions/workflows/python.yml/badge.svg)](https://github.com/LucaCappelletti94/csv_trimming/actions/)
  8 | [![Codacy Badge](https://app.codacy.com/project/badge/Grade/0968ff39b133475da3a9c528b8ae2c9d)](https://app.codacy.com/gh/LucaCappelletti94/csv_trimming/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
  9 | 
 10 | [CSV Trimming](https://github.com/LucaCappelletti94/csv_trimming) is a Python package designed to take messy CSVs — the kind you get from scraping websites, legacy systems, or poorly managed data — and transform them into clean, well-formatted CSVs with just one line of code. No need for complex setups or large language models. It’s simple, straightforward, and generally gets the job done.
 11 | 
 12 | ## How do I install this package?
 13 | 
 14 | As usual, just download it using pip:
 15 | 
 16 | ```shell
 17 | pip install csv_trimming
 18 | ```
 19 | 
 20 | ## How do I use this package?
 21 | The package is very simple to use, just load your CSV and pass it to the trimmer.
 22 | 
 23 | ```python
 24 | import pandas as pd
 25 | from csv_trimming import CSVTrimmer
 26 | 
 27 | # Load your csv
 28 | csv = pd.read_csv("tests/documents/noisy/sicilia.csv")
 29 | # Instantiate the trimmer
 30 | trimmer = CSVTrimmer()
 31 | # And trim it
 32 | trimmed_csv = trimmer.trim(csv)
 33 | # That's it!
 34 | ```
 35 | 
 36 | For instance, your input CSV to clean up may look like this at the beginning:
 37 | 
 38 | |   | 0   | 1                       | 2       | 3                                                | 4         |
 39 | |---|-----|-------------------------|---------|--------------------------------------------------|-----------|
 40 | | 0 | #RIF! | #RIF!                  | ......... | ///                                            | -----     |
 41 | | 1 | ('surname',)('-',)(0,) | region                  | (""('surname',)('-',)(0,"),)(' ',)(1,)       | province  | surname   |
 42 | | 2 | ------ | #RIF!                  | #RIF!    |                                                |           |
 43 | | 3 | #RIF! | Calabria               | -------  | Catanzaro                                      | Rossi     |
 44 | | 4 | 0     | Sicilia                | _____    | Ragusa                                         | Pinna     |
 45 | | 5 | ""    | Lombardia              | ------   | Varese                                         | Sbrana    |
 46 | | 6 | 0     | Lazio                  | __       | Roma                                           | Mair      |
 47 | | 7 | _     | Sicilia                | #RIF!    | Messina                                        | Ferrari   |
 48 | | 8 | ----- | ..                     | ""       | 0                                              | --------- |
 49 | 
 50 | And after the trimming, it will look like this:
 51 | 
 52 | |   | region    | province  | surname |
 53 | |---|-----------|-----------|---------|
 54 | | 0 | Calabria  | Catanzaro | Rossi   |
 55 | | 1 | Sicilia   | Ragusa    | Pinna   |
 56 | | 2 | Lombardia | Varese    | Sbrana  |
 57 | | 3 | Lazio     | Roma      | Mair    |
 58 | | 4 | Sicilia   | Messina   | Ferrari |
 59 | 
 60 | Magic!
 61 | 
 62 | ## Advanced trimming with row correlation
 63 | Sometimes, the CSVs you are working with may have a row correlation, meaning part of a given row is inserted in the next row. Such cases are common when the data-entry clerk wants to make the whole table fit in their screen, and in order to do so, they split the row in two. While this is clearly an extremely bad practice, it happens in the real world and the CSV Trimmer can handle it with a little help.
 64 | 
 65 | You just need to provide a function that defines which rows are correlated, and the CSV Trimmer will take care of the rest. While in this example we are using a rather simple function and a relatively clean CSV, the package can handle more complex cases.
 66 | 
 67 | ```python
 68 | from typing import Tuple
 69 | import pandas as pd
 70 | from csv_trimming import CSVTrimmer
 71 | 
 72 | def simple_correlation_callback(
 73 |     current_row: pd.Series,
 74 |     next_row: pd.Series
 75 | ) -> Tuple[bool, pd.Series]:
 76 |     """Return the correlation between two rows.
 77 |     
 78 |     Parameters
 79 |     ----------
 80 |     current_row : pd.Series
 81 |         The current row being analyzed in the DataFrame.
 82 |     next_row : pd.Series
 83 |         The next row in the DataFrame.
 84 | 
 85 |     Returns
 86 |     -------
 87 |     Tuple[bool, pd.Series]
 88 |         A tuple with a boolean indicating if the rows are correlated
 89 |         and a Series with the merged row.
 90 |     """
 91 | 
 92 |     # All of the rows that have a subsequent correlated row are
 93 |     # non-empty, and the subsequent correlated rows are always
 94 |     # with the first cell empty.
 95 |     if pd.isna(next_row.iloc[0]) and all(pd.notna(current_row)):
 96 |         return True, pd.concat(
 97 |             [
 98 |                 current_row,
 99 |                 pd.Series({"surname": next_row.iloc[-1]}),
100 |             ]
101 |         )
102 | 
103 |     return False, current_row
104 | 
105 | csv = pd.read_csv("tests/test.csv")
106 | trimmer = CSVTrimmer(simple_correlation_callback)
107 | result = trimmer.trim(csv)
108 | ```
109 | 
110 | In this case, our CSV looked like this at the beginning:
111 | 
112 | |    | region   | province        |
113 | |----|----------|-----------------|
114 | | 0  | Campania | Caserta          |
115 | | 1  |          | Ferrero          |
116 | | 2  | Liguria  | Imperia          |
117 | | 3  |          | Conti            |
118 | | 4  | Puglia   | Bari             |
119 | | 5  |          | Fabris           |
120 | | 6  | Sardegna | Medio Campidano  |
121 | | 7  |          | Conti            |
122 | | 8  | Lazio    | Roma             |
123 | | 9  |          | Fabbri           |
124 | 
125 | 
126 | And after the trimming, it will look like this:
127 | 
128 | |    | region   | province        | surname |
129 | |----|----------|-----------------|---------|
130 | | 0  | Campania | Caserta          | Ferrero |
131 | | 1  | Liguria  | Imperia          | Conti   |
132 | | 2  | Puglia   | Bari             | Fabris  |
133 | | 3  | Sardegna | Medio Campidano  | Conti   |
134 | | 4  | Lazio    | Roma             | Fabbri  |
135 | 
136 | ## More examples
137 | Here follow some examples of the package in action.
138 | 
139 | ### Case with duplicated schemas
140 | Sometimes, when chaining multiple CSVs in a poor manner, you may end up with duplicated schemas.
141 | The CSV Trimmer detects rows that match the detected header, and it can (optionally) remove them.
142 | 
143 | ```python
144 | import pandas as pd
145 | from csv_trimming import CSVTrimmer
146 | 
147 | # Load your csv
148 | csv = pd.read_csv("tests/documents/noisy/duplicated_schema.csv")
149 | # Instantiate the trimmer
150 | trimmer = CSVTrimmer()
151 | # And trim it
152 | trimmed_csv = trimmer.trim(csv, drop_duplicated_schema=True)
153 | # That's it!
154 | ```
155 | 
156 | For instance, your input CSV to clean up may look like this at the beginning:
157 | 
158 | |    | 0          | 1                            | 2      | 3                                         | 4                             | 5                             | 6          | 7        |
159 | |----|------------|------------------------------|--------|-------------------------------------------|------------------------------|------------------------------|------------|----------|
160 | | 0  | #RIF!      | ////                         | #RIF!  | #RIF!                                     | 0                             | ....                         | 0          | 0        |
161 | | 1  |            | ('surname',)('.',)(0,)       | region | province                                  | surname                      | ('province',)('_',)(1,)      |            | 0        |
162 | | 2  | 0          | ////////                     | region | province                                  | surname                      | 0                             | 0          |          |
163 | | 3  | _____      | ///////                      | region | province                                  | surname                      | #RIF!                        | #RIF!      |          |
164 | | 4  |            |                              | Puglia                                    | Bari                         | Zanetti                      | 0          | -------- |
165 | | 5  | 0          |                              | Piemonte| Alessandria                               | Fabbri                       |                              |            |          |
166 | | 6  | 0          | -------                      |        | #RIF!                                     | #RIF!                        | 0                            |            | ----     |
167 | | 7  | /////////  | /////////                    | Sicilia| Agrigento                                  | Ferretti                     | //////////                   |            | ----------|
168 | | 8  | __         | --------                     | Campania| Napoli                                    | Belotti                      |                              | ///        |          |
169 | | 9  |            | --------                     | 0      | /////                                      | ---                          | 0                            | /////      | ----------|
170 | | 10 | -----      | #RIF!                        | Liguria| Savona                                    | Casini                       | 0                            |            | #RIF!    |
171 | | 11 | ...        | 0                            |        | -----                                     |                              | --------                     | 0          | 0        |
172 | 
173 | And after the trimming, it will look like this:
174 | 
175 | |   | region   | province    | surname |
176 | |---|----------|-------------|---------|
177 | | 0 | Puglia   | Bari        | Zanetti |
178 | | 1 | Piemonte | Alessandria | Fabbri  |
179 | | 2 | Sicilia  | Agrigento   | Ferretti|
180 | | 3 | Campania | Napoli      | Belotti |
181 | | 4 | Liguria  | Savona      | Casini  |
182 | 
183 | ### Case with only padding
184 | Sometimes, the data entry clerk may start filling a table offsetted from the top-left corner, and export it with also
185 | empty cells all around. We call such cells "padding". The CSV Trimmer can detect and remove them.
186 | 
187 | ```python
188 | import pandas as pd
189 | from csv_trimming import CSVTrimmer
190 | 
191 | # Load your csv
192 | csv = pd.read_csv("tests/documents/noisy/padding.csv")
193 | 
194 | # Instantiate the trimmer
195 | trimmer = CSVTrimmer()
196 | 
197 | # And trim it
198 | trimmed_csv = trimmer.trim(csv, drop_padding=True)
199 | ```
200 | 
201 | For instance, your input CSV to clean up may look like this at the beginning:
202 | 
203 | |   |   | region   | province       | surname |
204 | |---|---|----------|----------------|---------|
205 | | 0 |   |          |                |         |
206 | | 1 |   |          |                |         |
207 | | 2 |   | region   | province       | surname |
208 | | 3 |   | Campania | Caserta        | Ferrero |
209 | | 4 |   | Liguria  | Imperia        | Conti   |
210 | | 5 |   | Puglia   | Bari           | Fabris  |
211 | | 6 |   | Sardegna | Medio Campidano| Conti   |
212 | | 7 |   | Lazio    | Roma           | Fabbri  |
213 | | 8 |   |          |                |         |
214 | | 9 |   |          |                |         |
215 | | 10|   |          |                |         |
216 | | 11|   |          |                |         |
217 | 
218 | And after the trimming, it will look like this:
219 | 
220 | |   | region   | province       | surname |
221 | |---|----------|----------------|---------|
222 | | 0 | Campania | Caserta        | Ferrero |
223 | | 1 | Liguria  | Imperia        | Conti   |
224 | | 2 | Puglia   | Bari           | Fabris  |
225 | | 3 | Sardegna | Medio Campidano| Conti   |
226 | | 4 | Lazio    | Roma           | Fabbri  |
227 | 
228 | 
229 | ## Command Line Interface
230 | The package also provides a command line interface to trim CSVs. It comes installed with the `setup.py` of the package, therefore after having pip installed the package, you can immediately use it from the command line.
231 | 
232 | You can use it by running the following command:
233 | 
234 | ```shell
235 | csv-trim tests/documents/noisy/sicilia.csv tests/documents/noisy/sicilia_trimmed.csv
236 | ```
237 | 
238 | It supports the following options to keep it from attempting some trimmings:
239 | 
240 | - `--keep-padding`: Do not attempt to remove padding.
241 | - `--keep-duplicated-schema`: Do not attempt to remove duplicated schemas.
242 | - `--no-restore-header`: Do not attempt to restore the header.
243 | 
244 | For instance:
245 |     
246 | ```shell
247 | csv-trim tests/documents/noisy/sicilia.csv tests/documents/noisy/sicilia_trimmed.csv --keep-padding
248 | ```
249 | 
250 | ## How do I contribute to this package?
251 | If you have identified some new corner case that the package does not handle, or you have a suggestion for a new feature, feel free to open an issue. If you want to contribute with code, open an issue describing the feature you intend to add and submit a pull request.
252 | 
253 | ## License
254 | This package is released under MIT license.


--------------------------------------------------------------------------------
/tests/test_readme.py:
--------------------------------------------------------------------------------
  1 | # # ✂️ CSV Trimming
  2 | # 
  3 | # [![PyPI](https://badge.fury.io/py/csv-trimming.svg)](https://badge.fury.io/py/csv-trimming)
  4 | # [![python](https://img.shields.io/pypi/pyversions/csv-trimming)](https://pypi.org/project/csv-trimming/)
  5 | # [![license](https://img.shields.io/pypi/l/csv-trimming)](https://pypi.org/project/csv-trimming/)
  6 | # [![Downloads](https://pepy.tech/badge/csv-trimming)](https://pepy.tech/projects/csv-trimming)
  7 | # [![Github Actions](https://github.com/LucaCappelletti94/csv_trimming/actions/workflows/python.yml/badge.svg)](https://github.com/LucaCappelletti94/csv_trimming/actions/)
  8 | # [![Codacy Badge](https://app.codacy.com/project/badge/Grade/0968ff39b133475da3a9c528b8ae2c9d)](https://app.codacy.com/gh/LucaCappelletti94/csv_trimming/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
  9 | # 
 10 | # [CSV Trimming](https://github.com/LucaCappelletti94/csv_trimming) is a Python package designed to take messy CSVs — the kind you get from scraping websites, legacy systems, or poorly managed data — and transform them into clean, well-formatted CSVs with just one line of code. No need for complex setups or large language models. It’s simple, straightforward, and generally gets the job done.
 11 | # 
 12 | # ## How do I install this package?
 13 | # 
 14 | # As usual, just download it using pip:
 15 | # 
 16 | # ```shell
 17 | # pip install csv_trimming
 18 | 
 19 | # 
 20 | # ## How do I use this package?
 21 | # The package is very simple to use, just load your CSV and pass it to the trimmer.
 22 | # 
 23 | def test_line_22():
 24 |     import pandas as pd
 25 |     from csv_trimming import CSVTrimmer
 26 |     
 27 |     # Load your csv
 28 |     csv = pd.read_csv("tests/documents/noisy/sicilia.csv")
 29 |     # Instantiate the trimmer
 30 |     trimmer = CSVTrimmer()
 31 |     # And trim it
 32 |     trimmed_csv = trimmer.trim(csv)
 33 |     # That's it!
 34 | 
 35 | # 
 36 | # For instance, your input CSV to clean up may look like this at the beginning:
 37 | # 
 38 | # |   | 0   | 1                       | 2       | 3                                                | 4         |
 39 | # |---|-----|-------------------------|---------|--------------------------------------------------|-----------|
 40 | # | 0 | #RIF! | #RIF!                  | ......... | ///                                            | -----     |
 41 | # | 1 | ('surname',)('-',)(0,) | region                  | (""('surname',)('-',)(0,"),)(' ',)(1,)       | province  | surname   |
 42 | # | 2 | ------ | #RIF!                  | #RIF!    |                                                |           |
 43 | # | 3 | #RIF! | Calabria               | -------  | Catanzaro                                      | Rossi     |
 44 | # | 4 | 0     | Sicilia                | _____    | Ragusa                                         | Pinna     |
 45 | # | 5 | ""    | Lombardia              | ------   | Varese                                         | Sbrana    |
 46 | # | 6 | 0     | Lazio                  | __       | Roma                                           | Mair      |
 47 | # | 7 | _     | Sicilia                | #RIF!    | Messina                                        | Ferrari   |
 48 | # | 8 | ----- | ..                     | ""       | 0                                              | --------- |
 49 | # 
 50 | # And after the trimming, it will look like this:
 51 | # 
 52 | # |   | region    | province  | surname |
 53 | # |---|-----------|-----------|---------|
 54 | # | 0 | Calabria  | Catanzaro | Rossi   |
 55 | # | 1 | Sicilia   | Ragusa    | Pinna   |
 56 | # | 2 | Lombardia | Varese    | Sbrana  |
 57 | # | 3 | Lazio     | Roma      | Mair    |
 58 | # | 4 | Sicilia   | Messina   | Ferrari |
 59 | # 
 60 | # Magic!
 61 | # 
 62 | # ## Advanced trimming with row correlation
 63 | # Sometimes, the CSVs you are working with may have a row correlation, meaning part of a given row is inserted in the next row. Such cases are common when the data-entry clerk wants to make the whole table fit in their screen, and in order to do so, they split the row in two. While this is clearly an extremely bad practice, it happens in the real world and the CSV Trimmer can handle it with a little help.
 64 | # 
 65 | # You just need to provide a function that defines which rows are correlated, and the CSV Trimmer will take care of the rest. While in this example we are using a rather simple function and a relatively clean CSV, the package can handle more complex cases.
 66 | # 
 67 | def test_line_66():
 68 |     from typing import Tuple
 69 |     import pandas as pd
 70 |     from csv_trimming import CSVTrimmer
 71 |     
 72 |     def simple_correlation_callback(
 73 |         current_row: pd.Series,
 74 |         next_row: pd.Series
 75 |     ) -> Tuple[bool, pd.Series]:
 76 |         """Return the correlation between two rows.
 77 |         
 78 |         Parameters
 79 |         ----------
 80 |         current_row : pd.Series
 81 |             The current row being analyzed in the DataFrame.
 82 |         next_row : pd.Series
 83 |             The next row in the DataFrame.
 84 |     
 85 |         Returns
 86 |         -------
 87 |         Tuple[bool, pd.Series]
 88 |             A tuple with a boolean indicating if the rows are correlated
 89 |             and a Series with the merged row.
 90 |         """
 91 |     
 92 |         # All of the rows that have a subsequent correlated row are
 93 |         # non-empty, and the subsequent correlated rows are always
 94 |         # with the first cell empty.
 95 |         if pd.isna(next_row.iloc[0]) and all(pd.notna(current_row)):
 96 |             return True, pd.concat(
 97 |                 [
 98 |                     current_row,
 99 |                     pd.Series({"surname": next_row.iloc[-1]}),
100 |                 ]
101 |             )
102 |     
103 |         return False, current_row
104 |     
105 |     csv = pd.read_csv("tests/test.csv")
106 |     trimmer = CSVTrimmer(simple_correlation_callback)
107 |     result = trimmer.trim(csv)
108 | 
109 | # 
110 | # In this case, our CSV looked like this at the beginning:
111 | # 
112 | # |    | region   | province        |
113 | # |----|----------|-----------------|
114 | # | 0  | Campania | Caserta          |
115 | # | 1  |          | Ferrero          |
116 | # | 2  | Liguria  | Imperia          |
117 | # | 3  |          | Conti            |
118 | # | 4  | Puglia   | Bari             |
119 | # | 5  |          | Fabris           |
120 | # | 6  | Sardegna | Medio Campidano  |
121 | # | 7  |          | Conti            |
122 | # | 8  | Lazio    | Roma             |
123 | # | 9  |          | Fabbri           |
124 | # 
125 | # 
126 | # And after the trimming, it will look like this:
127 | # 
128 | # |    | region   | province        | surname |
129 | # |----|----------|-----------------|---------|
130 | # | 0  | Campania | Caserta          | Ferrero |
131 | # | 1  | Liguria  | Imperia          | Conti   |
132 | # | 2  | Puglia   | Bari             | Fabris  |
133 | # | 3  | Sardegna | Medio Campidano  | Conti   |
134 | # | 4  | Lazio    | Roma             | Fabbri  |
135 | # 
136 | # ## More examples
137 | # Here follow some examples of the package in action.
138 | # 
139 | # ### Case with duplicated schemas
140 | # Sometimes, when chaining multiple CSVs in a poor manner, you may end up with duplicated schemas.
141 | # The CSV Trimmer detects rows that match the detected header, and it can (optionally) remove them.
142 | # 
143 | def test_line_142():
144 |     import pandas as pd
145 |     from csv_trimming import CSVTrimmer
146 |     
147 |     # Load your csv
148 |     csv = pd.read_csv("tests/documents/noisy/duplicated_schema.csv")
149 |     # Instantiate the trimmer
150 |     trimmer = CSVTrimmer()
151 |     # And trim it
152 |     trimmed_csv = trimmer.trim(csv, drop_duplicated_schema=True)
153 |     # That's it!
154 | 
155 | # 
156 | # For instance, your input CSV to clean up may look like this at the beginning:
157 | # 
158 | # |    | 0          | 1                            | 2      | 3                                         | 4                             | 5                             | 6          | 7        |
159 | # |----|------------|------------------------------|--------|-------------------------------------------|------------------------------|------------------------------|------------|----------|
160 | # | 0  | #RIF!      | ////                         | #RIF!  | #RIF!                                     | 0                             | ....                         | 0          | 0        |
161 | # | 1  |            | ('surname',)('.',)(0,)       | region | province                                  | surname                      | ('province',)('_',)(1,)      |            | 0        |
162 | # | 2  | 0          | ////////                     | region | province                                  | surname                      | 0                             | 0          |          |
163 | # | 3  | _____      | ///////                      | region | province                                  | surname                      | #RIF!                        | #RIF!      |          |
164 | # | 4  |            |                              | Puglia                                    | Bari                         | Zanetti                      | 0          | -------- |
165 | # | 5  | 0          |                              | Piemonte| Alessandria                               | Fabbri                       |                              |            |          |
166 | # | 6  | 0          | -------                      |        | #RIF!                                     | #RIF!                        | 0                            |            | ----     |
167 | # | 7  | /////////  | /////////                    | Sicilia| Agrigento                                  | Ferretti                     | //////////                   |            | ----------|
168 | # | 8  | __         | --------                     | Campania| Napoli                                    | Belotti                      |                              | ///        |          |
169 | # | 9  |            | --------                     | 0      | /////                                      | ---                          | 0                            | /////      | ----------|
170 | # | 10 | -----      | #RIF!                        | Liguria| Savona                                    | Casini                       | 0                            |            | #RIF!    |
171 | # | 11 | ...        | 0                            |        | -----                                     |                              | --------                     | 0          | 0        |
172 | # 
173 | # And after the trimming, it will look like this:
174 | # 
175 | # |   | region   | province    | surname |
176 | # |---|----------|-------------|---------|
177 | # | 0 | Puglia   | Bari        | Zanetti |
178 | # | 1 | Piemonte | Alessandria | Fabbri  |
179 | # | 2 | Sicilia  | Agrigento   | Ferretti|
180 | # | 3 | Campania | Napoli      | Belotti |
181 | # | 4 | Liguria  | Savona      | Casini  |
182 | # 
183 | # ### Case with only padding
184 | # Sometimes, the data entry clerk may start filling a table offsetted from the top-left corner, and export it with also
185 | # empty cells all around. We call such cells "padding". The CSV Trimmer can detect and remove them.
186 | # 
187 | def test_line_186():
188 |     import pandas as pd
189 |     from csv_trimming import CSVTrimmer
190 |     
191 |     # Load your csv
192 |     csv = pd.read_csv("tests/documents/noisy/padding.csv")
193 |     
194 |     # Instantiate the trimmer
195 |     trimmer = CSVTrimmer()
196 |     
197 |     # And trim it
198 |     trimmed_csv = trimmer.trim(csv, drop_padding=True)
199 | 
200 | # 
201 | # For instance, your input CSV to clean up may look like this at the beginning:
202 | # 
203 | # |   |   | region   | province       | surname |
204 | # |---|---|----------|----------------|---------|
205 | # | 0 |   |          |                |         |
206 | # | 1 |   |          |                |         |
207 | # | 2 |   | region   | province       | surname |
208 | # | 3 |   | Campania | Caserta        | Ferrero |
209 | # | 4 |   | Liguria  | Imperia        | Conti   |
210 | # | 5 |   | Puglia   | Bari           | Fabris  |
211 | # | 6 |   | Sardegna | Medio Campidano| Conti   |
212 | # | 7 |   | Lazio    | Roma           | Fabbri  |
213 | # | 8 |   |          |                |         |
214 | # | 9 |   |          |                |         |
215 | # | 10|   |          |                |         |
216 | # | 11|   |          |                |         |
217 | # 
218 | # And after the trimming, it will look like this:
219 | # 
220 | # |   | region   | province       | surname |
221 | # |---|----------|----------------|---------|
222 | # | 0 | Campania | Caserta        | Ferrero |
223 | # | 1 | Liguria  | Imperia        | Conti   |
224 | # | 2 | Puglia   | Bari           | Fabris  |
225 | # | 3 | Sardegna | Medio Campidano| Conti   |
226 | # | 4 | Lazio    | Roma           | Fabbri  |
227 | # 
228 | # 
229 | # ## Command Line Interface
230 | # The package also provides a command line interface to trim CSVs. It comes installed with the `setup.py` of the package, therefore after having pip installed the package, you can immediately use it from the command line.
231 | # 
232 | # You can use it by running the following command:
233 | # 
234 | # ```shell
235 | # csv-trim tests/documents/noisy/sicilia.csv tests/documents/noisy/sicilia_trimmed.csv
236 | 
237 | # 
238 | # It supports the following options to keep it from attempting some trimmings:
239 | # 
240 | # - `--keep-padding`: Do not attempt to remove padding.
241 | # - `--keep-duplicated-schema`: Do not attempt to remove duplicated schemas.
242 | # - `--no-restore-header`: Do not attempt to restore the header.
243 | # 
244 | # For instance:
245 | #     
246 | # ```shell
247 | # csv-trim tests/documents/noisy/sicilia.csv tests/documents/noisy/sicilia_trimmed.csv --keep-padding
248 | 
249 | # 
250 | # ## How do I contribute to this package?
251 | # If you have identified some new corner case that the package does not handle, or you have a suggestion for a new feature, feel free to open an issue. If you want to contribute with code, open an issue describing the feature you intend to add and submit a pull request.
252 | # 
253 | # ## License
254 | # This package is released under MIT license.


--------------------------------------------------------------------------------
/tests/test.csv:
--------------------------------------------------------------------------------
   1 | ,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41
   2 | 0,"
   3 | 
   4 | 
   5 | 
   6 | 
   7 | 
   8 | ",#RIF!,,random,#RIF!,"
   9 | 
  10 | 
  11 | 
  12 | 
  13 | 
  14 | 
  15 | ",         ,0,#RIF!,#RIF!,        ,"
  16 | ",____,"
  17 | 
  18 | 
  19 | 
  20 | 
  21 | 
  22 | 
  23 | ",#RIF!,....,#RIF!,///,,    ,      ,#RIF!,#RIF!,0,#RIF!,--,"
  24 | 
  25 | 
  26 | 
  27 | 
  28 | 
  29 | ",,.........,/////,,//////,"
  30 | 
  31 | 
  32 | 
  33 | 
  34 | 
  35 | 
  36 | 
  37 | 
  38 | ",#RIF!,/,"
  39 | 
  40 | 
  41 | 
  42 | 
  43 | 
  44 | 
  45 | 
  46 | 
  47 | 
  48 | ","
  49 | 
  50 | ",,--,#RIF!,///,
  51 | 1,,,random,..,#RIF!,/////////,"
  52 | ",/////////,#RIF!,,#RIF!,0,0,0,#RIF!,,..,----,0,0,"
  53 | 
  54 | 
  55 | ",________,//////////,..,0,0,#RIF!,.......,0,...,_______,,#RIF!,.,,"
  56 | 
  57 | 
  58 | 
  59 | ",0,________,#RIF!,#RIF!,0,    
  60 | 2,,caso,#RIF!,#RIF!,"
  61 | ",0,,"
  62 | 
  63 | ",0,0,_____,_,"
  64 | 
  65 | 
  66 | 
  67 | 
  68 | 
  69 | ",,0,0,///,0,"
  70 | 
  71 | ","
  72 | 
  73 | ",_____,#RIF!,0,"
  74 | 
  75 | ",--,        ,0,"
  76 | 
  77 | 
  78 | 
  79 | 
  80 | 
  81 | 
  82 | 
  83 | ",#RIF!,0,#RIF!,0,,,....,..,,,,---------,....,"
  84 | 
  85 | 
  86 | 
  87 | 
  88 | 
  89 | 
  90 | 
  91 | "
  92 | 3,,0,0,#RIF!,0,__________,........,birth_municipality_1,region,province,surname,name,codice_fiscale4,sex,birth_province 2,birth_municipality,cap_3,birth_province,birth_region,birth_cap,birth_province_code,birthdate,sex-6,address,house_number,cap,cap.0,birth_province 2-7,municipality,province_code,codice_fiscale,total_debit,payed_debit,municipality_5,0,__,_,......,#RIF!,------,"
  93 | 
  94 | 
  95 | 
  96 | 
  97 | 
  98 | 
  99 | 
 100 | ",#RIF!
 101 | 4,"
 102 | 
 103 | 
 104 | 
 105 | 
 106 | 
 107 | 
 108 | 
 109 | 
 110 | ",#RIF!,0,0,_____,--------,,-,"     
 111 | Campania  ","
 112 | Napoli
 113 |    
 114 |  ","  
 115 |   
 116 |  
 117 |   Villa  "," 
 118 |     Giangiacomo  
 119 | Maria 
 120 |    
 121 |    ",,"    
 122 |   M 
 123 | 
 124 | 
 125 | 
 126 |  ",0,"
 127 |  Busto        Garolfo   
 128 |   ",0,       Milano       ," 
 129 | 
 130 |      Lombardia    
 131 |   ",  20020 ,"  MI   
 132 | 
 133 |      ","  
 134 |     
 135 | 1997-03-24 
 136 | 
 137 |     ",/////,"  
 138 |  
 139 |    Via
 140 | Epomeo 
 141 | ","
 142 | 
 143 |  
 144 | 489
 145 |  
 146 | ","   
 147 |     
 148 | 80126   
 149 |   
 150 |   ",,"
 151 | ",   Napoli ,"
 152 | 
 153 | 
 154 | 
 155 | ","
 156 | VLLGGC97C24B301W  
 157 |    ","
 158 |   
 159 | Eu        
 160 | 83.294,00     ","    
 161 |  
 162 | 
 163 | Eu   
 164 |   
 165 | 68.537,00
 166 |   
 167 |    ",,"
 168 | 
 169 | 
 170 | 
 171 | 
 172 | ",0,........,,____,#RIF!, ,#RIF!
 173 | 5,...,,0,"
 174 | 
 175 | 
 176 | 
 177 | 
 178 | 
 179 | 
 180 | 
 181 | 
 182 | 
 183 | ",....,        ,-,0,"
 184 | 
 185 |  Lombardia
 186 |      ","
 187 |  
 188 |   
 189 |    Bergamo "," 
 190 |       Ferrari  ","  
 191 |   Farhat     ",__,"   
 192 |  F ",  ,"    
 193 | Rivoli 
 194 |        Veronese 
 195 |   ",#RIF!,"  
 196 |  
 197 |     Verona 
 198 | 
 199 | 
 200 |    
 201 |   ","
 202 |  Veneto     
 203 |    ","  
 204 |     37010   ","  
 205 |  
 206 | 
 207 | 
 208 |  
 209 | VR 
 210 |   ","    1925-03-26 
 211 |  ",------,"  
 212 |     
 213 |  Piazza   
 214 |     Repubblica   ","  1  
 215 |   ","          24050 
 216 |    ",,0,"   Zanica  
 217 |     ","         
 218 | BG
 219 | 
 220 |        ","  
 221 |     
 222 | FRRFHT25C66H356T
 223 |      
 224 | 
 225 |  ","  
 226 |   Eu 4.771,00     ","
 227 |  Eu     4.188,00
 228 |  
 229 |  
 230 | ",,..........,----,//////,0,0,.,#RIF!,0
 231 | 6,#RIF!,"
 232 | 
 233 | 
 234 | ",,,"
 235 | 
 236 | 
 237 | 
 238 | 
 239 | ",0,////,---------," 
 240 |  
 241 |      Campania    ","       
 242 | 
 243 | Napoli   
 244 | ","   
 245 |  
 246 | 
 247 | Venturelli 
 248 |   "," Francesco
 249 | 
 250 |  
 251 |   
 252 |  ",///,"      
 253 |  M 
 254 |  ",0,  Mirandola   ,#RIF!,"
 255 |  
 256 |  Modena       ","    Emilia  
 257 | 
 258 | 
 259 |      Romagna    ","   
 260 | 41037
 261 | ",   MO  ,    1959-10-29         ,0,  Via   Monteoliveto   ,"     1
 262 |      ","  
 263 |     
 264 |  80135   
 265 |   ",0,-,"   
 266 |  
 267 | 
 268 |   
 269 | Napoli
 270 |    ",0,"       
 271 |   VNTFNC59R29F240C      
 272 |    ","
 273 |     
 274 |  Eu
 275 |     
 276 | 84.020,00 
 277 |    ","  
 278 |      Eu  80.640,00
 279 |     
 280 | ",#RIF!,         ,"
 281 | 
 282 | 
 283 | 
 284 | ",0,---,0,         ,__,
 285 | 7,--,"
 286 | 
 287 | 
 288 | 
 289 | 
 290 | 
 291 | ",---------,0,#RIF!,----,0,,"  
 292 |     Piemonte    
 293 | ","
 294 |  Biella ","     Nocentini     
 295 |   "," 
 296 | 
 297 | 
 298 |    
 299 | 
 300 | Saadia 
 301 |      
 302 | ",0,"
 303 | 
 304 |     
 305 | 
 306 | F 
 307 | ",#RIF!,"  
 308 | 
 309 | Castelfranco
 310 | 
 311 | Di
 312 | 
 313 | Sopra   
 314 |   
 315 |    ",#RIF!,"    
 316 | 
 317 |  
 318 |  Arezzo 
 319 | 
 320 |   ","  
 321 |   Toscana
 322 |     
 323 | ","
 324 |  52020
 325 |      "," 
 326 | 
 327 |  
 328 |    
 329 |  AR
 330 |  ","   
 331 | 
 332 |  
 333 |   
 334 | 1933-12-08  
 335 |  
 336 |    ",   ,"  Via
 337 | Xxv
 338 | Aprile    ","     
 339 | 15
 340 |  ","  
 341 |       13851 ","
 342 | 
 343 | 
 344 | 
 345 | 
 346 | 
 347 | 
 348 | 
 349 | ",,  Castelletto    Cervo   ,   BI    ,"  
 350 |   
 351 |   NCNSDA33T48C112S  
 352 | "," Eu          30.843,00          ","  Eu   21.587,00
 353 |         ",...,0,--------,.......,"
 354 | 
 355 | 
 356 | 
 357 | 
 358 | 
 359 | 
 360 | ",____,         ,_,........
 361 | 8,#RIF!,"
 362 | ","
 363 | 
 364 | 
 365 | 
 366 | 
 367 | 
 368 | 
 369 | 
 370 | 
 371 | 
 372 | ",0,"
 373 | 
 374 | ",#RIF!,---------,#RIF!,"  
 375 |     
 376 |  Emilia       Romagna     ","
 377 | 
 378 | 
 379 | Ravenna  ",    Bruno    ,"
 380 |    
 381 |  
 382 | Francesca
 383 | 
 384 |    ","
 385 | 
 386 | ", F ,//////////," 
 387 | Terranova   Da   Sibari    ",     ,"  
 388 |  
 389 |  
 390 |    Cosenza
 391 |    ","
 392 |  Calabria 
 393 |   ","
 394 | 
 395 | 87010    ","      
 396 |    CS     
 397 |  ","
 398 | 1983-11-21     ","
 399 | 
 400 | "," Via    Matteotti
 401 |    ",        55         ," 48010  
 402 | 
 403 |     
 404 |  ",,0,"   
 405 |      Cotignola      "," 
 406 | 
 407 | RA  "," BRNFNC83S61L124W  
 408 | ","     Eu 
 409 |  
 410 |     46.499,00    
 411 |   ","
 412 |      
 413 | 
 414 |   Eu      36.566,00
 415 |  ",#RIF!,______,#RIF!,0,0,------,"
 416 | ",,0
 417 | 9,,0,"
 418 | 
 419 | ",,#RIF!,//////////,#RIF!,0," Piemonte   
 420 | ", Torino    ,"  
 421 |   
 422 | Ricci  ","   Mattia        
 423 | ",#RIF!,"  
 424 | 
 425 |    M   
 426 |  
 427 | 
 428 |    ",,"
 429 | 
 430 |  
 431 | 
 432 |  Sante Marie   
 433 |    
 434 | 
 435 | 
 436 | ",,"    L'Aquila
 437 | "," 
 438 | 
 439 | Abruzzo  ","  
 440 | 
 441 |       67067
 442 |    
 443 |  ","  
 444 |     AQ     ","
 445 |      1926-08-04  
 446 |  
 447 |  
 448 | 
 449 |   ","
 450 | 
 451 | 
 452 | 
 453 | 
 454 | 
 455 | 
 456 | 
 457 | 
 458 | 
 459 | "," 
 460 |    
 461 |    Corso  
 462 |     Re  
 463 |     Umberto        ","
 464 |     
 465 |   38    
 466 | 
 467 | ","
 468 |   10128   
 469 | ",_________,#RIF!,"     Torino      
 470 |   ","   
 471 |  
 472 | TO
 473 | 
 474 |    ","
 475 |  
 476 | 
 477 | 
 478 | 
 479 |  
 480 | RCCMTT26M04I326A          ","        Eu      80.583,00 
 481 |    
 482 |    
 483 | ","  
 484 | 
 485 |  Eu        4.186,00   
 486 | 
 487 |    
 488 | ",0,#RIF!,,          ,#RIF!,,------,0,"
 489 | 
 490 | 
 491 | 
 492 | 
 493 | 
 494 | 
 495 | 
 496 | 
 497 | "
 498 | 10,0,#RIF!,,0,,/,#RIF!,, Lombardia ," 
 499 |      Milano
 500 |  
 501 |    ","      
 502 |  Caruso  
 503 | 
 504 |   "," Sara
 505 | 
 506 |  
 507 |  ",#RIF!,"     
 508 | 
 509 | 
 510 |  
 511 | F   
 512 |    ",...,"   
 513 |       San 
 514 |   
 515 | Giovanni 
 516 |   
 517 | La 
 518 |   
 519 | Punta 
 520 |  ",,"
 521 |    
 522 |  Catania
 523 |       
 524 |   ","
 525 |  Sicilia    
 526 |  ",    95037 ,"        
 527 |  CT    
 528 | 
 529 |     ","   
 530 | 
 531 |    1970-03-25  ",0," Via
 532 |  Giambellino   
 533 |       "," 
 534 | 
 535 |      
 536 |  64
 537 |      ", 20146 ,#RIF!,, Milano     ,"        MI 
 538 | 
 539 | 
 540 |   ","      CRSSRA70C65H922G
 541 |   
 542 |     ","
 543 |   Eu    
 544 |      85.595,00 
 545 |      
 546 | ","  
 547 |      Eu 
 548 |     78.088,00   ",0,------,0,,          ,--------,__,"
 549 | 
 550 | 
 551 | 
 552 | 
 553 | 
 554 | ","
 555 | 
 556 | 
 557 | 
 558 | "
 559 | 11,0,#RIF!,      ,----,0,_,"
 560 | 
 561 | 
 562 | 
 563 | 
 564 | 
 565 | 
 566 | ",#RIF!,"   
 567 |   
 568 |  Emilia         
 569 | Romagna
 570 |   ",   Bologna ,"       Piras 
 571 |       ","     
 572 |     Sofia   
 573 |  ",,"     
 574 | F       
 575 | 
 576 | ","
 577 | 
 578 | "," San  
 579 | 
 580 | Basilio  ",0,"  Cagliari   
 581 |  
 582 |  
 583 |  ","      
 584 | Sardegna
 585 |  ","  
 586 | 
 587 | 09040   ","  CA
 588 | 
 589 | 
 590 | 
 591 |     
 592 | ","   
 593 |   1991-10-19  
 594 |       ",0,"
 595 |       Via   Appia  "," 
 596 |      24/B    
 597 | ","  
 598 |  
 599 | 40026  
 600 | 
 601 |    ",#RIF!,#RIF!," 
 602 |     
 603 | 
 604 |   Imola 
 605 |     
 606 |    ","
 607 |         BO     ","
 608 |   
 609 |   PRSSFO91R59H766W          "," 
 610 |  Eu
 611 |   59.769,00    ","      
 612 | Eu      
 613 | 13.577,00 ","
 614 | 
 615 | 
 616 | 
 617 | 
 618 | ",,///////,--,"
 619 | 
 620 | 
 621 | 
 622 | 
 623 | ",-----,----,,
 624 | 12,0,         ,-,,,#RIF!,"
 625 | 
 626 | 
 627 | 
 628 | 
 629 | 
 630 | 
 631 | 
 632 | ",#RIF!,"
 633 |  
 634 |        Abruzzo
 635 |      ","        
 636 | Chieti      ","
 637 |     Musso ","  
 638 | 
 639 |      Bouchaib  ",0," 
 640 | 
 641 |       M   ",..........,"
 642 |   
 643 |    Loazzolo   ",,"  
 644 |      Asti 
 645 |    ","   
 646 |       Piemonte
 647 |    
 648 | ","        
 649 |  14051 ",          AT         ,"      1974-09-01  
 650 | 
 651 | ",0,"       
 652 |  Piazza   Vittorio   Emanuele 
 653 | 
 654 |  ","      6
 655 |     ","  66043
 656 |  
 657 | 
 658 |      ","
 659 | ",....,     Casoli    ,"  
 660 | CH 
 661 | "," 
 662 |   
 663 | 
 664 |  
 665 | MSSBHB74P01E633Q       
 666 | 
 667 |  ","     Eu
 668 | 
 669 |    
 670 |    39.475,00
 671 |       
 672 | "," 
 673 |     Eu
 674 | 
 675 | 13.796,00   
 676 |      ","
 677 | 
 678 | 
 679 | 
 680 | 
 681 | 
 682 | 
 683 | 
 684 | 
 685 | ",/////,////////,"
 686 | 
 687 | 
 688 | 
 689 | 
 690 | 
 691 | 
 692 | 
 693 | ",,#RIF!,,"
 694 | 
 695 | 
 696 | ",.
 697 | 13,#RIF!,#RIF!,__,,--------,0,#RIF!,,    Lombardia     ,"
 698 | Brescia
 699 |  ", Gamper   ,         Andrea   ,#RIF!,"     M   
 700 | ",,"   
 701 | 
 702 | 
 703 |    
 704 | Chiusa    ",#RIF!,"
 705 |       Bolzano     ","      Trentino   
 706 | 
 707 |      Alto   
 708 | 
 709 |      Adige      "," 
 710 | 
 711 | 
 712 |    39043   
 713 | 
 714 |  ","  
 715 |   
 716 |     BZ  
 717 |   
 718 |    ","    
 719 | 1964-01-03         ",0,"  
 720 | 
 721 |       Via
 722 |        Fossadelli    
 723 | "," 
 724 |    snc  
 725 | 
 726 |    
 727 |  ","  
 728 |    25031 
 729 |    
 730 |   ",,   ,"
 731 | Capriolo  ",   BS   ,"      
 732 |   GMPNDR64A03C652R       "," 
 733 |       
 734 |  Eu 72.610,00  
 735 | 
 736 |    "," 
 737 | 
 738 |  
 739 |    Eu    68.475,00 
 740 |      ",#RIF!,#RIF!,,"
 741 | 
 742 | 
 743 | 
 744 | 
 745 | ",#RIF!,"
 746 | 
 747 | 
 748 | 
 749 | 
 750 | 
 751 | ","
 752 | 
 753 | 
 754 | 
 755 | 
 756 | 
 757 | 
 758 | ",0,____
 759 | 14,,"
 760 | 
 761 | 
 762 | 
 763 | 
 764 | 
 765 | 
 766 | ","
 767 | 
 768 | 
 769 | 
 770 | 
 771 | 
 772 | ",0,0,  ,#RIF!,"
 773 | 
 774 | ","
 775 | 
 776 | 
 777 | 
 778 | 
 779 | 
 780 | 
 781 | 
 782 | 
 783 | ",...,"
 784 | 
 785 | 
 786 | 
 787 | ",,---,0,///,0,--------,0,#RIF!,"
 788 | 
 789 | 
 790 | 
 791 | 
 792 | 
 793 | ",,........,........,"
 794 | 
 795 | 
 796 | 
 797 | 
 798 | 
 799 | 
 800 | ",,...,_____,#RIF!,----------,........,#RIF!,,...,0,#RIF!,,0,,0,#RIF!,_____,"
 801 | 
 802 | 
 803 | 
 804 | 
 805 | 
 806 | "
 807 | 15,"
 808 | 
 809 | 
 810 | 
 811 | 
 812 | 
 813 | 
 814 | 
 815 | ","
 816 | 
 817 | 
 818 | 
 819 | ",----------,#RIF!,0,........,--------,____,#RIF!,0,#RIF!,//////,#RIF!,0,,,---,,,#RIF!,...,#RIF!,0,......,,"
 820 | 
 821 | 
 822 | 
 823 | 
 824 | ",____,         ,////////,/,__,,,-----,"
 825 | 
 826 | 
 827 | 
 828 | ",-------,----------, ,0,#RIF!,----------,0
 829 | 16,,"
 830 | 
 831 | 
 832 | ",0,///////,#RIF!,----,..,     ,,_____,,,,,0,0,________,----,0,#RIF!,,"
 833 | ","
 834 | 
 835 | 
 836 | 
 837 | 
 838 | 
 839 | ",----,#RIF!,"
 840 | 
 841 | 
 842 | 
 843 | 
 844 | 
 845 | 
 846 | ",#RIF!,#RIF!,0,____,0,,_________,,#RIF!,"
 847 | ",,--,//////////,"
 848 | 
 849 | ",...,#RIF!
 850 | 17,...,,0,0,,"
 851 | 
 852 | 
 853 | 
 854 | 
 855 | 
 856 | ",,,,,#RIF!,#RIF!,"
 857 | 
 858 | 
 859 | 
 860 | 
 861 | 
 862 | ",,      ,#RIF!,"
 863 | 
 864 | 
 865 | 
 866 | 
 867 | ",0,,_______,0,.........,#RIF!,..,"
 868 | 
 869 | 
 870 | 
 871 | 
 872 | 
 873 | 
 874 | 
 875 | 
 876 | 
 877 | ",#RIF!,"
 878 | 
 879 | ","
 880 | 
 881 | 
 882 | 
 883 | 
 884 | 
 885 | ",0,#RIF!,0,#RIF!,0,__,#RIF!,......,#RIF!,"
 886 | 
 887 | 
 888 | 
 889 | 
 890 | ",0,..,#RIF!,#RIF!
 891 | 18,0,.,///////,,"
 892 | 
 893 | 
 894 | 
 895 | ",,,#RIF!,"
 896 | 
 897 | ",#RIF!,"
 898 | 
 899 | 
 900 | 
 901 | 
 902 | 
 903 | 
 904 | 
 905 | ","
 906 | 
 907 | 
 908 | 
 909 | ",,0,,#RIF!,,0,      ,,......,"
 910 | 
 911 | 
 912 | ",,"
 913 | 
 914 | 
 915 | 
 916 | ","
 917 | 
 918 | 
 919 | 
 920 | 
 921 | 
 922 | ",    ,      ,----,#RIF!,--------,____,        ,0,-----,#RIF!,-----,---------,/,,0,"
 923 | 
 924 | 
 925 | 
 926 | 
 927 | 
 928 | ","
 929 | "
 930 | 19,///////,..........,...,0,......,    ,       ,--,,"
 931 | 
 932 | ",////////,,_________,...,"
 933 | 
 934 | 
 935 | 
 936 | 
 937 | 
 938 | 
 939 | 
 940 | ",,    ,/////,,,.,"
 941 | ","
 942 | 
 943 | ",0,0,0,0,_____,,#RIF!,--,0,......,"
 944 | 
 945 | 
 946 | ",0,....,///,0,,.........,       ,
 947 | 20,......,,"
 948 | 
 949 | 
 950 | 
 951 | 
 952 | 
 953 | 
 954 | 
 955 | 
 956 | ",,0,#RIF!,0,0,___,0,/////,_________,#RIF!,,"
 957 | 
 958 | 
 959 | 
 960 | 
 961 | ",,,#RIF!,----------,,"
 962 | ",-------,,#RIF!,#RIF!,,#RIF!,....,"
 963 | 
 964 | 
 965 | 
 966 | 
 967 | 
 968 | ",#RIF!,"
 969 | 
 970 | 
 971 | 
 972 | ",,"
 973 | 
 974 | 
 975 | ",0,--,#RIF!,#RIF!,0,,-----,0,
 976 | 21,....,"
 977 | 
 978 | 
 979 | ","
 980 | 
 981 | 
 982 | 
 983 | 
 984 | 
 985 | 
 986 | ",#RIF!,"
 987 | 
 988 | 
 989 | 
 990 | ",----------,#RIF!,0,,#RIF!,      ,"
 991 | 
 992 | 
 993 | 
 994 | 
 995 | ",,0, ,0,#RIF!,////////,---,#RIF!,, ,0,0,0,,,"
 996 | 
 997 | ",0,__________,   ,#RIF!,,........,_______,,#RIF!,"
 998 | 
 999 | 
1000 | 
1001 | 
1002 | 
1003 | 
1004 | 
1005 | 
1006 | ","
1007 | 
1008 | 
1009 | 
1010 | 
1011 | 
1012 | 
1013 | ",#RIF!,#RIF!,
1014 | 


--------------------------------------------------------------------------------