├── MANIFEST.in ├── pytest.ini ├── tests ├── __init__.py ├── documents │ ├── cleaned │ │ ├── sicilia.csv │ │ ├── padding.csv │ │ └── duplicated_schema.csv │ └── noisy │ │ ├── padding.csv │ │ ├── sicilia.csv │ │ └── duplicated_schema.csv ├── test_version.py ├── trim_correlation_simple_cleaned.csv ├── trim_correlation_simple.csv ├── test_trimming.py ├── test_cases.py ├── test_trim_with_correlation.py ├── expected_result.csv ├── test_cli.py ├── test_readme.py └── test.csv ├── .github ├── FUNDING.yml └── workflows │ └── python.yml ├── csv_trimming ├── __version__.py ├── __init__.py ├── logger.py ├── cli.py └── trim.py ├── conftest.py ├── .gitignore ├── LICENSE ├── setup.py └── README.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --doctest-modules -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Test suite for the CSVTrimmer class.""" -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: LucaCappelletti94 4 | -------------------------------------------------------------------------------- /csv_trimming/__version__.py: -------------------------------------------------------------------------------- 1 | """Current version of package csv_trimming""" 2 | 3 | __version__ = "1.1.1" 4 | -------------------------------------------------------------------------------- /csv_trimming/__init__.py: -------------------------------------------------------------------------------- 1 | """Package for cleaning & trimming CSV files.""" 2 | 3 | from csv_trimming.trim import CSVTrimmer 4 | 5 | __all__ = ["CSVTrimmer"] 6 | -------------------------------------------------------------------------------- /tests/documents/cleaned/sicilia.csv: -------------------------------------------------------------------------------- 1 | ,region,province,surname 2 | 0,Calabria,Catanzaro,Rossi 3 | 1,Sicilia,Ragusa,Pinna 4 | 2,Lombardia,Varese,Sbrana 5 | 3,Lazio,Roma,Mair 6 | 4,Sicilia,Messina,Ferrari -------------------------------------------------------------------------------- /tests/documents/cleaned/padding.csv: -------------------------------------------------------------------------------- 1 | ,region,province,surname 2 | 0,Campania,Caserta,Ferrero 3 | 1,Liguria,Imperia,Conti 4 | 2,Puglia,Bari,Fabris 5 | 3,Sardegna,Medio Campidano,Conti 6 | 4,Lazio,Roma,Fabbri -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | from validate_version_code import validate_version_code 2 | from csv_trimming.__version__ import __version__ 3 | 4 | def test_version(): 5 | assert validate_version_code(__version__) -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | """Script to setup the test_readme.py file in the tests directory.""" 2 | 3 | import os 4 | from pytest_readme import setup 5 | 6 | setup() 7 | os.rename("test_readme.py", "tests/test_readme.py") 8 | -------------------------------------------------------------------------------- /tests/trim_correlation_simple_cleaned.csv: -------------------------------------------------------------------------------- 1 | ,region,province,surname 2 | 0,Campania,Caserta,Ferrero 3 | 1,Liguria,Imperia,Conti 4 | 2,Puglia,Bari,Fabris 5 | 3,Sardegna,Medio Campidano,Conti 6 | 4,Lazio,Roma,Fabbri -------------------------------------------------------------------------------- /tests/documents/cleaned/duplicated_schema.csv: -------------------------------------------------------------------------------- 1 | ,region,province,surname 2 | 0,Puglia,Bari,Zanetti 3 | 1,Piemonte,Alessandria,Fabbri 4 | 2,Sicilia,Agrigento,Ferretti 5 | 3,Campania,Napoli,Belotti 6 | 4,Liguria,Savona,Casini -------------------------------------------------------------------------------- /tests/trim_correlation_simple.csv: -------------------------------------------------------------------------------- 1 | ,region,province 2 | 0,Campania,Caserta 3 | 1,,Ferrero 4 | 2,Liguria,Imperia 5 | 3,,Conti 6 | 4,Puglia,Bari 7 | 5,,Fabris 8 | 6,Sardegna,Medio Campidano 9 | 7,,Conti 10 | 8,Lazio,Roma 11 | 9,,Fabbri -------------------------------------------------------------------------------- /tests/documents/noisy/padding.csv: -------------------------------------------------------------------------------- 1 | ,0,1,2,3 2 | 0,,,, 3 | 1,,,, 4 | 2,,region,province,surname 5 | 3,,Campania,Caserta,Ferrero 6 | 4,,Liguria,Imperia,Conti 7 | 5,,Puglia,Bari,Fabris 8 | 6,,Sardegna,Medio Campidano,Conti 9 | 7,,Lazio,Roma,Fabbri 10 | 8,,,, 11 | 9,,,, 12 | 10,,,, 13 | 11,,,, 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/* 2 | */__pycache__/* 3 | coverage.xml 4 | __pycache__ 5 | .coverage* 6 | \.DS_Store 7 | .single_run 8 | \.vscode/ 9 | .notipy.json 10 | .ipynb_checkpoints 11 | htmlcov 12 | DONOTUPLOAD.csv 13 | test_do_not_upload.py 14 | *.egg-info 15 | build 16 | .pytest_cache 17 | dist 18 | tests/output.tmp.csv -------------------------------------------------------------------------------- /tests/test_trimming.py: -------------------------------------------------------------------------------- 1 | """Test the CSVTrimmer class.""" 2 | 3 | import random 4 | from random_csv_generator import random_csv 5 | from ugly_csv_generator import uglify 6 | from tqdm.auto import trange 7 | from csv_trimming import CSVTrimmer 8 | 9 | 10 | def test_trim(): 11 | """Test the trim method.""" 12 | state = random.Random(1234) 13 | for iteration in trange(100): 14 | csv = random_csv( 15 | number_of_rows=state.randint(1, 100), 16 | random_state=(iteration + 1) * 543678, 17 | localization="en_US.UTF-8", 18 | ) 19 | ugly = uglify( 20 | csv, 21 | duplicate_schema=False, 22 | seed=(iteration + 1) * 5443678, 23 | ) 24 | trimmer = CSVTrimmer() 25 | trimmer.trim(ugly) 26 | -------------------------------------------------------------------------------- /.github/workflows/python.yml: -------------------------------------------------------------------------------- 1 | name: Python Package CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v2 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: 3.9 24 | 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install ".[test]" 29 | 30 | - name: Run tests 31 | run: | 32 | pytest 33 | 34 | - name: Build package 35 | run: | 36 | python -m pip install wheel 37 | python setup.py sdist bdist_wheel -------------------------------------------------------------------------------- /csv_trimming/logger.py: -------------------------------------------------------------------------------- 1 | """Submodule for setting up the logger for the csv_trimming package.""" 2 | 3 | import sys 4 | import logging 5 | 6 | # Create the logger 7 | logger = logging.getLogger(__name__) 8 | # Change the levels names to that they are 4 chars long 9 | logging.addLevelName(logging.DEBUG, "DEBG") 10 | logging.addLevelName(logging.WARNING, "WARN") 11 | logging.addLevelName(logging.ERROR, "ERRO") 12 | logging.addLevelName(logging.CRITICAL, "CRIT") 13 | # Set the default log level 14 | logger.setLevel(logging.INFO) 15 | # Set the format of the loger 16 | formatter = logging.Formatter("[%(levelname)s] %(asctime)-15s : %(message)s") 17 | 18 | # Setup a stdout logger 19 | shandler = logging.StreamHandler(sys.stdout) 20 | shandler.setLevel(logging.INFO) 21 | shandler.setFormatter(formatter) 22 | logger.addHandler(shandler) 23 | -------------------------------------------------------------------------------- /tests/documents/noisy/sicilia.csv: -------------------------------------------------------------------------------- 1 | ,0,1,2,3,4 2 | 0,#RIF!,#RIF!,.........,///,----- 3 | 1,"('surname',)('-',)(0,)",region,"(""('surname',)('-',)(0,)"",)(' ',)(1,)",province,surname 4 | 2,------,#RIF!,#RIF!," 5 | 6 | 7 | "," 8 | 9 | 10 | 11 | 12 | 13 | 14 | " 15 | 3,#RIF!, Calabria ,-------, Catanzaro ," 16 | 17 | Rossi 18 | " 19 | 4,0, Sicilia ,_____," 20 | Ragusa "," Pinna 21 | 22 | 23 | 24 | " 25 | 5," 26 | 27 | "," 28 | 29 | 30 | Lombardia 31 | 32 | 33 | ",------," 34 | Varese 35 | 36 | 37 | 38 | "," 39 | Sbrana 40 | 41 | " 42 | 6,0," Lazio 43 | 44 | ",__," 45 | Roma "," 46 | Mair " 47 | 7,_," Sicilia 48 | 49 | ",#RIF!, Messina ," Ferrari 50 | 51 | 52 | 53 | 54 | " 55 | 8,-----,..," 56 | 57 | 58 | 59 | ",0,-------- -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Luca Cappelletti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tests/test_cases.py: -------------------------------------------------------------------------------- 1 | """Test cases from the documents collection.""" 2 | 3 | import os 4 | from glob import glob 5 | import pandas as pd 6 | from tqdm.auto import tqdm 7 | from csv_trimming import CSVTrimmer 8 | 9 | 10 | def test_document_collection(): 11 | """Test the document collection.""" 12 | documents = glob("tests/documents/cleaned/*.csv") 13 | noisy_document_pattern = "tests/documents/noisy/{}" 14 | 15 | for document in tqdm( 16 | documents, 17 | desc="Testing documents", 18 | leave=False, 19 | dynamic_ncols=True, 20 | ): 21 | desinence = document.split(os.sep)[-1] 22 | noisy_document = noisy_document_pattern.format(desinence) 23 | 24 | noisy_csv = pd.read_csv(noisy_document, index_col=0) 25 | expected_cleaned_csv = pd.read_csv(document, index_col=0) 26 | 27 | trimmer = CSVTrimmer() 28 | trimmed_csv = trimmer.trim(noisy_csv) 29 | 30 | try: 31 | assert trimmed_csv.equals(expected_cleaned_csv) 32 | except AssertionError as exp: 33 | trimmed_csv.to_csv("tests/trimmed.csv", index=False) 34 | raise exp 35 | 36 | if os.path.exists("tests/trimmed.csv"): 37 | os.remove("tests/trimmed.csv") -------------------------------------------------------------------------------- /tests/documents/noisy/duplicated_schema.csv: -------------------------------------------------------------------------------- 1 | ,0,1,2,3,4,5,6,7,8 2 | 0,#RIF!,////,#RIF!,#RIF!,0,....,0,0, 3 | 1, ,"('surname',)('.',)(0,)",region,province,surname,"('province',)('_',)(1,)",,0,___ 4 | 2,0,////////, region ," province 5 | "," surname 6 | 7 | ",0,0,,.......... 8 | 3,_____,///////," region 9 | 10 | 11 | "," 12 | province "," 13 | 14 | 15 | surname ",#RIF!,#RIF!,,#RIF! 16 | 4," 17 | 18 | 19 | 20 | 21 | 22 | 23 | ",, Puglia ," 24 | 25 | Bari 26 | 27 | "," 28 | 29 | Zanetti 30 | 31 | 32 | ",0,--------,------,0 33 | 5,0," 34 | ", Piemonte , Alessandria ," Fabbri 35 | 36 | 37 | 38 | "," 39 | 40 | 41 | 42 | ", ," 43 | ", 44 | 6,0,-------,,#RIF!,#RIF!,0," 45 | 46 | ",----," 47 | 48 | " 49 | 7,/////////,/////////," 50 | Sicilia "," 51 | 52 | 53 | Agrigento 54 | ", Ferretti ,//////////,,----------,#RIF! 55 | 8,__,---------," Campania 56 | "," Napoli 57 | "," 58 | Belotti ",,,///," 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | " 69 | 9,,--------,0,/////,---,0,/////,----------, 70 | 10,-----,#RIF!," Liguria 71 | ", Savona ," 72 | 73 | Casini 74 | 75 | ",0,,#RIF!,#RIF! 76 | 11,...,0,,-----," 77 | 78 | 79 | 80 | 81 | 82 | 83 | ",--------,0,0, 84 | -------------------------------------------------------------------------------- /csv_trimming/cli.py: -------------------------------------------------------------------------------- 1 | """CLI command and entry point""" 2 | 3 | import argparse 4 | import pandas as pd 5 | from csv_trimming import CSVTrimmer 6 | 7 | 8 | def main(): 9 | """CLI command and entry point""" 10 | parser = argparse.ArgumentParser( 11 | description="Clean up malformed CSV files using heuristics." 12 | ) 13 | 14 | parser.add_argument("input_csv", help="Path to the input CSV file.") 15 | parser.add_argument("output_csv", help="Path to save the cleaned CSV file.") 16 | parser.add_argument( 17 | "--no-restore-header", 18 | action="store_true", 19 | help="Does not attempt to restore the header.", 20 | default=False, 21 | ) 22 | parser.add_argument( 23 | "--keep-padding", 24 | action="store_true", 25 | help="Does not attempt to drop padding.", 26 | default=False, 27 | ) 28 | parser.add_argument( 29 | "--keep-duplicated-schema", 30 | action="store_true", 31 | help="Does not attempt to drop duplicated schema.", 32 | default=False, 33 | ) 34 | 35 | args = parser.parse_args() 36 | 37 | # Load the CSV file 38 | csv = pd.read_csv(args.input_csv) 39 | 40 | # Create the CSVTrimmer object 41 | trimmer = CSVTrimmer() 42 | 43 | # Clean up the CSV using the options provided 44 | cleaned_csv = trimmer.trim( 45 | csv, 46 | restore_header=not args.no_restore_header, 47 | drop_padding=not args.keep_padding, 48 | drop_duplicated_schema=not args.keep_duplicated_schema, 49 | ) 50 | 51 | # Save the cleaned CSV 52 | cleaned_csv.to_csv(args.output_csv, index=False) 53 | 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Setup for the csv_trimming package.""" 2 | import os 3 | import re 4 | from setuptools import find_packages, setup 5 | 6 | here = os.path.abspath(os.path.dirname(__file__)) 7 | 8 | # Get the long description from the relevant file 9 | with open(os.path.join(here, 'README.md'), encoding='utf8') as f: 10 | long_description = f.read() 11 | 12 | 13 | def read(*parts): 14 | with open(os.path.join(here, *parts), 'r', encoding='utf8') as fp: 15 | return fp.read() 16 | 17 | 18 | def find_version(*file_paths): 19 | version_file = read(*file_paths) 20 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", 21 | version_file, re.M) 22 | if version_match: 23 | return version_match.group(1) 24 | raise RuntimeError("Unable to find version string.") 25 | 26 | 27 | __version__ = find_version("csv_trimming", "__version__.py") 28 | 29 | test_deps =[ 30 | "pytest", 31 | "pytest-cov", 32 | "pytest-readme", 33 | "tqdm", 34 | "validate_version_code", 35 | "random_csv_generator", 36 | "ugly_csv_generator" 37 | ] 38 | 39 | extras = { 40 | 'test': test_deps, 41 | } 42 | 43 | setup( 44 | name='csv_trimming', 45 | version=__version__, 46 | description="Package python to remove common ugliness from a csv-like file", 47 | long_description=long_description, 48 | long_description_content_type='text/markdown', 49 | url="https://github.com/LucaCappelletti94/csv_trimming", 50 | author="LucaCappelletti94", 51 | author_email="cappelletti.luca94@gmail.com", 52 | license='MIT', 53 | include_package_data=True, 54 | classifiers=[ 55 | 'Development Status :: 5 - Production/Stable', 56 | 'License :: OSI Approved :: MIT License', 57 | 'Programming Language :: Python :: 3' 58 | ], 59 | packages=find_packages(exclude=['contrib', 'docs', 'tests*']), 60 | tests_require=test_deps, 61 | python_requires='>=3.9', 62 | install_requires=[ 63 | "pandas>=2.1.0", 64 | "scipy", 65 | "numpy", 66 | "ugly_csv_generator>=1.1.4" 67 | ], 68 | extras_require=extras, 69 | entry_points={ 70 | 'console_scripts': [ 71 | 'csv-trim = csv_trimming.cli:main', # CLI command and entry point 72 | ], 73 | }, 74 | ) -------------------------------------------------------------------------------- /tests/test_trim_with_correlation.py: -------------------------------------------------------------------------------- 1 | """Test the trim method with correlation.""" 2 | 3 | from typing import Tuple 4 | import pandas as pd 5 | from csv_trimming import CSVTrimmer 6 | 7 | 8 | def correlation_callback( 9 | current_row: pd.Series, next_row: pd.Series 10 | ) -> Tuple[bool, pd.Series]: 11 | """Return the correlation between two rows. 12 | 13 | Parameters 14 | -------------------------- 15 | current_row: pd.Series, 16 | The current row. 17 | next_row: pd.Series, 18 | The next row. 19 | """ 20 | for value in current_row: 21 | if value == "Piemonte": 22 | return True, pd.concat( 23 | [ 24 | current_row, 25 | pd.Series( 26 | {f"correlated_{key}": value for key, value in next_row.items()} 27 | ), 28 | ] 29 | ) 30 | return False, current_row 31 | 32 | 33 | def test_trim_with_correlation(): 34 | """Test the trim method with correlation.""" 35 | csv = pd.read_csv("tests/test.csv", index_col=0) 36 | trimmer = CSVTrimmer(correlation_callback) 37 | result = trimmer.trim(csv) 38 | with open("tests/expected_result.csv", "r", encoding="utf8") as f: 39 | assert result.to_csv() == f.read() 40 | 41 | 42 | def simple_correlation_callback( 43 | current_row: pd.Series, next_row: pd.Series 44 | ) -> Tuple[bool, pd.Series]: 45 | """Return the correlation between two rows.""" 46 | 47 | # All of the rows that have a subsequent correlated row are 48 | # non-empty, and the subsequent correlated rows are always 49 | # with the first cell empty. 50 | if pd.isna(next_row.iloc[0]) and all(pd.notna(current_row)): 51 | return True, pd.concat( 52 | [ 53 | current_row, 54 | pd.Series({"surname": next_row.iloc[-1]}), 55 | ] 56 | ) 57 | 58 | return False, current_row 59 | 60 | 61 | def test_trim_correlation_simple(): 62 | """Test the trim method with correlation.""" 63 | csv = pd.read_csv("tests/trim_correlation_simple.csv", index_col=0, header=None) 64 | expected = pd.read_csv("tests/trim_correlation_simple_cleaned.csv", index_col=0) 65 | trimmer = CSVTrimmer(simple_correlation_callback) 66 | result = trimmer.trim(csv) 67 | assert result.equals(expected) 68 | -------------------------------------------------------------------------------- /tests/expected_result.csv: -------------------------------------------------------------------------------- 1 | ,region,province,surname,name,sex,birth_municipality,birth_province,birth_region,birth_cap,birth_province_code,birthdate,address,house_number,cap,municipality,province_code,codice_fiscale,total_debit,payed_debit,correlated_region,correlated_province,correlated_surname,correlated_name,correlated_sex,correlated_birth_municipality,correlated_birth_province,correlated_birth_region,correlated_birth_cap,correlated_birth_province_code,correlated_birthdate,correlated_address,correlated_house_number,correlated_cap,correlated_municipality,correlated_province_code,correlated_codice_fiscale,correlated_total_debit,correlated_payed_debit 2 | 0,Campania,Napoli,Villa,Giangiacomo Maria,M,Busto Garolfo,Milano,Lombardia,20020,MI,1997-03-24,Via Epomeo,489,80126,Napoli,,VLLGGC97C24B301W,"Eu 83.294,00","Eu 68.537,00",,,,,,,,,,,,,,,,,,, 3 | 1,Lombardia,Bergamo,Ferrari,Farhat,F,Rivoli Veronese,Verona,Veneto,37010,VR,1925-03-26,Piazza Repubblica,1,24050,Zanica,BG,FRRFHT25C66H356T,"Eu 4.771,00","Eu 4.188,00",,,,,,,,,,,,,,,,,,, 4 | 2,Campania,Napoli,Venturelli,Francesco,M,Mirandola,Modena,Emilia Romagna,41037,MO,1959-10-29,Via Monteoliveto,1,80135,Napoli,,VNTFNC59R29F240C,"Eu 84.020,00","Eu 80.640,00",,,,,,,,,,,,,,,,,,, 5 | 3,Piemonte,Biella,Nocentini,Saadia,F,Castelfranco Di Sopra,Arezzo,Toscana,52020,AR,1933-12-08,Via Xxv Aprile,15,13851,Castelletto Cervo,BI,NCNSDA33T48C112S,"Eu 30.843,00","Eu 21.587,00",Emilia Romagna,Ravenna,Bruno,Francesca,F,Terranova Da Sibari,Cosenza,Calabria,87010,CS,1983-11-21,Via Matteotti,55,48010,Cotignola,RA,BRNFNC83S61L124W,"Eu 46.499,00","Eu 36.566,00" 6 | 4,Piemonte,Torino,Ricci,Mattia,M,Sante Marie,L'Aquila,Abruzzo,67067,AQ,1926-08-04,Corso Re Umberto,38,10128,Torino,TO,RCCMTT26M04I326A,"Eu 80.583,00","Eu 4.186,00",Lombardia,Milano,Caruso,Sara,F,San Giovanni La Punta,Catania,Sicilia,95037,CT,1970-03-25,Via Giambellino,64,20146,Milano,MI,CRSSRA70C65H922G,"Eu 85.595,00","Eu 78.088,00" 7 | 5,Emilia Romagna,Bologna,Piras,Sofia,F,San Basilio,Cagliari,Sardegna,09040,CA,1991-10-19,Via Appia,24/B,40026,Imola,BO,PRSSFO91R59H766W,"Eu 59.769,00","Eu 13.577,00",,,,,,,,,,,,,,,,,,, 8 | 6,Abruzzo,Chieti,Musso,Bouchaib,M,Loazzolo,Asti,Piemonte,14051,AT,1974-09-01,Piazza Vittorio Emanuele,6,66043,Casoli,CH,MSSBHB74P01E633Q,"Eu 39.475,00","Eu 13.796,00",Lombardia,Brescia,Gamper,Andrea,M,Chiusa,Bolzano,Trentino Alto Adige,39043,BZ,1964-01-03,Via Fossadelli,snc,25031,Capriolo,BS,GMPNDR64A03C652R,"Eu 72.610,00","Eu 68.475,00" 9 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | """Test suite to verify that the CLI of the package works as expected. 2 | 3 | The CLI commands are of the form: 4 | 5 | ```bash 6 | csv-trim input.csv output.csv 7 | ``` 8 | 9 | where `input.csv` is the path to the input CSV file and `output.csv` is the path to save the cleaned CSV file. 10 | 11 | The CLI also supports the following options: 12 | 13 | - `--restore-header`: Attempts to restore the header. 14 | - `--drop-padding`: Attempts to drop padding. 15 | - `--drop-duplicated-schema`: Attempts to drop duplicated schema. 16 | 17 | The tests in this module verify that the CLI commands work as expected. 18 | """ 19 | import os 20 | import subprocess 21 | import pandas as pd 22 | from csv_trimming import CSVTrimmer 23 | 24 | def test_cli(): 25 | """Test that the CLI works as expected.""" 26 | 27 | paths = [ 28 | "tests/test.csv", 29 | "tests/documents/noisy/padding.csv", 30 | "tests/documents/noisy/duplicated_schema.csv", 31 | "tests/documents/noisy/sicilia.csv", 32 | ] 33 | 34 | for path in paths: 35 | for restore_header in (True, False): 36 | for drop_padding in (True, False): 37 | for drop_duplicated_schema in (True, False): 38 | trimmer = CSVTrimmer() 39 | csv = pd.read_csv(path) 40 | cleaned_csv = trimmer.trim( 41 | csv, 42 | restore_header=restore_header, 43 | drop_padding=drop_padding, 44 | drop_duplicated_schema=drop_duplicated_schema, 45 | ) 46 | 47 | # We store the cleaned CSV in a temporary file 48 | cleaned_csv.to_csv("tests/output.tmp.csv", index=False) 49 | 50 | # We reload the cleaned CSV from the temporary file 51 | cleaned_csv = pd.read_csv("tests/output.tmp.csv") 52 | 53 | # We create the same output with the CLI and compare 54 | # the results 55 | 56 | status = subprocess.run( 57 | [ 58 | "csv-trim", 59 | path, 60 | "tests/output.tmp.cli.csv", 61 | *(("--no-restore-header",) if not restore_header else ()), 62 | *(("--keep-padding",) if not drop_padding else ()), 63 | *(("--keep-duplicated-schema",) if not drop_duplicated_schema else ()), 64 | ], 65 | check=True, 66 | ) 67 | 68 | assert status.returncode == 0 69 | 70 | cli_cleaned_csv = pd.read_csv("tests/output.tmp.cli.csv") 71 | 72 | assert cleaned_csv.equals(cli_cleaned_csv) 73 | 74 | # We remove the temporary files 75 | os.remove("tests/output.tmp.cli.csv") 76 | os.remove("tests/output.tmp.csv") 77 | 78 | -------------------------------------------------------------------------------- /csv_trimming/trim.py: -------------------------------------------------------------------------------- 1 | """Module handling the cleaning up of malformed CSVs using heuristics.""" 2 | 3 | from typing import Tuple, Any, Optional, Callable 4 | import pandas as pd 5 | import numpy as np 6 | from ugly_csv_generator.utils.add_nan_like_artefacts import ( 7 | NAN_LIKE_ARTIFACTS, 8 | UNICODE_NAN_LIKE_ARTIFACTS, 9 | ) 10 | from ugly_csv_generator.utils.add_random_spaces import ( 11 | SPACES, 12 | UNICODE_SPACES 13 | ) 14 | from csv_trimming.logger import logger 15 | 16 | NAN_LIKE = NAN_LIKE_ARTIFACTS + UNICODE_NAN_LIKE_ARTIFACTS 17 | SPACE_LIKE = sorted(SPACES + UNICODE_SPACES, key=lambda x: -len(x)) 18 | 19 | 20 | def is_nan(candidate: Any) -> bool: 21 | """Return True if the given candidate is NaN-like. 22 | 23 | Parameters 24 | --------------------------- 25 | candidate: object, 26 | candidate to be checked. 27 | 28 | Returns 29 | --------------------------- 30 | True if the given candidate is NaN-like. 31 | """ 32 | return ( 33 | pd.isna(candidate) 34 | or candidate in NAN_LIKE 35 | or isinstance(candidate, str) 36 | and len(candidate) > 1 37 | and all(is_nan(e) for e in candidate) 38 | ) 39 | 40 | 41 | class CSVTrimmer: 42 | """Class handling the cleaning up of malformed CSVs using heuristics.""" 43 | 44 | def __init__( 45 | self, 46 | correlation_callback: Optional[ 47 | Callable[[pd.Series, pd.Series], Tuple[bool, pd.Series]] 48 | ] = None, 49 | ): 50 | """Create new CVSTrimmer object. 51 | 52 | Parameters 53 | --------------------------- 54 | correlation_callback: Optional[Callable] = None, 55 | Callback to use to check if two rows required to be specially handled for correlations. 56 | """ 57 | self._correlation_callback = correlation_callback 58 | 59 | def _mask_edges(self, mask: np.ndarray) -> np.ndarray: 60 | """ "Return boolean array with only boolean True attached to sides. 61 | 62 | Parameters 63 | ------------------------------- 64 | mask: np.ndarray, 65 | Boolean vector from which to extract borders. 66 | 67 | Returns 68 | ------------------------------- 69 | Boolean array with only boolean True attached to array sides. 70 | """ 71 | left, right = 0, 0 72 | for left, val in enumerate(mask): 73 | if not val: 74 | break 75 | for right, val in enumerate(np.flip(mask, axis=0)): 76 | if not val: 77 | break 78 | if right == 0: 79 | mask[left:] = False 80 | else: 81 | mask[left:-right] = False 82 | return mask 83 | 84 | def trim_padding(self, csv: pd.DataFrame) -> pd.DataFrame: 85 | """Return given CSV with trimmed rows and columns. 86 | 87 | Parameters 88 | ------------------------------- 89 | csv: pd.DataFrame, 90 | DataFrame whose borders are to be cleaned up. 91 | 92 | Returns 93 | ------------------------------- 94 | DataFrame wthout empty or near-empty border columns. 95 | """ 96 | nan_mask = csv.map(is_nan) 97 | rows_threshold = np.logical_not(nan_mask).sum(axis=1).mean() / 2 98 | rows_mask = self._mask_edges((~nan_mask).sum(axis=1).values < rows_threshold) 99 | columns_mask = self._mask_edges(nan_mask.all(axis=0).values) 100 | csv = csv[~rows_mask][csv.columns[~columns_mask]] 101 | return csv 102 | 103 | def restore_header(self, csv: pd.DataFrame) -> pd.DataFrame: 104 | """Return CSV with restored first row as header of CSV. 105 | 106 | Eventual double columns have added the term '.duplicated'. 107 | Eventual columns without name are called 'column #n' 108 | 109 | Parameters 110 | ------------------------------- 111 | csv: pd.DataFrame, 112 | DataFrame where to restore the header. 113 | 114 | Returns 115 | ------------------------------- 116 | DataFrame with restored header. 117 | """ 118 | new_header = csv.iloc[0] # grab the first row for the header 119 | 120 | new_sanitized_header = [] 121 | nan_values_count = 0 122 | for value in new_header: 123 | if is_nan(value): 124 | new_sanitized_header.append(f"column {nan_values_count}") 125 | nan_values_count += 1 126 | continue 127 | 128 | while value in new_sanitized_header: 129 | value = f"{value}.duplicated" 130 | 131 | new_sanitized_header.append(value) 132 | 133 | csv = csv.iloc[1:] # take the data less the header row 134 | csv.columns = new_sanitized_header # set the header row as the csv header 135 | return csv 136 | 137 | def drop_empty_columns(self, csv: pd.DataFrame) -> pd.DataFrame: 138 | """Return DataFrame with removed empty columns. 139 | 140 | Parameters 141 | --------------------------- 142 | csv: pd.DataFrame, 143 | DataFrame where to drop the empty columns. 144 | 145 | Returns 146 | --------------------------- 147 | DataFrame without empty columns. 148 | """ 149 | nan_mask = csv.map(is_nan).all(axis=0) 150 | return csv[csv.columns[~nan_mask]] 151 | 152 | def drop_duplicated_schema(self, csv: pd.DataFrame) -> pd.DataFrame: 153 | """Return DataFrame with removed duplicated schema. 154 | 155 | Implementative details 156 | --------------------------- 157 | In some cases, such as when multiple CSVs are chained in a poor manner, 158 | the same schema can be repeated multiple times. This method removes 159 | the duplicated schema if it is detected. 160 | """ 161 | # We detect the indices of all the rows that are equal to 162 | # the header, and then we drop them. 163 | header = csv.columns 164 | 165 | indices_to_drop = [] 166 | 167 | for idx, row in csv.iterrows(): 168 | if all(row == header): 169 | indices_to_drop.append(idx) 170 | 171 | return csv.drop(index=indices_to_drop) 172 | 173 | def drop_empty_rows(self, csv: pd.DataFrame) -> pd.DataFrame: 174 | """Return DataFrame with removed empty columns. 175 | 176 | Parameters 177 | --------------------------- 178 | csv: pd.DataFrame, 179 | DataFrame where to drop the empty columns. 180 | 181 | Returns 182 | --------------------------- 183 | DataFrame without empty columns. 184 | """ 185 | nan_mask = csv.map(is_nan).all(axis=1) 186 | return csv[~nan_mask] 187 | 188 | def _deep_strip(self, string: str): 189 | """Return string without continuos spaces. 190 | 191 | Parameters 192 | ---------------------------- 193 | string: str, 194 | Sanitized string. 195 | 196 | Returns 197 | ---------------------------- 198 | String without duplicated spaces. 199 | """ 200 | old_string = None 201 | while old_string != string: 202 | old_string = string 203 | for char in SPACE_LIKE: 204 | if char in string: 205 | string = " ".join(e for e in string.split(char) if e) 206 | return string.strip() 207 | 208 | def trim_spaces(self, csv: pd.DataFrame) -> pd.DataFrame: 209 | """Return dataframe without multiple spaces. 210 | 211 | Parameters 212 | --------------------------- 213 | csv: pd.DataFrame, 214 | DataFrame to be sanitized. 215 | 216 | Returns 217 | --------------------------- 218 | DataFrame without multiple spaces in strings. 219 | """ 220 | return csv.map(lambda x: self._deep_strip(x) if isinstance(x, str) else x) 221 | 222 | def restore_true_nan(self, csv: pd.DataFrame) -> pd.DataFrame: 223 | """Return CSV with restored True NaN values. 224 | 225 | Parameters 226 | ---------------------------- 227 | csv: pd.DataFrame, 228 | DataFrame where to restore the NaN values. 229 | 230 | Returns 231 | ---------------------------- 232 | DataFrame with restored NaN values. 233 | """ 234 | nan_mask = csv.map(is_nan) 235 | return csv.where(np.logical_not(nan_mask)) 236 | 237 | def normalize_correlated_rows(self, csv: pd.DataFrame) -> pd.DataFrame: 238 | """Return normalized correlated rows. 239 | 240 | Parameters 241 | -------------------------- 242 | csv: pd.DataFrame, 243 | DataFrame to be normalized. 244 | 245 | Returns 246 | -------------------------- 247 | The dataframe normalized correlated rows. 248 | """ 249 | if self._correlation_callback is None: 250 | return csv 251 | 252 | new_rows = [] 253 | skip_row = False 254 | stored_next_row = None 255 | 256 | for (_, current_row), (_, next_row) in zip( 257 | csv.iterrows(), csv.iloc[1:].iterrows() 258 | ): 259 | if skip_row: 260 | skip_row = False 261 | continue 262 | skip_row, result = self._correlation_callback(current_row, next_row) 263 | new_rows.append(result) 264 | stored_next_row = next_row 265 | 266 | if not skip_row and stored_next_row is not None: 267 | new_rows.append(stored_next_row) 268 | 269 | return pd.DataFrame(new_rows) 270 | 271 | def trim( 272 | self, 273 | csv: pd.DataFrame, 274 | restore_header: bool = True, 275 | drop_padding: bool = True, 276 | drop_duplicated_schema: bool = True, 277 | ) -> pd.DataFrame: 278 | """Return sanitized version of given dataframe. 279 | 280 | Parameters 281 | ---------------------------- 282 | csv: pd.DataFrame, 283 | The dataframe to clean up. 284 | restore_header: bool = True, 285 | Whether to restore the header. 286 | drop_padding: bool = True, 287 | Whether to drop padding. 288 | drop_duplicated_schema: bool = True, 289 | Whether to drop duplicated schemas. 290 | 291 | Returns 292 | ---------------------------- 293 | The cleaned up dataframe. 294 | """ 295 | logger.info("Removing extra spaces within cells.") 296 | csv = self.trim_spaces(csv) 297 | if drop_padding: 298 | logger.info("Removing empty space (or NaNs).") 299 | csv = self.trim_padding(csv) 300 | logger.info("Removing empty space rows.") 301 | csv = self.drop_empty_rows(csv) 302 | if restore_header: 303 | logger.info("Restoring detected header.") 304 | csv = self.restore_header(csv) 305 | logger.info("Restoring true NaN values.") 306 | csv = self.restore_true_nan(csv) 307 | logger.info("Normalizing correlated rows (if lambda is provided).") 308 | csv = self.normalize_correlated_rows(csv) 309 | logger.info("Dropping empty columns.") 310 | csv = self.drop_empty_columns(csv) 311 | if drop_duplicated_schema: 312 | logger.info("Dropping rows containing duplicated schema.") 313 | csv = self.drop_duplicated_schema(csv) 314 | 315 | csv = csv.reset_index(drop=True) 316 | csv.index.name = None 317 | csv.columns.name = None 318 | return csv 319 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ✂️ CSV Trimming 2 | 3 | [![PyPI](https://badge.fury.io/py/csv-trimming.svg)](https://badge.fury.io/py/csv-trimming) 4 | [![python](https://img.shields.io/pypi/pyversions/csv-trimming)](https://pypi.org/project/csv-trimming/) 5 | [![license](https://img.shields.io/pypi/l/csv-trimming)](https://pypi.org/project/csv-trimming/) 6 | [![Downloads](https://pepy.tech/badge/csv-trimming)](https://pepy.tech/projects/csv-trimming) 7 | [![Github Actions](https://github.com/LucaCappelletti94/csv_trimming/actions/workflows/python.yml/badge.svg)](https://github.com/LucaCappelletti94/csv_trimming/actions/) 8 | [![Codacy Badge](https://app.codacy.com/project/badge/Grade/0968ff39b133475da3a9c528b8ae2c9d)](https://app.codacy.com/gh/LucaCappelletti94/csv_trimming/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) 9 | 10 | [CSV Trimming](https://github.com/LucaCappelletti94/csv_trimming) is a Python package designed to take messy CSVs — the kind you get from scraping websites, legacy systems, or poorly managed data — and transform them into clean, well-formatted CSVs with just one line of code. No need for complex setups or large language models. It’s simple, straightforward, and generally gets the job done. 11 | 12 | ## How do I install this package? 13 | 14 | As usual, just download it using pip: 15 | 16 | ```shell 17 | pip install csv_trimming 18 | ``` 19 | 20 | ## How do I use this package? 21 | The package is very simple to use, just load your CSV and pass it to the trimmer. 22 | 23 | ```python 24 | import pandas as pd 25 | from csv_trimming import CSVTrimmer 26 | 27 | # Load your csv 28 | csv = pd.read_csv("tests/documents/noisy/sicilia.csv") 29 | # Instantiate the trimmer 30 | trimmer = CSVTrimmer() 31 | # And trim it 32 | trimmed_csv = trimmer.trim(csv) 33 | # That's it! 34 | ``` 35 | 36 | For instance, your input CSV to clean up may look like this at the beginning: 37 | 38 | | | 0 | 1 | 2 | 3 | 4 | 39 | |---|-----|-------------------------|---------|--------------------------------------------------|-----------| 40 | | 0 | #RIF! | #RIF! | ......... | /// | ----- | 41 | | 1 | ('surname',)('-',)(0,) | region | (""('surname',)('-',)(0,"),)(' ',)(1,) | province | surname | 42 | | 2 | ------ | #RIF! | #RIF! | | | 43 | | 3 | #RIF! | Calabria | ------- | Catanzaro | Rossi | 44 | | 4 | 0 | Sicilia | _____ | Ragusa | Pinna | 45 | | 5 | "" | Lombardia | ------ | Varese | Sbrana | 46 | | 6 | 0 | Lazio | __ | Roma | Mair | 47 | | 7 | _ | Sicilia | #RIF! | Messina | Ferrari | 48 | | 8 | ----- | .. | "" | 0 | --------- | 49 | 50 | And after the trimming, it will look like this: 51 | 52 | | | region | province | surname | 53 | |---|-----------|-----------|---------| 54 | | 0 | Calabria | Catanzaro | Rossi | 55 | | 1 | Sicilia | Ragusa | Pinna | 56 | | 2 | Lombardia | Varese | Sbrana | 57 | | 3 | Lazio | Roma | Mair | 58 | | 4 | Sicilia | Messina | Ferrari | 59 | 60 | Magic! 61 | 62 | ## Advanced trimming with row correlation 63 | Sometimes, the CSVs you are working with may have a row correlation, meaning part of a given row is inserted in the next row. Such cases are common when the data-entry clerk wants to make the whole table fit in their screen, and in order to do so, they split the row in two. While this is clearly an extremely bad practice, it happens in the real world and the CSV Trimmer can handle it with a little help. 64 | 65 | You just need to provide a function that defines which rows are correlated, and the CSV Trimmer will take care of the rest. While in this example we are using a rather simple function and a relatively clean CSV, the package can handle more complex cases. 66 | 67 | ```python 68 | from typing import Tuple 69 | import pandas as pd 70 | from csv_trimming import CSVTrimmer 71 | 72 | def simple_correlation_callback( 73 | current_row: pd.Series, 74 | next_row: pd.Series 75 | ) -> Tuple[bool, pd.Series]: 76 | """Return the correlation between two rows. 77 | 78 | Parameters 79 | ---------- 80 | current_row : pd.Series 81 | The current row being analyzed in the DataFrame. 82 | next_row : pd.Series 83 | The next row in the DataFrame. 84 | 85 | Returns 86 | ------- 87 | Tuple[bool, pd.Series] 88 | A tuple with a boolean indicating if the rows are correlated 89 | and a Series with the merged row. 90 | """ 91 | 92 | # All of the rows that have a subsequent correlated row are 93 | # non-empty, and the subsequent correlated rows are always 94 | # with the first cell empty. 95 | if pd.isna(next_row.iloc[0]) and all(pd.notna(current_row)): 96 | return True, pd.concat( 97 | [ 98 | current_row, 99 | pd.Series({"surname": next_row.iloc[-1]}), 100 | ] 101 | ) 102 | 103 | return False, current_row 104 | 105 | csv = pd.read_csv("tests/test.csv") 106 | trimmer = CSVTrimmer(simple_correlation_callback) 107 | result = trimmer.trim(csv) 108 | ``` 109 | 110 | In this case, our CSV looked like this at the beginning: 111 | 112 | | | region | province | 113 | |----|----------|-----------------| 114 | | 0 | Campania | Caserta | 115 | | 1 | | Ferrero | 116 | | 2 | Liguria | Imperia | 117 | | 3 | | Conti | 118 | | 4 | Puglia | Bari | 119 | | 5 | | Fabris | 120 | | 6 | Sardegna | Medio Campidano | 121 | | 7 | | Conti | 122 | | 8 | Lazio | Roma | 123 | | 9 | | Fabbri | 124 | 125 | 126 | And after the trimming, it will look like this: 127 | 128 | | | region | province | surname | 129 | |----|----------|-----------------|---------| 130 | | 0 | Campania | Caserta | Ferrero | 131 | | 1 | Liguria | Imperia | Conti | 132 | | 2 | Puglia | Bari | Fabris | 133 | | 3 | Sardegna | Medio Campidano | Conti | 134 | | 4 | Lazio | Roma | Fabbri | 135 | 136 | ## More examples 137 | Here follow some examples of the package in action. 138 | 139 | ### Case with duplicated schemas 140 | Sometimes, when chaining multiple CSVs in a poor manner, you may end up with duplicated schemas. 141 | The CSV Trimmer detects rows that match the detected header, and it can (optionally) remove them. 142 | 143 | ```python 144 | import pandas as pd 145 | from csv_trimming import CSVTrimmer 146 | 147 | # Load your csv 148 | csv = pd.read_csv("tests/documents/noisy/duplicated_schema.csv") 149 | # Instantiate the trimmer 150 | trimmer = CSVTrimmer() 151 | # And trim it 152 | trimmed_csv = trimmer.trim(csv, drop_duplicated_schema=True) 153 | # That's it! 154 | ``` 155 | 156 | For instance, your input CSV to clean up may look like this at the beginning: 157 | 158 | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 159 | |----|------------|------------------------------|--------|-------------------------------------------|------------------------------|------------------------------|------------|----------| 160 | | 0 | #RIF! | //// | #RIF! | #RIF! | 0 | .... | 0 | 0 | 161 | | 1 | | ('surname',)('.',)(0,) | region | province | surname | ('province',)('_',)(1,) | | 0 | 162 | | 2 | 0 | //////// | region | province | surname | 0 | 0 | | 163 | | 3 | _____ | /////// | region | province | surname | #RIF! | #RIF! | | 164 | | 4 | | | Puglia | Bari | Zanetti | 0 | -------- | 165 | | 5 | 0 | | Piemonte| Alessandria | Fabbri | | | | 166 | | 6 | 0 | ------- | | #RIF! | #RIF! | 0 | | ---- | 167 | | 7 | ///////// | ///////// | Sicilia| Agrigento | Ferretti | ////////// | | ----------| 168 | | 8 | __ | -------- | Campania| Napoli | Belotti | | /// | | 169 | | 9 | | -------- | 0 | ///// | --- | 0 | ///// | ----------| 170 | | 10 | ----- | #RIF! | Liguria| Savona | Casini | 0 | | #RIF! | 171 | | 11 | ... | 0 | | ----- | | -------- | 0 | 0 | 172 | 173 | And after the trimming, it will look like this: 174 | 175 | | | region | province | surname | 176 | |---|----------|-------------|---------| 177 | | 0 | Puglia | Bari | Zanetti | 178 | | 1 | Piemonte | Alessandria | Fabbri | 179 | | 2 | Sicilia | Agrigento | Ferretti| 180 | | 3 | Campania | Napoli | Belotti | 181 | | 4 | Liguria | Savona | Casini | 182 | 183 | ### Case with only padding 184 | Sometimes, the data entry clerk may start filling a table offsetted from the top-left corner, and export it with also 185 | empty cells all around. We call such cells "padding". The CSV Trimmer can detect and remove them. 186 | 187 | ```python 188 | import pandas as pd 189 | from csv_trimming import CSVTrimmer 190 | 191 | # Load your csv 192 | csv = pd.read_csv("tests/documents/noisy/padding.csv") 193 | 194 | # Instantiate the trimmer 195 | trimmer = CSVTrimmer() 196 | 197 | # And trim it 198 | trimmed_csv = trimmer.trim(csv, drop_padding=True) 199 | ``` 200 | 201 | For instance, your input CSV to clean up may look like this at the beginning: 202 | 203 | | | | region | province | surname | 204 | |---|---|----------|----------------|---------| 205 | | 0 | | | | | 206 | | 1 | | | | | 207 | | 2 | | region | province | surname | 208 | | 3 | | Campania | Caserta | Ferrero | 209 | | 4 | | Liguria | Imperia | Conti | 210 | | 5 | | Puglia | Bari | Fabris | 211 | | 6 | | Sardegna | Medio Campidano| Conti | 212 | | 7 | | Lazio | Roma | Fabbri | 213 | | 8 | | | | | 214 | | 9 | | | | | 215 | | 10| | | | | 216 | | 11| | | | | 217 | 218 | And after the trimming, it will look like this: 219 | 220 | | | region | province | surname | 221 | |---|----------|----------------|---------| 222 | | 0 | Campania | Caserta | Ferrero | 223 | | 1 | Liguria | Imperia | Conti | 224 | | 2 | Puglia | Bari | Fabris | 225 | | 3 | Sardegna | Medio Campidano| Conti | 226 | | 4 | Lazio | Roma | Fabbri | 227 | 228 | 229 | ## Command Line Interface 230 | The package also provides a command line interface to trim CSVs. It comes installed with the `setup.py` of the package, therefore after having pip installed the package, you can immediately use it from the command line. 231 | 232 | You can use it by running the following command: 233 | 234 | ```shell 235 | csv-trim tests/documents/noisy/sicilia.csv tests/documents/noisy/sicilia_trimmed.csv 236 | ``` 237 | 238 | It supports the following options to keep it from attempting some trimmings: 239 | 240 | - `--keep-padding`: Do not attempt to remove padding. 241 | - `--keep-duplicated-schema`: Do not attempt to remove duplicated schemas. 242 | - `--no-restore-header`: Do not attempt to restore the header. 243 | 244 | For instance: 245 | 246 | ```shell 247 | csv-trim tests/documents/noisy/sicilia.csv tests/documents/noisy/sicilia_trimmed.csv --keep-padding 248 | ``` 249 | 250 | ## How do I contribute to this package? 251 | If you have identified some new corner case that the package does not handle, or you have a suggestion for a new feature, feel free to open an issue. If you want to contribute with code, open an issue describing the feature you intend to add and submit a pull request. 252 | 253 | ## License 254 | This package is released under MIT license. -------------------------------------------------------------------------------- /tests/test_readme.py: -------------------------------------------------------------------------------- 1 | # # ✂️ CSV Trimming 2 | # 3 | # [![PyPI](https://badge.fury.io/py/csv-trimming.svg)](https://badge.fury.io/py/csv-trimming) 4 | # [![python](https://img.shields.io/pypi/pyversions/csv-trimming)](https://pypi.org/project/csv-trimming/) 5 | # [![license](https://img.shields.io/pypi/l/csv-trimming)](https://pypi.org/project/csv-trimming/) 6 | # [![Downloads](https://pepy.tech/badge/csv-trimming)](https://pepy.tech/projects/csv-trimming) 7 | # [![Github Actions](https://github.com/LucaCappelletti94/csv_trimming/actions/workflows/python.yml/badge.svg)](https://github.com/LucaCappelletti94/csv_trimming/actions/) 8 | # [![Codacy Badge](https://app.codacy.com/project/badge/Grade/0968ff39b133475da3a9c528b8ae2c9d)](https://app.codacy.com/gh/LucaCappelletti94/csv_trimming/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) 9 | # 10 | # [CSV Trimming](https://github.com/LucaCappelletti94/csv_trimming) is a Python package designed to take messy CSVs — the kind you get from scraping websites, legacy systems, or poorly managed data — and transform them into clean, well-formatted CSVs with just one line of code. No need for complex setups or large language models. It’s simple, straightforward, and generally gets the job done. 11 | # 12 | # ## How do I install this package? 13 | # 14 | # As usual, just download it using pip: 15 | # 16 | # ```shell 17 | # pip install csv_trimming 18 | 19 | # 20 | # ## How do I use this package? 21 | # The package is very simple to use, just load your CSV and pass it to the trimmer. 22 | # 23 | def test_line_22(): 24 | import pandas as pd 25 | from csv_trimming import CSVTrimmer 26 | 27 | # Load your csv 28 | csv = pd.read_csv("tests/documents/noisy/sicilia.csv") 29 | # Instantiate the trimmer 30 | trimmer = CSVTrimmer() 31 | # And trim it 32 | trimmed_csv = trimmer.trim(csv) 33 | # That's it! 34 | 35 | # 36 | # For instance, your input CSV to clean up may look like this at the beginning: 37 | # 38 | # | | 0 | 1 | 2 | 3 | 4 | 39 | # |---|-----|-------------------------|---------|--------------------------------------------------|-----------| 40 | # | 0 | #RIF! | #RIF! | ......... | /// | ----- | 41 | # | 1 | ('surname',)('-',)(0,) | region | (""('surname',)('-',)(0,"),)(' ',)(1,) | province | surname | 42 | # | 2 | ------ | #RIF! | #RIF! | | | 43 | # | 3 | #RIF! | Calabria | ------- | Catanzaro | Rossi | 44 | # | 4 | 0 | Sicilia | _____ | Ragusa | Pinna | 45 | # | 5 | "" | Lombardia | ------ | Varese | Sbrana | 46 | # | 6 | 0 | Lazio | __ | Roma | Mair | 47 | # | 7 | _ | Sicilia | #RIF! | Messina | Ferrari | 48 | # | 8 | ----- | .. | "" | 0 | --------- | 49 | # 50 | # And after the trimming, it will look like this: 51 | # 52 | # | | region | province | surname | 53 | # |---|-----------|-----------|---------| 54 | # | 0 | Calabria | Catanzaro | Rossi | 55 | # | 1 | Sicilia | Ragusa | Pinna | 56 | # | 2 | Lombardia | Varese | Sbrana | 57 | # | 3 | Lazio | Roma | Mair | 58 | # | 4 | Sicilia | Messina | Ferrari | 59 | # 60 | # Magic! 61 | # 62 | # ## Advanced trimming with row correlation 63 | # Sometimes, the CSVs you are working with may have a row correlation, meaning part of a given row is inserted in the next row. Such cases are common when the data-entry clerk wants to make the whole table fit in their screen, and in order to do so, they split the row in two. While this is clearly an extremely bad practice, it happens in the real world and the CSV Trimmer can handle it with a little help. 64 | # 65 | # You just need to provide a function that defines which rows are correlated, and the CSV Trimmer will take care of the rest. While in this example we are using a rather simple function and a relatively clean CSV, the package can handle more complex cases. 66 | # 67 | def test_line_66(): 68 | from typing import Tuple 69 | import pandas as pd 70 | from csv_trimming import CSVTrimmer 71 | 72 | def simple_correlation_callback( 73 | current_row: pd.Series, 74 | next_row: pd.Series 75 | ) -> Tuple[bool, pd.Series]: 76 | """Return the correlation between two rows. 77 | 78 | Parameters 79 | ---------- 80 | current_row : pd.Series 81 | The current row being analyzed in the DataFrame. 82 | next_row : pd.Series 83 | The next row in the DataFrame. 84 | 85 | Returns 86 | ------- 87 | Tuple[bool, pd.Series] 88 | A tuple with a boolean indicating if the rows are correlated 89 | and a Series with the merged row. 90 | """ 91 | 92 | # All of the rows that have a subsequent correlated row are 93 | # non-empty, and the subsequent correlated rows are always 94 | # with the first cell empty. 95 | if pd.isna(next_row.iloc[0]) and all(pd.notna(current_row)): 96 | return True, pd.concat( 97 | [ 98 | current_row, 99 | pd.Series({"surname": next_row.iloc[-1]}), 100 | ] 101 | ) 102 | 103 | return False, current_row 104 | 105 | csv = pd.read_csv("tests/test.csv") 106 | trimmer = CSVTrimmer(simple_correlation_callback) 107 | result = trimmer.trim(csv) 108 | 109 | # 110 | # In this case, our CSV looked like this at the beginning: 111 | # 112 | # | | region | province | 113 | # |----|----------|-----------------| 114 | # | 0 | Campania | Caserta | 115 | # | 1 | | Ferrero | 116 | # | 2 | Liguria | Imperia | 117 | # | 3 | | Conti | 118 | # | 4 | Puglia | Bari | 119 | # | 5 | | Fabris | 120 | # | 6 | Sardegna | Medio Campidano | 121 | # | 7 | | Conti | 122 | # | 8 | Lazio | Roma | 123 | # | 9 | | Fabbri | 124 | # 125 | # 126 | # And after the trimming, it will look like this: 127 | # 128 | # | | region | province | surname | 129 | # |----|----------|-----------------|---------| 130 | # | 0 | Campania | Caserta | Ferrero | 131 | # | 1 | Liguria | Imperia | Conti | 132 | # | 2 | Puglia | Bari | Fabris | 133 | # | 3 | Sardegna | Medio Campidano | Conti | 134 | # | 4 | Lazio | Roma | Fabbri | 135 | # 136 | # ## More examples 137 | # Here follow some examples of the package in action. 138 | # 139 | # ### Case with duplicated schemas 140 | # Sometimes, when chaining multiple CSVs in a poor manner, you may end up with duplicated schemas. 141 | # The CSV Trimmer detects rows that match the detected header, and it can (optionally) remove them. 142 | # 143 | def test_line_142(): 144 | import pandas as pd 145 | from csv_trimming import CSVTrimmer 146 | 147 | # Load your csv 148 | csv = pd.read_csv("tests/documents/noisy/duplicated_schema.csv") 149 | # Instantiate the trimmer 150 | trimmer = CSVTrimmer() 151 | # And trim it 152 | trimmed_csv = trimmer.trim(csv, drop_duplicated_schema=True) 153 | # That's it! 154 | 155 | # 156 | # For instance, your input CSV to clean up may look like this at the beginning: 157 | # 158 | # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 159 | # |----|------------|------------------------------|--------|-------------------------------------------|------------------------------|------------------------------|------------|----------| 160 | # | 0 | #RIF! | //// | #RIF! | #RIF! | 0 | .... | 0 | 0 | 161 | # | 1 | | ('surname',)('.',)(0,) | region | province | surname | ('province',)('_',)(1,) | | 0 | 162 | # | 2 | 0 | //////// | region | province | surname | 0 | 0 | | 163 | # | 3 | _____ | /////// | region | province | surname | #RIF! | #RIF! | | 164 | # | 4 | | | Puglia | Bari | Zanetti | 0 | -------- | 165 | # | 5 | 0 | | Piemonte| Alessandria | Fabbri | | | | 166 | # | 6 | 0 | ------- | | #RIF! | #RIF! | 0 | | ---- | 167 | # | 7 | ///////// | ///////// | Sicilia| Agrigento | Ferretti | ////////// | | ----------| 168 | # | 8 | __ | -------- | Campania| Napoli | Belotti | | /// | | 169 | # | 9 | | -------- | 0 | ///// | --- | 0 | ///// | ----------| 170 | # | 10 | ----- | #RIF! | Liguria| Savona | Casini | 0 | | #RIF! | 171 | # | 11 | ... | 0 | | ----- | | -------- | 0 | 0 | 172 | # 173 | # And after the trimming, it will look like this: 174 | # 175 | # | | region | province | surname | 176 | # |---|----------|-------------|---------| 177 | # | 0 | Puglia | Bari | Zanetti | 178 | # | 1 | Piemonte | Alessandria | Fabbri | 179 | # | 2 | Sicilia | Agrigento | Ferretti| 180 | # | 3 | Campania | Napoli | Belotti | 181 | # | 4 | Liguria | Savona | Casini | 182 | # 183 | # ### Case with only padding 184 | # Sometimes, the data entry clerk may start filling a table offsetted from the top-left corner, and export it with also 185 | # empty cells all around. We call such cells "padding". The CSV Trimmer can detect and remove them. 186 | # 187 | def test_line_186(): 188 | import pandas as pd 189 | from csv_trimming import CSVTrimmer 190 | 191 | # Load your csv 192 | csv = pd.read_csv("tests/documents/noisy/padding.csv") 193 | 194 | # Instantiate the trimmer 195 | trimmer = CSVTrimmer() 196 | 197 | # And trim it 198 | trimmed_csv = trimmer.trim(csv, drop_padding=True) 199 | 200 | # 201 | # For instance, your input CSV to clean up may look like this at the beginning: 202 | # 203 | # | | | region | province | surname | 204 | # |---|---|----------|----------------|---------| 205 | # | 0 | | | | | 206 | # | 1 | | | | | 207 | # | 2 | | region | province | surname | 208 | # | 3 | | Campania | Caserta | Ferrero | 209 | # | 4 | | Liguria | Imperia | Conti | 210 | # | 5 | | Puglia | Bari | Fabris | 211 | # | 6 | | Sardegna | Medio Campidano| Conti | 212 | # | 7 | | Lazio | Roma | Fabbri | 213 | # | 8 | | | | | 214 | # | 9 | | | | | 215 | # | 10| | | | | 216 | # | 11| | | | | 217 | # 218 | # And after the trimming, it will look like this: 219 | # 220 | # | | region | province | surname | 221 | # |---|----------|----------------|---------| 222 | # | 0 | Campania | Caserta | Ferrero | 223 | # | 1 | Liguria | Imperia | Conti | 224 | # | 2 | Puglia | Bari | Fabris | 225 | # | 3 | Sardegna | Medio Campidano| Conti | 226 | # | 4 | Lazio | Roma | Fabbri | 227 | # 228 | # 229 | # ## Command Line Interface 230 | # The package also provides a command line interface to trim CSVs. It comes installed with the `setup.py` of the package, therefore after having pip installed the package, you can immediately use it from the command line. 231 | # 232 | # You can use it by running the following command: 233 | # 234 | # ```shell 235 | # csv-trim tests/documents/noisy/sicilia.csv tests/documents/noisy/sicilia_trimmed.csv 236 | 237 | # 238 | # It supports the following options to keep it from attempting some trimmings: 239 | # 240 | # - `--keep-padding`: Do not attempt to remove padding. 241 | # - `--keep-duplicated-schema`: Do not attempt to remove duplicated schemas. 242 | # - `--no-restore-header`: Do not attempt to restore the header. 243 | # 244 | # For instance: 245 | # 246 | # ```shell 247 | # csv-trim tests/documents/noisy/sicilia.csv tests/documents/noisy/sicilia_trimmed.csv --keep-padding 248 | 249 | # 250 | # ## How do I contribute to this package? 251 | # If you have identified some new corner case that the package does not handle, or you have a suggestion for a new feature, feel free to open an issue. If you want to contribute with code, open an issue describing the feature you intend to add and submit a pull request. 252 | # 253 | # ## License 254 | # This package is released under MIT license. -------------------------------------------------------------------------------- /tests/test.csv: -------------------------------------------------------------------------------- 1 | ,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41 2 | 0," 3 | 4 | 5 | 6 | 7 | 8 | ",#RIF!,,random,#RIF!," 9 | 10 | 11 | 12 | 13 | 14 | 15 | ", ,0,#RIF!,#RIF!, ," 16 | ",____," 17 | 18 | 19 | 20 | 21 | 22 | 23 | ",#RIF!,....,#RIF!,///,, , ,#RIF!,#RIF!,0,#RIF!,--," 24 | 25 | 26 | 27 | 28 | 29 | ",,.........,/////,,//////," 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | ",#RIF!,/," 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | "," 49 | 50 | ",,--,#RIF!,///, 51 | 1,,,random,..,#RIF!,/////////," 52 | ",/////////,#RIF!,,#RIF!,0,0,0,#RIF!,,..,----,0,0," 53 | 54 | 55 | ",________,//////////,..,0,0,#RIF!,.......,0,...,_______,,#RIF!,.,," 56 | 57 | 58 | 59 | ",0,________,#RIF!,#RIF!,0, 60 | 2,,caso,#RIF!,#RIF!," 61 | ",0,," 62 | 63 | ",0,0,_____,_," 64 | 65 | 66 | 67 | 68 | 69 | ",,0,0,///,0," 70 | 71 | "," 72 | 73 | ",_____,#RIF!,0," 74 | 75 | ",--, ,0," 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | ",#RIF!,0,#RIF!,0,,,....,..,,,,---------,....," 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | " 92 | 3,,0,0,#RIF!,0,__________,........,birth_municipality_1,region,province,surname,name,codice_fiscale4,sex,birth_province 2,birth_municipality,cap_3,birth_province,birth_region,birth_cap,birth_province_code,birthdate,sex-6,address,house_number,cap,cap.0,birth_province 2-7,municipality,province_code,codice_fiscale,total_debit,payed_debit,municipality_5,0,__,_,......,#RIF!,------," 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | ",#RIF! 101 | 4," 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | ",#RIF!,0,0,_____,--------,,-," 111 | Campania "," 112 | Napoli 113 | 114 | "," 115 | 116 | 117 | Villa "," 118 | Giangiacomo 119 | Maria 120 | 121 | ",," 122 | M 123 | 124 | 125 | 126 | ",0," 127 | Busto Garolfo 128 | ",0, Milano ," 129 | 130 | Lombardia 131 | ", 20020 ," MI 132 | 133 | "," 134 | 135 | 1997-03-24 136 | 137 | ",/////," 138 | 139 | Via 140 | Epomeo 141 | "," 142 | 143 | 144 | 489 145 | 146 | "," 147 | 148 | 80126 149 | 150 | ",," 151 | ", Napoli ," 152 | 153 | 154 | 155 | "," 156 | VLLGGC97C24B301W 157 | "," 158 | 159 | Eu 160 | 83.294,00 "," 161 | 162 | 163 | Eu 164 | 165 | 68.537,00 166 | 167 | ",," 168 | 169 | 170 | 171 | 172 | ",0,........,,____,#RIF!, ,#RIF! 173 | 5,...,,0," 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | ",...., ,-,0," 184 | 185 | Lombardia 186 | "," 187 | 188 | 189 | Bergamo "," 190 | Ferrari "," 191 | Farhat ",__," 192 | F ", ," 193 | Rivoli 194 | Veronese 195 | ",#RIF!," 196 | 197 | Verona 198 | 199 | 200 | 201 | "," 202 | Veneto 203 | "," 204 | 37010 "," 205 | 206 | 207 | 208 | 209 | VR 210 | "," 1925-03-26 211 | ",------," 212 | 213 | Piazza 214 | Repubblica "," 1 215 | "," 24050 216 | ",,0," Zanica 217 | "," 218 | BG 219 | 220 | "," 221 | 222 | FRRFHT25C66H356T 223 | 224 | 225 | "," 226 | Eu 4.771,00 "," 227 | Eu 4.188,00 228 | 229 | 230 | ",,..........,----,//////,0,0,.,#RIF!,0 231 | 6,#RIF!," 232 | 233 | 234 | ",,," 235 | 236 | 237 | 238 | 239 | ",0,////,---------," 240 | 241 | Campania "," 242 | 243 | Napoli 244 | "," 245 | 246 | 247 | Venturelli 248 | "," Francesco 249 | 250 | 251 | 252 | ",///," 253 | M 254 | ",0, Mirandola ,#RIF!," 255 | 256 | Modena "," Emilia 257 | 258 | 259 | Romagna "," 260 | 41037 261 | ", MO , 1959-10-29 ,0, Via Monteoliveto ," 1 262 | "," 263 | 264 | 80135 265 | ",0,-," 266 | 267 | 268 | 269 | Napoli 270 | ",0," 271 | VNTFNC59R29F240C 272 | "," 273 | 274 | Eu 275 | 276 | 84.020,00 277 | "," 278 | Eu 80.640,00 279 | 280 | ",#RIF!, ," 281 | 282 | 283 | 284 | ",0,---,0, ,__, 285 | 7,--," 286 | 287 | 288 | 289 | 290 | 291 | ",---------,0,#RIF!,----,0,," 292 | Piemonte 293 | "," 294 | Biella "," Nocentini 295 | "," 296 | 297 | 298 | 299 | 300 | Saadia 301 | 302 | ",0," 303 | 304 | 305 | 306 | F 307 | ",#RIF!," 308 | 309 | Castelfranco 310 | 311 | Di 312 | 313 | Sopra 314 | 315 | ",#RIF!," 316 | 317 | 318 | Arezzo 319 | 320 | "," 321 | Toscana 322 | 323 | "," 324 | 52020 325 | "," 326 | 327 | 328 | 329 | AR 330 | "," 331 | 332 | 333 | 334 | 1933-12-08 335 | 336 | ", ," Via 337 | Xxv 338 | Aprile "," 339 | 15 340 | "," 341 | 13851 "," 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | ",, Castelletto Cervo , BI ," 350 | 351 | NCNSDA33T48C112S 352 | "," Eu 30.843,00 "," Eu 21.587,00 353 | ",...,0,--------,.......," 354 | 355 | 356 | 357 | 358 | 359 | 360 | ",____, ,_,........ 361 | 8,#RIF!," 362 | "," 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | ",0," 373 | 374 | ",#RIF!,---------,#RIF!," 375 | 376 | Emilia Romagna "," 377 | 378 | 379 | Ravenna ", Bruno ," 380 | 381 | 382 | Francesca 383 | 384 | "," 385 | 386 | ", F ,//////////," 387 | Terranova Da Sibari ", ," 388 | 389 | 390 | Cosenza 391 | "," 392 | Calabria 393 | "," 394 | 395 | 87010 "," 396 | CS 397 | "," 398 | 1983-11-21 "," 399 | 400 | "," Via Matteotti 401 | ", 55 ," 48010 402 | 403 | 404 | ",,0," 405 | Cotignola "," 406 | 407 | RA "," BRNFNC83S61L124W 408 | "," Eu 409 | 410 | 46.499,00 411 | "," 412 | 413 | 414 | Eu 36.566,00 415 | ",#RIF!,______,#RIF!,0,0,------," 416 | ",,0 417 | 9,,0," 418 | 419 | ",,#RIF!,//////////,#RIF!,0," Piemonte 420 | ", Torino ," 421 | 422 | Ricci "," Mattia 423 | ",#RIF!," 424 | 425 | M 426 | 427 | 428 | ",," 429 | 430 | 431 | 432 | Sante Marie 433 | 434 | 435 | 436 | ",," L'Aquila 437 | "," 438 | 439 | Abruzzo "," 440 | 441 | 67067 442 | 443 | "," 444 | AQ "," 445 | 1926-08-04 446 | 447 | 448 | 449 | "," 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | "," 460 | 461 | Corso 462 | Re 463 | Umberto "," 464 | 465 | 38 466 | 467 | "," 468 | 10128 469 | ",_________,#RIF!," Torino 470 | "," 471 | 472 | TO 473 | 474 | "," 475 | 476 | 477 | 478 | 479 | 480 | RCCMTT26M04I326A "," Eu 80.583,00 481 | 482 | 483 | "," 484 | 485 | Eu 4.186,00 486 | 487 | 488 | ",0,#RIF!,, ,#RIF!,,------,0," 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | " 498 | 10,0,#RIF!,,0,,/,#RIF!,, Lombardia ," 499 | Milano 500 | 501 | "," 502 | Caruso 503 | 504 | "," Sara 505 | 506 | 507 | ",#RIF!," 508 | 509 | 510 | 511 | F 512 | ",...," 513 | San 514 | 515 | Giovanni 516 | 517 | La 518 | 519 | Punta 520 | ",," 521 | 522 | Catania 523 | 524 | "," 525 | Sicilia 526 | ", 95037 ," 527 | CT 528 | 529 | "," 530 | 531 | 1970-03-25 ",0," Via 532 | Giambellino 533 | "," 534 | 535 | 536 | 64 537 | ", 20146 ,#RIF!,, Milano ," MI 538 | 539 | 540 | "," CRSSRA70C65H922G 541 | 542 | "," 543 | Eu 544 | 85.595,00 545 | 546 | "," 547 | Eu 548 | 78.088,00 ",0,------,0,, ,--------,__," 549 | 550 | 551 | 552 | 553 | 554 | "," 555 | 556 | 557 | 558 | " 559 | 11,0,#RIF!, ,----,0,_," 560 | 561 | 562 | 563 | 564 | 565 | 566 | ",#RIF!," 567 | 568 | Emilia 569 | Romagna 570 | ", Bologna ," Piras 571 | "," 572 | Sofia 573 | ",," 574 | F 575 | 576 | "," 577 | 578 | "," San 579 | 580 | Basilio ",0," Cagliari 581 | 582 | 583 | "," 584 | Sardegna 585 | "," 586 | 587 | 09040 "," CA 588 | 589 | 590 | 591 | 592 | "," 593 | 1991-10-19 594 | ",0," 595 | Via Appia "," 596 | 24/B 597 | "," 598 | 599 | 40026 600 | 601 | ",#RIF!,#RIF!," 602 | 603 | 604 | Imola 605 | 606 | "," 607 | BO "," 608 | 609 | PRSSFO91R59H766W "," 610 | Eu 611 | 59.769,00 "," 612 | Eu 613 | 13.577,00 "," 614 | 615 | 616 | 617 | 618 | ",,///////,--," 619 | 620 | 621 | 622 | 623 | ",-----,----,, 624 | 12,0, ,-,,,#RIF!," 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | ",#RIF!," 633 | 634 | Abruzzo 635 | "," 636 | Chieti "," 637 | Musso "," 638 | 639 | Bouchaib ",0," 640 | 641 | M ",..........," 642 | 643 | Loazzolo ",," 644 | Asti 645 | "," 646 | Piemonte 647 | 648 | "," 649 | 14051 ", AT ," 1974-09-01 650 | 651 | ",0," 652 | Piazza Vittorio Emanuele 653 | 654 | "," 6 655 | "," 66043 656 | 657 | 658 | "," 659 | ",...., Casoli ," 660 | CH 661 | "," 662 | 663 | 664 | 665 | MSSBHB74P01E633Q 666 | 667 | "," Eu 668 | 669 | 670 | 39.475,00 671 | 672 | "," 673 | Eu 674 | 675 | 13.796,00 676 | "," 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | ",/////,////////," 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | ",,#RIF!,," 694 | 695 | 696 | ",. 697 | 13,#RIF!,#RIF!,__,,--------,0,#RIF!,, Lombardia ," 698 | Brescia 699 | ", Gamper , Andrea ,#RIF!," M 700 | ",," 701 | 702 | 703 | 704 | Chiusa ",#RIF!," 705 | Bolzano "," Trentino 706 | 707 | Alto 708 | 709 | Adige "," 710 | 711 | 712 | 39043 713 | 714 | "," 715 | 716 | BZ 717 | 718 | "," 719 | 1964-01-03 ",0," 720 | 721 | Via 722 | Fossadelli 723 | "," 724 | snc 725 | 726 | 727 | "," 728 | 25031 729 | 730 | ",, ," 731 | Capriolo ", BS ," 732 | GMPNDR64A03C652R "," 733 | 734 | Eu 72.610,00 735 | 736 | "," 737 | 738 | 739 | Eu 68.475,00 740 | ",#RIF!,#RIF!,," 741 | 742 | 743 | 744 | 745 | ",#RIF!," 746 | 747 | 748 | 749 | 750 | 751 | "," 752 | 753 | 754 | 755 | 756 | 757 | 758 | ",0,____ 759 | 14,," 760 | 761 | 762 | 763 | 764 | 765 | 766 | "," 767 | 768 | 769 | 770 | 771 | 772 | ",0,0, ,#RIF!," 773 | 774 | "," 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | ",...," 784 | 785 | 786 | 787 | ",,---,0,///,0,--------,0,#RIF!," 788 | 789 | 790 | 791 | 792 | 793 | ",,........,........," 794 | 795 | 796 | 797 | 798 | 799 | 800 | ",,...,_____,#RIF!,----------,........,#RIF!,,...,0,#RIF!,,0,,0,#RIF!,_____," 801 | 802 | 803 | 804 | 805 | 806 | " 807 | 15," 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | "," 816 | 817 | 818 | 819 | ",----------,#RIF!,0,........,--------,____,#RIF!,0,#RIF!,//////,#RIF!,0,,,---,,,#RIF!,...,#RIF!,0,......,," 820 | 821 | 822 | 823 | 824 | ",____, ,////////,/,__,,,-----," 825 | 826 | 827 | 828 | ",-------,----------, ,0,#RIF!,----------,0 829 | 16,," 830 | 831 | 832 | ",0,///////,#RIF!,----,.., ,,_____,,,,,0,0,________,----,0,#RIF!,," 833 | "," 834 | 835 | 836 | 837 | 838 | 839 | ",----,#RIF!," 840 | 841 | 842 | 843 | 844 | 845 | 846 | ",#RIF!,#RIF!,0,____,0,,_________,,#RIF!," 847 | ",,--,//////////," 848 | 849 | ",...,#RIF! 850 | 17,...,,0,0,," 851 | 852 | 853 | 854 | 855 | 856 | ",,,,,#RIF!,#RIF!," 857 | 858 | 859 | 860 | 861 | 862 | ",, ,#RIF!," 863 | 864 | 865 | 866 | 867 | ",0,,_______,0,.........,#RIF!,..," 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | ",#RIF!," 878 | 879 | "," 880 | 881 | 882 | 883 | 884 | 885 | ",0,#RIF!,0,#RIF!,0,__,#RIF!,......,#RIF!," 886 | 887 | 888 | 889 | 890 | ",0,..,#RIF!,#RIF! 891 | 18,0,.,///////,," 892 | 893 | 894 | 895 | ",,,#RIF!," 896 | 897 | ",#RIF!," 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | "," 906 | 907 | 908 | 909 | ",,0,,#RIF!,,0, ,,......," 910 | 911 | 912 | ",," 913 | 914 | 915 | 916 | "," 917 | 918 | 919 | 920 | 921 | 922 | ", , ,----,#RIF!,--------,____, ,0,-----,#RIF!,-----,---------,/,,0," 923 | 924 | 925 | 926 | 927 | 928 | "," 929 | " 930 | 19,///////,..........,...,0,......, , ,--,," 931 | 932 | ",////////,,_________,...," 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | ",, ,/////,,,.," 941 | "," 942 | 943 | ",0,0,0,0,_____,,#RIF!,--,0,......," 944 | 945 | 946 | ",0,....,///,0,,........., , 947 | 20,......,," 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | ",,0,#RIF!,0,0,___,0,/////,_________,#RIF!,," 957 | 958 | 959 | 960 | 961 | ",,,#RIF!,----------,," 962 | ",-------,,#RIF!,#RIF!,,#RIF!,....," 963 | 964 | 965 | 966 | 967 | 968 | ",#RIF!," 969 | 970 | 971 | 972 | ",," 973 | 974 | 975 | ",0,--,#RIF!,#RIF!,0,,-----,0, 976 | 21,....," 977 | 978 | 979 | "," 980 | 981 | 982 | 983 | 984 | 985 | 986 | ",#RIF!," 987 | 988 | 989 | 990 | ",----------,#RIF!,0,,#RIF!, ," 991 | 992 | 993 | 994 | 995 | ",,0, ,0,#RIF!,////////,---,#RIF!,, ,0,0,0,,," 996 | 997 | ",0,__________, ,#RIF!,,........,_______,,#RIF!," 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | "," 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | ",#RIF!,#RIF!, 1014 | --------------------------------------------------------------------------------