├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── cleaningbenchmark ├── CorruptionModels │ ├── AddressCorruptionModel.py │ ├── CSVCorruptionModel.py │ ├── CorruptionModel.py │ ├── ERCorruptionModel.py │ ├── EscapeCorruptionModel.py │ ├── TypoCorruptionModel.py │ ├── __init__.py │ └── data │ │ ├── Address.txt │ │ ├── Typos.txt │ │ └── wikipreprocess.py ├── ModelEval │ ├── EvalUtils.py │ └── __init__.py ├── NoiseModels │ ├── ConstraintModel.py │ ├── NoiseModel.py │ ├── RandomNoise.py │ ├── SystematicNoise.py │ └── __init__.py ├── Utils │ ├── Utils.py │ └── __init__.py ├── WorkflowModels │ ├── WorkFlowModel.py │ └── __init__.py └── __init__.py ├── setup.cfg ├── setup.py └── test └── testcleaningbenchmark.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/README.md -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/AddressCorruptionModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/CorruptionModels/AddressCorruptionModel.py -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/CSVCorruptionModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/CorruptionModels/CSVCorruptionModel.py -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/CorruptionModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/CorruptionModels/CorruptionModel.py -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/ERCorruptionModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/CorruptionModels/ERCorruptionModel.py -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/EscapeCorruptionModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/CorruptionModels/EscapeCorruptionModel.py -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/TypoCorruptionModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/CorruptionModels/TypoCorruptionModel.py -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/data/Address.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/CorruptionModels/data/Address.txt -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/data/Typos.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/CorruptionModels/data/Typos.txt -------------------------------------------------------------------------------- /cleaningbenchmark/CorruptionModels/data/wikipreprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/CorruptionModels/data/wikipreprocess.py -------------------------------------------------------------------------------- /cleaningbenchmark/ModelEval/EvalUtils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/ModelEval/EvalUtils.py -------------------------------------------------------------------------------- /cleaningbenchmark/ModelEval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cleaningbenchmark/NoiseModels/ConstraintModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/NoiseModels/ConstraintModel.py -------------------------------------------------------------------------------- /cleaningbenchmark/NoiseModels/NoiseModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/NoiseModels/NoiseModel.py -------------------------------------------------------------------------------- /cleaningbenchmark/NoiseModels/RandomNoise.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/NoiseModels/RandomNoise.py -------------------------------------------------------------------------------- /cleaningbenchmark/NoiseModels/SystematicNoise.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/NoiseModels/SystematicNoise.py -------------------------------------------------------------------------------- /cleaningbenchmark/NoiseModels/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cleaningbenchmark/Utils/Utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/Utils/Utils.py -------------------------------------------------------------------------------- /cleaningbenchmark/Utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cleaningbenchmark/WorkflowModels/WorkFlowModel.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/WorkflowModels/WorkFlowModel.py -------------------------------------------------------------------------------- /cleaningbenchmark/WorkflowModels/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cleaningbenchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/cleaningbenchmark/__init__.py -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/setup.py -------------------------------------------------------------------------------- /test/testcleaningbenchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjyk/datacleaning-benchmark/HEAD/test/testcleaningbenchmark.py --------------------------------------------------------------------------------