├── .flake8 ├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── diffino ├── __init__.py ├── cli.py ├── constants.py ├── exceptions.py └── models.py ├── doc └── README.md ├── docker-compose-test.yml ├── docker-compose.yml ├── requirements.txt ├── requirements ├── requirements-lint.txt └── requirements-test.txt ├── setup.py └── tests ├── __init__.py ├── sample_left.csv ├── sample_right.csv ├── test_cli.py ├── test_md5.py ├── test_models.py └── test_pandas.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | 4 | ignore = E501 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | shippable 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # IPython Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv/ 86 | venv/ 87 | venv27/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # Editors 97 | .idea/ 98 | .vscode/ 99 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | jobs: 4 | include: 5 | - stage: lint 6 | script: black diffino tests --check 7 | python: 8 | - 3.6 9 | install: 10 | - pip install -e . 11 | - pip install -r requirements/requirements-lint.txt 12 | - stage: test 13 | before_script: 14 | - export BOTO_CONFIG=/dev/null 15 | script: pytest tests/test_models.py 16 | python: 17 | - 2.7 18 | - 3.6 19 | install: 20 | - pip install -e . 21 | - pip install -r requirements/requirements-test.txt 22 | 23 | stages: 24 | - lint 25 | - test -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Coming soon 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2010-2017 Google, Inc. http://angularjs.org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | diffino 2 | ==== 3 | [![Build Status](https://travis-ci.com/IntuitiveWebSolutions/diffino.svg?branch=master)](https://travis-ci.com/IntuitiveWebSolutions/diffino) 4 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) 5 | 6 | Diffing tools for comparing datasets in CSV, XLSX and other formats available as CLI app, API, web app and module. Powered by the awesome Pandas library for Python. 7 | 8 | ### Done 9 | - Install as CLI app 10 | - Install and use as python module 11 | - Compare two CSV datasets using Pandas where you can output differences row by row 12 | - Use the following inputs for your datasets: 13 | - Local file in CSV pandas modes 14 | - File in S3 pandas mode 15 | - Define a subset of columns to use for comparing/diffing (only works with pandas mode, not supported for MD5 comparison) 16 | - Output differences to: 17 | - Console (print) 18 | - CSV file 19 | 20 | ### To-Do (ROADMAP) 21 | - Compare one or more CSV datasets using MD5 hash of the files 22 | - Compare one or more XLSX datasets using Pandas where you can output differences row by row 23 | - Use the following inputs for your datasets: 24 | - Local file in CSV MD5 25 | - Local file in XLSX (only for pandas mode) 26 | - Local directory with CSVs or XSLX files (for both MD5 and pandas modes) 27 | - ZIP file with CSVs or XLSX files (only for pandas mode) 28 | - File in S3 for MD5 29 | - Bucket in S3 (for both MD5 and pandas modes) 30 | - Output differences to: 31 | - XSLX file 32 | - JSON file 33 | 34 | ## Install 35 | 36 | To install as module and CLI: 37 | 38 | ``` 39 | pip install diffino 40 | ``` 41 | 42 | ## CLI 43 | 44 | Diffino will try it's best to guess your input storage mechanisms, for that you need to include `s3://` in the input argument and/or the `.csv`, `.xls` and `.xlsx extensions`. 45 | 46 | ### Compare using pandas 47 | 48 | MD5 is only useful for knowing two CSV datasets are not the same but it's not useful for knowing which are the actual differences among those. For that you can use the pandas mode which will output the differences row by row. 49 | The same commands shown earlier for MD5 are available, you need to pass the `--mode pandas` argument for using pandas. **By default Pandas mode is used so this argument can be omitted**: 50 | 51 | ``` 52 | diffino before_dataset.csv after_dataset.csv --mode pandas 53 | ``` 54 | 55 | When using pandas mode, by default Diffino will try to convert numeric columns, you can change this behavior with: 56 | 57 | ``` 58 | diffino before_dataset.csv after_dataset.csv --convert-numeric false 59 | ``` 60 | 61 | You can define the columns to be used for checking the diffs: 62 | 63 | ``` 64 | diffino before_dataset.csv after_dataset.csv --cols id name 65 | ``` 66 | 67 | #### Compare two CSV files in an S3 bucket using pandas mode 68 | 69 | ``` 70 | diffino s3://bucket/before_dataset.csv s3://bucket/after_dataset.csv --mode pandas 71 | ``` 72 | 73 | ### Output diff results to file 74 | 75 | Diffino will try it's best to guess your output storage mechanism, for that you need to include `s3://` in the input argument or use the `.csv`, `.xls` and `.xlsx extensions`. 76 | 77 | #### Output to a local CSV file 78 | ``` 79 | diffino file_1.csv file_2.csv --output diff.csv 80 | ``` 81 | 82 | Note: Two files are going to be generated, comparing the left argument file to the right argument file. For the example above, 2 files are going to be created: 83 | 84 | * `diff_left.csv` 85 | * `diff_right.csv` 86 | 87 | #### Avoid creating unnecesary files 88 | 89 | If you want to avoid unnecesary noise, you can prevent diffino from creating resulting files if there are no actual differences with the `--output-only-diffs` like 90 | ``` 91 | diffino file_1.csv file_2.csv --output diff.csv 92 | ``` 93 | 94 | For the above example, if `file_1` has some extra rows that are not present in `file_2`, but `file_2` only have rows that are present in `file_1`, then we are going to end up only with a resulting `diff_left.csv` file. 95 | 96 | 97 | #### Output to a local Excel file 98 | 99 | When using Excel, output will contain different sheets as well as one summary sheet containing all differences: 100 | 101 | ``` 102 | diffino file_1.csv file_2.csv --output diff.xlsx 103 | ``` 104 | 105 | #### Output to a local JSON file 106 | 107 | ``` 108 | diffino file_1.csv file_2.csv --output diff.json 109 | ``` 110 | 111 | #### Output to an CSV file in S3 112 | 113 | ``` 114 | diffino file_1.csv file_2.csv --output s3://bucket/diff.csv 115 | ``` 116 | 117 | #### Output to an Excel file in S3 118 | When using Excel, output will contain different sheets as well as one summary sheet containing all differences: 119 | 120 | ``` 121 | diffino file_1.csv file_2.csv --output s3://bucket/diff.xlsx 122 | ``` 123 | 124 | #### Output to a JSON file in S3 125 | 126 | ``` 127 | diffino file_1.csv file_2.csv --output s3://bucket/diff.json 128 | ``` 129 | 130 | ## Python module 131 | 132 | Useful if you want to integrate as part of you ETL or as part of you Continuous Integration (CI) builds. 133 | 134 | ### Get a dictionary with differences using pandas mode 135 | For using all columns: 136 | 137 | ```python 138 | from diffino.models import Diffino 139 | 140 | diffino = Diffino(left='s3://bucket/one.csv', right='s3://bucket/two.csv', mode='pandas') 141 | results = diffino.build_diff() 142 | ``` 143 | 144 | In the above example, the `results` variable contains a tuple with the first index containing 145 | the left differences count and the second index with the right differences count: 146 | 147 | ```python 148 | results(0) 149 | results(1) 150 | ``` 151 | 152 | And for using a subset of columns you can specify a string with a Python list of the column names you want to include: 153 | 154 | ```python 155 | from diffino.models import Diffino 156 | 157 | diffino = Diffino( 158 | left='one.csv', 159 | right='two.csv', 160 | mode='pandas', 161 | cols=['id', 'name'] 162 | ) 163 | results = diffino.build_diff() 164 | ``` 165 | 166 | ## COMING SOON 167 | Different column names? No problemo that works too! 168 | 169 | ```python 170 | from diffino.models import Diffino 171 | 172 | diffino = Diffino( 173 | left='one.xlsx', 174 | right='two.xlsx', 175 | mode='pandas', 176 | left_cols=['myColumn'], 177 | right_cols=['my_column'], 178 | ) 179 | results = diffino.build_diff() 180 | ``` 181 | 182 | ## Web App 183 | 184 | Coming soon 185 | 186 | ## API 187 | 188 | Coming soon 189 | -------------------------------------------------------------------------------- /diffino/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/diffino/__init__.py -------------------------------------------------------------------------------- /diffino/cli.py: -------------------------------------------------------------------------------- 1 | from models import Diffino 2 | import argparse 3 | 4 | 5 | def main(): 6 | parser = argparse.ArgumentParser(description="") 7 | parser.add_argument( 8 | "left_dataset", help="Path or S3 loaction of the left data set (CSV, JSON, etc)" 9 | ) 10 | parser.add_argument( 11 | "right_dataset", 12 | help="Path or S3 loaction of the right data set (CSV, JSON, etc)", 13 | ) 14 | parser.add_argument( 15 | "--mode", default="pandas", choices=["pandas", "md5"], help="Pandas or md5" 16 | ) 17 | parser.add_argument( 18 | "--convert-numeric", 19 | action="store_true", 20 | default=False, 21 | help="Whether to convert numeric columns", 22 | ) 23 | parser.add_argument( 24 | "--cols", nargs="+", default=None, help="Columns to be used for comparing" 25 | ) 26 | parser.add_argument( 27 | "--output-only-diffs", 28 | action="store_true", 29 | help="Output only when a difference exists", 30 | ) 31 | parser.add_argument("--output", help="Output file") 32 | 33 | args = parser.parse_args() 34 | 35 | diffino = Diffino( 36 | left=args.left_dataset, 37 | right=args.right_dataset, 38 | output=args.output, 39 | cols=args.cols, 40 | convert_numeric=args.convert_numeric, 41 | output_only_diffs=args.output_only_diffs, 42 | ) 43 | 44 | diffino.build_diff() 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /diffino/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/diffino/constants.py -------------------------------------------------------------------------------- /diffino/exceptions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/diffino/exceptions.py -------------------------------------------------------------------------------- /diffino/models.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import logging 3 | import boto3 4 | import pandas as pd 5 | from urlparse import urlparse 6 | 7 | logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO) 8 | 9 | 10 | def get_bucket_and_key_from_s3_path(path): 11 | o = urlparse(path, allow_fragments=False) 12 | return (o.netloc, o.path.lstrip("/")) 13 | 14 | 15 | class DataSet: 16 | dfs = [] 17 | md5_hashes = [] 18 | 19 | def __init__(self, location, cols, convert_numeric): 20 | self.location = location 21 | self.cols = cols 22 | self.convert_numeric = convert_numeric 23 | 24 | # Private methods 25 | def _get_from_local_file(self): 26 | logging.info("Reading local file %s", self.location) 27 | return pd.read_csv(self.location, usecols=self.cols) 28 | 29 | def _get_from_local_dir(self): 30 | return self._get_from_local_file() 31 | 32 | def _get_from_s3_file(self): 33 | logging.info("Reading from S3 %s", self.location) 34 | bucket_key = get_bucket_and_key_from_s3_path(self.location) 35 | 36 | s3 = boto3.client("s3") 37 | obj = s3.get_object(Bucket=bucket_key[0], Key=bucket_key[1]) 38 | return pd.read_csv(obj["Body"], usecols=self.cols) 39 | 40 | def _get_from_s3_bucket(self): 41 | raise NotImplementedError 42 | 43 | def _get_from_zip_local_file(self): 44 | # Unzip 45 | raise NotImplementedError 46 | 47 | # Public methods 48 | def read(self): 49 | df = None 50 | if "s3://" in self.location: 51 | if self.location.endswith("/"): 52 | df = self._get_from_s3_bucket() 53 | else: 54 | df = self._get_from_s3_file() 55 | else: 56 | if "/" in self.location: 57 | df = self._get_from_local_dir() 58 | else: 59 | df = self._get_from_local_file() 60 | 61 | if self.convert_numeric: 62 | logging.info("Converting to numeric for file %s", self.location) 63 | df.apply(pd.to_numeric, errors="ignore") 64 | return df 65 | 66 | 67 | class Diffino: 68 | """ 69 | Main class that provides the diff functionalities. Specific dataset types (CSV, XLSX, etc) 70 | are provided by classes inheriting from DataSet 71 | 72 | @param left: String with the input dataset to be used (.csv, .xlsx, .xls for local files and s3 url plus extension for AWS S3) 73 | @param right: String with the other input dataset to compare against (.csv, .xlsx, .xls for local files and s3 url plus extension for AWS S3) 74 | @param output: String with the output location (.csv, .xlsx, .xls, .json for local files and s3 url plus extension for AWS S3) 75 | @param convert_numeric: Boolean indicating whether numeric columns should be treated as numbers (in pandas mode). 76 | @param mode: String with the diff mode: 'pandas' or 'md5' 77 | @param cols: List with subset of columns to be used for the diff check. 78 | @param index_col: Column to be used as index 79 | @return: Nothing is returned 80 | """ 81 | 82 | def __init__(self, **kwargs): 83 | self.left, self._left_dataset = kwargs.get("left"), None 84 | self.right, self._right_dataset = kwargs.get("right"), None 85 | self.output, self._output_dataset = kwargs.get("output"), None 86 | self.convert_numeric = kwargs.get("convert_numeric", True) 87 | self.mode = kwargs.get("mode", "pandas") 88 | self.cols = kwargs.get("cols") 89 | self.cols_left = kwargs.get("cols_left") 90 | self.cols_right = kwargs.get("cols_right") 91 | self.output_only_diffs = kwargs.get("output_only_diffs") 92 | 93 | self.diff_result_left = {} 94 | self.diff_result_right = {} 95 | 96 | # Private methods 97 | def _build_inputs(self): 98 | logging.info("Building inputs") 99 | self._left_dataset = self._build_input(self.left) 100 | self._right_dataset = self._build_input(self.right) 101 | 102 | def _build_input(self, dataset_location): 103 | logging.info("Building dataset for %s", dataset_location) 104 | return DataSet(dataset_location, self.cols, self.convert_numeric).read() 105 | 106 | def _should_print_left(self): 107 | return not self.diff_result_left.empty or ( 108 | self.diff_result_left.empty and not self.output_only_diffs 109 | ) 110 | 111 | def _should_print_right(self): 112 | return not self.diff_result_right.empty or ( 113 | self.diff_result_right.empty and not self.output_only_diffs 114 | ) 115 | 116 | def _save_csv(self, df, output_file, s3=False): 117 | if not s3: 118 | logging.info("Saving result csv file %s", output_file) 119 | df.to_csv(output_file, index=False) 120 | return 121 | 122 | logging.info("Saving result csv file %s to S3", output_file) 123 | 124 | bucket_key = get_bucket_and_key_from_s3_path(output_file) 125 | csv_buffer = BytesIO() 126 | df.to_csv(csv_buffer, index=False) 127 | s3client = boto3.client("s3") 128 | response = s3client.put_object( 129 | Body=csv_buffer.getvalue(), 130 | ContentType="application/vnd.ms-excel", 131 | Bucket=bucket_key[0], 132 | Key=bucket_key[1], 133 | ) 134 | 135 | def to_csv(self, s3=False): 136 | output_name = self.output.replace(".csv", "") 137 | 138 | if self._should_print_left(): 139 | output_left = output_name + "_not_in_right.csv" 140 | self._save_csv(self.diff_result_left, output_left, s3) 141 | 142 | if self._should_print_right(): 143 | output_right = output_name + "_not_in_left.csv" 144 | self._save_csv(self.diff_result_right, output_right, s3) 145 | 146 | def to_excel(self, s3=False): 147 | raise NotImplementedError 148 | 149 | def to_json(self, s3=False): 150 | raise NotImplementedError 151 | 152 | def to_console(self): 153 | if self._should_print_left(): 154 | print("=============== Differences found on left file ===============") 155 | print(self.diff_result_left.to_string()) 156 | 157 | if self._should_print_right(): 158 | print("=============== Differences found on right file ===============") 159 | print(self.diff_result_right.to_string()) 160 | 161 | def _build_output(self): 162 | logging.info("Building output started") 163 | if not self.output: 164 | logging.info("Building output to console") 165 | self.to_console() 166 | return 167 | if ".csv" in self.output: 168 | logging.info("Building output to csv") 169 | if "s3://" in self.output: 170 | self.to_csv(s3=True) 171 | else: 172 | self.to_csv(s3=False) 173 | elif ".xslx" in self.output or ".xls" in self.output: 174 | logging.info("Building output to Excel") 175 | if "s3://" in self.output: 176 | self.to_excel(s3=True) 177 | else: 178 | self.to_excel(s3=False) 179 | elif ".json" in self.output: 180 | logging.info("Building output to json") 181 | if "s3://" in self.output: 182 | self.to_json(s3=True) 183 | else: 184 | self.to_json(s3=False) 185 | else: 186 | raise UserWarning("Invalid output format") 187 | self._output_dataset = None 188 | 189 | # Public methods 190 | def build_diff(self): 191 | if not self.left or not self.right: 192 | print("{}, {}".format(self.left, self.right)) 193 | raise UserWarning("Left and right datasets are both required") 194 | 195 | self._build_inputs() 196 | 197 | logging.info("Performing merge of datasets in preparation for diff") 198 | merged_dataset = pd.merge( 199 | left=self._left_dataset, 200 | right=self._right_dataset, 201 | how="outer", 202 | indicator="exists", 203 | ) 204 | 205 | exists_left = merged_dataset["exists"] == "left_only" 206 | exists_right = merged_dataset["exists"] == "right_only" 207 | 208 | logging.info("Creating diff result left") 209 | self.diff_result_left = merged_dataset[exists_left].drop(["exists"], axis=1) 210 | 211 | logging.info("Creating diff result right") 212 | self.diff_result_right = merged_dataset[exists_right].drop(["exists"], axis=1) 213 | 214 | self._build_output() 215 | 216 | return (len(self.diff_result_left.index), len(self.diff_result_right.index)) 217 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | ## Models 2 | 3 | DataSet 4 | - location 5 | - read() 6 | 7 | CsvDataSet < Dataset 8 | - separator 9 | 10 | ExcelDataSet < Dataset 11 | - sheet_name 12 | - skip_cols 13 | 14 | Diffino 15 | - left 16 | - right 17 | - output 18 | - convert_numeric 19 | - mode 20 | - cols 21 | - _build_inputs() 22 | - build_diff() 23 | -------------------------------------------------------------------------------- /docker-compose-test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/docker-compose-test.yml -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/docker-compose.yml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==0.19.2 2 | boto3==1.7.3 -------------------------------------------------------------------------------- /requirements/requirements-lint.txt: -------------------------------------------------------------------------------- 1 | black==19.3b0 2 | -r ../requirements.txt 3 | -------------------------------------------------------------------------------- /requirements/requirements-test.txt: -------------------------------------------------------------------------------- 1 | numpy==1.16.2 2 | pytest==4.4.0 3 | pytest-cov==2.6.1 4 | moto==1.3.8 5 | -r ../requirements.txt 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | VERSION = "0.2.1" 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | setup( 10 | name="diffino", 11 | version=VERSION, 12 | packages=["diffino"], 13 | include_package_data=True, 14 | install_requires=[ 15 | "pandas==0.19.2", 16 | "boto3==1.7.3" 17 | ], 18 | entry_points={'console_scripts': ['diffino = diffino.cli:main']}, 19 | author="BriteCore", 20 | description="Diffing tools for comparing datasets in CSV, XLSX and other formats", 21 | long_description=read('README.md'), 22 | long_description_content_type="text/markdown", 23 | keywords="diffing comparing csv excel json", 24 | url="https://github.com/IntuitiveWebSolutions/diffino" 25 | ) 26 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/tests/__init__.py -------------------------------------------------------------------------------- /tests/sample_left.csv: -------------------------------------------------------------------------------- 1 | address,state,zip,name,id 2 | one st,CA,66661,name one,1 3 | two st,CA,66662,name two,2 4 | three st,CA,66663,name three,3 5 | four st,CA,66664,name four,4 6 | five st,CA,66665,name five,5 7 | six st,CA,66666,name six,6 8 | seven st,CA,66667,name seven,7 9 | eight st,CA,66668,name eight,8 10 | nine st,CA,66669,name nine,9 11 | ten st,CA,66610,name ten,10 -------------------------------------------------------------------------------- /tests/sample_right.csv: -------------------------------------------------------------------------------- 1 | address,state,zip,name,id 2 | one st,CA,66661,name one,1 3 | two st,CA,66662,name two,2 4 | three st,CA,66663,name three,3 5 | four st,CA,66664,name four,4 6 | five st,CA,66665,name five,5 7 | six st,CA,66666,name six,6 8 | seven st,CA,66667,name seven,7 9 | eight st,CA,66668,name eight,8 10 | nine st,CA,66669,name nine,9 11 | eleven st,CA,66611,name eleven,11 -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | from subprocess import Popen, PIPE 2 | 3 | 4 | def test_md5_csv(): 5 | p = Popen( 6 | ["diffino", "before_dataset.csv", "after_dataset.csv", "--mode", "md5"], 7 | stdout=PIPE, 8 | stderr=PIPE, 9 | ) 10 | stdout, stderr = p.communicate() 11 | 12 | raise Exception("Finish test!") 13 | 14 | 15 | def test_md5_zip(): 16 | p = Popen( 17 | ["diffino", "before_dataset.zip", "after_dataset.zip", "--mode", "md5"], 18 | stdout=PIPE, 19 | stderr=PIPE, 20 | ) 21 | stdout, stderr = p.communicate() 22 | 23 | raise Exception("Finish test!") 24 | 25 | 26 | def test_s3_csv(): 27 | p = Popen( 28 | [ 29 | "diffino", 30 | "s3://bucket/before_dataset.csv", 31 | "s3://bucket/after_dataset.csv", 32 | "--mode", 33 | "md5", 34 | ], 35 | stdout=PIPE, 36 | stderr=PIPE, 37 | ) 38 | stdout, stderr = p.communicate() 39 | 40 | raise Exception("Finish test!") 41 | 42 | 43 | def test_s3_bucket_md5(): 44 | p = Popen( 45 | [ 46 | "diffino", 47 | "s3://bucket/before_dataset", 48 | "s3://bucket/after_dataset", 49 | "--mode", 50 | "md5", 51 | ], 52 | stdout=PIPE, 53 | stderr=PIPE, 54 | ) 55 | stdout, stderr = p.communicate() 56 | 57 | raise Exception("Finish test!") 58 | 59 | 60 | def test_pandas_csv(): 61 | p = Popen( 62 | ["diffino", "before_dataset.csv", "after_dataset.csv", "--mode", "pandas"], 63 | stdout=PIPE, 64 | stderr=PIPE, 65 | ) 66 | stdout, stderr = p.communicate() 67 | 68 | raise Exception("Finish test!") 69 | 70 | 71 | def test_pandas_csv_numeric_false(): 72 | p = Popen( 73 | [ 74 | "diffino", 75 | "before_dataset.csv", 76 | "after_dataset.csv", 77 | "--mode", 78 | "pandas", 79 | "--convert-numeric", 80 | "false", 81 | ], 82 | stdout=PIPE, 83 | stderr=PIPE, 84 | ) 85 | stdout, stderr = p.communicate() 86 | 87 | raise Exception("Finish test!") 88 | 89 | 90 | def test_pandas_csv_cols(): 91 | p = Popen( 92 | [ 93 | "diffino before_dataset.csv", 94 | "after_dataset.csv", 95 | "--mode pandas", 96 | "--cols", 97 | "id", 98 | "name", 99 | ], 100 | stdout=PIPE, 101 | stderr=PIPE, 102 | ) 103 | stdout, stderr = p.communicate() 104 | 105 | raise Exception("Finish test!") 106 | 107 | 108 | def test_pandas_output_csv_local(): 109 | p = Popen( 110 | ["diffino", "file_1.csv", "file_2.csv", "--output", "diff.csv"], 111 | stdout=PIPE, 112 | stderr=PIPE, 113 | ) 114 | stdout, stderr = p.communicate() 115 | 116 | raise Exception("Finish test!") 117 | 118 | 119 | def test_pandas_output_xlsx_local(): 120 | p = Popen( 121 | ["diffino", "file_1.csv", "file_2.csv", "--output", "diff.xlsx"], 122 | stdout=PIPE, 123 | stderr=PIPE, 124 | ) 125 | stdout, stderr = p.communicate() 126 | 127 | raise Exception("Finish test!") 128 | 129 | 130 | def test_pandas_output_json_local(): 131 | p = Popen( 132 | ["diffino", "file_1.csv", "file_2.csv", "--output", "diff.json"], 133 | stdout=PIPE, 134 | stderr=PIPE, 135 | ) 136 | stdout, stderr = p.communicate() 137 | 138 | raise Exception("Finish test!") 139 | 140 | 141 | def test_pandas_output_csv_s3(): 142 | p = Popen( 143 | ["diffino", "file_1.csv", "file_2.csv", "--output", "s3://bucket/diff.csv"], 144 | stdout=PIPE, 145 | stderr=PIPE, 146 | ) 147 | stdout, stderr = p.communicate() 148 | 149 | raise Exception("Finish test!") 150 | 151 | 152 | def test_pandas_output_xlsx_s3(): 153 | p = Popen( 154 | ["diffino", "file_1.csv", "file_2.csv", "--output", "s3://bucket/diff.xlsx"], 155 | stdout=PIPE, 156 | stderr=PIPE, 157 | ) 158 | stdout, stderr = p.communicate() 159 | 160 | raise Exception("Finish test!") 161 | 162 | 163 | def test_pandas_output_json_s3(): 164 | p = Popen( 165 | ["diffino", "file_1.csv", "file_2.csv", "--output", "s3://bucket/diff.json"], 166 | stdout=PIPE, 167 | stderr=PIPE, 168 | ) 169 | stdout, stderr = p.communicate() 170 | 171 | raise Exception("Finish test!") 172 | -------------------------------------------------------------------------------- /tests/test_md5.py: -------------------------------------------------------------------------------- 1 | from diffino.models import Diffino 2 | 3 | 4 | def test_single_file_csv_local_md5(): 5 | diff = Diffino(mode="md5", left="/tmp/one.csv", right="/tmp/two.csv") 6 | results = diff.build_diff() 7 | assert results 8 | 9 | 10 | def test_single_file_excel_local_md5(): 11 | diff = Diffino(mode="md5", left="/tmp/one.xlsx", right="/tmp/two.xlsx") 12 | results = diff.build_diff() 13 | assert results 14 | 15 | 16 | def test_single_file_csv_s3_md5(): 17 | diff = Diffino( 18 | mode="md5", left="s3://fake-bucket/one.csv", right="s3://fake-bucket/two.csv" 19 | ) 20 | results = diff.build_diff() 21 | assert results 22 | 23 | 24 | def test_single_file_excel_s3_md5(): 25 | diff = Diffino( 26 | mode="md5", left="s3://fake-bucket/one.xlsx", right="s3://fake-bucket/two.xlsx" 27 | ) 28 | results = diff.build_diff() 29 | assert results 30 | 31 | 32 | def test_multiple_files_dir_md5(): 33 | diff = Diffino(mode="md5", left="/tmp/one", right="/tmp/two") 34 | results = diff.build_diff() 35 | assert results 36 | 37 | 38 | def test_multiple_files_zip_md5(): 39 | diff = Diffino(mode="md5", left="/tmp/one.zip", right="/tmp/two.zip") 40 | results = diff.build_diff() 41 | assert results 42 | 43 | 44 | def test_multiple_files_s3_md5(): 45 | diff = Diffino( 46 | mode="md5", left="s3://fake-bucket/one", right="s3://fake-bucket/two" 47 | ) 48 | results = diff.build_diff() 49 | assert results 50 | 51 | 52 | def test_output_csv_md5(): 53 | diff = Diffino( 54 | mode="md5", left="/tmp/one.csv", right="/tmp/two.csv", output="/tmp/diff.csv" 55 | ) 56 | diff.build_diff() 57 | raise Exception("Finish test!") 58 | 59 | 60 | def test_output_xlsx_md5(): 61 | diff = Diffino( 62 | mode="md5", left="/tmp/one.csv", right="/tmp/two.csv", output="/tmp/diff.xslx" 63 | ) 64 | diff.build_diff() 65 | raise Exception("Finish test!") 66 | 67 | 68 | def test_output_json_md5(): 69 | diff = Diffino( 70 | mode="md5", left="/tmp/one.csv", right="/tmp/two.csv", output="/tmp/diff.json" 71 | ) 72 | diff.build_diff() 73 | raise Exception("Finish test!") 74 | 75 | 76 | def test_output_in_s3_md5(): 77 | diff = Diffino( 78 | mode="md5", 79 | left="/tmp/one.csv", 80 | right="/tmp/two.csv", 81 | output="s3://fake-bucket/diff.json", 82 | ) 83 | diff.build_diff() 84 | raise Exception("Finish test!") 85 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import io 3 | import os 4 | import numpy as np 5 | import numpy.testing as npt 6 | import pandas as pd 7 | 8 | from diffino.models import DataSet, Diffino 9 | from moto import mock_s3 10 | 11 | 12 | def assert_frames_equal(actual, expected): 13 | """ 14 | Compare DataFrame items by index and column and 15 | raise AssertionError if any item is not equal. 16 | 17 | Ordering is unimportant, items are compared only by label. 18 | NaN and infinite values are supported. 19 | 20 | Parameters 21 | ---------- 22 | actual : pandas.DataFrame 23 | expected : pandas.DataFrame 24 | 25 | """ 26 | comp = npt.assert_equal 27 | 28 | assert isinstance(actual, pd.DataFrame) and isinstance( 29 | expected, pd.DataFrame 30 | ), "Inputs must both be pandas DataFrames." 31 | 32 | for i, exp_row in expected.iterrows(): 33 | assert i in actual.index, "Expected row {!r} not found.".format(i) 34 | 35 | act_row = actual.loc[i] 36 | 37 | for j, exp_item in exp_row.iteritems(): 38 | assert j in act_row.index, "Expected column {!r} not found.".format(j) 39 | 40 | act_item = act_row[j] 41 | 42 | try: 43 | comp(act_item, exp_item) 44 | except AssertionError as e: 45 | raise AssertionError( 46 | e.message + "\n\nColumn: {!r}\nRow: {!r}".format(j, i) 47 | ) 48 | 49 | 50 | class TestModels(object): 51 | def _create_diff( 52 | self, 53 | target_dir, 54 | left_csv="sample_left.csv", 55 | right_csv="sample_right.csv", 56 | to_console=False, 57 | cols=None, 58 | output_only_diffs=False, 59 | ): 60 | output_location = ( 61 | False if to_console else os.path.join(target_dir, "output.csv") 62 | ) 63 | output_left = os.path.join(target_dir, "output_not_in_left.csv") 64 | output_right = os.path.join(target_dir, "output_not_in_right.csv") 65 | 66 | location_left = fname = os.path.join(os.path.dirname(__file__), left_csv) 67 | location_right = fname = os.path.join(os.path.dirname(__file__), right_csv) 68 | diffino = Diffino( 69 | left=location_left, 70 | right=location_right, 71 | output=output_location, 72 | cols=cols, 73 | output_only_diffs=output_only_diffs, 74 | ) 75 | 76 | rows_count = diffino.build_diff() 77 | 78 | if not to_console and not output_only_diffs: 79 | assert os.path.isfile(output_left) 80 | assert os.path.isfile(output_right) 81 | return output_location, output_left, output_right, rows_count 82 | 83 | def test_dataset_read_from_local_file(self): 84 | location = fname = os.path.join(os.path.dirname(__file__), "sample_left.csv") 85 | dataset = DataSet(location, None, False) 86 | df = dataset.read() 87 | assert isinstance(df, pd.DataFrame) 88 | assert df.empty is not True 89 | 90 | def test_diffino_diff_is_working(self, tmpdir): 91 | outputs = self._create_diff(str(tmpdir)) 92 | 93 | expected_data_not_in_left = u"""address,state,zip,name,id 94 | eleven st,CA,66611,name eleven,11""" 95 | 96 | expected_data_not_in_right = u"""address,state,zip,name,id 97 | ten st,CA,66610,name ten,10""" 98 | 99 | expected_df_not_in_left = pd.read_csv(io.StringIO(expected_data_not_in_left)) 100 | expected_df_not_in_right = pd.read_csv(io.StringIO(expected_data_not_in_right)) 101 | 102 | result_not_in_left = pd.read_csv(outputs[1]) 103 | result_not_in_right = pd.read_csv(outputs[2]) 104 | 105 | assert_frames_equal(expected_df_not_in_left, result_not_in_left) 106 | assert_frames_equal(expected_df_not_in_right, result_not_in_right) 107 | 108 | def test_diffino_no_diff(self, tmpdir): 109 | outputs = self._create_diff(str(tmpdir), right_csv="sample_left.csv") 110 | 111 | expected_data = u"address,state,zip,name,id" 112 | 113 | expected_df = pd.read_csv(io.StringIO(expected_data)) 114 | resulting_left_csv = pd.read_csv(outputs[1]) 115 | resulting_right_csv = pd.read_csv(outputs[2]) 116 | 117 | assert_frames_equal(expected_df, resulting_left_csv) 118 | assert_frames_equal(expected_df, resulting_right_csv) 119 | 120 | def test_diffino_build_output_to_console(self, tmpdir, capsys): 121 | self._create_diff(str(tmpdir), to_console=True) 122 | captured = capsys.readouterr() 123 | assert "Differences found on left file" in captured.out 124 | assert "Differences found on right file" in captured.out 125 | 126 | def test_diffino_diff_with_selected_columns(self, tmpdir): 127 | outputs = self._create_diff(str(tmpdir), cols=["address", "id"]) 128 | 129 | expected_data_right = u"""address,id 130 | ten st,10""" 131 | expected_data_left = u"""address,id 132 | eleven st,11""" 133 | 134 | expected_df_left = pd.read_csv(io.StringIO(expected_data_left)) 135 | expected_df_right = pd.read_csv(io.StringIO(expected_data_right)) 136 | resulting_left_csv = pd.read_csv(outputs[1]) 137 | resulting_right_csv = pd.read_csv(outputs[2]) 138 | 139 | assert_frames_equal(expected_df_left, resulting_left_csv) 140 | assert_frames_equal(expected_df_right, resulting_right_csv) 141 | 142 | def test_diffino_output_only_diffs_console(self, tmpdir, capsys): 143 | self._create_diff( 144 | str(tmpdir), 145 | to_console=True, 146 | right_csv="sample_left.csv", 147 | output_only_diffs=True, 148 | ) 149 | captured = capsys.readouterr() 150 | assert "Differences found on left file" not in captured.out 151 | assert "Differences found on right file" not in captured.out 152 | 153 | def test_diffino_output_only_diffs_csv(self, tmpdir): 154 | outputs = self._create_diff( 155 | str(tmpdir), right_csv="sample_left.csv", output_only_diffs=True 156 | ) 157 | assert not os.path.isfile(outputs[1]) 158 | assert not os.path.isfile(outputs[2]) 159 | 160 | def test_diffino_return_diff_count(self, tmpdir): 161 | outputs = self._create_diff(str(tmpdir)) 162 | assert outputs[3][0] is 1 163 | assert outputs[3][0] is 1 164 | 165 | @mock_s3 166 | def test_diffino_s3_support(self, tmpdir): 167 | conn = boto3.resource("s3") 168 | # We need to create the bucket since this is all in Moto's 'virtual' AWS account 169 | bucket = "britedata-diff" 170 | conn.create_bucket(Bucket=bucket) 171 | s3 = boto3.client("s3") 172 | 173 | key_current = "current.csv" 174 | key_new = "new.csv" 175 | value = u"""address,state,zip,name,id 176 | eleven st,CA,66611,name eleven,11""" 177 | s3.put_object(Bucket=bucket, Key=key_current, Body=value) 178 | s3.put_object(Bucket=bucket, Key=key_new, Body=value) 179 | 180 | location_left = "s3://" + bucket + "/" + key_current 181 | location_right = "s3://" + bucket + "/" + key_new 182 | output_location = "s3://" + bucket + "/output.csv" 183 | diffino = Diffino( 184 | left=location_left, right=location_right, output=output_location 185 | ) 186 | diffino.build_diff() 187 | 188 | body_left = ( 189 | conn.Object(bucket, "output_not_in_left.csv") 190 | .get()["Body"] 191 | .read() 192 | .decode("utf-8") 193 | ) 194 | body_right = ( 195 | conn.Object(bucket, "output_not_in_right.csv") 196 | .get()["Body"] 197 | .read() 198 | .decode("utf-8") 199 | ) 200 | 201 | expected_result = u"""address,state,zip,name,id\n""" 202 | assert body_left == expected_result 203 | assert body_right == expected_result 204 | -------------------------------------------------------------------------------- /tests/test_pandas.py: -------------------------------------------------------------------------------- 1 | from diffino.models import Diffino 2 | 3 | 4 | def test_single_file_csv_local_pandas(): 5 | diff = Diffino(mode="pandas", left="/tmp/one.csv", right="/tmp/two.csv") 6 | results = diff.build_diff() 7 | assert results 8 | 9 | 10 | def test_single_file_excel_local_pandas(): 11 | diff = Diffino(mode="pandas", left="/tmp/one.xlsx", right="/tmp/two.xlsx") 12 | results = diff.build_diff() 13 | assert results 14 | 15 | 16 | def test_single_file_csv_s3_pandas(): 17 | diff = Diffino( 18 | mode="pandas", left="s3://fake-bucket/one.csv", right="s3://fake-bucket/two.csv" 19 | ) 20 | results = diff.build_diff() 21 | assert results 22 | 23 | 24 | def test_multiple_files_dir_pandas(): 25 | diff = Diffino(mode="pandas", left="/tmp/one", right="/tmp/two") 26 | results = diff.build_diff() 27 | assert results 28 | 29 | 30 | def test_multiple_files_zip_pandas(): 31 | diff = Diffino(mode="pandas", left="/tmp/one.zip", right="/tmp/two.zip") 32 | results = diff.build_diff() 33 | assert results 34 | 35 | 36 | def test_multiple_files_s3_pandas(): 37 | diff = Diffino( 38 | mode="pandas", left="s3://fake-bucket/one", right="s3://fake-bucket/two" 39 | ) 40 | results = diff.build_diff() 41 | assert results 42 | 43 | 44 | def test_specific_cols(): 45 | diff = Diffino( 46 | mode="pandas", left="/tmp/one.csv", right="/tmp/two.csv", cols=["id", "name"] 47 | ) 48 | results = diff.build_diff() 49 | assert results 50 | 51 | 52 | def test_convert_numeric(): 53 | diff = Diffino( 54 | mode="pandas", 55 | left="/tmp/one_specific_cols.csv", 56 | right="/tmp/two_specific_cols.csv", 57 | convert_numeric=False, 58 | ) 59 | results = diff.build_diff() 60 | assert results 61 | 62 | 63 | def test_output_csv_pandas(): 64 | diff = Diffino( 65 | mode="pandas", left="/tmp/one.csv", right="/tmp/two.csv", output="/tmp/diff.csv" 66 | ) 67 | diff.build_diff() 68 | raise Exception("Finish test!") 69 | 70 | 71 | def test_output_xlsx_pandas(): 72 | diff = Diffino( 73 | mode="pandas", 74 | left="/tmp/one.csv", 75 | right="/tmp/two.csv", 76 | output="/tmp/diff.xslx", 77 | ) 78 | diff.build_diff() 79 | raise Exception("Finish test!") 80 | 81 | 82 | def test_output_json_pandas(): 83 | diff = Diffino( 84 | mode="pandas", 85 | left="/tmp/one.csv", 86 | right="/tmp/two.csv", 87 | output="/tmp/diff.json", 88 | ) 89 | diff.build_diff() 90 | raise Exception("Finish test!") 91 | 92 | 93 | def test_output_in_s3_pandas(): 94 | diff = Diffino( 95 | mode="pandas", 96 | left="/tmp/one.csv", 97 | right="/tmp/two.csv", 98 | output="s3://fake-bucket/diff.json", 99 | ) 100 | diff.build_diff() 101 | raise Exception("Finish test!") 102 | --------------------------------------------------------------------------------