├── .flake8
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── diffino
    ├── __init__.py
    ├── cli.py
    ├── constants.py
    ├── exceptions.py
    └── models.py
├── doc
    └── README.md
├── docker-compose-test.yml
├── docker-compose.yml
├── requirements.txt
├── requirements
    ├── requirements-lint.txt
    └── requirements-test.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── sample_left.csv
    ├── sample_right.csv
    ├── test_cli.py
    ├── test_md5.py
    ├── test_models.py
    └── test_pandas.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | 
4 | ignore = E501
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | shippable
 2 | .DS_Store
 3 | 
 4 | # Byte-compiled / optimized / DLL files
 5 | __pycache__/
 6 | *.py[cod]
 7 | *$py.class
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Distribution / packaging
13 | .Python
14 | env/
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # PyInstaller
31 | #  Usually these files are written by a python script from a template
32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .hypothesis/
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 | 
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 | 
63 | # Scrapy stuff:
64 | .scrapy
65 | 
66 | # Sphinx documentation
67 | docs/_build/
68 | 
69 | # PyBuilder
70 | target/
71 | 
72 | # IPython Notebook
73 | .ipynb_checkpoints
74 | 
75 | # pyenv
76 | .python-version
77 | 
78 | # celery beat schedule file
79 | celerybeat-schedule
80 | 
81 | # dotenv
82 | .env
83 | 
84 | # virtualenv
85 | .venv/
86 | venv/
87 | venv27/
88 | ENV/
89 | 
90 | # Spyder project settings
91 | .spyderproject
92 | 
93 | # Rope project settings
94 | .ropeproject
95 | 
96 | # Editors
97 | .idea/
98 | .vscode/
99 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | jobs:
 4 |   include:
 5 |     - stage: lint
 6 |       script: black diffino tests --check
 7 |       python:
 8 |         - 3.6
 9 |       install:
10 |         - pip install -e .
11 |         - pip install -r requirements/requirements-lint.txt
12 |     - stage: test
13 |       before_script:
14 |         - export BOTO_CONFIG=/dev/null
15 |       script: pytest tests/test_models.py
16 |       python:
17 |         - 2.7
18 |         - 3.6
19 |       install:
20 |         - pip install -e .
21 |         - pip install -r requirements/requirements-test.txt
22 | 
23 | stages:
24 |   - lint
25 |   - test


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Coming soon
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2010-2017 Google, Inc. http://angularjs.org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | diffino
  2 | ====
  3 | [![Build Status](https://travis-ci.com/IntuitiveWebSolutions/diffino.svg?branch=master)](https://travis-ci.com/IntuitiveWebSolutions/diffino)
  4 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black)
  5 | 
  6 | Diffing tools for comparing datasets in CSV, XLSX and other formats available as CLI app, API, web app and module. Powered by the awesome Pandas library for Python.
  7 | 
  8 | ### Done
  9 | - Install as CLI app
 10 | - Install and use as python module
 11 | - Compare two CSV datasets using Pandas where you can output differences row by row
 12 | - Use the following inputs for your datasets:
 13 |   - Local file in CSV pandas modes
 14 |   - File in S3 pandas mode
 15 | - Define a subset of columns to use for comparing/diffing (only works with pandas mode, not supported for MD5 comparison)
 16 | - Output differences to:
 17 |   - Console (print)
 18 |   - CSV file
 19 | 
 20 | ### To-Do (ROADMAP)
 21 | - Compare one or more CSV datasets using MD5 hash of the files
 22 | - Compare one or more XLSX datasets using Pandas where you can output differences row by row
 23 | - Use the following inputs for your datasets:
 24 |   - Local file in CSV MD5
 25 |   - Local file in XLSX (only for pandas mode)
 26 |   - Local directory with CSVs or XSLX files (for both MD5 and pandas modes)
 27 |   - ZIP file with CSVs or XLSX files (only for pandas mode)
 28 |   - File in S3 for MD5
 29 |   - Bucket in S3 (for both MD5 and pandas modes)
 30 | - Output differences to:
 31 |   - XSLX file
 32 |   - JSON file
 33 | 
 34 | ## Install
 35 | 
 36 | To install as module and CLI:
 37 | 
 38 | ```
 39 | pip install diffino
 40 | ```
 41 | 
 42 | ## CLI
 43 | 
 44 | Diffino will try it's best to guess your input storage mechanisms, for that you need to include `s3://` in the input argument and/or the `.csv`, `.xls` and `.xlsx extensions`.
 45 | 
 46 | ### Compare using pandas
 47 | 
 48 | MD5 is only useful for knowing two CSV datasets are not the same but it's not useful for knowing which are the actual differences among those. For that you can use the pandas mode which will output the differences row by row.
 49 | The same commands shown earlier for MD5 are available, you need to pass the `--mode pandas` argument for using pandas. **By default Pandas mode is used so this argument can be omitted**:
 50 | 
 51 | ```
 52 | diffino before_dataset.csv after_dataset.csv --mode pandas
 53 | ```
 54 | 
 55 | When using pandas mode, by default Diffino will try to convert numeric columns, you can change this behavior with:
 56 | 
 57 | ```
 58 | diffino before_dataset.csv after_dataset.csv --convert-numeric false
 59 | ```
 60 | 
 61 | You can define the columns to be used for checking the diffs:
 62 | 
 63 | ```
 64 | diffino before_dataset.csv after_dataset.csv --cols id name
 65 | ```
 66 | 
 67 | #### Compare two CSV files in an S3 bucket using pandas mode
 68 | 
 69 | ```
 70 | diffino s3://bucket/before_dataset.csv s3://bucket/after_dataset.csv --mode pandas
 71 | ```
 72 | 
 73 | ### Output diff results to file
 74 | 
 75 | Diffino will try it's best to guess your output storage mechanism, for that you need to include `s3://` in the input argument or use the `.csv`, `.xls` and `.xlsx extensions`.
 76 | 
 77 | #### Output to a local CSV file
 78 | ```
 79 | diffino file_1.csv file_2.csv --output diff.csv
 80 | ```
 81 | 
 82 | Note: Two files are going to be generated, comparing the left argument file to the right argument file. For the example above, 2 files are going to be created:
 83 | 
 84 | * `diff_left.csv`
 85 | * `diff_right.csv`
 86 | 
 87 | #### Avoid creating unnecesary files
 88 | 
 89 | If you want to avoid unnecesary noise, you can prevent diffino from creating resulting files if there are no actual differences with the `--output-only-diffs` like
 90 | ```
 91 | diffino file_1.csv file_2.csv --output diff.csv
 92 | ```
 93 | 
 94 | For the above example, if `file_1` has some extra rows that are not present in `file_2`, but `file_2` only have rows that are present in `file_1`, then we are going to end up only with a resulting `diff_left.csv` file.
 95 | 
 96 | 
 97 | #### Output to a local Excel file
 98 | 
 99 | When using Excel, output will contain different sheets as well as one summary sheet containing all differences:
100 | 
101 | ```
102 | diffino file_1.csv file_2.csv --output diff.xlsx
103 | ```
104 | 
105 | #### Output to a local JSON file
106 | 
107 | ```
108 | diffino file_1.csv file_2.csv --output diff.json
109 | ```
110 | 
111 | #### Output to an CSV file in S3
112 | 
113 | ```
114 | diffino file_1.csv file_2.csv --output s3://bucket/diff.csv
115 | ```
116 | 
117 | #### Output to an Excel file in S3
118 | When using Excel, output will contain different sheets as well as one summary sheet containing all differences:
119 | 
120 | ```
121 | diffino file_1.csv file_2.csv --output s3://bucket/diff.xlsx
122 | ```
123 | 
124 | #### Output to a JSON file in S3
125 | 
126 | ```
127 | diffino file_1.csv file_2.csv --output s3://bucket/diff.json
128 | ```
129 | 
130 | ## Python module
131 | 
132 | Useful if you want to integrate as part of you ETL or as part of you Continuous Integration (CI) builds.
133 | 
134 | ### Get a dictionary with differences using pandas mode
135 | For using all columns:
136 | 
137 | ```python
138 | from diffino.models import Diffino
139 | 
140 | diffino = Diffino(left='s3://bucket/one.csv', right='s3://bucket/two.csv', mode='pandas')
141 | results = diffino.build_diff()
142 | ```
143 | 
144 | In the above example, the `results` variable contains a tuple with the first index containing
145 | the left differences count and the second index with the right differences count:
146 | 
147 | ```python
148 | results(0)
149 | results(1)
150 | ```
151 | 
152 | And for using a subset of columns you can specify a string with a Python list of the column names you want to include:
153 | 
154 | ```python
155 | from diffino.models import Diffino
156 | 
157 | diffino = Diffino(
158 |   left='one.csv',
159 |   right='two.csv',
160 |   mode='pandas',
161 |   cols=['id', 'name']
162 | )
163 | results = diffino.build_diff()
164 | ```
165 | 
166 | ## COMING SOON
167 | Different column names? No problemo that works too! 
168 | 
169 | ```python
170 | from diffino.models import Diffino
171 | 
172 | diffino = Diffino(
173 |   left='one.xlsx',
174 |   right='two.xlsx',
175 |   mode='pandas',
176 |   left_cols=['myColumn'],
177 |   right_cols=['my_column'],
178 | )
179 | results = diffino.build_diff()
180 | ```
181 | 
182 | ## Web App
183 | 
184 | Coming soon
185 | 
186 | ## API
187 | 
188 | Coming soon
189 | 


--------------------------------------------------------------------------------
/diffino/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/diffino/__init__.py


--------------------------------------------------------------------------------
/diffino/cli.py:
--------------------------------------------------------------------------------
 1 | from models import Diffino
 2 | import argparse
 3 | 
 4 | 
 5 | def main():
 6 |     parser = argparse.ArgumentParser(description="")
 7 |     parser.add_argument(
 8 |         "left_dataset", help="Path or S3 loaction of the left data set (CSV, JSON, etc)"
 9 |     )
10 |     parser.add_argument(
11 |         "right_dataset",
12 |         help="Path or S3 loaction of the right data set (CSV, JSON, etc)",
13 |     )
14 |     parser.add_argument(
15 |         "--mode", default="pandas", choices=["pandas", "md5"], help="Pandas or md5"
16 |     )
17 |     parser.add_argument(
18 |         "--convert-numeric",
19 |         action="store_true",
20 |         default=False,
21 |         help="Whether to convert numeric columns",
22 |     )
23 |     parser.add_argument(
24 |         "--cols", nargs="+", default=None, help="Columns to be used for comparing"
25 |     )
26 |     parser.add_argument(
27 |         "--output-only-diffs",
28 |         action="store_true",
29 |         help="Output only when a difference exists",
30 |     )
31 |     parser.add_argument("--output", help="Output file")
32 | 
33 |     args = parser.parse_args()
34 | 
35 |     diffino = Diffino(
36 |         left=args.left_dataset,
37 |         right=args.right_dataset,
38 |         output=args.output,
39 |         cols=args.cols,
40 |         convert_numeric=args.convert_numeric,
41 |         output_only_diffs=args.output_only_diffs,
42 |     )
43 | 
44 |     diffino.build_diff()
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/diffino/constants.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/diffino/constants.py


--------------------------------------------------------------------------------
/diffino/exceptions.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/diffino/exceptions.py


--------------------------------------------------------------------------------
/diffino/models.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | import logging
  3 | import boto3
  4 | import pandas as pd
  5 | from urlparse import urlparse
  6 | 
  7 | logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)
  8 | 
  9 | 
 10 | def get_bucket_and_key_from_s3_path(path):
 11 |     o = urlparse(path, allow_fragments=False)
 12 |     return (o.netloc, o.path.lstrip("/"))
 13 | 
 14 | 
 15 | class DataSet:
 16 |     dfs = []
 17 |     md5_hashes = []
 18 | 
 19 |     def __init__(self, location, cols, convert_numeric):
 20 |         self.location = location
 21 |         self.cols = cols
 22 |         self.convert_numeric = convert_numeric
 23 | 
 24 |     # Private methods
 25 |     def _get_from_local_file(self):
 26 |         logging.info("Reading local file %s", self.location)
 27 |         return pd.read_csv(self.location, usecols=self.cols)
 28 | 
 29 |     def _get_from_local_dir(self):
 30 |         return self._get_from_local_file()
 31 | 
 32 |     def _get_from_s3_file(self):
 33 |         logging.info("Reading from S3 %s", self.location)
 34 |         bucket_key = get_bucket_and_key_from_s3_path(self.location)
 35 | 
 36 |         s3 = boto3.client("s3")
 37 |         obj = s3.get_object(Bucket=bucket_key[0], Key=bucket_key[1])
 38 |         return pd.read_csv(obj["Body"], usecols=self.cols)
 39 | 
 40 |     def _get_from_s3_bucket(self):
 41 |         raise NotImplementedError
 42 | 
 43 |     def _get_from_zip_local_file(self):
 44 |         # Unzip
 45 |         raise NotImplementedError
 46 | 
 47 |     # Public methods
 48 |     def read(self):
 49 |         df = None
 50 |         if "s3://" in self.location:
 51 |             if self.location.endswith("/"):
 52 |                 df = self._get_from_s3_bucket()
 53 |             else:
 54 |                 df = self._get_from_s3_file()
 55 |         else:
 56 |             if "/" in self.location:
 57 |                 df = self._get_from_local_dir()
 58 |             else:
 59 |                 df = self._get_from_local_file()
 60 | 
 61 |         if self.convert_numeric:
 62 |             logging.info("Converting to numeric for file %s", self.location)
 63 |             df.apply(pd.to_numeric, errors="ignore")
 64 |         return df
 65 | 
 66 | 
 67 | class Diffino:
 68 |     """
 69 |     Main class that provides the diff functionalities. Specific dataset types (CSV, XLSX, etc)
 70 |     are provided by classes inheriting from DataSet
 71 | 
 72 |     @param left: String with the input dataset to be used (.csv, .xlsx, .xls for local files and s3 url plus extension for AWS S3)
 73 |     @param right: String with the other input dataset to compare against (.csv, .xlsx, .xls for local files and s3 url plus extension for AWS S3)
 74 |     @param output: String with the output location (.csv, .xlsx, .xls, .json for local files and s3 url plus extension for AWS S3)
 75 |     @param convert_numeric: Boolean indicating whether numeric columns should be treated as numbers (in pandas mode).
 76 |     @param mode: String with the diff mode: 'pandas' or 'md5'
 77 |     @param cols: List with subset of columns to be used for the diff check.
 78 |     @param index_col: Column to be used as index
 79 |     @return: Nothing is returned
 80 |     """
 81 | 
 82 |     def __init__(self, **kwargs):
 83 |         self.left, self._left_dataset = kwargs.get("left"), None
 84 |         self.right, self._right_dataset = kwargs.get("right"), None
 85 |         self.output, self._output_dataset = kwargs.get("output"), None
 86 |         self.convert_numeric = kwargs.get("convert_numeric", True)
 87 |         self.mode = kwargs.get("mode", "pandas")
 88 |         self.cols = kwargs.get("cols")
 89 |         self.cols_left = kwargs.get("cols_left")
 90 |         self.cols_right = kwargs.get("cols_right")
 91 |         self.output_only_diffs = kwargs.get("output_only_diffs")
 92 | 
 93 |         self.diff_result_left = {}
 94 |         self.diff_result_right = {}
 95 | 
 96 |     # Private methods
 97 |     def _build_inputs(self):
 98 |         logging.info("Building inputs")
 99 |         self._left_dataset = self._build_input(self.left)
100 |         self._right_dataset = self._build_input(self.right)
101 | 
102 |     def _build_input(self, dataset_location):
103 |         logging.info("Building dataset for %s", dataset_location)
104 |         return DataSet(dataset_location, self.cols, self.convert_numeric).read()
105 | 
106 |     def _should_print_left(self):
107 |         return not self.diff_result_left.empty or (
108 |             self.diff_result_left.empty and not self.output_only_diffs
109 |         )
110 | 
111 |     def _should_print_right(self):
112 |         return not self.diff_result_right.empty or (
113 |             self.diff_result_right.empty and not self.output_only_diffs
114 |         )
115 | 
116 |     def _save_csv(self, df, output_file, s3=False):
117 |         if not s3:
118 |             logging.info("Saving result csv file %s", output_file)
119 |             df.to_csv(output_file, index=False)
120 |             return
121 | 
122 |         logging.info("Saving result csv file %s to S3", output_file)
123 | 
124 |         bucket_key = get_bucket_and_key_from_s3_path(output_file)
125 |         csv_buffer = BytesIO()
126 |         df.to_csv(csv_buffer, index=False)
127 |         s3client = boto3.client("s3")
128 |         response = s3client.put_object(
129 |             Body=csv_buffer.getvalue(),
130 |             ContentType="application/vnd.ms-excel",
131 |             Bucket=bucket_key[0],
132 |             Key=bucket_key[1],
133 |         )
134 | 
135 |     def to_csv(self, s3=False):
136 |         output_name = self.output.replace(".csv", "")
137 | 
138 |         if self._should_print_left():
139 |             output_left = output_name + "_not_in_right.csv"
140 |             self._save_csv(self.diff_result_left, output_left, s3)
141 | 
142 |         if self._should_print_right():
143 |             output_right = output_name + "_not_in_left.csv"
144 |             self._save_csv(self.diff_result_right, output_right, s3)
145 | 
146 |     def to_excel(self, s3=False):
147 |         raise NotImplementedError
148 | 
149 |     def to_json(self, s3=False):
150 |         raise NotImplementedError
151 | 
152 |     def to_console(self):
153 |         if self._should_print_left():
154 |             print("=============== Differences found on left file ===============")
155 |             print(self.diff_result_left.to_string())
156 | 
157 |         if self._should_print_right():
158 |             print("=============== Differences found on right file ===============")
159 |             print(self.diff_result_right.to_string())
160 | 
161 |     def _build_output(self):
162 |         logging.info("Building output started")
163 |         if not self.output:
164 |             logging.info("Building output to console")
165 |             self.to_console()
166 |             return
167 |         if ".csv" in self.output:
168 |             logging.info("Building output to csv")
169 |             if "s3://" in self.output:
170 |                 self.to_csv(s3=True)
171 |             else:
172 |                 self.to_csv(s3=False)
173 |         elif ".xslx" in self.output or ".xls" in self.output:
174 |             logging.info("Building output to Excel")
175 |             if "s3://" in self.output:
176 |                 self.to_excel(s3=True)
177 |             else:
178 |                 self.to_excel(s3=False)
179 |         elif ".json" in self.output:
180 |             logging.info("Building output to json")
181 |             if "s3://" in self.output:
182 |                 self.to_json(s3=True)
183 |             else:
184 |                 self.to_json(s3=False)
185 |         else:
186 |             raise UserWarning("Invalid output format")
187 |         self._output_dataset = None
188 | 
189 |     # Public methods
190 |     def build_diff(self):
191 |         if not self.left or not self.right:
192 |             print("{}, {}".format(self.left, self.right))
193 |             raise UserWarning("Left and right datasets are both required")
194 | 
195 |         self._build_inputs()
196 | 
197 |         logging.info("Performing merge of datasets in preparation for diff")
198 |         merged_dataset = pd.merge(
199 |             left=self._left_dataset,
200 |             right=self._right_dataset,
201 |             how="outer",
202 |             indicator="exists",
203 |         )
204 | 
205 |         exists_left = merged_dataset["exists"] == "left_only"
206 |         exists_right = merged_dataset["exists"] == "right_only"
207 | 
208 |         logging.info("Creating diff result left")
209 |         self.diff_result_left = merged_dataset[exists_left].drop(["exists"], axis=1)
210 | 
211 |         logging.info("Creating diff result right")
212 |         self.diff_result_right = merged_dataset[exists_right].drop(["exists"], axis=1)
213 | 
214 |         self._build_output()
215 | 
216 |         return (len(self.diff_result_left.index), len(self.diff_result_right.index))
217 | 


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
 1 | ## Models
 2 | 
 3 | DataSet
 4 | - location
 5 | - read()
 6 | 
 7 | CsvDataSet < Dataset
 8 | - separator
 9 | 
10 | ExcelDataSet < Dataset
11 | - sheet_name
12 | - skip_cols
13 | 
14 | Diffino
15 | - left
16 | - right
17 | - output
18 | - convert_numeric
19 | - mode
20 | - cols
21 | - _build_inputs()
22 | - build_diff()
23 | 


--------------------------------------------------------------------------------
/docker-compose-test.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/docker-compose-test.yml


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/docker-compose.yml


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==0.19.2
2 | boto3==1.7.3


--------------------------------------------------------------------------------
/requirements/requirements-lint.txt:
--------------------------------------------------------------------------------
1 | black==19.3b0
2 | -r ../requirements.txt
3 | 


--------------------------------------------------------------------------------
/requirements/requirements-test.txt:
--------------------------------------------------------------------------------
1 | numpy==1.16.2
2 | pytest==4.4.0
3 | pytest-cov==2.6.1
4 | moto==1.3.8
5 | -r ../requirements.txt
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | 
 4 | VERSION = "0.2.1"
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | setup(
10 |     name="diffino",
11 |     version=VERSION,
12 |     packages=["diffino"],
13 |     include_package_data=True,
14 |     install_requires=[
15 |         "pandas==0.19.2",
16 |         "boto3==1.7.3"
17 |     ],
18 |     entry_points={'console_scripts': ['diffino = diffino.cli:main']},
19 |     author="BriteCore",
20 |     description="Diffing tools for comparing datasets in CSV, XLSX and other formats",
21 |     long_description=read('README.md'),
22 |     long_description_content_type="text/markdown",
23 |     keywords="diffing comparing csv excel json",
24 |     url="https://github.com/IntuitiveWebSolutions/diffino"
25 | )
26 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntuitiveWebSolutions/diffino/64d2b8830299c1b1ecca8639b882d0562c30d00a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/sample_left.csv:
--------------------------------------------------------------------------------
 1 | address,state,zip,name,id
 2 | one st,CA,66661,name one,1
 3 | two st,CA,66662,name two,2
 4 | three st,CA,66663,name three,3
 5 | four st,CA,66664,name four,4
 6 | five st,CA,66665,name five,5
 7 | six st,CA,66666,name six,6
 8 | seven st,CA,66667,name seven,7
 9 | eight st,CA,66668,name eight,8
10 | nine st,CA,66669,name nine,9
11 | ten st,CA,66610,name ten,10


--------------------------------------------------------------------------------
/tests/sample_right.csv:
--------------------------------------------------------------------------------
 1 | address,state,zip,name,id
 2 | one st,CA,66661,name one,1
 3 | two st,CA,66662,name two,2
 4 | three st,CA,66663,name three,3
 5 | four st,CA,66664,name four,4
 6 | five st,CA,66665,name five,5
 7 | six st,CA,66666,name six,6
 8 | seven st,CA,66667,name seven,7
 9 | eight st,CA,66668,name eight,8
10 | nine st,CA,66669,name nine,9
11 | eleven st,CA,66611,name eleven,11


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | from subprocess import Popen, PIPE
  2 | 
  3 | 
  4 | def test_md5_csv():
  5 |     p = Popen(
  6 |         ["diffino", "before_dataset.csv", "after_dataset.csv", "--mode", "md5"],
  7 |         stdout=PIPE,
  8 |         stderr=PIPE,
  9 |     )
 10 |     stdout, stderr = p.communicate()
 11 | 
 12 |     raise Exception("Finish test!")
 13 | 
 14 | 
 15 | def test_md5_zip():
 16 |     p = Popen(
 17 |         ["diffino", "before_dataset.zip", "after_dataset.zip", "--mode", "md5"],
 18 |         stdout=PIPE,
 19 |         stderr=PIPE,
 20 |     )
 21 |     stdout, stderr = p.communicate()
 22 | 
 23 |     raise Exception("Finish test!")
 24 | 
 25 | 
 26 | def test_s3_csv():
 27 |     p = Popen(
 28 |         [
 29 |             "diffino",
 30 |             "s3://bucket/before_dataset.csv",
 31 |             "s3://bucket/after_dataset.csv",
 32 |             "--mode",
 33 |             "md5",
 34 |         ],
 35 |         stdout=PIPE,
 36 |         stderr=PIPE,
 37 |     )
 38 |     stdout, stderr = p.communicate()
 39 | 
 40 |     raise Exception("Finish test!")
 41 | 
 42 | 
 43 | def test_s3_bucket_md5():
 44 |     p = Popen(
 45 |         [
 46 |             "diffino",
 47 |             "s3://bucket/before_dataset",
 48 |             "s3://bucket/after_dataset",
 49 |             "--mode",
 50 |             "md5",
 51 |         ],
 52 |         stdout=PIPE,
 53 |         stderr=PIPE,
 54 |     )
 55 |     stdout, stderr = p.communicate()
 56 | 
 57 |     raise Exception("Finish test!")
 58 | 
 59 | 
 60 | def test_pandas_csv():
 61 |     p = Popen(
 62 |         ["diffino", "before_dataset.csv", "after_dataset.csv", "--mode", "pandas"],
 63 |         stdout=PIPE,
 64 |         stderr=PIPE,
 65 |     )
 66 |     stdout, stderr = p.communicate()
 67 | 
 68 |     raise Exception("Finish test!")
 69 | 
 70 | 
 71 | def test_pandas_csv_numeric_false():
 72 |     p = Popen(
 73 |         [
 74 |             "diffino",
 75 |             "before_dataset.csv",
 76 |             "after_dataset.csv",
 77 |             "--mode",
 78 |             "pandas",
 79 |             "--convert-numeric",
 80 |             "false",
 81 |         ],
 82 |         stdout=PIPE,
 83 |         stderr=PIPE,
 84 |     )
 85 |     stdout, stderr = p.communicate()
 86 | 
 87 |     raise Exception("Finish test!")
 88 | 
 89 | 
 90 | def test_pandas_csv_cols():
 91 |     p = Popen(
 92 |         [
 93 |             "diffino before_dataset.csv",
 94 |             "after_dataset.csv",
 95 |             "--mode pandas",
 96 |             "--cols",
 97 |             "id",
 98 |             "name",
 99 |         ],
100 |         stdout=PIPE,
101 |         stderr=PIPE,
102 |     )
103 |     stdout, stderr = p.communicate()
104 | 
105 |     raise Exception("Finish test!")
106 | 
107 | 
108 | def test_pandas_output_csv_local():
109 |     p = Popen(
110 |         ["diffino", "file_1.csv", "file_2.csv", "--output", "diff.csv"],
111 |         stdout=PIPE,
112 |         stderr=PIPE,
113 |     )
114 |     stdout, stderr = p.communicate()
115 | 
116 |     raise Exception("Finish test!")
117 | 
118 | 
119 | def test_pandas_output_xlsx_local():
120 |     p = Popen(
121 |         ["diffino", "file_1.csv", "file_2.csv", "--output", "diff.xlsx"],
122 |         stdout=PIPE,
123 |         stderr=PIPE,
124 |     )
125 |     stdout, stderr = p.communicate()
126 | 
127 |     raise Exception("Finish test!")
128 | 
129 | 
130 | def test_pandas_output_json_local():
131 |     p = Popen(
132 |         ["diffino", "file_1.csv", "file_2.csv", "--output", "diff.json"],
133 |         stdout=PIPE,
134 |         stderr=PIPE,
135 |     )
136 |     stdout, stderr = p.communicate()
137 | 
138 |     raise Exception("Finish test!")
139 | 
140 | 
141 | def test_pandas_output_csv_s3():
142 |     p = Popen(
143 |         ["diffino", "file_1.csv", "file_2.csv", "--output", "s3://bucket/diff.csv"],
144 |         stdout=PIPE,
145 |         stderr=PIPE,
146 |     )
147 |     stdout, stderr = p.communicate()
148 | 
149 |     raise Exception("Finish test!")
150 | 
151 | 
152 | def test_pandas_output_xlsx_s3():
153 |     p = Popen(
154 |         ["diffino", "file_1.csv", "file_2.csv", "--output", "s3://bucket/diff.xlsx"],
155 |         stdout=PIPE,
156 |         stderr=PIPE,
157 |     )
158 |     stdout, stderr = p.communicate()
159 | 
160 |     raise Exception("Finish test!")
161 | 
162 | 
163 | def test_pandas_output_json_s3():
164 |     p = Popen(
165 |         ["diffino", "file_1.csv", "file_2.csv", "--output", "s3://bucket/diff.json"],
166 |         stdout=PIPE,
167 |         stderr=PIPE,
168 |     )
169 |     stdout, stderr = p.communicate()
170 | 
171 |     raise Exception("Finish test!")
172 | 


--------------------------------------------------------------------------------
/tests/test_md5.py:
--------------------------------------------------------------------------------
 1 | from diffino.models import Diffino
 2 | 
 3 | 
 4 | def test_single_file_csv_local_md5():
 5 |     diff = Diffino(mode="md5", left="/tmp/one.csv", right="/tmp/two.csv")
 6 |     results = diff.build_diff()
 7 |     assert results
 8 | 
 9 | 
10 | def test_single_file_excel_local_md5():
11 |     diff = Diffino(mode="md5", left="/tmp/one.xlsx", right="/tmp/two.xlsx")
12 |     results = diff.build_diff()
13 |     assert results
14 | 
15 | 
16 | def test_single_file_csv_s3_md5():
17 |     diff = Diffino(
18 |         mode="md5", left="s3://fake-bucket/one.csv", right="s3://fake-bucket/two.csv"
19 |     )
20 |     results = diff.build_diff()
21 |     assert results
22 | 
23 | 
24 | def test_single_file_excel_s3_md5():
25 |     diff = Diffino(
26 |         mode="md5", left="s3://fake-bucket/one.xlsx", right="s3://fake-bucket/two.xlsx"
27 |     )
28 |     results = diff.build_diff()
29 |     assert results
30 | 
31 | 
32 | def test_multiple_files_dir_md5():
33 |     diff = Diffino(mode="md5", left="/tmp/one", right="/tmp/two")
34 |     results = diff.build_diff()
35 |     assert results
36 | 
37 | 
38 | def test_multiple_files_zip_md5():
39 |     diff = Diffino(mode="md5", left="/tmp/one.zip", right="/tmp/two.zip")
40 |     results = diff.build_diff()
41 |     assert results
42 | 
43 | 
44 | def test_multiple_files_s3_md5():
45 |     diff = Diffino(
46 |         mode="md5", left="s3://fake-bucket/one", right="s3://fake-bucket/two"
47 |     )
48 |     results = diff.build_diff()
49 |     assert results
50 | 
51 | 
52 | def test_output_csv_md5():
53 |     diff = Diffino(
54 |         mode="md5", left="/tmp/one.csv", right="/tmp/two.csv", output="/tmp/diff.csv"
55 |     )
56 |     diff.build_diff()
57 |     raise Exception("Finish test!")
58 | 
59 | 
60 | def test_output_xlsx_md5():
61 |     diff = Diffino(
62 |         mode="md5", left="/tmp/one.csv", right="/tmp/two.csv", output="/tmp/diff.xslx"
63 |     )
64 |     diff.build_diff()
65 |     raise Exception("Finish test!")
66 | 
67 | 
68 | def test_output_json_md5():
69 |     diff = Diffino(
70 |         mode="md5", left="/tmp/one.csv", right="/tmp/two.csv", output="/tmp/diff.json"
71 |     )
72 |     diff.build_diff()
73 |     raise Exception("Finish test!")
74 | 
75 | 
76 | def test_output_in_s3_md5():
77 |     diff = Diffino(
78 |         mode="md5",
79 |         left="/tmp/one.csv",
80 |         right="/tmp/two.csv",
81 |         output="s3://fake-bucket/diff.json",
82 |     )
83 |     diff.build_diff()
84 |     raise Exception("Finish test!")
85 | 


--------------------------------------------------------------------------------
/tests/test_models.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import io
  3 | import os
  4 | import numpy as np
  5 | import numpy.testing as npt
  6 | import pandas as pd
  7 | 
  8 | from diffino.models import DataSet, Diffino
  9 | from moto import mock_s3
 10 | 
 11 | 
 12 | def assert_frames_equal(actual, expected):
 13 |     """
 14 |     Compare DataFrame items by index and column and
 15 |     raise AssertionError if any item is not equal.
 16 | 
 17 |     Ordering is unimportant, items are compared only by label.
 18 |     NaN and infinite values are supported.
 19 |     
 20 |     Parameters
 21 |     ----------
 22 |     actual : pandas.DataFrame
 23 |     expected : pandas.DataFrame
 24 | 
 25 |     """
 26 |     comp = npt.assert_equal
 27 | 
 28 |     assert isinstance(actual, pd.DataFrame) and isinstance(
 29 |         expected, pd.DataFrame
 30 |     ), "Inputs must both be pandas DataFrames."
 31 | 
 32 |     for i, exp_row in expected.iterrows():
 33 |         assert i in actual.index, "Expected row {!r} not found.".format(i)
 34 | 
 35 |         act_row = actual.loc[i]
 36 | 
 37 |         for j, exp_item in exp_row.iteritems():
 38 |             assert j in act_row.index, "Expected column {!r} not found.".format(j)
 39 | 
 40 |             act_item = act_row[j]
 41 | 
 42 |             try:
 43 |                 comp(act_item, exp_item)
 44 |             except AssertionError as e:
 45 |                 raise AssertionError(
 46 |                     e.message + "\n\nColumn: {!r}\nRow: {!r}".format(j, i)
 47 |                 )
 48 | 
 49 | 
 50 | class TestModels(object):
 51 |     def _create_diff(
 52 |         self,
 53 |         target_dir,
 54 |         left_csv="sample_left.csv",
 55 |         right_csv="sample_right.csv",
 56 |         to_console=False,
 57 |         cols=None,
 58 |         output_only_diffs=False,
 59 |     ):
 60 |         output_location = (
 61 |             False if to_console else os.path.join(target_dir, "output.csv")
 62 |         )
 63 |         output_left = os.path.join(target_dir, "output_not_in_left.csv")
 64 |         output_right = os.path.join(target_dir, "output_not_in_right.csv")
 65 | 
 66 |         location_left = fname = os.path.join(os.path.dirname(__file__), left_csv)
 67 |         location_right = fname = os.path.join(os.path.dirname(__file__), right_csv)
 68 |         diffino = Diffino(
 69 |             left=location_left,
 70 |             right=location_right,
 71 |             output=output_location,
 72 |             cols=cols,
 73 |             output_only_diffs=output_only_diffs,
 74 |         )
 75 | 
 76 |         rows_count = diffino.build_diff()
 77 | 
 78 |         if not to_console and not output_only_diffs:
 79 |             assert os.path.isfile(output_left)
 80 |             assert os.path.isfile(output_right)
 81 |         return output_location, output_left, output_right, rows_count
 82 | 
 83 |     def test_dataset_read_from_local_file(self):
 84 |         location = fname = os.path.join(os.path.dirname(__file__), "sample_left.csv")
 85 |         dataset = DataSet(location, None, False)
 86 |         df = dataset.read()
 87 |         assert isinstance(df, pd.DataFrame)
 88 |         assert df.empty is not True
 89 | 
 90 |     def test_diffino_diff_is_working(self, tmpdir):
 91 |         outputs = self._create_diff(str(tmpdir))
 92 | 
 93 |         expected_data_not_in_left = u"""address,state,zip,name,id
 94 | eleven st,CA,66611,name eleven,11"""
 95 | 
 96 |         expected_data_not_in_right = u"""address,state,zip,name,id
 97 | ten st,CA,66610,name ten,10"""
 98 | 
 99 |         expected_df_not_in_left = pd.read_csv(io.StringIO(expected_data_not_in_left))
100 |         expected_df_not_in_right = pd.read_csv(io.StringIO(expected_data_not_in_right))
101 | 
102 |         result_not_in_left = pd.read_csv(outputs[1])
103 |         result_not_in_right = pd.read_csv(outputs[2])
104 | 
105 |         assert_frames_equal(expected_df_not_in_left, result_not_in_left)
106 |         assert_frames_equal(expected_df_not_in_right, result_not_in_right)
107 | 
108 |     def test_diffino_no_diff(self, tmpdir):
109 |         outputs = self._create_diff(str(tmpdir), right_csv="sample_left.csv")
110 | 
111 |         expected_data = u"address,state,zip,name,id"
112 | 
113 |         expected_df = pd.read_csv(io.StringIO(expected_data))
114 |         resulting_left_csv = pd.read_csv(outputs[1])
115 |         resulting_right_csv = pd.read_csv(outputs[2])
116 | 
117 |         assert_frames_equal(expected_df, resulting_left_csv)
118 |         assert_frames_equal(expected_df, resulting_right_csv)
119 | 
120 |     def test_diffino_build_output_to_console(self, tmpdir, capsys):
121 |         self._create_diff(str(tmpdir), to_console=True)
122 |         captured = capsys.readouterr()
123 |         assert "Differences found on left file" in captured.out
124 |         assert "Differences found on right file" in captured.out
125 | 
126 |     def test_diffino_diff_with_selected_columns(self, tmpdir):
127 |         outputs = self._create_diff(str(tmpdir), cols=["address", "id"])
128 | 
129 |         expected_data_right = u"""address,id
130 | ten st,10"""
131 |         expected_data_left = u"""address,id
132 | eleven st,11"""
133 | 
134 |         expected_df_left = pd.read_csv(io.StringIO(expected_data_left))
135 |         expected_df_right = pd.read_csv(io.StringIO(expected_data_right))
136 |         resulting_left_csv = pd.read_csv(outputs[1])
137 |         resulting_right_csv = pd.read_csv(outputs[2])
138 | 
139 |         assert_frames_equal(expected_df_left, resulting_left_csv)
140 |         assert_frames_equal(expected_df_right, resulting_right_csv)
141 | 
142 |     def test_diffino_output_only_diffs_console(self, tmpdir, capsys):
143 |         self._create_diff(
144 |             str(tmpdir),
145 |             to_console=True,
146 |             right_csv="sample_left.csv",
147 |             output_only_diffs=True,
148 |         )
149 |         captured = capsys.readouterr()
150 |         assert "Differences found on left file" not in captured.out
151 |         assert "Differences found on right file" not in captured.out
152 | 
153 |     def test_diffino_output_only_diffs_csv(self, tmpdir):
154 |         outputs = self._create_diff(
155 |             str(tmpdir), right_csv="sample_left.csv", output_only_diffs=True
156 |         )
157 |         assert not os.path.isfile(outputs[1])
158 |         assert not os.path.isfile(outputs[2])
159 | 
160 |     def test_diffino_return_diff_count(self, tmpdir):
161 |         outputs = self._create_diff(str(tmpdir))
162 |         assert outputs[3][0] is 1
163 |         assert outputs[3][0] is 1
164 | 
165 |     @mock_s3
166 |     def test_diffino_s3_support(self, tmpdir):
167 |         conn = boto3.resource("s3")
168 |         # We need to create the bucket since this is all in Moto's 'virtual' AWS account
169 |         bucket = "britedata-diff"
170 |         conn.create_bucket(Bucket=bucket)
171 |         s3 = boto3.client("s3")
172 | 
173 |         key_current = "current.csv"
174 |         key_new = "new.csv"
175 |         value = u"""address,state,zip,name,id
176 | eleven st,CA,66611,name eleven,11"""
177 |         s3.put_object(Bucket=bucket, Key=key_current, Body=value)
178 |         s3.put_object(Bucket=bucket, Key=key_new, Body=value)
179 | 
180 |         location_left = "s3://" + bucket + "/" + key_current
181 |         location_right = "s3://" + bucket + "/" + key_new
182 |         output_location = "s3://" + bucket + "/output.csv"
183 |         diffino = Diffino(
184 |             left=location_left, right=location_right, output=output_location
185 |         )
186 |         diffino.build_diff()
187 | 
188 |         body_left = (
189 |             conn.Object(bucket, "output_not_in_left.csv")
190 |             .get()["Body"]
191 |             .read()
192 |             .decode("utf-8")
193 |         )
194 |         body_right = (
195 |             conn.Object(bucket, "output_not_in_right.csv")
196 |             .get()["Body"]
197 |             .read()
198 |             .decode("utf-8")
199 |         )
200 | 
201 |         expected_result = u"""address,state,zip,name,id\n"""
202 |         assert body_left == expected_result
203 |         assert body_right == expected_result
204 | 


--------------------------------------------------------------------------------
/tests/test_pandas.py:
--------------------------------------------------------------------------------
  1 | from diffino.models import Diffino
  2 | 
  3 | 
  4 | def test_single_file_csv_local_pandas():
  5 |     diff = Diffino(mode="pandas", left="/tmp/one.csv", right="/tmp/two.csv")
  6 |     results = diff.build_diff()
  7 |     assert results
  8 | 
  9 | 
 10 | def test_single_file_excel_local_pandas():
 11 |     diff = Diffino(mode="pandas", left="/tmp/one.xlsx", right="/tmp/two.xlsx")
 12 |     results = diff.build_diff()
 13 |     assert results
 14 | 
 15 | 
 16 | def test_single_file_csv_s3_pandas():
 17 |     diff = Diffino(
 18 |         mode="pandas", left="s3://fake-bucket/one.csv", right="s3://fake-bucket/two.csv"
 19 |     )
 20 |     results = diff.build_diff()
 21 |     assert results
 22 | 
 23 | 
 24 | def test_multiple_files_dir_pandas():
 25 |     diff = Diffino(mode="pandas", left="/tmp/one", right="/tmp/two")
 26 |     results = diff.build_diff()
 27 |     assert results
 28 | 
 29 | 
 30 | def test_multiple_files_zip_pandas():
 31 |     diff = Diffino(mode="pandas", left="/tmp/one.zip", right="/tmp/two.zip")
 32 |     results = diff.build_diff()
 33 |     assert results
 34 | 
 35 | 
 36 | def test_multiple_files_s3_pandas():
 37 |     diff = Diffino(
 38 |         mode="pandas", left="s3://fake-bucket/one", right="s3://fake-bucket/two"
 39 |     )
 40 |     results = diff.build_diff()
 41 |     assert results
 42 | 
 43 | 
 44 | def test_specific_cols():
 45 |     diff = Diffino(
 46 |         mode="pandas", left="/tmp/one.csv", right="/tmp/two.csv", cols=["id", "name"]
 47 |     )
 48 |     results = diff.build_diff()
 49 |     assert results
 50 | 
 51 | 
 52 | def test_convert_numeric():
 53 |     diff = Diffino(
 54 |         mode="pandas",
 55 |         left="/tmp/one_specific_cols.csv",
 56 |         right="/tmp/two_specific_cols.csv",
 57 |         convert_numeric=False,
 58 |     )
 59 |     results = diff.build_diff()
 60 |     assert results
 61 | 
 62 | 
 63 | def test_output_csv_pandas():
 64 |     diff = Diffino(
 65 |         mode="pandas", left="/tmp/one.csv", right="/tmp/two.csv", output="/tmp/diff.csv"
 66 |     )
 67 |     diff.build_diff()
 68 |     raise Exception("Finish test!")
 69 | 
 70 | 
 71 | def test_output_xlsx_pandas():
 72 |     diff = Diffino(
 73 |         mode="pandas",
 74 |         left="/tmp/one.csv",
 75 |         right="/tmp/two.csv",
 76 |         output="/tmp/diff.xslx",
 77 |     )
 78 |     diff.build_diff()
 79 |     raise Exception("Finish test!")
 80 | 
 81 | 
 82 | def test_output_json_pandas():
 83 |     diff = Diffino(
 84 |         mode="pandas",
 85 |         left="/tmp/one.csv",
 86 |         right="/tmp/two.csv",
 87 |         output="/tmp/diff.json",
 88 |     )
 89 |     diff.build_diff()
 90 |     raise Exception("Finish test!")
 91 | 
 92 | 
 93 | def test_output_in_s3_pandas():
 94 |     diff = Diffino(
 95 |         mode="pandas",
 96 |         left="/tmp/one.csv",
 97 |         right="/tmp/two.csv",
 98 |         output="s3://fake-bucket/diff.json",
 99 |     )
100 |     diff.build_diff()
101 |     raise Exception("Finish test!")
102 | 


--------------------------------------------------------------------------------