├── .github └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── csv_diff ├── __init__.py └── cli.py ├── setup.py └── tests ├── __init__.py ├── test_cli.py ├── test_csv_diff.py └── test_human_text.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | cache: pip 23 | cache-dependency-path: setup.py 24 | - name: Install dependencies 25 | run: | 26 | pip install '.[test]' 27 | - name: Run tests 28 | run: | 29 | pytest 30 | deploy: 31 | runs-on: ubuntu-latest 32 | needs: [test] 33 | environment: release 34 | permissions: 35 | id-token: write 36 | steps: 37 | - uses: actions/checkout@v4 38 | - name: Set up Python 39 | uses: actions/setup-python@v5 40 | with: 41 | python-version: "3.12" 42 | cache: pip 43 | cache-dependency-path: setup.py 44 | - name: Install dependencies 45 | run: | 46 | pip install setuptools wheel build 47 | - name: Build 48 | run: | 49 | python -m build 50 | - name: Publish 51 | uses: pypa/gh-action-pypi-publish@release/v1 52 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request] 4 | 5 | permissions: 6 | contents: read 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | cache: pip 21 | cache-dependency-path: setup.py 22 | - name: Install dependencies 23 | run: | 24 | pip install '.[test]' 25 | - name: Run tests 26 | run: | 27 | pytest 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | venv 6 | .eggs 7 | .pytest_cache 8 | *.egg-info 9 | .DS_Store 10 | .schema 11 | .vscode 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-alpine 2 | RUN pip install csv-diff 3 | WORKDIR /files 4 | ENTRYPOINT ["csv-diff"] 5 | CMD ["--help"] 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # csv-diff 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/csv-diff.svg)](https://pypi.org/project/csv-diff/) 4 | [![Changelog](https://img.shields.io/github/v/release/simonw/csv-diff?include_prereleases&label=changelog)](https://github.com/simonw/csv-diff/releases) 5 | [![Tests](https://github.com/simonw/csv-diff/workflows/Test/badge.svg)](https://github.com/simonw/csv-diff/actions?query=workflow%3ATest) 6 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/simonw/csv-diff/blob/main/LICENSE) 7 | 8 | Tool for viewing the difference between two CSV, TSV or JSON files. See [Generating a commit log for San Francisco’s official list of trees](https://simonwillison.net/2019/Mar/13/tree-history/) (and the [sf-tree-history repo commit log](https://github.com/simonw/sf-tree-history/commits)) for background information on this project. 9 | 10 | ## Installation 11 | 12 | pip install csv-diff 13 | 14 | ## Usage 15 | 16 | Consider two CSV files: 17 | 18 | `one.csv` 19 | 20 | id,name,age 21 | 1,Cleo,4 22 | 2,Pancakes,2 23 | 24 | `two.csv` 25 | 26 | id,name,age 27 | 1,Cleo,5 28 | 3,Bailey,1 29 | 30 | `csv-diff` can show a human-readable summary of differences between the files: 31 | 32 | $ csv-diff one.csv two.csv --key=id 33 | 1 row changed, 1 row added, 1 row removed 34 | 35 | 1 row changed 36 | 37 | Row 1 38 | age: "4" => "5" 39 | 40 | 1 row added 41 | 42 | id: 3 43 | name: Bailey 44 | age: 1 45 | 46 | 1 row removed 47 | 48 | id: 2 49 | name: Pancakes 50 | age: 2 51 | 52 | The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. 53 | 54 | The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`. 55 | 56 | You can also feed it JSON files, provided they are a JSON array of objects where each object has the same keys. Use `--format=json` if your input files are JSON. 57 | 58 | Use `--show-unchanged` to include full details of the unchanged values for rows with at least one change in the diff output: 59 | 60 | % csv-diff one.csv two.csv --key=id --show-unchanged 61 | 1 row changed 62 | 63 | id: 1 64 | age: "4" => "5" 65 | 66 | Unchanged: 67 | name: "Cleo" 68 | 69 | ### JSON output 70 | 71 | You can use the `--json` option to get a machine-readable difference: 72 | 73 | $ csv-diff one.csv two.csv --key=id --json 74 | { 75 | "added": [ 76 | { 77 | "id": "3", 78 | "name": "Bailey", 79 | "age": "1" 80 | } 81 | ], 82 | "removed": [ 83 | { 84 | "id": "2", 85 | "name": "Pancakes", 86 | "age": "2" 87 | } 88 | ], 89 | "changed": [ 90 | { 91 | "key": "1", 92 | "changes": { 93 | "age": [ 94 | "4", 95 | "5" 96 | ] 97 | } 98 | } 99 | ], 100 | "columns_added": [], 101 | "columns_removed": [] 102 | } 103 | 104 | ### Adding templated extras 105 | 106 | You can specify additional keys to be displayed in the human-readable format using the `--extra` option: 107 | 108 | --extra name "Python format string with {id} for variables" 109 | 110 | For example, to output a link to `https://news.ycombinator.com/latest?id={id}` for each item with an ID, you could use this: 111 | 112 | ```bash 113 | csv-diff one.csv two.csv --key=id \ 114 | --extra latest "https://news.ycombinator.com/latest?id={id}" 115 | ``` 116 | These extras display something like this: 117 | ``` 118 | 1 row changed 119 | 120 | id: 41459472 121 | points: "24" => "25" 122 | numComments: "5" => "6" 123 | extras: 124 | latest: https://news.ycombinator.com/latest?id=41459472 125 | ``` 126 | 127 | ## As a Python library 128 | 129 | You can also import the Python library into your own code like so: 130 | 131 | from csv_diff import load_csv, compare 132 | diff = compare( 133 | load_csv(open("one.csv"), key="id"), 134 | load_csv(open("two.csv"), key="id") 135 | ) 136 | 137 | `diff` will now contain the same data structure as the output in the `--json` example above. 138 | 139 | If the columns in the CSV have changed, those added or removed columns will be ignored when calculating changes made to specific rows. 140 | 141 | ## As a Docker container 142 | 143 | ### Build the image 144 | 145 | $ docker build -t csvdiff . 146 | 147 | ### Run the container 148 | 149 | $ docker run --rm -v $(pwd):/files csvdiff 150 | 151 | Suppose current directory contains two csv files : one.csv two.csv 152 | 153 | $ docker run --rm -v $(pwd):/files csvdiff one.csv two.csv 154 | 155 | ## Alternatives 156 | 157 | - [csvdiff](https://github.com/aswinkarthik/csvdiff) is a "fast diff tool for comparing CSV files" - you may get better results from this than from `csv-diff` against larger files. 158 | -------------------------------------------------------------------------------- /csv_diff/__init__.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from dictdiffer import diff 3 | import json 4 | import hashlib 5 | 6 | 7 | def load_csv(fp, key=None, dialect=None): 8 | if dialect is None and fp.seekable(): 9 | # Peek at first 1MB to sniff the delimiter and other dialect details 10 | peek = fp.read(1024**2) 11 | fp.seek(0) 12 | try: 13 | dialect = csv.Sniffer().sniff(peek, delimiters=",\t;") 14 | except csv.Error: 15 | # Oh well, we tried. Fallback to the default. 16 | pass 17 | fp = csv.reader(fp, dialect=(dialect or "excel")) 18 | headings = next(fp) 19 | rows = [dict(zip(headings, line)) for line in fp] 20 | if key: 21 | keyfn = lambda r: r[key] 22 | else: 23 | keyfn = lambda r: hashlib.sha1( 24 | json.dumps(r, sort_keys=True).encode("utf8") 25 | ).hexdigest() 26 | return {keyfn(r): r for r in rows} 27 | 28 | 29 | def load_json(fp, key=None): 30 | raw_list = json.load(fp) 31 | assert isinstance(raw_list, list) 32 | common_keys = set() 33 | for item in raw_list: 34 | common_keys.update(item.keys()) 35 | if key: 36 | keyfn = lambda r: r[key] 37 | else: 38 | keyfn = lambda r: hashlib.sha1( 39 | json.dumps(r, sort_keys=True).encode("utf8") 40 | ).hexdigest() 41 | return {keyfn(r): _simplify_json_row(r, common_keys) for r in raw_list} 42 | 43 | 44 | def _simplify_json_row(r, common_keys): 45 | # Convert list/dict values into JSON serialized strings 46 | for key, value in r.items(): 47 | if isinstance(value, (dict, tuple, list)): 48 | r[key] = json.dumps(value) 49 | for key in common_keys: 50 | if key not in r: 51 | r[key] = None 52 | return r 53 | 54 | 55 | def compare(previous, current, show_unchanged=False): 56 | result = { 57 | "added": [], 58 | "removed": [], 59 | "changed": [], 60 | "columns_added": [], 61 | "columns_removed": [], 62 | } 63 | # Have the columns changed? 64 | previous_columns = set(next(iter(previous.values())).keys()) 65 | current_columns = set(next(iter(current.values())).keys()) 66 | ignore_columns = None 67 | if previous_columns != current_columns: 68 | result["columns_added"] = [ 69 | c for c in current_columns if c not in previous_columns 70 | ] 71 | result["columns_removed"] = [ 72 | c for c in previous_columns if c not in current_columns 73 | ] 74 | ignore_columns = current_columns.symmetric_difference(previous_columns) 75 | # Have any rows been removed or added? 76 | removed = [id for id in previous if id not in current] 77 | added = [id for id in current if id not in previous] 78 | # How about changed? 79 | removed_or_added = set(removed) | set(added) 80 | potential_changes = [id for id in current if id not in removed_or_added] 81 | changed = [id for id in potential_changes if current[id] != previous[id]] 82 | if added: 83 | result["added"] = [current[id] for id in added] 84 | if removed: 85 | result["removed"] = [previous[id] for id in removed] 86 | if changed: 87 | for id in changed: 88 | diffs = list(diff(previous[id], current[id], ignore=ignore_columns)) 89 | if diffs: 90 | changes = { 91 | "key": id, 92 | "changes": { 93 | # field can be a list if id contained '.' - #7 94 | field[0] if isinstance(field, list) else field: [ 95 | prev_value, 96 | current_value, 97 | ] 98 | for _, field, (prev_value, current_value) in diffs 99 | }, 100 | } 101 | if show_unchanged: 102 | changes["unchanged"] = { 103 | field: value 104 | for field, value in previous[id].items() 105 | if field not in changes["changes"] and field != "id" 106 | } 107 | result["changed"].append(changes) 108 | return result 109 | 110 | 111 | def human_text(result, key=None, singular=None, plural=None, current=None, extras=None): 112 | singular = singular or "row" 113 | plural = plural or "rows" 114 | title = [] 115 | summary = [] 116 | show_headers = sum(1 for key in result if result[key]) > 1 117 | if result["columns_added"]: 118 | fragment = "{} {} added".format( 119 | len(result["columns_added"]), 120 | "column" if len(result["columns_added"]) == 1 else "columns", 121 | ) 122 | title.append(fragment) 123 | summary.extend( 124 | [fragment, ""] 125 | + [" {}".format(c) for c in sorted(result["columns_added"])] 126 | + [""] 127 | ) 128 | if result["columns_removed"]: 129 | fragment = "{} {} removed".format( 130 | len(result["columns_removed"]), 131 | "column" if len(result["columns_removed"]) == 1 else "columns", 132 | ) 133 | title.append(fragment) 134 | summary.extend( 135 | [fragment, ""] 136 | + [" {}".format(c) for c in sorted(result["columns_removed"])] 137 | + [""] 138 | ) 139 | if result["changed"]: 140 | fragment = "{} {} changed".format( 141 | len(result["changed"]), singular if len(result["changed"]) == 1 else plural 142 | ) 143 | title.append(fragment) 144 | if show_headers: 145 | summary.append(fragment + "\n") 146 | change_blocks = [] 147 | for details in result["changed"]: 148 | block = [] 149 | block.append(" {}: {}".format(key, details["key"])) 150 | for field, (prev_value, current_value) in details["changes"].items(): 151 | block.append( 152 | ' {}: "{}" => "{}"'.format(field, prev_value, current_value) 153 | ) 154 | if extras: 155 | current_item = current[details["key"]] 156 | block.append(human_extras(current_item, extras)) 157 | block.append("") 158 | change_blocks.append("\n".join(block)) 159 | if details.get("unchanged"): 160 | block = [] 161 | block.append(" Unchanged:") 162 | for field, value in details["unchanged"].items(): 163 | block.append(' {}: "{}"'.format(field, value)) 164 | block.append("") 165 | change_blocks.append("\n".join(block)) 166 | summary.append("\n".join(change_blocks)) 167 | if result["added"]: 168 | fragment = "{} {} added".format( 169 | len(result["added"]), singular if len(result["added"]) == 1 else plural 170 | ) 171 | title.append(fragment) 172 | if show_headers: 173 | summary.append(fragment + "\n") 174 | rows = [] 175 | for row in result["added"]: 176 | to_append = human_row(row, prefix=" ") 177 | if extras: 178 | to_append += "\n" + human_extras(row, extras) 179 | rows.append(to_append) 180 | summary.append("\n\n".join(rows)) 181 | summary.append("") 182 | if result["removed"]: 183 | fragment = "{} {} removed".format( 184 | len(result["removed"]), singular if len(result["removed"]) == 1 else plural 185 | ) 186 | title.append(fragment) 187 | if show_headers: 188 | summary.append(fragment + "\n") 189 | rows = [] 190 | for row in result["removed"]: 191 | to_append = human_row(row, prefix=" ") 192 | if extras: 193 | to_append += "\n" + human_extras(row, extras) 194 | rows.append(to_append) 195 | summary.append("\n\n".join(rows)) 196 | summary.append("") 197 | return (", ".join(title) + "\n\n" + ("\n".join(summary))).strip() 198 | 199 | 200 | def human_row(row, prefix=""): 201 | bits = [] 202 | for key, value in row.items(): 203 | bits.append("{}{}: {}".format(prefix, key, value)) 204 | return "\n".join(bits) 205 | 206 | 207 | def human_extras(row, extras): 208 | bits = [] 209 | bits.append(" extras:") 210 | for key, fmt in extras: 211 | bits.append(" {}: {}".format(key, fmt.format(**row))) 212 | return "\n".join(bits) 213 | -------------------------------------------------------------------------------- /csv_diff/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | import json as std_json 3 | from . import load_csv, load_json, compare, human_text 4 | 5 | 6 | @click.command() 7 | @click.version_option() 8 | @click.argument( 9 | "previous", 10 | type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False), 11 | ) 12 | @click.argument( 13 | "current", 14 | type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False), 15 | ) 16 | @click.option( 17 | "--key", type=str, default=None, help="Column to use as a unique ID for each row" 18 | ) 19 | @click.option( 20 | "--format", 21 | type=click.Choice(["csv", "tsv", "json"]), 22 | default=None, 23 | help="Explicitly specify input format (csv, tsv, json) instead of auto-detecting", 24 | ) 25 | @click.option( 26 | "--json", type=bool, default=False, help="Output changes as JSON", is_flag=True 27 | ) 28 | @click.option( 29 | "--singular", 30 | type=str, 31 | default=None, 32 | help="Singular word to use, e.g. 'tree' for '1 tree'", 33 | ) 34 | @click.option( 35 | "--plural", 36 | type=str, 37 | default=None, 38 | help="Plural word to use, e.g. 'trees' for '2 trees'", 39 | ) 40 | @click.option( 41 | "--show-unchanged", 42 | is_flag=True, 43 | help="Show unchanged fields for rows with at least one change", 44 | ) 45 | @click.option( 46 | "extras", 47 | "--extra", 48 | type=(str, str), 49 | multiple=True, 50 | help="key: format string - define extra fields to display", 51 | ) 52 | def cli(previous, current, key, format, json, singular, plural, show_unchanged, extras): 53 | "Diff two CSV or JSON files" 54 | dialect = { 55 | "csv": "excel", 56 | "tsv": "excel-tab", 57 | } 58 | 59 | if extras and json: 60 | raise click.UsageError( 61 | "Extra fields are not supported in JSON output mode", 62 | ctx=click.get_current_context(), 63 | ) 64 | 65 | def load(filename): 66 | if format == "json": 67 | return load_json(open(filename), key=key) 68 | else: 69 | return load_csv( 70 | open(filename, newline=""), key=key, dialect=dialect.get(format) 71 | ) 72 | 73 | previous_data = load(previous) 74 | current_data = load(current) 75 | 76 | diff = compare(previous_data, current_data, show_unchanged) 77 | if json: 78 | print(std_json.dumps(diff, indent=4)) 79 | else: 80 | print( 81 | human_text(diff, key, singular, plural, current=current_data, extras=extras) 82 | ) 83 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import io 3 | import os 4 | 5 | VERSION = "1.2" 6 | 7 | 8 | def get_long_description(): 9 | with io.open( 10 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md"), 11 | encoding="utf8", 12 | ) as fp: 13 | return fp.read() 14 | 15 | 16 | setup( 17 | name="csv-diff", 18 | description="Python CLI tool and library for diffing CSV and JSON files", 19 | long_description=get_long_description(), 20 | long_description_content_type="text/markdown", 21 | author="Simon Willison", 22 | version=VERSION, 23 | license="Apache License, Version 2.0", 24 | packages=find_packages(), 25 | install_requires=["click", "dictdiffer"], 26 | setup_requires=["pytest-runner"], 27 | extras_require={"test": ["pytest"]}, 28 | entry_points=""" 29 | [console_scripts] 30 | csv-diff=csv_diff.cli:cli 31 | """, 32 | tests_require=["csv-diff[test]"], 33 | url="https://github.com/simonw/csv-diff", 34 | classifiers=[ 35 | "Development Status :: 4 - Beta", 36 | "Intended Audience :: Developers", 37 | "Intended Audience :: Science/Research", 38 | "Intended Audience :: End Users/Desktop", 39 | "License :: OSI Approved :: Apache Software License", 40 | "Programming Language :: Python :: 3.6", 41 | "Programming Language :: Python :: 3.7", 42 | ], 43 | ) 44 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simonw/csv-diff/26903b74eefcd65be761810f51b0e55c033bde66/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | from click.testing import CliRunner 2 | from csv_diff import cli, load_csv 3 | import csv 4 | import pytest 5 | from .test_csv_diff import ONE, ONE_TSV, TWO, TWO_TSV, THREE, FIVE 6 | import io 7 | import json 8 | from textwrap import dedent 9 | 10 | 11 | @pytest.fixture 12 | def tsv_files(tmpdir): 13 | one = tmpdir / "one.tsv" 14 | one.write(ONE_TSV) 15 | two = tmpdir / "two.tsv" 16 | two.write(TWO_TSV) 17 | return str(one), str(two) 18 | 19 | 20 | @pytest.fixture 21 | def json_files(tmpdir): 22 | one = tmpdir / "one.json" 23 | one.write( 24 | json.dumps( 25 | [ 26 | {"id": 1, "name": "Cleo", "nested": {"foo": 3}, "extra": 1}, 27 | {"id": 2, "name": "Pancakes", "nested": {"foo": 3}}, 28 | ] 29 | ) 30 | ) 31 | two = tmpdir / "two.json" 32 | two.write( 33 | json.dumps( 34 | [ 35 | {"id": 1, "name": "Cleo", "nested": {"foo": 3, "bar": 5}, "extra": 1}, 36 | {"id": 2, "name": "Pancakes!", "nested": {"foo": 3}, "extra": 1}, 37 | ] 38 | ) 39 | ) 40 | return str(one), str(two) 41 | 42 | 43 | def test_human_cli(tmpdir): 44 | one = tmpdir / "one.csv" 45 | one.write(ONE) 46 | two = tmpdir / "two.csv" 47 | two.write(TWO) 48 | result = CliRunner().invoke(cli.cli, [str(one), str(two), "--key", "id"]) 49 | assert 0 == result.exit_code 50 | assert ( 51 | dedent( 52 | """ 53 | 1 row changed 54 | 55 | id: 1 56 | age: "4" => "5" 57 | """ 58 | ).strip() 59 | == result.output.strip() 60 | ) 61 | 62 | 63 | def test_human_cli_alternative_names(tmpdir): 64 | one = tmpdir / "one.csv" 65 | one.write(ONE) 66 | five = tmpdir / "five.csv" 67 | five.write(FIVE) 68 | result = CliRunner().invoke( 69 | cli.cli, 70 | [str(one), str(five), "--key", "id", "--singular", "tree", "--plural", "trees"], 71 | ) 72 | assert 0 == result.exit_code, result.output 73 | assert ( 74 | dedent( 75 | """ 76 | 1 tree changed, 2 trees added 77 | 78 | 1 tree changed 79 | 80 | id: 1 81 | age: "4" => "5" 82 | 83 | 2 trees added 84 | 85 | id: 3 86 | name: Bailey 87 | age: 1 88 | 89 | id: 4 90 | name: Carl 91 | age: 7 92 | """ 93 | ).strip() 94 | == result.output.strip() 95 | ) 96 | 97 | 98 | def test_human_cli_json(tmpdir): 99 | one = tmpdir / "one.csv" 100 | one.write(ONE) 101 | two = tmpdir / "two.csv" 102 | two.write(TWO) 103 | result = CliRunner().invoke(cli.cli, [str(one), str(two), "--key", "id", "--json"]) 104 | assert 0 == result.exit_code 105 | assert { 106 | "added": [], 107 | "removed": [], 108 | "changed": [{"key": "1", "changes": {"age": ["4", "5"]}}], 109 | "columns_added": [], 110 | "columns_removed": [], 111 | } == json.loads(result.output.strip()) 112 | 113 | 114 | def test_tsv_files(tsv_files): 115 | one, two = tsv_files 116 | result = CliRunner().invoke( 117 | cli.cli, [one, two, "--key", "id", "--json", "--format", "tsv"] 118 | ) 119 | assert 0 == result.exit_code 120 | assert { 121 | "added": [], 122 | "removed": [], 123 | "changed": [{"key": "1", "changes": {"age": ["4", "5"]}}], 124 | "columns_added": [], 125 | "columns_removed": [], 126 | } == json.loads(result.output.strip()) 127 | 128 | 129 | def test_json_files(json_files): 130 | one, two = json_files 131 | result = CliRunner().invoke( 132 | cli.cli, 133 | [one, two, "--key", "id", "--json", "--format", "json"], 134 | catch_exceptions=False, 135 | ) 136 | assert 0 == result.exit_code 137 | assert { 138 | "added": [], 139 | "removed": [], 140 | "changed": [ 141 | {"key": 1, "changes": {"nested": ['{"foo": 3}', '{"foo": 3, "bar": 5}']}}, 142 | { 143 | "key": 2, 144 | "changes": {"name": ["Pancakes", "Pancakes!"], "extra": [None, 1]}, 145 | }, 146 | ], 147 | "columns_added": [], 148 | "columns_removed": [], 149 | } == json.loads(result.output.strip()) 150 | 151 | 152 | def test_sniff_format(tsv_files): 153 | one, two = tsv_files 154 | result = CliRunner().invoke(cli.cli, [one, two, "--key", "id", "--json"]) 155 | assert 0 == result.exit_code 156 | assert { 157 | "added": [], 158 | "removed": [], 159 | "changed": [{"key": "1", "changes": {"age": ["4", "5"]}}], 160 | "columns_added": [], 161 | "columns_removed": [], 162 | } == json.loads(result.output.strip()) 163 | 164 | 165 | def test_format_overrides_sniff(tsv_files): 166 | one, two = tsv_files 167 | result = CliRunner().invoke( 168 | cli.cli, [one, two, "--key", "id", "--json", "--format", "csv"] 169 | ) 170 | assert 1 == result.exit_code 171 | 172 | 173 | def test_column_containing_dot(tmpdir): 174 | # https://github.com/simonw/csv-diff/issues/7 175 | one = tmpdir / "one.csv" 176 | two = tmpdir / "two.csv" 177 | one.write( 178 | dedent( 179 | """ 180 | id,foo.bar,foo.baz 181 | 1,Dog,Cat 182 | """ 183 | ).strip() 184 | ) 185 | two.write( 186 | dedent( 187 | """ 188 | id,foo.bar,foo.baz 189 | 1,Dog,Beaver 190 | """ 191 | ).strip() 192 | ) 193 | result = CliRunner().invoke( 194 | cli.cli, [str(one), str(two), "--key", "id", "--json"], catch_exceptions=False 195 | ) 196 | assert 0 == result.exit_code 197 | assert { 198 | "added": [], 199 | "removed": [], 200 | "changed": [{"key": "1", "changes": {"foo.baz": ["Cat", "Beaver"]}}], 201 | "columns_added": [], 202 | "columns_removed": [], 203 | } == json.loads(result.output.strip()) 204 | 205 | 206 | def test_semicolon_delimited(tmpdir): 207 | # https://github.com/simonw/csv-diff/issues/6 208 | one = tmpdir / "one.csv" 209 | two = tmpdir / "two.csv" 210 | one.write( 211 | dedent( 212 | """ 213 | id;name 214 | 1;Mark 215 | """ 216 | ).strip() 217 | ) 218 | two.write( 219 | dedent( 220 | """ 221 | id;name 222 | 1;Brian 223 | """ 224 | ).strip() 225 | ) 226 | result = CliRunner().invoke( 227 | cli.cli, [str(one), str(two), "--key", "id", "--json"], catch_exceptions=False 228 | ) 229 | assert 0 == result.exit_code 230 | assert { 231 | "added": [], 232 | "removed": [], 233 | "changed": [{"key": "1", "changes": {"name": ["Mark", "Brian"]}}], 234 | "columns_added": [], 235 | "columns_removed": [], 236 | } == json.loads(result.output.strip()) 237 | 238 | 239 | def test_diff_with_extras(tmpdir): 240 | one = tmpdir / "one.json" 241 | two = tmpdir / "two.json" 242 | one.write( 243 | json.dumps( 244 | [ 245 | {"id": 1, "name": "Cleo", "type": "dog"}, 246 | {"id": 2, "name": "Suna", "type": "chicken"}, 247 | ] 248 | ) 249 | ) 250 | two.write( 251 | json.dumps( 252 | [ 253 | {"id": 2, "name": "Suna", "type": "pretty chicken"}, 254 | {"id": 3, "name": "Artie", "type": "bunny"}, 255 | ] 256 | ) 257 | ) 258 | result = CliRunner().invoke( 259 | cli.cli, 260 | [ 261 | str(one), 262 | str(two), 263 | "--key", 264 | "id", 265 | "--format", 266 | "json", 267 | "--extra", 268 | "search", 269 | "https://www.google.com/search?q={name}", 270 | ], 271 | catch_exceptions=False, 272 | ) 273 | assert result.exit_code == 0 274 | expected = dedent( 275 | """ 276 | 1 row changed, 1 row added, 1 row removed 277 | 278 | 1 row changed 279 | 280 | id: 2 281 | type: "chicken" => "pretty chicken" 282 | extras: 283 | search: https://www.google.com/search?q=Suna 284 | 285 | 1 row added 286 | 287 | id: 3 288 | name: Artie 289 | type: bunny 290 | extras: 291 | search: https://www.google.com/search?q=Artie 292 | 293 | 1 row removed 294 | 295 | id: 1 296 | name: Cleo 297 | type: dog 298 | extras: 299 | search: https://www.google.com/search?q=Cleo 300 | """ 301 | ).strip() 302 | assert result.output.strip() == expected 303 | -------------------------------------------------------------------------------- /tests/test_csv_diff.py: -------------------------------------------------------------------------------- 1 | from csv_diff import load_csv, compare 2 | import io 3 | 4 | ONE = """id,name,age 5 | 1,Cleo,4 6 | 2,Pancakes,2""" 7 | 8 | ONE_TSV = """id\tname\tage 9 | 1\tCleo\t4 10 | 2\tPancakes\t2""" 11 | 12 | TWO = """id,name,age 13 | 1,Cleo,5 14 | 2,Pancakes,2""" 15 | 16 | TWO_TSV = """id\tname\tage 17 | 1\tCleo\t5 18 | 2\tPancakes\t2""" 19 | 20 | THREE = """id,name,age 21 | 1,Cleo,5""" 22 | 23 | FOUR = """id,name,age 24 | 1,Cleo,5 25 | 2,Pancakes,2, 26 | 3,Bailey,1""" 27 | 28 | FIVE = """id,name,age 29 | 1,Cleo,5 30 | 2,Pancakes,2, 31 | 3,Bailey,1 32 | 4,Carl,7""" 33 | 34 | SIX = """id,name,age 35 | 1,Cleo,5 36 | 3,Bailey,1""" 37 | 38 | SEVEN = """id,name,weight 39 | 1,Cleo,48 40 | 3,Bailey,20""" 41 | 42 | EIGHT = """id,name,age,length 43 | 3,Bailee,1,100 44 | 4,Bob,7,422""" 45 | 46 | NINE = """id,name,age 47 | 1,Cleo,5 48 | 2,Pancakes,4""" 49 | 50 | TEN = """id,name,age 51 | 1,Cleo,5 52 | 2,Pancakes,3""" 53 | 54 | 55 | def test_row_changed(): 56 | diff = compare( 57 | load_csv(io.StringIO(ONE), key="id"), load_csv(io.StringIO(TWO), key="id") 58 | ) 59 | assert { 60 | "added": [], 61 | "removed": [], 62 | "changed": [{"key": "1", "changes": {"age": ["4", "5"]}}], 63 | "columns_added": [], 64 | "columns_removed": [], 65 | } == diff 66 | 67 | 68 | def test_row_added(): 69 | diff = compare( 70 | load_csv(io.StringIO(THREE), key="id"), load_csv(io.StringIO(TWO), key="id") 71 | ) 72 | assert { 73 | "changed": [], 74 | "removed": [], 75 | "added": [{"age": "2", "id": "2", "name": "Pancakes"}], 76 | "columns_added": [], 77 | "columns_removed": [], 78 | } == diff 79 | 80 | 81 | def test_row_removed(): 82 | diff = compare( 83 | load_csv(io.StringIO(TWO), key="id"), load_csv(io.StringIO(THREE), key="id") 84 | ) 85 | assert { 86 | "changed": [], 87 | "removed": [{"age": "2", "id": "2", "name": "Pancakes"}], 88 | "added": [], 89 | "columns_added": [], 90 | "columns_removed": [], 91 | } == diff 92 | 93 | 94 | def test_columns_changed(): 95 | diff = compare( 96 | load_csv(io.StringIO(SIX), key="id"), load_csv(io.StringIO(SEVEN), key="id") 97 | ) 98 | assert { 99 | "changed": [], 100 | "removed": [], 101 | "added": [], 102 | "columns_added": ["weight"], 103 | "columns_removed": ["age"], 104 | } == diff 105 | 106 | 107 | def test_tsv(): 108 | diff = compare( 109 | load_csv(io.StringIO(ONE), key="id"), load_csv(io.StringIO(TWO_TSV), key="id") 110 | ) 111 | assert { 112 | "added": [], 113 | "removed": [], 114 | "changed": [{"key": "1", "changes": {"age": ["4", "5"]}}], 115 | "columns_added": [], 116 | "columns_removed": [], 117 | } == diff 118 | -------------------------------------------------------------------------------- /tests/test_human_text.py: -------------------------------------------------------------------------------- 1 | from csv_diff import load_csv, compare, human_text 2 | from .test_csv_diff import ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN 3 | from textwrap import dedent 4 | import io 5 | 6 | 7 | def test_row_changed(): 8 | diff = compare( 9 | load_csv(io.StringIO(ONE), key="id"), load_csv(io.StringIO(TWO), key="id") 10 | ) 11 | assert ( 12 | dedent( 13 | """ 14 | 1 row changed 15 | 16 | id: 1 17 | age: "4" => "5" 18 | """ 19 | ).strip() 20 | == human_text(diff, "id") 21 | ) 22 | 23 | 24 | def test_row_changed_show_unchanged(): 25 | diff = compare( 26 | load_csv(io.StringIO(ONE), key="id"), 27 | load_csv(io.StringIO(TWO), key="id"), 28 | show_unchanged=True, 29 | ) 30 | assert ( 31 | dedent( 32 | """ 33 | 1 row changed 34 | 35 | id: 1 36 | age: "4" => "5" 37 | 38 | Unchanged: 39 | name: "Cleo" 40 | """ 41 | ).strip() 42 | == human_text(diff, "id") 43 | ) 44 | 45 | 46 | def test_row_added(): 47 | diff = compare( 48 | load_csv(io.StringIO(THREE), key="id"), load_csv(io.StringIO(TWO), key="id") 49 | ) 50 | assert ( 51 | dedent( 52 | """ 53 | 1 row added 54 | 55 | id: 2 56 | name: Pancakes 57 | age: 2 58 | """ 59 | ).strip() 60 | == human_text(diff, "id") 61 | ) 62 | 63 | 64 | def test_rows_added(): 65 | diff = compare( 66 | load_csv(io.StringIO(THREE), key="id"), load_csv(io.StringIO(FIVE), key="id") 67 | ) 68 | assert ( 69 | dedent( 70 | """ 71 | 3 rows added 72 | 73 | id: 2 74 | name: Pancakes 75 | age: 2 76 | 77 | id: 3 78 | name: Bailey 79 | age: 1 80 | 81 | id: 4 82 | name: Carl 83 | age: 7 84 | """ 85 | ).strip() 86 | == human_text(diff, "id") 87 | ) 88 | 89 | 90 | def test_row_removed(): 91 | diff = compare( 92 | load_csv(io.StringIO(TWO), key="id"), load_csv(io.StringIO(THREE), key="id") 93 | ) 94 | assert ( 95 | dedent( 96 | """ 97 | 1 row removed 98 | 99 | id: 2 100 | name: Pancakes 101 | age: 2 102 | """ 103 | ).strip() 104 | == human_text(diff, "id") 105 | ) 106 | 107 | 108 | def test_row_changed_and_row_added_and_row_deleted(): 109 | "Should have headers for each section here" 110 | diff = compare( 111 | load_csv(io.StringIO(ONE), key="id"), load_csv(io.StringIO(SIX), key="id") 112 | ) 113 | assert ( 114 | dedent( 115 | """ 116 | 1 row changed, 1 row added, 1 row removed 117 | 118 | 1 row changed 119 | 120 | id: 1 121 | age: "4" => "5" 122 | 123 | 1 row added 124 | 125 | id: 3 126 | name: Bailey 127 | age: 1 128 | 129 | 1 row removed 130 | 131 | id: 2 132 | name: Pancakes 133 | age: 2 134 | """ 135 | ).strip() 136 | == human_text(diff, "id") 137 | ) 138 | 139 | 140 | def test_columns_changed(): 141 | diff = compare( 142 | load_csv(io.StringIO(SIX), key="id"), load_csv(io.StringIO(SEVEN), key="id") 143 | ) 144 | assert ( 145 | dedent( 146 | """ 147 | 1 column added, 1 column removed 148 | 149 | 1 column added 150 | 151 | weight 152 | 153 | 1 column removed 154 | 155 | age 156 | """ 157 | ).strip() 158 | == human_text(diff, "id") 159 | ) 160 | 161 | 162 | def test_columns_and_rows_changed(): 163 | diff = compare( 164 | load_csv(io.StringIO(SEVEN), key="id"), load_csv(io.StringIO(EIGHT), key="id") 165 | ) 166 | assert ( 167 | dedent( 168 | """ 169 | 2 columns added, 1 column removed, 1 row changed, 1 row added, 1 row removed 170 | 171 | 2 columns added 172 | 173 | age 174 | length 175 | 176 | 1 column removed 177 | 178 | weight 179 | 180 | 1 row changed 181 | 182 | id: 3 183 | name: "Bailey" => "Bailee" 184 | 185 | 1 row added 186 | 187 | id: 4 188 | name: Bob 189 | age: 7 190 | length: 422 191 | 192 | 1 row removed 193 | 194 | id: 1 195 | name: Cleo 196 | weight: 48 197 | """ 198 | ).strip() 199 | == human_text(diff, "id") 200 | ) 201 | 202 | 203 | def test_no_key(): 204 | diff = compare(load_csv(io.StringIO(NINE)), load_csv(io.StringIO(TEN))) 205 | assert ( 206 | dedent( 207 | """ 208 | 1 row added, 1 row removed 209 | 210 | 1 row added 211 | 212 | id: 2 213 | name: Pancakes 214 | age: 3 215 | 216 | 1 row removed 217 | 218 | id: 2 219 | name: Pancakes 220 | age: 4 221 | """ 222 | ).strip() 223 | == human_text(diff) 224 | ) 225 | --------------------------------------------------------------------------------