├── .gitignore ├── .coveragerc ├── tox.ini ├── setup.py ├── LICENSE ├── filecmp2.py ├── README.rst └── test_filecmp2.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | 4 | .coverage 5 | .tox 6 | dist 7 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | include = filecmp2.py 4 | 5 | [report] 6 | show_missing = True 7 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = lint,py35,py36,py37,py38 3 | 4 | [testenv] 5 | deps = 6 | coverage 7 | pytest-cov 8 | pytest 9 | commands = 10 | coverage run -m pytest {posargs} test_filecmp2.py 11 | coverage report 12 | 13 | [testenv:lint] 14 | basepython = python3.7 15 | deps = flake8 16 | commands = flake8 --max-complexity=10 filecmp2.py test_filecmp2.py 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 2 | 3 | import codecs 4 | from setuptools import setup 5 | 6 | with codecs.open("README.rst", encoding="utf8") as infile: 7 | readme_contents = infile.read() 8 | 9 | setup( 10 | name="filecmp2", 11 | version="1.0.0", 12 | license="MIT", 13 | description="Are these two files the same? Explicit file comparisons.", 14 | long_description=readme_contents, 15 | author="Alex Chan", 16 | author_email="alex@alexwlchan.net", 17 | url="https://github.com/alexwlchan/filecmp2", 18 | classifiers=[ 19 | "Development Status :: 5 - Production/Stable", 20 | 21 | "License :: OSI Approved :: MIT License", 22 | 23 | "Operating System :: OS Independent", 24 | 25 | "Programming Language :: Python :: 3", 26 | "Programming Language :: Python :: 3.5", 27 | "Programming Language :: Python :: 3.6", 28 | "Programming Language :: Python :: 3.7", 29 | "Programming Language :: Python :: 3.8", 30 | ] 31 | ) 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 Alex Chan 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a 4 | copy of this software and associated documentation files (the "Software"), 5 | to deal in the Software without restriction, including without limitation 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | and/or sell copies of the Software, and to permit persons to whom the Software 8 | is furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 17 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 18 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 19 | OTHER DEALINGS IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /filecmp2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 3 | 4 | import filecmp 5 | import os 6 | 7 | 8 | def cmp_contents(f1, f2): 9 | """ 10 | Returns True if the contents of the file-like objects ``f1`` and ``f2`` 11 | are the equal. 12 | 13 | That is, they return the same bytes/text when calling ``read()``. 14 | 15 | Passing in the same file-like object twice is an error. 16 | 17 | """ 18 | # Handle the case where the same stream has been handed in twice. 19 | # 20 | # Although they arguably contain the same contents; you can't call 21 | # read() on them both and get the same bytes/text. It's probably a 22 | # sign something has gone wrong, so throw an error. 23 | if f1 is f2: 24 | raise ValueError("f1 and f2 are the same stream! %r" % f1) 25 | 26 | # Should this be configurable? 27 | buffer_size = 8192 28 | 29 | while True: 30 | buffer1 = f1.read(buffer_size) 31 | buffer2 = f2.read(buffer_size) 32 | if buffer1 != buffer2: 33 | return False 34 | 35 | # Both files are exhausted and there's nothing left to read. 36 | if not buffer1: 37 | return True 38 | 39 | 40 | def cmp_path_contents(path1, path2): 41 | """ 42 | Returns True if the files at paths ``path1`` and ``path2`` 43 | have the same contents. 44 | 45 | That is, the files both contain the same bytes. 46 | """ 47 | with open(path1, "rb") as f1, open(path2, "rb") as f2: 48 | return cmp_contents(f1, f2) 49 | 50 | 51 | def cmp_stat(path1, path2): 52 | """ 53 | Returns True if the os.stat() signature of ``path1`` and 54 | ``path2`` are the same. 55 | 56 | This is the shallow copy done by filecmp.cmp(). 57 | """ 58 | return filecmp.cmp(path1, path2, shallow=True) 59 | 60 | 61 | def cmp_same_file(path1, path2): 62 | """ 63 | Returns True if ``path1`` and ``path2`` point to the same file on disk. 64 | """ 65 | return os.path.samefile(path1, path2) 66 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | filecmp2 2 | ======== 3 | 4 | If you say "two files are the same", you could mean at least three different things: 5 | 6 | * *The files have the same os.stat() signature.* 7 | (This is the shallow copy done by ``filecmp.cmp()``.) 8 | 9 | * *The files have the same contents. They're byte-for-byte identical.* 10 | 11 | * *They're the same files on disk (modulo hard links).* 12 | 13 | You can compare files with `filecmp `_ 14 | or `os `_, but it's not always obvious what 15 | sort of comparison you're doing. 16 | 17 | Since `explicit is better than implicit `_, 18 | filecmp2 provides three functions so you can be clear about what you mean by "same": 19 | 20 | .. code-block:: python 21 | 22 | def cmp_path_contents(path1, path2): 23 | """ 24 | Returns True if the files at paths ``path1`` and ``path2`` 25 | have the same contents. 26 | """ 27 | 28 | 29 | def cmp_stat(path1, path2): 30 | """ 31 | Returns True if the os.stat() signature of ``path1`` and ``path2`` 32 | are the same. 33 | """ 34 | 35 | 36 | def cmp_same_file(path1, path2): 37 | """ 38 | Returns True if ``path1`` and ``path2`` point to the same file on disk. 39 | """ 40 | 41 | If you have two file-like objects, you can also compare their contents with 42 | ``cmp_contents``: 43 | 44 | .. code-block:: pycon 45 | 46 | >>> import filecmp2 47 | >>> import io 48 | 49 | >>> b1 = io.BytesIO(b"hello world") 50 | >>> b2 = io.BytesIO(b"hello world") 51 | >>> filecmp2.cmp_contents(b1, b2) 52 | True 53 | 54 | >>> b1 = io.BytesIO(b"hello world") 55 | >>> b2 = io.BytesIO(b"the cheese shop") 56 | >>> filecmp2.cmp_contents(b1, b2) 57 | False 58 | 59 | This is useful if you're dealing with streams that you don't want to write to disk. 60 | 61 | I wrote this after discovering that I was using ``filecmp.cmp()`` wrong, and doing 62 | a shallow copy instead of checking the contents of the files. I don't find the 63 | current API very clear, and reading the Python bug tracker suggests I'm not the only 64 | person who's made this mistake. Although the docs explain the distinction, it's 65 | lost on somebody who's casually reading the code or reviewing without the docs. 66 | 67 | Installation 68 | ************ 69 | 70 | Install from PyPI (``pip install filecmp2``), or copy the single file directly 71 | into your codebase. 72 | 73 | License 74 | ******* 75 | 76 | MIT. 77 | -------------------------------------------------------------------------------- /test_filecmp2.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 2 | 3 | import io 4 | import os 5 | 6 | import pytest 7 | 8 | import filecmp2 9 | 10 | 11 | def test_file_is_same_as_itself(tmpdir): 12 | path = tmpdir.join("greeting.txt") 13 | path.write(b"hello world") 14 | 15 | assert filecmp2.cmp_path_contents(path1=path, path2=path) 16 | assert filecmp2.cmp_stat(path1=path, path2=path) 17 | assert filecmp2.cmp_same_file(path1=path, path2=path) 18 | 19 | 20 | def test_different_files_are_different(tmpdir): 21 | path1 = tmpdir.join("greeting.txt") 22 | path1.write(b"hello world") 23 | 24 | path2 = tmpdir.join("name.txt") 25 | path2.write(b"lexie") 26 | 27 | assert not filecmp2.cmp_path_contents(path1=path1, path2=path2) 28 | assert not filecmp2.cmp_stat(path1=path1, path2=path2) 29 | assert not filecmp2.cmp_same_file(path1=path1, path2=path2) 30 | 31 | 32 | def test_same_size_and_mtime_but_different_contents(tmpdir): 33 | path1 = tmpdir.join("greeting1.txt") 34 | path1.write(b"hello world") 35 | 36 | path2 = tmpdir.join("greeting2.txt") 37 | path2.write(b"howdy world") 38 | 39 | path1_stat = os.stat(str(path1)) 40 | os.utime(str(path2), times=(path1_stat.st_atime, path1_stat.st_mtime)) 41 | 42 | assert not filecmp2.cmp_path_contents(path1=path1, path2=path2) 43 | assert filecmp2.cmp_stat(path1=path1, path2=path2) 44 | assert not filecmp2.cmp_same_file(path1=path1, path2=path2) 45 | 46 | 47 | def test_two_files_with_equal_contents_match_contents_but_not_same(tmpdir): 48 | path1 = tmpdir.join("greeting1.txt") 49 | path1.write(b"hello world") 50 | 51 | path2 = tmpdir.join("greeting2.txt") 52 | path2.write(b"hello world") 53 | 54 | assert filecmp2.cmp_path_contents(path1=path1, path2=path2) 55 | assert filecmp2.cmp_stat(path1=path1, path2=path2) 56 | assert not filecmp2.cmp_same_file(path1=path1, path2=path2) 57 | 58 | 59 | def test_comparing_contents_of_file_object_to_itself(): 60 | f = io.BytesIO(b"hello world") 61 | 62 | with pytest.raises(ValueError, match="f1 and f2 are the same stream"): 63 | filecmp2.cmp_contents(f, f) 64 | 65 | 66 | def test_can_compare_equal_file_like_objects(): 67 | f1 = io.BytesIO(b"hello world") 68 | f2 = io.BytesIO(b"hello world") 69 | 70 | assert filecmp2.cmp_contents(f1, f2) 71 | 72 | 73 | def test_can_compare_equal_file_like_objects_which_are_text(): 74 | f1 = io.StringIO(u"hello world") 75 | f2 = io.StringIO(u"hello world") 76 | 77 | assert filecmp2.cmp_contents(f1, f2) 78 | 79 | 80 | def test_mixed_binary_and_text_objects_are_different(): 81 | f1 = io.BytesIO(b"hello world") 82 | f2 = io.StringIO(u"hello world") 83 | 84 | assert not filecmp2.cmp_contents(f1, f2) 85 | 86 | 87 | def test_compares_large_files(): 88 | # The current buffer used by ``cmp_contents`` is 8192 bytes; make sure 89 | # we can detect differences beyond the end of the first buffer. 90 | common_prefix = b"padding123" * 8200 91 | 92 | f1 = io.BytesIO(common_prefix + b"1") 93 | f2 = io.BytesIO(common_prefix + b"2") 94 | 95 | assert not filecmp2.cmp_contents(f1=f1, f2=f2) 96 | 97 | 98 | def test_hard_links_are_the_same(tmpdir): 99 | path1 = tmpdir.join("greeting1.txt") 100 | path1.write(b"hello world") 101 | 102 | path2 = tmpdir.join("greeting2.txt") 103 | 104 | os.link(path1, path2) 105 | 106 | assert filecmp2.cmp_same_file(path1, path2) 107 | --------------------------------------------------------------------------------