├── .gitignore ├── LICENSE ├── README.rst ├── mpt ├── __init__.py ├── __main__.py ├── codes.py ├── defaults.py ├── email.py ├── filemanager.py ├── hashing.py ├── paths.py ├── results.py ├── staging.py └── timing.py ├── mptreport ├── __init__.py ├── __main__.py └── reportcollator.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | .Python 8 | env/ 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | *.egg-info/ 21 | .installed.cfg 22 | *.egg 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | MPT (Minimum Preservation Tool) 3 | =============================== 4 | 5 | A utility for staging files, calculating and validating file checksums, and comparing checksum values between storage 6 | locations. 7 | 8 | Requirements 9 | ============ 10 | 11 | * Python (version 3.6+) 12 | * Pip (version 19.0+) 13 | 14 | How to install 15 | ============== 16 | 17 | MPT works best within a `virtual environment `_. To create a new 18 | virtual environment, start a command prompt and enter the following command: 19 | :: 20 | 21 | python -m venv [path-to-venv-directory] 22 | 23 | This will create a directory structure in ``[path-to-venv-directory]`` containing all the necessary configuration and 24 | data files required. The virtual environment can be activated by entering one of the following at the command prompt: 25 | 26 | Windows: 27 | :: 28 | 29 | [path-to-venv-directory]\Scripts\activate.bat 30 | 31 | Linux: 32 | :: 33 | 34 | source [path-to-venv-directory]/bin/activate 35 | 36 | When you've activated the virtual environment, install MPT from a Git repository: 37 | :: 38 | 39 | pip install git+http://github.com/britishlibrary/mpt 40 | 41 | 42 | Or from a local source: 43 | :: 44 | 45 | pip install /path/to/mpt-source/ 46 | 47 | 48 | All dependencies should be automatically downloaded and installed as part of pip's install process. 49 | 50 | Configuration 51 | ============= 52 | 53 | In order to automatically e-mail summary reports, MPT requires that three environment variables be set: 54 | :: 55 | 56 | MAIL_SERVER = mail.example.com 57 | MAIL_SERVER_PORT = 587 58 | MAIL_SENDER_ADDRESS = 59 | 60 | An example of ``MAIL_SENDER_ADDRESS`` might be ``Bitwise Checks `` 61 | 62 | On Windows, these should be set via Control Panel > System > Advanced System Settings > Environment Variables. 63 | 64 | On Linux, these should be added to the ``~/.bash_profile`` or ``~/.profile`` file for the user running MPT. 65 | 66 | How to use 67 | ========== 68 | 69 | MPT has several modes of operation. 70 | 71 | Checksum Creation 72 | ----------------- 73 | 74 | MPT can calculate checksums for an existing collection of files, and store those checksums in a 'checksum 75 | tree' which mimics the directory structure of the original files. Optionally it can also store these checksum values 76 | in a single manifest file. 77 | :: 78 | 79 | mpt create dir -t TREE [-a ALGORITHM] [--formats FORMATS ] [-m MANIFEST] [-r] 80 | 81 | The various command line options and arguments are described below. 82 | 83 | Directory to check (required) 84 | """"""""""""""""""""""""""""" 85 | 86 | The directory of files to process. 87 | 88 | Directory for checksum tree (required) 89 | """""""""""""""""""""""""""""""""""""" 90 | 91 | Use the ``-t`` or ``--tree`` option to specify the directory in which the 'checksum tree' should be created. A 92 | checksum file will be created in the tree for each file checked. The name and path to the checksum file will mirror 93 | that of the original file checked. 94 | 95 | Recursive operation (optional) 96 | """""""""""""""""""""""""""""" 97 | 98 | Use the ``-r`` or ``--recursive`` option to process all sub-folders beneath the given directory. By default only 99 | the top-level directory will be processed. 100 | 101 | Specify checksum algorithm (optional) 102 | """"""""""""""""""""""""""""""""""""" 103 | 104 | Use the ``-a`` or ``--algorithm`` option to specify the checksum algorithm to use. A number of different algorithms 105 | are supported (use ``mpt create -h`` to list them all). The default algorithm is ``sha256``. 106 | 107 | Limit to certain file extensions (optional) 108 | """"""""""""""""""""""""""""""""""""""""""" 109 | 110 | Use the ``--formats`` option to limit checksum creation to files with a particular file extension. 111 | 112 | Specify manifest file (optional) 113 | """""""""""""""""""""""""""""""" 114 | Use the ``-m`` or ``--manifest`` option to specify a manifest file to be created in addition to the 'checksum tree'. 115 | 116 | Example of command syntax 117 | """"""""""""""""""""""""" 118 | :: 119 | 120 | mpt create -r c:\storage\files 121 | -t c:\storage\checksums 122 | -m c:\storage\manifest.sha256 123 | --formats tiff tif 124 | 125 | This will create checksums for all files ending in ``tiff`` or ``tif`` in ``c:\storage\files`` and all subdirectories. The SHA256 126 | algorithm will be used as the default option. The resulting 'checksum tree' will be created in ``c:\storage\checksums`` 127 | mirroring the original directory structure. A manifest file containing all checksums will also be created (if it does 128 | not already exist) or updated at ``c:\storage\manifest.sha256``. 129 | 130 | Checksum Validation (Checksum Tree) 131 | ----------------------------------- 132 | 133 | MPT can verify the checksums of all files listed in a 'checksum tree' created by the creation or staging mode. 134 | :: 135 | 136 | mpt validate_tree dir -t TREE [-r] 137 | 138 | The various command line options and arguments are described below. 139 | 140 | Data directory root (required) 141 | """"""""""""""""""""""""""""""" 142 | 143 | The root directory of files to validate. 144 | 145 | Checksum tree root (required) 146 | """"""""""""""""""""""""""""" 147 | 148 | Use the ``-t`` or ``--tree`` option to specify the root directory of the 'checksum tree' used to validate the data 149 | files. 150 | 151 | Recursive operation (optional) 152 | """""""""""""""""""""""""""""" 153 | 154 | Use the ``-r`` or ``--recursive`` option to process all sub-folders beneath the given directory. By default only 155 | the top-level directory will be processed. 156 | 157 | Example of command syntax 158 | """"""""""""""""""""""""" 159 | :: 160 | 161 | mpt validate_tree -r c:\storage\files -t c:\storage\checksums 162 | 163 | This will validate all data files in ``c:\storage\files`` and all subdirectories. Each file will be validated using its 164 | checksum file in the 'checksum tree' in ``c:\storage\checksums``. 165 | 166 | Checksum Validation (Manifest) 167 | ----------------------------------- 168 | 169 | MPT can verify the checksums of all files listed in a manifest file created by the creation or staging mode. 170 | :: 171 | 172 | mpt validate_manifest dir -m MANIFEST [-r] [-a ALGORITHM] 173 | 174 | The various command line options and arguments are described below. 175 | 176 | Data directory root (required) 177 | """""""""""""""""""""""""""""" 178 | 179 | The root directory of files to validate. 180 | 181 | Manifest file path (required) 182 | """"""""""""""""""""""""""""" 183 | 184 | Use the ``-m`` or ``--manifest`` option to specify the location of the manifest file used to validate the data 185 | files. 186 | 187 | Specify checksum algorithm (optional) 188 | """"""""""""""""""""""""""""""""""""" 189 | 190 | Use the ``-a`` or ``--algorithm`` option to specify the checksum algorithm to use. A number of different algorithms 191 | are supported (use ``mpt validate_manifest -h`` to list them all). The default algorithm is ``sha256``. 192 | 193 | Example of command syntax 194 | """"""""""""""""""""""""" 195 | :: 196 | 197 | mpt validate_manifest c:\storage\files -m c:\storage\manifest.sha256 198 | 199 | This will validate all data files in ``c:\storage\files`` and all subdirectories. Each file will be validated using its 200 | entry in the manifest file ``c:\storage\manifest.sha256``. 201 | 202 | Checksum Comparison (Checksum Trees) 203 | ------------------------------------ 204 | 205 | MPT can compare the checksums stored in a 'checksum tree' to other 'trees' stored in different locations in 206 | order to detect any discrepancies. 207 | :: 208 | 209 | mpt compare_trees dir -t OTHER_TREES 210 | 211 | The various command line options and arguments are described below. 212 | 213 | Checksum tree root (required) 214 | """"""""""""""""""""""""""""" 215 | 216 | The root directory of the master checksum tree to use as a base of comparison. 217 | 218 | Other checksum tree roots (required) 219 | """""""""""""""""""""""""""""""""""" 220 | 221 | Use the ``-t`` or ``--trees`` option to specify the location of other checksum trees to compare to the master. 222 | 223 | Example of command syntax 224 | """"""""""""""""""""""""" 225 | :: 226 | 227 | mpt compare_trees c:\storage\checksums 228 | -t q:\backup_storage_1\checksums z:\backup_storage_2\checksums 229 | 230 | This will compare all checksum files in the 'checksum tree' located in ``c:\storage\checksums`` against the 231 | corresponding files in ``q:\backup_storage_1\checksums`` and ``z:\backup_storage_2\checksums`` and highlight any 232 | discrepancies. 233 | 234 | Checksum Comparison (Manifests) 235 | ------------------------------- 236 | 237 | MPT can compare the checksums stored in a manifest file to manifests in other locations in order to detect any 238 | discrepancies. 239 | :: 240 | 241 | mpt compare_manifests manifest -m OTHER_MANIFESTS 242 | 243 | The various command line options and arguments are described below. 244 | 245 | Master manifest file (required) 246 | """"""""""""""""""""""""""""""" 247 | 248 | The path to the master manifest file to use as a base of comparison. 249 | 250 | Other manifest files (required) 251 | """"""""""""""""""""""""""""""" 252 | 253 | Use the ``-m`` or ``--other_manifests`` option to specify the location of other manifests to compare to the master. 254 | 255 | Example of command syntax 256 | """"""""""""""""""""""""" 257 | :: 258 | 259 | mpt compare_manifests c:\storage\manifest.sha256 260 | -m q:\backup_storage_1\manifest.sha256 z:\backup_storage_2\manifest.sha256 261 | 262 | This will compare all entries in the manifest file ``c:\storage\manifest.sha256`` against the 263 | corresponding files ``q:\backup_storage_1\manifest.sha256`` and ``z:\backup_storage_2\manifest.sha256`` and highlight 264 | any discrepancies. 265 | 266 | File Staging 267 | ------------ 268 | 269 | File staging involves processing all files in a particular directory and moving them to one or more storage 270 | locations, calculating their checksums in the process. 271 | 272 | If staging is successful for all destinations then the original file will be removed from the staging area. If any part 273 | of the staging process fails for a particular file, then the entire staging process will be backed out for that file. 274 | This is to ensure that the staged file is present either in all destinations or in none. 275 | 276 | For example, if a file is successfully copied to three out of four destinations, but fails on the fourth destination, the 277 | file will be removed from each of the three other nodes. The final summary report would describe the details of the 278 | error condition for the one destination which failed, while the other three would be listed as "Unstaged." 279 | :: 280 | 281 | mpt stage dir -d DESTINATIONS [-a ALGORITHM] [-t TREES] [-m MANIFESTS ] [--max-failures MAX_FAILURES] 282 | 283 | The various command line options and arguments are described below. 284 | 285 | Staging Directory (required) 286 | """""""""""""""""""""""""""" 287 | 288 | The directory of files to be staged. 289 | 290 | Staging Destinations (required) 291 | """"""""""""""""""""""""""""""" 292 | 293 | Use the ``-d`` or ``--destinations`` option to specify the root directory of each staging destination (i.e. where the 294 | files should be moved to). These destinations can be in any order, but the order must be consistent between this option 295 | and the ``--trees`` and ``--manifests`` options if they are used. 296 | 297 | If the ``--trees`` option to specify 'checksum tree' locations is omitted, then the files will actually be staged to a 298 | subdirectory named ``files`` directly beneath each specified staging destination. 299 | 300 | Specify checksum algorithm (optional) 301 | """"""""""""""""""""""""""""""""""""" 302 | 303 | Use the ``-a`` or ``--algorithm`` option to specify the checksum algorithm to use. A number of different algorithms 304 | are supported (use ``mpt stage -h`` to list them all). The default algorithm is ``sha256``. 305 | 306 | Destination checksum trees (optional) 307 | """"""""""""""""""""""""""""""""""""" 308 | 309 | Use the ``-t`` or ``--trees`` option to specify the root directory of each destination checksum tree (i.e. where the 310 | checksums should be stored in each staging destination). 311 | 312 | If provided, then these destination tree paths *must* be listed in the same order as the staging destinations listed for 313 | the ``--destinations`` option - e.g. the first path listed for ``-t`` must be for the checksum tree corresponding to 314 | the first destination listed for the ``-d`` option, and so on. 315 | 316 | If this option is omitted altogether, then checksum trees will actually be created in a subdirectory named ``checksums`` 317 | directly beneath each specified staging destination. 318 | 319 | Destination manifest files (optional) 320 | """"""""""""""""""""""""""""""""""""" 321 | 322 | Use the ``-m`` or ``--manifests`` option to specify the location of a manifest file to create or update in each staging 323 | destination. 324 | 325 | If provided, then these manifest paths *must* be listed in the same order as the staging destinations listed for the 326 | ``--destinations`` option - e.g. the first manifest listed for ``-m`` must be for the manifest corresponding to the 327 | first destination listed for the ``-d`` option, and so on. 328 | 329 | If this option is omitted altogether, then no manifest files will be created. 330 | 331 | Bypass confirmation prompt (optional) 332 | """"""""""""""""""""""""""""""""""""" 333 | 334 | By default, staging mode will prompt the user to confirm that all file paths are correct before commencing. Using the 335 | ``--no-confirm`` option will bypass this prompt. The intention is for the user to prepare and test their command-line 336 | syntax interactively using the confirmation prompt as a guide, and use the ``--no-confirm`` option when scheduling the 337 | staging process to run automatically. 338 | 339 | Override maximum number of consecutive failures (optional) 340 | """""""""""""""""""""""""""""""""""""""""""""""""""""""""" 341 | 342 | By default, staging will be aborted if 10 consecutive write failures occur. Use the ``--max-failures`` option to 343 | override this threshold. 344 | 345 | Keep empty folders in staging directory (optional) 346 | """""""""""""""""""""""""""""""""""""""""""""""""" 347 | 348 | By default, any empty folders left in the staging directory will be deleted once staging is complete. Using the 349 | ``--keep-staging-folders`` option will change this behaviour and leave empty folders untouched. This may be useful in 350 | cases where a complex hierarchical structure needs to be maintained for new files and maintaining an empty file system 351 | in the staging directory is easier than recreating the structure for each run. 352 | 353 | Examples of command syntax 354 | """""""""""""""""""""""""" 355 | 356 | Example 1 (use defaults for file & checksum destinations): 357 | :: 358 | 359 | mpt stage f:\staging 360 | -d c:\storage q:\backup_storage_1 z:\backup_storage_2 361 | 362 | 363 | This will process all files in ``f:\staging`` and create output in the following locations 364 | 365 | =========== ============================= ================================= ======== 366 | Destination Files Checksums Manifest 367 | ----------- ----------------------------- --------------------------------- -------- 368 | 1 ``c:\storage\files`` ``c:\storage\checksums`` ``None`` 369 | 2 ``q:\backup_storage_1\files`` ``q:\backup_storage_1\checksums`` ``None`` 370 | 3 ``z:\backup_storage_2\files`` ``z:\backup_storage_2\checksums`` ``None`` 371 | =========== ============================= ================================= ======== 372 | 373 | Example 2 (use specific checksum & manifest locations): 374 | :: 375 | 376 | mpt stage f:\staging 377 | -d c:\storage\datastore q:\backup_storage_1\datastore z:\backup_storage_2\file_data 378 | -t c:\storage\checksumdata q:\backup_storage_1\checksumdata 379 | z:\backup_storage_2\meta_data\checksums 380 | -m c:\storage\manifest.sha256 q:\backup_storage_\manifest.sha256 381 | z:\backup_storage_2\meta_data\manifest.sha256 382 | 383 | This will process all files in ``f:\staging`` and create output in the following locations: 384 | 385 | =========== ================================= =========================================== =============================================== 386 | Destination Files Checksums Manifest 387 | ----------- --------------------------------- ------------------------------------------- ----------------------------------------------- 388 | 1 ``c:\storage\datastore`` ``c:\storage\checksumdata`` ``c:\storage\manifest.sha256`` 389 | 2 ``q:\backup_storage_1\datastore`` ``q:\backup_storage_1\checksumdata`` ``q:\backup_storage_1\manifest.sha256`` 390 | 3 ``z:\backup_storage_2\file_data`` ``z:\backup_storage_2\meta_data\checksums`` ``z:\backup_storage_2\meta_data\manifest.sha256`` 391 | =========== ================================= =========================================== =============================================== 392 | 393 | Common Options 394 | -------------- 395 | 396 | The following options can be used with all modes of operation. They should be used in the command line *before* the 397 | mode of operation (e.g. create, stage, etc) is specified. 398 | 399 | Number of processes 400 | """"""""""""""""""" 401 | 402 | Use the ``-p`` or ``--num-processes`` option to specify the number of concurrent processes MPT should use. The 403 | default value is 2. The ideal number will depend on the number of CPUs and processor cores the host machine has. 404 | 405 | E-mail recipients 406 | """""""""""""""" 407 | 408 | Use the ``-e`` or ``--email-results`` option to specify e-mail recipients for MPT's summary reports. 409 | 410 | Output directory 411 | """""""""""""""" 412 | 413 | Use the ``-o`` or ``--output`` option to specify the root directory used to store reports. Subdirectories will be 414 | created beneath this directory for each type of report (creation, validation, comparison and staging), and a separate 415 | dated directory will be created each time MPT runs. 416 | 417 | Disable file count 418 | """""""""""""""""" 419 | 420 | Normally MPT will count the number of files to be processed before it starts. When run interactively, this can provide 421 | a useful picture of its progress - however, this is at the cost of potentially taking a long time to begin processing, 422 | as all files have to be counted before processing can begin. Use the ``--no-count`` option to skip file counting 423 | and simply display a count of how many files have been processed so far. 424 | 425 | Use absolute path in reports 426 | """""""""""""""""""""""""""" 427 | 428 | By default, the summary reports produced by MPT show each file's path relative to the root directory specified 429 | on the command line. Use the ``--absolute-path`` option to instead show an absolute path. Note that this may include 430 | a drive letter (on Windows) or mount point (on Linux) which does not exist for all users. 431 | 432 | Override cache size 433 | """"""""""""""""""" 434 | 435 | MPT produces its output reports as it is running. By default, it caches 1000 records in memory before writing them to 436 | disk. To override this setting, use the ``--cache-size`` option to specify a different number of records. A higher 437 | value will result in higher memory usage, whereas a lower number will cause more frequent writing to disk. Depending on 438 | the number of files being processed by MPT, adjustments to the cache size may improve overall performance. 439 | 440 | Example of command syntax 441 | """"""""""""""""""""""""" 442 | :: 443 | 444 | mpt --email-results recipient@example.com recipient2@example.com 445 | --num-processes 8 446 | --no-count 447 | --cache-size 0 448 | --output c:\storage\reports 449 | validate_tree c:\storage\files 450 | --tree c:\storage\checksums 451 | 452 | This will validate the files stored in ``c:\storage\files`` using the checksum tree in ``c:\storage\checksums``, 453 | using 8 concurrent processes and without counting the files to be processed. Results will be written out to disk 454 | immediately rather than being cached. The resulting reports will be written to the directory ``c:\storage\reports`` 455 | and sent via e-mail to the two listed recipients. 456 | 457 | Licence 458 | ======= 459 | 460 | This project is licensed under the Apache License 2.0. 461 | For details see the accompanying LICENSE file or visit: 462 | 463 | http://www.apache.org/licenses/LICENSE-2.0 464 | 465 | Copyright (c) 2020, The British Library 466 | -------------------------------------------------------------------------------- /mpt/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.2.1' 2 | -------------------------------------------------------------------------------- /mpt/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | 5 | from mpt import __version__ 6 | from .defaults import * 7 | from .filemanager import FileManager 8 | from .hashing import algorithms_supported 9 | from .staging import stage_files 10 | 11 | 12 | def main(args=None): 13 | if args is None: 14 | args = sys.argv[1:] 15 | 16 | # Process CLI arguments 17 | ap = argparse.ArgumentParser(prog="mpt", 18 | description="Minimum Preservation Tool: file staging and checksum validation " 19 | "utilities") 20 | 21 | actionparser = ap.add_subparsers(title='Actions', dest='actions') 22 | # Args for creating manifests 23 | create_parser = actionparser.add_parser("create") 24 | create_parser.add_argument("dir", help="Directory of files to process") 25 | create_parser.add_argument("-a", "--algorithm", dest="algorithm", 26 | choices=algorithms_supported, 27 | default=default_algorithm, 28 | help="the checksum algorithm to use [default: {0}]".format(default_algorithm)) 29 | create_parser.add_argument("--formats", dest="formats", nargs="+", help="list of file extensions to include (only)") 30 | create_parser.add_argument("-m", dest="manifest", help="the manifest to create [default: None]") 31 | create_parser.add_argument("-r", "--recursive", dest="recursive", action="store_true", 32 | help="recurse into sub-folders [default: false]") 33 | create_parser.add_argument("-t", "--tree", required=True, dest="tree", 34 | help="directory in which to create 'checksum tree' mirroring original data structure") 35 | 36 | # Args for validating manifests 37 | validate_m_parser = actionparser.add_parser("validate_manifest") 38 | validate_m_parser.add_argument("dir", help="Directory of files to process") 39 | validate_m_parser.add_argument("-a", "--algorithm", dest="algorithm", choices=algorithms_supported, 40 | default=default_algorithm, 41 | help="the checksum algorithm to use [default: {0}]".format(default_algorithm)) 42 | validate_m_parser.add_argument("-m", required=True, dest="manifest", help="the manifest to validate") 43 | 44 | # Args for validating checksum tree 45 | validate_t_parser = actionparser.add_parser("validate_tree") 46 | validate_t_parser.add_argument("dir", help="Directory of files to process") 47 | validate_t_parser.add_argument("-r", "--recursive", dest="recursive", action="store_true", 48 | help="recurse into sub-folders [default: false]") 49 | validate_t_parser.add_argument("-t", "--tree", required=True, dest="tree", 50 | help="directory containing checksum files mirroring original data structure") 51 | 52 | # Args for comparing checksum trees 53 | compare_t_parser = actionparser.add_parser("compare_trees") 54 | compare_t_parser.add_argument("dir", help="root directory of master checksum tree") 55 | compare_t_parser.add_argument("-t", "--trees", required=True, dest="other_paths", nargs="+", 56 | help="list of other 'checksum tree' root directories to compare to master") 57 | 58 | # Args for comparing manifests 59 | compare_m_parser = actionparser.add_parser("compare_manifests") 60 | compare_m_parser.add_argument("manifest", help="master manifest file to check") 61 | compare_m_parser.add_argument("-m", "--other_manifests", required=True, dest="other_paths", nargs="+", 62 | help="list of other manifests to compare to master") 63 | 64 | # Args for staging files 65 | 66 | stage_description = ("Move files from a staging directory to one or more destination directories, calculating " 67 | "checksums and saving to a checksum tree and optional manifest file for each destination.") 68 | 69 | stage_epilog = ("Any number of destination directories can be specified (using the -d argument), but the number of " 70 | "trees (-t) and manifests (-m) must either match the number of destination directories or be " 71 | "omitted entirely. If no trees are specified, then files will be staged to a 'files' directory " 72 | " in each destination root, and checksums created in a corresponding 'checksums' directory.") 73 | 74 | stage_parser = actionparser.add_parser("stage", description=stage_description, epilog=stage_epilog) 75 | stage_parser.add_argument("dir", help="Directory of files to process") 76 | stage_parser.add_argument("-a", "--algorithm", dest="algorithm", 77 | choices=algorithms_supported, default=default_algorithm, 78 | help="the checksum algorithm to use [default: {0}]".format(default_algorithm)) 79 | stage_parser.add_argument("-t", "--trees", dest="trees", nargs="+", default=[], 80 | help="list of directories in which to create 'checksum tree' mirroring original data " 81 | "structure. Should match the number of destination directories in number, or be " 82 | "omitted") 83 | stage_parser.add_argument("-m", "--manifests", dest="manifests", nargs="+", default=[], 84 | help="list of manifest files to create. Should match the number of destination " 85 | "directories in number, or be omitted ") 86 | stage_parser.add_argument("--no-confirm", dest="no_confirm", action="store_true", 87 | help="run without requesting confirmation") 88 | stage_parser.add_argument("--max-failures", dest="max_failures", type=int, default=None, 89 | help="maximum number of consecutive write failures allowed " 90 | "[default: {0}]".format(max_failures)) 91 | stage_parser.add_argument("--keep-staging-folders", dest="keep_empty_folders", action="store_true", 92 | help="keep empty folders in staging directory after completion") 93 | stage_parser.add_argument("-d", "--destinations", required=True, dest="targets", nargs="+", metavar="DESTINATIONS", 94 | help="list of destination directories into which the files should be staged") 95 | 96 | # Common args 97 | 98 | ap.add_argument("-v", "--version", action="version", version='%(prog)s v' + __version__, 99 | help="display program version") 100 | ap.add_argument("-p", "--num-processes", dest="processes", default=default_processes, type=int, 101 | help="number of concurrent processes to run [default: {0}]".format(default_processes)) 102 | ap.add_argument("-e", "--email-results", dest="email", metavar="ADDRESS", nargs="+", 103 | help="email recipients for results [default: none]") 104 | ap.add_argument("-o", "--output", dest="output", default=base_output_dir, 105 | help="directory in which to create reports [default: {0}]".format(base_output_dir)) 106 | ap.add_argument("--no-count", dest="count_files", action="store_false", help="don't count files before processing") 107 | ap.add_argument("--absolute-path", dest="abspath", action="store_true", help="use absolute path in reports") 108 | ap.add_argument("--cache-size", dest="cache_size", type=int, help="number of results to cache before writing to " 109 | "disk") 110 | 111 | args = ap.parse_args() 112 | 113 | if hasattr(args, "dir"): 114 | if not os.path.exists(args.dir): 115 | print("Specified directory ({0}) does not exist.".format(args.dir)) 116 | ap.print_help() 117 | return 118 | try: 119 | if args.actions == 'stage': 120 | stage_files(args) 121 | elif args.actions == 'create': 122 | fm = FileManager(primary_path=args.dir, cs_dir=args.tree, manifest_file=args.manifest, 123 | algorithm=args.algorithm, recursive=args.recursive, count_files=args.count_files, 124 | num_procs=args.processes, email=args.email, formats=args.formats, 125 | output_dir=args.output, absolute_path=args.abspath, cache_size=args.cache_size) 126 | fm.create_checksums() 127 | elif args.actions == 'validate_manifest': 128 | fm = FileManager(primary_path=args.dir, manifest_file=args.manifest, 129 | algorithm=args.algorithm, num_procs=args.processes, count_files=args.count_files, 130 | email=args.email, output_dir=args.output, absolute_path=args.abspath, 131 | cache_size=args.cache_size) 132 | fm.validate_manifest() 133 | elif args.actions == 'validate_tree': 134 | fm = FileManager(primary_path=args.dir, cs_dir=args.tree, recursive=args.recursive, 135 | num_procs=args.processes, count_files=args.count_files, email=args.email, output_dir=args.output, 136 | absolute_path=args.abspath, cache_size=args.cache_size) 137 | fm.validate_tree() 138 | elif args.actions == "compare_trees": 139 | fm = FileManager(primary_path=args.dir, cs_dir=args.dir, num_procs=args.processes, count_files=args.count_files, 140 | email=args.email, output_dir=args.output, other_paths=args.other_paths, recursive=True, 141 | absolute_path=args.abspath, cache_size=args.cache_size) 142 | fm.compare_trees() 143 | elif args.actions == "compare_manifests": 144 | fm = FileManager(primary_path=args.manifest, num_procs=args.processes, count_files=args.count_files, 145 | email=args.email, output_dir=args.output, other_paths=args.other_paths, 146 | absolute_path=args.abspath, cache_size=args.cache_size) 147 | fm.compare_manifests() 148 | except AttributeError as e: 149 | print(str(e)) 150 | ap.print_help() 151 | return 152 | 153 | 154 | if __name__ == '__main__': 155 | main() 156 | -------------------------------------------------------------------------------- /mpt/codes.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class Action(Enum): 5 | CREATE = "Checksum creation" 6 | VALIDATE_MANIFEST = "Manifest validation" 7 | VALIDATE_TREE = "Checksum tree validation" 8 | STAGE_FILES = "File staging" 9 | COMPARE_TREES = "Checksum tree comparison" 10 | COMPARE_MANIFESTS = "Manifest comparison" 11 | 12 | 13 | class StagingStatus(Enum): 14 | READY = "Ready for staging" 15 | STAGED = "Staged" 16 | DUPLICATE_FILE = "Duplicate data file" 17 | DUPLICATE_CHECKSUM = "Duplicate checksum file" 18 | DATA_WRITE_FAILURE = "Failed to write data file" 19 | CHECKSUM_WRITE_FAILURE = "Failed to write checksum file" 20 | CHECKSUM_MISMATCH = "Checksum mismatch" 21 | COULD_NOT_REMOVE = "Could not remove unstaged file" 22 | IN_PROGRESS = "Staging in progress" 23 | UNSTAGED = "Unstaged" 24 | 25 | 26 | class Result(Enum): 27 | pass 28 | 29 | 30 | class ComparisonResult(Result): 31 | MATCHED = "File checksum matches on all nodes" 32 | UNMATCHED = "File checksum does not match on all nodes" 33 | MISSING = "Checksum missing from node" 34 | OSERROR = "OS Error: cannot open checksum file" 35 | 36 | 37 | class CreationResult(Result): 38 | ADDED = "File added to checksum tree" 39 | SKIPPED = "File already listed in checksum tree" 40 | FAILED = "Hash generation failed for file" 41 | 42 | 43 | class ValidationResult(Result): 44 | VALID = "File found and checksum valid" 45 | INVALID = "File found but checksum not valid" 46 | MISSING = "File not found" 47 | ADDITIONAL = "Unexpected file found" 48 | OSERROR = "OS Error: cannot open file" 49 | 50 | 51 | ExceptionsResults = [ComparisonResult.UNMATCHED, ComparisonResult.MISSING, ComparisonResult.OSERROR, 52 | CreationResult.ADDED, CreationResult.FAILED, 53 | ValidationResult.INVALID, ValidationResult.MISSING, ValidationResult.ADDITIONAL, 54 | ValidationResult.OSERROR] 55 | -------------------------------------------------------------------------------- /mpt/defaults.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser, join 2 | 3 | default_algorithm = "sha256" 4 | default_blocksize = 1024 * 2048 5 | default_cachesize = 1000 6 | default_processes = 2 7 | base_output_dir = join(join(expanduser("~"), "mpt")) 8 | mail_size_threshold = 10000000 9 | max_failures = 10 10 | fallback_to_insecure_smtp = False 11 | email_only_exceptions = True 12 | -------------------------------------------------------------------------------- /mpt/email.py: -------------------------------------------------------------------------------- 1 | import os 2 | import smtplib 3 | import ssl 4 | import tempfile 5 | import zipfile 6 | from email import encoders 7 | from email.mime.base import MIMEBase 8 | from email.mime.multipart import MIMEMultipart 9 | from email.mime.text import MIMEText 10 | from typing import List 11 | 12 | from .defaults import fallback_to_insecure_smtp, mail_size_threshold 13 | 14 | 15 | def send_email(subject: str, recipients: List, message: str, 16 | attachments: List = None, zip_files: bool = False): 17 | """ Send an e-mail to a recipient by connecting to an SMTP server 18 | :param subject: the e-mail subject 19 | :param recipients: a list of e-mail addresses 20 | :param message: the text of the message 21 | :param attachments: a list of file paths to attach to the e-mail 22 | :param zip_files: boolean value indicating whether the attachments should be compressed into a ZIP archive 23 | """ 24 | 25 | mail_server = os.environ['MAIL_SERVER'] 26 | mail_server_port = os.environ['MAIL_SERVER_PORT'] 27 | mail_address = os.environ['MAIL_SENDER_ADDRESS'] 28 | 29 | mail_msg = MIMEMultipart() 30 | mail_msg['Subject'] = subject 31 | mail_msg['From'] = mail_address 32 | mail_msg['BCC'] = ','.join(recipients) 33 | mail_msg.attach(MIMEText(message)) 34 | if zip_files: 35 | with tempfile.TemporaryFile(prefix="reports", suffix="zip") as zf: 36 | zip = zipfile.ZipFile(zf, 'w', zipfile.ZIP_DEFLATED) 37 | for path in attachments: 38 | zip.write(path, arcname=os.path.basename(path)) 39 | zip.close() 40 | size = zf.tell() 41 | if size < mail_size_threshold: 42 | zf.seek(0) 43 | part = MIMEBase("application", "zip") 44 | part.set_payload(zf.read()) 45 | encoders.encode_base64(part) 46 | part.add_header('Content-Disposition', 47 | 'attachment; filename="reports.zip"') 48 | mail_msg.attach(part) 49 | else: 50 | if attachments is not None: 51 | for path in attachments: 52 | part = MIMEBase("application", "octet-stream") 53 | with open(path, 'rb') as file: 54 | part.set_payload(file.read()) 55 | encoders.encode_base64(part) 56 | part.add_header('Content-Disposition', 57 | 'attachment; filename="{}"'.format(os.path.basename(path))) 58 | mail_msg.attach(part) 59 | 60 | server = smtplib.SMTP(mail_server, mail_server_port) 61 | try: 62 | server.ehlo() 63 | if mail_server_port == '587': 64 | context = ssl.SSLContext(ssl.PROTOCOL_TLS) 65 | server.starttls(context=context) 66 | server.ehlo() 67 | server.sendmail(mail_address, recipients, mail_msg.as_string()) 68 | except ssl.SSLError as ssl_e: 69 | print('SSL error: ' + str(ssl_e)) 70 | if fallback_to_insecure_smtp: 71 | print("Falling back to insecure SMTP") 72 | server = smtplib.SMTP(mail_server, 25) 73 | server.ehlo() 74 | server.sendmail(mail_address, recipients, mail_msg.as_string()) 75 | pass 76 | except Exception as e: 77 | print("Cannot send email, details: " + str(e)) 78 | finally: 79 | try: 80 | server.quit() 81 | except smtplib.SMTPServerDisconnected: 82 | pass 83 | -------------------------------------------------------------------------------- /mpt/filemanager.py: -------------------------------------------------------------------------------- 1 | import mmap 2 | import multiprocessing 3 | import os 4 | 5 | from tqdm import tqdm 6 | 7 | from .codes import (Action, ComparisonResult, CreationResult, 8 | ExceptionsResults, ValidationResult) 9 | from .defaults import * 10 | from .email import send_email 11 | from .hashing import hash_file, algorithms_supported 12 | from .paths import fix_path 13 | from .results import ReportHandler 14 | 15 | 16 | def scan_tree(path, recursive=False, formats: list = None): 17 | """ 18 | A generator to return all files within a directory. 19 | :param path: top-level directory to scan 20 | :param recursive: true if the scan should be recursive 21 | :param formats: an optional list of file extensions - if provided, the scan will be limited to files with these 22 | extensions 23 | :return: an iterable sequence of files found by the scan 24 | """ 25 | for entry in os.scandir(path): 26 | try: 27 | if entry.is_dir(follow_symlinks=False) and recursive: 28 | yield from scan_tree(entry.path, recursive, formats) 29 | else: 30 | if entry.is_file(): 31 | if formats is None: 32 | yield entry.path 33 | else: 34 | if entry.path.endswith(tuple(formats)): 35 | yield entry.path 36 | except PermissionError: 37 | pass 38 | 39 | 40 | def walk_tree(path, recursive=False, formats: list = None): 41 | """ 42 | Alternative generator to get all files in directory. May be useful in the 43 | event of network issues 44 | :param path: top-level directory to scan 45 | :param recursive: true if the scan should be recursive 46 | :param formats: an optional list of file extensions - if provided, the scan will be limited to files with these 47 | extensions 48 | :return: an iterable sequence of files found by the walk 49 | """ 50 | if recursive: 51 | for root, dirs, files in os.walk(path): 52 | for file in files: 53 | full_path = os.path.join(root, file) 54 | if formats is None: 55 | yield full_path 56 | else: 57 | if full_path.endswith(tuple(formats)): 58 | yield full_path 59 | else: 60 | for file in os.listdir(path): 61 | full_path = os.path.join(path, file) 62 | if os.path.isfile(full_path): 63 | if formats is None: 64 | yield full_path 65 | else: 66 | if full_path.endswith(tuple(formats)): 67 | yield full_path 68 | 69 | 70 | def count_lines(file_path: str): 71 | """ 72 | Count the number of lines in a text file 73 | :param file_path: path to the file 74 | :return: the number of lines 75 | """ 76 | with open(file_path, 'r', errors='ignore') as f: 77 | for i, l in enumerate(f): 78 | pass 79 | return i + 1 80 | 81 | 82 | def iterate_manifest(file_path: str): 83 | """ 84 | A generator to read all valid lines in a checksum manifest file 85 | :param file_path: path to the manifest file 86 | :return: an iterable sequence of list items, each containing the components [checksum, file path] of a 87 | manifest record 88 | """ 89 | with open(file_path, 'r', encoding='utf8', errors='surrogateescape') as in_file: 90 | for line in in_file: 91 | line_s = line.rstrip('\r\n').split(' ', 1) 92 | if len(line_s) > 1: 93 | yield line_s 94 | 95 | 96 | class FileManager: 97 | """ 98 | The FileManager class, used for all MPT operations except staging. 99 | """ 100 | primary_path = None 101 | cs_dir = None 102 | manifest_file = None 103 | email = None 104 | formats = None 105 | last_action = None 106 | output_dir = base_output_dir 107 | algorithm = default_algorithm 108 | num_procs = default_processes 109 | blocksize = default_blocksize 110 | cache_size = default_cachesize 111 | count_files = True 112 | recursive = True 113 | absolute_path = False 114 | other_paths = [] 115 | report_handler = None 116 | debug_mode = False 117 | 118 | def __init__(self, primary_path: str, cs_dir: str = None, manifest_file: str = None, 119 | algorithm: str = None, blocksize: int = None, num_procs: int = None, 120 | recursive: bool = False, count_files: bool = True, email: list = None, 121 | formats: list = None, output_dir: str = None, other_paths: list = None, 122 | absolute_path: bool = False, cache_size: int = None): 123 | """ 124 | Initialisation function for the FileManager class. 125 | :param primary_path: path to the primary directory (containing data files or checksum files, depending on 126 | the action being carried out) 127 | :param cs_dir: the top-level directory of the "checksum tree" used to hold checksum files 128 | :param manifest_file: path to the manifest file being used 129 | :param algorithm: the checksum algorithm to be used 130 | :param blocksize: the block size used for I/O operations when calculating checksums 131 | :param num_procs: the number of concurrent processes to spawn 132 | :param recursive: true if directories beneath primary_path should be processed recursively 133 | :param count_files: true to count files prior to processing (increases startup time) 134 | :param email: list of email addresses to send reports to on completion 135 | :param formats: list of file extensions to restrict checksum creation to 136 | :param output_dir: base directory in which report subfolders should be created 137 | :param other_paths: other directories containing checksum information (trees or manifests) used when comparing 138 | checksums 139 | :param absolute_path: true if absolute paths are required in reports 140 | :param cache_size: number of output records to cache in memory before writing to disk 141 | """ 142 | self.primary_path = os.path.abspath(primary_path) 143 | if not self.primary_path.endswith(os.sep): 144 | self.primary_path = self.primary_path + os.sep 145 | if cs_dir is not None: 146 | self.cs_dir = os.path.abspath(cs_dir) 147 | self.recursive = recursive 148 | self.count_files = count_files 149 | self.email = email 150 | self.formats = formats 151 | self.absolute_path = absolute_path 152 | self.other_paths = other_paths 153 | if manifest_file is not None: 154 | self.manifest_file = os.path.abspath(manifest_file) 155 | if output_dir is not None: 156 | self.output_dir = output_dir 157 | if algorithm is not None: 158 | self.algorithm = algorithm 159 | if blocksize is not None: 160 | self.blocksize = blocksize 161 | if num_procs is not None: 162 | self.num_procs = num_procs 163 | if cache_size is not None: 164 | self.cache_size = cache_size 165 | if self.debug_mode: 166 | for k, v in self.__dict__.items(): 167 | print("{}: {}".format(k, v)) 168 | 169 | def _email_report(self): 170 | """ 171 | Email the results of checksum operations to the configured recipients 172 | """ 173 | mail_body = self.report_handler.summary() 174 | if self.report_handler.errors_detected: 175 | mail_subject = "BL MPT {}: Errors encountered".format(self.last_action.value) 176 | else: 177 | if self.last_action in [Action.VALIDATE_MANIFEST, Action.VALIDATE_TREE, Action.COMPARE_MANIFESTS, 178 | Action.COMPARE_TREES]: 179 | mail_subject = "BL MPT {}: No errors encountered".format(self.last_action.value) 180 | elif self.last_action == Action.CREATE: 181 | if self.report_handler.results[CreationResult.ADDED] == 0: 182 | mail_subject = "BL MPT {}: No new files".format(self.last_action.value) 183 | else: 184 | mail_subject = "BL MPT {}: New files detected".format(self.last_action.value) 185 | 186 | if email_only_exceptions: 187 | exceptions = [f.name.lower() for f in ExceptionsResults] 188 | attachments = [os.path.join(self.report_handler.out_dir, f) for f in os.listdir(self.report_handler.out_dir) 189 | if (os.path.isfile(os.path.join(self.report_handler.out_dir, f)) and f.endswith("csv") 190 | and os.path.splitext(f)[0] in exceptions)] 191 | else: 192 | attachments = [os.path.join(self.report_handler.out_dir, f) for f in os.listdir(self.report_handler.out_dir) 193 | if (os.path.isfile(os.path.join(self.report_handler.out_dir, f)) and f.endswith("csv"))] 194 | 195 | size = sum(os.path.getsize(f) for f in attachments) 196 | zip = size >= mail_size_threshold 197 | send_email(subject=mail_subject, recipients=self.email, message=mail_body, attachments=attachments, 198 | zip_files=zip) 199 | 200 | def _normalise_path(self, file_path: str): 201 | """ 202 | Normalise a file path according to the configured absolute_path parameter 203 | :param file_path: the reported file path in its expected relative form 204 | :return: the file path in either absolute or relative form depending on the FileManager's absolute_path setting 205 | """ 206 | if file_path[0] == '*': 207 | if self.absolute_path: 208 | return file_path.replace('*', self.primary_path) 209 | return file_path 210 | 211 | def _show_results(self): 212 | """ 213 | Display the results of checksum operations on screen and send emails as required 214 | """ 215 | summary = self.report_handler.summary() 216 | print(summary) 217 | self.report_handler.write_summary() 218 | 219 | if self.email is not None: 220 | self._email_report() 221 | 222 | def _check_for_cs_file(self, data_file_path: str): 223 | """ 224 | Checks whether a checksum file exists in a tree for the given data file 225 | :param data_file_path: path to the data file 226 | :return: a tuple in the form (relative path to data file, validation result) or (None, None) if file exists 227 | """ 228 | rel_path = os.path.relpath(data_file_path, self.primary_path) 229 | for ext in algorithms_supported: 230 | cs_file_path = fix_path(os.path.join(self.cs_dir, rel_path) + '.' + ext) 231 | if os.path.exists(cs_file_path): 232 | return None, None 233 | return rel_path, ValidationResult.ADDITIONAL 234 | 235 | def _check_for_file_in_manifest(self, data_file_path: str): 236 | """ 237 | Checks whether a given data file is listed in the current manifest file 238 | :param data_file_path: path to the data file 239 | :return: a tuple in the form (relative path to data file, validation result) or (None, None) if the file is 240 | listed 241 | """ 242 | with open(self.manifest_file, 'r') as manifest: 243 | # Make allowance for paths containing using either forward slashes or escaped 244 | # backslashes as separators 245 | rel_path = os.path.relpath(data_file_path, self.primary_path) 246 | paths = [ 247 | "*{sep}{path}".format(sep=os.sep, path=rel_path), 248 | "*{sep}{path}".format(sep="/", path=rel_path.replace("\\","/")) 249 | ] 250 | manifest_map = mmap.mmap(manifest.fileno(), 0, access=mmap.ACCESS_READ) 251 | for next_path in paths: 252 | found = manifest_map.find(next_path.encode("utf-8")) 253 | if found != -1: 254 | return None, None 255 | return rel_path, ValidationResult.ADDITIONAL 256 | 257 | def _check_other_manifests(self, manifest_line: str): 258 | """ 259 | Compare an entry in a manifest file to the corresponding entry in other manifests 260 | :param manifest_line: the text of the manifest entry 261 | :return: a Tuple in the form (file path, {results}) where (results) is a dictionary containing 262 | the status of the checksum data in other trees 263 | """ 264 | manifest_maps = {} 265 | results = {} 266 | 267 | for m in self.other_paths: 268 | next_file = open(m, 'r') 269 | next_map = mmap.mmap(next_file.fileno(), 0, access=mmap.ACCESS_READ) 270 | manifest_maps[m] = next_map 271 | 272 | try: 273 | file_cs = manifest_line.split()[0] 274 | file_path = ' '.join(manifest_line.split()[1:]) 275 | # Handle blank lines 276 | except IndexError: 277 | return None, None 278 | if len(file_path) == 0: 279 | return None, None 280 | 281 | for manifest_path, manifest_map in manifest_maps.items(): 282 | found = manifest_map.find(file_path.encode("utf-8")) 283 | if found == -1: 284 | results[manifest_path] = ComparisonResult.MISSING 285 | else: 286 | s_pos = found - (len(file_cs) + 1) 287 | e_pos = s_pos + len(file_cs) 288 | manifest_cs = manifest_map[s_pos:e_pos] 289 | if manifest_cs.decode("utf-8") != file_cs: 290 | results[manifest_path] = ComparisonResult.UNMATCHED 291 | else: 292 | results[manifest_path] = ComparisonResult.MATCHED 293 | 294 | for m in manifest_maps.values(): 295 | m.close() 296 | 297 | r_val = self._normalise_path(file_path), results 298 | return r_val 299 | 300 | def _compare_checksum_file_to_other_trees(self, checksum_file_path: str): 301 | """ 302 | Compares the hash value in a given checksum file to its corresponding 303 | version in other checksum trees 304 | :param checksum_file_path: the path to the checksum file 305 | :return: a Tuple in the form (file path, {results}) where (results) is a dictionary containing 306 | the status of the checksum data in other trees 307 | """ 308 | results = {} 309 | rel_path = os.path.relpath(checksum_file_path, self.primary_path) 310 | file_key = "*{sep}{path}".format(sep=os.sep, path=rel_path) 311 | in_path = fix_path(checksum_file_path) 312 | try: 313 | with open(in_path, "r", encoding="utf-8", errors="surrogateescape") as cs_file: 314 | cs_line = cs_file.read().rstrip('\r\n').split(' ') 315 | master_cs = cs_line[0] 316 | for next_tree in self.other_paths: 317 | try: 318 | other_cs_path = fix_path(os.path.join(next_tree, rel_path)) 319 | if os.path.exists(other_cs_path): 320 | with open(other_cs_path, "r", encoding="utf-8", errors="surrogateescape") as cs_file: 321 | cs_line = cs_file.read().rstrip('\r\n').split(' ') 322 | other_cs = cs_line[0] 323 | if master_cs == other_cs: 324 | results[next_tree] = ComparisonResult.MATCHED 325 | else: 326 | results[next_tree] = ComparisonResult.UNMATCHED 327 | else: 328 | results[next_tree] = ComparisonResult.MISSING 329 | except OSError: 330 | results[next_tree] = ComparisonResult.OSERROR 331 | pass 332 | except OSError: 333 | results["ALL"] = ComparisonResult.OSERROR 334 | pass 335 | 336 | r_val = self._normalise_path(file_key), results 337 | return r_val 338 | 339 | def _create_checksum_or_skip_file(self, in_file: str, algorithm: str = None): 340 | """ Generate a checksum for a file if it has not already been created 341 | :param in_file: the file to be checked 342 | :param algorithm: the checksum algorithm to be used 343 | :return: a tuple containing the data file path and its status (correct/incorrect/missing) 344 | """ 345 | if algorithm is None: 346 | algorithm = self.algorithm 347 | r_path = os.path.relpath(in_file, self.primary_path) 348 | out_file = fix_path( 349 | os.path.join(self.cs_dir, r_path) + '.' + algorithm 350 | ) 351 | if os.path.exists(out_file): 352 | return in_file, CreationResult.SKIPPED, None 353 | else: 354 | if not os.path.exists(os.path.dirname(out_file)): 355 | try: 356 | os.makedirs(os.path.dirname(out_file)) 357 | except FileExistsError: 358 | pass 359 | try: 360 | checksum, size = hash_file(in_file, algorithm=algorithm) 361 | with open(out_file, 'w', encoding='utf-8', errors="surrogateescape") as cs_file: 362 | cs_file.write("{cs} *{sep}{path}\n".format(cs=checksum, 363 | sep=os.sep, 364 | path=os.path.basename(in_file))) 365 | except Exception as e: 366 | print(str(e)) 367 | return in_file, CreationResult.FAILED, None 368 | if self.manifest_file is not None: 369 | with open(self.manifest_file, 'a+', encoding='utf-8') as manifest_file: 370 | manifest_file.write("{cs} *{sep}{path}\n".format(cs=checksum, sep=os.sep, path=r_path)) 371 | return self._normalise_path(in_file), CreationResult.ADDED, size 372 | 373 | def _validate_checksum_file(self, checksum_file_path: str, algorithm: str = None): 374 | """ Use a checksum file within a checksum tree to validate its 375 | corresponding data file 376 | :param checksum_file_path: the path to the checksum file 377 | :param algorithm: the checksum algorithm to use 378 | :return: a tuple containing the data file path and its status (correct/incorrect/missing) 379 | """ 380 | if algorithm is None: 381 | algorithm = os.path.splitext(checksum_file_path)[1][1:] 382 | 383 | fixed_path = fix_path(checksum_file_path) 384 | 385 | try: 386 | with open(fixed_path, "r", encoding="utf-8", errors="surrogateescape") as cs_file: 387 | cs_line = cs_file.read().rstrip('\r\n').split(' ') 388 | original_cs = cs_line[0] 389 | except OSError: 390 | r_val = self._normalise_path(checksum_file_path), ValidationResult.OSERROR 391 | return r_val 392 | cs_rel_path = os.path.relpath(checksum_file_path, self.cs_dir) 393 | data_rel_path = os.path.splitext(cs_rel_path)[0] 394 | full_path = fix_path(os.path.join(self.primary_path, data_rel_path)) 395 | file_key = "*{sep}{path}".format(sep=os.sep, path=os.path.join(data_rel_path)) 396 | size = 0 397 | if os.path.exists(full_path): 398 | try: 399 | current_cs, size = hash_file(full_path, algorithm=algorithm) 400 | if current_cs == original_cs: 401 | file_status = ValidationResult.VALID 402 | else: 403 | file_status = ValidationResult.INVALID 404 | except OSError: 405 | file_status = ValidationResult.OSERROR 406 | else: 407 | file_status = ValidationResult.MISSING 408 | r_val = self._normalise_path(file_key), file_status, size 409 | return r_val 410 | 411 | def _validate_file_with_checksum(self, original_checksum_data): 412 | """ Validate a data file against the checksum value provided 413 | :param original_checksum_data: a tuple in the format (file_path, file_checksum) 414 | :return: a triple containing the data file path, its status (correct/incorrect/missing) and its size 415 | """ 416 | original_cs, rel_path = original_checksum_data 417 | full_path = fix_path(rel_path.replace('*', self.primary_path)) 418 | size = None 419 | if os.path.exists(full_path): 420 | try: 421 | current_cs, size = hash_file(full_path, algorithm=self.algorithm) 422 | if current_cs == original_cs: 423 | file_status = ValidationResult.VALID 424 | else: 425 | file_status = ValidationResult.INVALID 426 | except OSError: 427 | file_status = ValidationResult.OSERROR 428 | pass 429 | else: 430 | file_status = ValidationResult.MISSING 431 | r_val = self._normalise_path(rel_path), file_status, size 432 | return r_val 433 | 434 | def compare_manifests(self): 435 | """ 436 | Compare the contents of a master manifest file to other files 437 | """ 438 | self.last_action = Action.COMPARE_MANIFESTS 439 | self.report_handler = ReportHandler(action=self.last_action, out_dir=self.output_dir, 440 | summary_data={ 441 | "primary_path": self.primary_path 442 | }) 443 | 444 | pool = multiprocessing.Pool(processes=self.num_procs) 445 | if self.count_files: 446 | line_count = count_lines(self.primary_path) 447 | else: 448 | line_count = None 449 | 450 | results_cache = [] 451 | 452 | with open(self.primary_path, 'r') as manifest_file: 453 | for file_path, status in tqdm(pool.imap_unordered(self._check_other_manifests, manifest_file), 454 | total=line_count, desc="MPT({}p)/Comparing manifests".format(self.num_procs)): 455 | if file_path is not None: 456 | results_cache.append((file_path, status)) 457 | if len(results_cache) >= self.cache_size: 458 | for next_path, next_status in results_cache: 459 | self.report_handler.assign_comparison_result(file_path=next_path, file_status=next_status) 460 | self.report_handler.write_summary() 461 | results_cache = [] 462 | # Write any records remaining in the cache after all files are processed 463 | for next_path, next_status in results_cache: 464 | self.report_handler.assign_comparison_result(file_path=next_path, file_status=next_status) 465 | self.report_handler.close() 466 | self._show_results() 467 | 468 | def compare_trees(self): 469 | """ 470 | Compare each checksum file in a tree against its counterparts in other checksum trees 471 | """ 472 | self.last_action = Action.COMPARE_TREES 473 | self.report_handler = ReportHandler(action=self.last_action, out_dir=self.output_dir, 474 | summary_data={ 475 | "cs_dir": self.cs_dir, 476 | "primary_path": self.primary_path 477 | }) 478 | 479 | pool = multiprocessing.Pool(processes=self.num_procs) 480 | files_iterable = scan_tree(path=self.primary_path, recursive=self.recursive) 481 | 482 | if self.count_files: 483 | file_count = sum([1 for x in files_iterable]) 484 | files_iterable = scan_tree(path=self.primary_path, recursive=self.recursive) 485 | else: 486 | file_count = None 487 | 488 | results_cache = [] 489 | 490 | for file_path, status in tqdm(pool.imap_unordered(self._compare_checksum_file_to_other_trees, files_iterable), 491 | total=file_count, desc="MPT({}p)/Comparing checksums".format(self.num_procs)): 492 | results_cache.append((file_path, status)) 493 | if len(results_cache) >= self.cache_size: 494 | for next_path, next_status in results_cache: 495 | self.report_handler.assign_comparison_result(file_path=next_path, file_status=next_status) 496 | self.report_handler.write_summary() 497 | results_cache = [] 498 | # Write any records remaining in the cache after all files are processed 499 | for next_path, next_status in results_cache: 500 | self.report_handler.assign_comparison_result(file_path=next_path, file_status=next_status) 501 | self.report_handler.close() 502 | self._show_results() 503 | 504 | def create_checksums(self): 505 | """ Create checksums and update manifest 506 | """ 507 | self.last_action = Action.CREATE 508 | self.report_handler = ReportHandler(action=self.last_action, out_dir=self.output_dir, 509 | summary_data={ 510 | "primary_path": self.primary_path, 511 | "cs_dir": self.cs_dir, 512 | "manifest_file": self.manifest_file, 513 | "formats": self.formats 514 | }) 515 | pool = multiprocessing.Pool(processes=self.num_procs) 516 | files_iterable = scan_tree(path=self.primary_path, recursive=self.recursive, formats=self.formats) 517 | if self.count_files: 518 | file_count = sum([1 for x in files_iterable]) 519 | files_iterable = scan_tree(path=self.primary_path, recursive=self.recursive, formats=self.formats) 520 | else: 521 | file_count = None 522 | 523 | results_cache = [] 524 | 525 | for file_path, status, file_size in tqdm(pool.imap_unordered(self._create_checksum_or_skip_file, 526 | files_iterable), total=file_count, 527 | desc="MPT({}p)/Creating checksums".format(self.num_procs)): 528 | results_cache.append((file_path, status, file_size)) 529 | if len(results_cache) >= self.cache_size: 530 | for next_path, next_status, next_size in results_cache: 531 | self.report_handler.add_result(description=next_status, data={"path": next_path, "size": next_size}) 532 | self.report_handler.write_summary() 533 | results_cache = [] 534 | # Write any records remaining in the cache after all files are processed 535 | for next_path, next_status, next_size in results_cache: 536 | self.report_handler.add_result(description=next_status, data={"path": next_path, "size": next_size}) 537 | self.report_handler.close() 538 | self._show_results() 539 | 540 | def validate_manifest(self): 541 | """ Validate files using the checksums listed in a manifest file 542 | """ 543 | self.last_action = Action.VALIDATE_MANIFEST 544 | self.report_handler = ReportHandler(action=self.last_action, out_dir=self.output_dir, 545 | summary_data={ 546 | "manifest_file": self.manifest_file, 547 | "primary_path": self.primary_path 548 | }) 549 | 550 | if not os.path.exists(self.manifest_file): 551 | raise EnvironmentError("Manifest file " + self.manifest_file + " not found") 552 | 553 | pool = multiprocessing.Pool(processes=self.num_procs) 554 | files_iterable = scan_tree(path=self.primary_path, recursive=True) 555 | lines_iterable = iterate_manifest(self.manifest_file) 556 | 557 | if self.count_files: 558 | file_count = count_lines(self.manifest_file) 559 | else: 560 | file_count = None 561 | 562 | results_cache = [] 563 | 564 | for file_path, status, file_size in tqdm( 565 | pool.imap_unordered(self._validate_file_with_checksum, lines_iterable), 566 | total=file_count, desc="MPT({}p)/Validating files".format(self.num_procs)): 567 | results_cache.append((file_path, status, file_size)) 568 | if len(results_cache) >= self.cache_size: 569 | for next_path, next_status, next_size in results_cache: 570 | self.report_handler.add_result(description=next_status, data={"path": next_path, "size": next_size}) 571 | self.report_handler.write_summary() 572 | results_cache = [] 573 | # Write any records remaining in the cache after all files are processed 574 | for next_path, next_status, next_size in results_cache: 575 | self.report_handler.add_result(description=next_status, data={"path": next_path, "size": next_size}) 576 | 577 | # Look for data files not listed in manifest 578 | for file_path, status in tqdm(pool.imap_unordered(self._check_for_file_in_manifest, files_iterable), 579 | desc="MPT({}p)/Finding additional files".format(self.num_procs)): 580 | if status is not None: 581 | self.report_handler.add_result(status, {"path": "*{sep}{path}".format(sep=os.sep, path=file_path)}) 582 | 583 | self.report_handler.close() 584 | self._show_results() 585 | 586 | def validate_tree(self): 587 | """ Validate files using the checksums listed in a checksum tree 588 | """ 589 | self.last_action = Action.VALIDATE_TREE 590 | self.report_handler = ReportHandler(action=self.last_action, out_dir=self.output_dir, 591 | summary_data={ 592 | "cs_dir": self.cs_dir, 593 | "primary_path": self.primary_path 594 | }) 595 | 596 | if not os.path.exists(self.cs_dir): 597 | raise EnvironmentError("Checksum tree directory " + self.cs_dir + " not found") 598 | 599 | pool = multiprocessing.Pool(processes=self.num_procs) 600 | cs_files_iterable = scan_tree(path=self.cs_dir, recursive=self.recursive) 601 | data_files_iterable = scan_tree(path=self.primary_path, recursive=self.recursive) 602 | 603 | if self.count_files: 604 | file_count = sum([1 for x in cs_files_iterable]) 605 | cs_files_iterable = scan_tree(path=self.cs_dir, recursive=self.recursive) 606 | else: 607 | file_count = None 608 | 609 | results_cache = [] 610 | 611 | for file_path, status, file_size in tqdm(pool.imap_unordered(self._validate_checksum_file, cs_files_iterable), 612 | total=file_count, desc="MPT({}p)/Validating files".format(self.num_procs)): 613 | results_cache.append((file_path, status, file_size)) 614 | if len(results_cache) >= self.cache_size: 615 | for next_path, next_status, next_size in results_cache: 616 | self.report_handler.add_result(description=next_status, data={"path": next_path, "size": next_size}) 617 | self.report_handler.write_summary() 618 | results_cache = [] 619 | # Write any records remaining in the cache after all files are processed 620 | for next_path, next_status, next_size in results_cache: 621 | self.report_handler.add_result(description=next_status, data={"path": next_path, "size": next_size}) 622 | 623 | # Look for data files with no checksum file 624 | for file_path, status in tqdm(pool.imap_unordered(self._check_for_cs_file, data_files_iterable), 625 | desc="MPT({}p/Finding additional files".format(self.num_procs)): 626 | if status is not None: 627 | self.report_handler.add_result(status, {"path": "*{sep}{path}".format(sep=os.sep, path=file_path)}) 628 | self.report_handler.close() 629 | self._show_results() 630 | -------------------------------------------------------------------------------- /mpt/hashing.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import xxhash 3 | from typing import List 4 | 5 | from .paths import fix_path 6 | 7 | algorithms_supported = set.union(hashlib.algorithms_guaranteed, xxhash.algorithms_available) 8 | 9 | def hash_files(file_list: List, algorithm: str = None, blocksize: int = None): 10 | """ 11 | Hash all files in a list using the algorithm and blocksize specified 12 | :param file_list: list of files to hash 13 | :param algorithm: the algorithm to use 14 | :param blocksize: block size to use 15 | :return: a list of tuples in the form (path, hash value) 16 | """ 17 | result = [] 18 | for f in file_list: 19 | cs = hash_file(f, algorithm, blocksize) 20 | next_file = (f, cs) 21 | result.append(next_file) 22 | return result 23 | 24 | def hash_file(in_path: str, algorithm: str = "sha256", blocksize: int = 131072): 25 | """ Return checksum value for a given file 26 | :param in_path: file to hash 27 | :param algorithm: hash algorithm to use 28 | :param blocksize: block size to use for file read 29 | :param return_size: optional, return number of bytes hashed 30 | :return: the hash value of the file 31 | """ 32 | if algorithm in xxhash.algorithms_available: 33 | if algorithm == "xxh32": 34 | hasher = xxhash.xxh32() 35 | elif algorithm == "xxh64": 36 | hasher = xxhash.xxh64() 37 | elif algorithm == "xxh128": 38 | hasher = xxhash.xxh128() 39 | elif algorithm == "xxh3_64": 40 | hasher = xxhash.xxh3_64() 41 | elif algorithm == "xxh3_128": 42 | hasher = xxhash.xxh3_128() 43 | else: 44 | hasher = hashlib.new(algorithm) 45 | path = fix_path(in_path) 46 | size = 0 47 | with open(path.encode('utf-8'), 'rb') as f: 48 | for block in iter(lambda: f.read(blocksize), b""): 49 | hasher.update(block) 50 | size += len(block) 51 | return hasher.hexdigest(), size 52 | -------------------------------------------------------------------------------- /mpt/paths.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def fix_path(path: str): 4 | """ 5 | Insert a 'magic prefix' to any path longer than 259 characters. 6 | Workaround for python-Bugs-542314 7 | (https://mail.python.org/pipermail/python-bugs-list/2007-March/037810.html) 8 | :param path: the original path 9 | :return: the fixed path including a prefix if necessary 10 | """ 11 | if sys.platform == "win32": 12 | if len(path) > 259: 13 | if '\\\\?\\' not in path: 14 | if path.startswith("\\\\"): 15 | # Alternative prefix for UNC paths 16 | path = u'\\\\?\\UNC\\' + path[2:] 17 | else: 18 | # Standard prefix for drive letter paths 19 | path = u'\\\\?\\' + path 20 | 21 | return path 22 | -------------------------------------------------------------------------------- /mpt/results.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from datetime import datetime 4 | 5 | from .codes import (Action, ComparisonResult, CreationResult, Result, 6 | ValidationResult) 7 | 8 | 9 | class Report: 10 | """ 11 | The Report class, representing a single report file created for an MPT run 12 | """ 13 | io_handler = None 14 | csv_handler = None 15 | 16 | def __init__(self, path: str, columns: list): 17 | """ 18 | Initialisation function for the Report class 19 | :param path: absolute file path for the report to be created 20 | :param columns: list of columns included in the report 21 | """ 22 | if not os.path.exists(os.path.dirname(path)): 23 | try: 24 | os.makedirs(os.path.dirname(path)) 25 | except FileExistsError: 26 | pass 27 | self.io_handler = open(path, 'w+', newline='', encoding='utf-8', errors="surrogateescape") 28 | self.csv_handler = csv.DictWriter(self.io_handler, fieldnames=columns) 29 | self.csv_handler.writeheader() 30 | 31 | def write(self, data: dict): 32 | """ 33 | Write one record to the report's output file 34 | :param data: dictionary containing report data in the form { column_name: data } 35 | """ 36 | self.csv_handler.writerow(data) 37 | 38 | def close(self): 39 | """ 40 | Close the report's output file 41 | :return: 42 | """ 43 | self.io_handler.close() 44 | 45 | 46 | class ReportHandler: 47 | """ 48 | The ReportHandler class, which manages all report output for MPT 49 | """ 50 | action = None 51 | out_dir = None 52 | start_time = None 53 | stop_time = None 54 | errors_detected = False 55 | summary_data = {} 56 | out_files = {} 57 | results = {} 58 | file_count = 0 59 | 60 | def __init__(self, action: Action, out_dir: str, summary_data: dict = None): 61 | """ 62 | Initialisation function for the ReportHandler class. 63 | :param action: an Action object representing the checksum action being reported on 64 | :param out_dir: the directory in which to create reports 65 | :param summary_data: a dictionary containing additional static data used to create reports 66 | """ 67 | self.action = action 68 | self.start_time = datetime.now().replace(microsecond=0) 69 | if action == Action.CREATE: 70 | self.results = {x: {"count": 0, "size": 0} for x in CreationResult} 71 | category_dir = os.path.join(out_dir, "creation_reports") 72 | elif action in [Action.COMPARE_TREES, Action.COMPARE_MANIFESTS]: 73 | self.results = {x: {"count": 0, "size": 0} for x in ComparisonResult} 74 | category_dir = os.path.join(out_dir, "comparison_reports") 75 | elif action in [Action.VALIDATE_MANIFEST, Action.VALIDATE_TREE]: 76 | self.results = {x: {"count": 0, "size": 0} for x in ValidationResult} 77 | category_dir = os.path.join(out_dir, "validation_reports") 78 | else: 79 | category_dir = os.path.join(out_dir, "other_reports") 80 | self.out_dir = os.path.join(category_dir, self.start_time.strftime("%Y-%m-%dT%H%M")) 81 | if summary_data is not None: 82 | self.summary_data = summary_data 83 | 84 | def add_out_file(self, description: Result, columns: list): 85 | """ 86 | Add a new Report object and output file 87 | :param description: a Result object representing the category of this report 88 | :param columns: a list of columns in the report 89 | :return: 90 | """ 91 | out_path = os.path.join(self.out_dir, description.name.lower() + '.csv') 92 | if description not in self.out_files: 93 | self.out_files[description] = Report(path=out_path, columns=columns) 94 | if description not in self.results: 95 | self.results[description]["count"] = 0 96 | self.results[description]["size"] = 0 97 | 98 | def add_result(self, description: Result, data: dict): 99 | """ 100 | Add the result of a single checksum operation to the relevant output file 101 | :param description: a Result object representing the category of this result 102 | :param data: a dictionary containing result data in the format { column_name: data } 103 | """ 104 | if description not in self.out_files: 105 | self.add_out_file(description=description, columns=[k for k, v in data.items() if v is not None]) 106 | self.out_files[description].write({k: v for k, v in data.items() if v is not None}) 107 | self.results[description]["count"] += 1 108 | if "size" in data: 109 | if data["size"] is not None: 110 | self.results[description]["size"] += data["size"] 111 | 112 | def write_summary(self): 113 | """ 114 | Write out the summary of this MPT run's results to a text file 115 | """ 116 | if not os.path.exists(self.out_dir): 117 | try: 118 | os.makedirs(self.out_dir) 119 | except FileExistsError: 120 | pass 121 | out_path = os.path.join(self.out_dir, "summary.txt") 122 | with open(out_path, "w+") as out_file: 123 | out_file.write(self.summary()) 124 | 125 | def assign_comparison_result(self, file_path: str, file_status: dict): 126 | """ 127 | Add the results of a checksum comparison to any applicable reports. A single comparison may have to be included 128 | in multiple reports - e.g. the checksum on node A matches that on node B, but is missing on node C and incorrect 129 | on node D. 130 | :param file_path: path to the checksum file 131 | :param file_status: a dictionary containing the results of comparison in the format { node_path: Result } 132 | """ 133 | failed = any(v != ComparisonResult.MATCHED for v in file_status.values()) 134 | file_results = {"path": file_path} 135 | for root, status in file_status.items(): 136 | file_results[root] = status.name.lower() 137 | if failed: 138 | for k, v in file_status.items(): 139 | if k != "path": 140 | if v != ComparisonResult.MATCHED: 141 | self.add_result(description=v, data=file_results) 142 | else: 143 | self.add_result(description=ComparisonResult.MATCHED, data=file_results) 144 | 145 | def close(self): 146 | """ 147 | Complete MPT reporting. Set the finish time and close all report files 148 | """ 149 | self.stop_time = datetime.now().replace(microsecond=0) 150 | for x in self.out_files.values(): 151 | x.close() 152 | 153 | def results_detail(self): 154 | """ 155 | Combine all results into a list with file counts and sizes where applicable. 156 | """ 157 | results_out = [] 158 | for status, data in self.results.items(): 159 | if data["count"] > 0: 160 | if "size" in data: 161 | if data["size"] == 0: 162 | results_out.append("\n{}: {:,}".format(status.value, data["count"])) 163 | else: 164 | results_out.append("\n{}: {:,} ({:,} bytes)".format(status.value, 165 | data["count"], 166 | data["size"])) 167 | return results_out 168 | 169 | def summary(self): 170 | """ 171 | Summarise the results of this MPT run in a form which can be used in an email. 172 | :return: a string containing the summary 173 | """ 174 | import platform 175 | hostname = platform.node() 176 | summary_header = "Minimum Preservation Tool (MPT): processing report for host {}".format(hostname) 177 | summary_intro = "{} results for {}".format(self.action.value, self.summary_data["primary_path"]) 178 | if self.action == Action.VALIDATE_MANIFEST: 179 | summary_intro += "\n\nValidation performed using manifest file " \ 180 | "{}".format(self.summary_data["manifest_file"]) 181 | elif self.action == Action.VALIDATE_TREE: 182 | summary_intro += "\n\nValidation performed using checksum tree {}".format(self.summary_data["cs_dir"]) 183 | elif self.action == Action.CREATE: 184 | if self.summary_data["formats"] is not None: 185 | summary_intro += "\n\nLimited processing to file formats {}".format(str(self.summary_data["formats"])) 186 | if self.action in [Action.COMPARE_TREES, Action.COMPARE_MANIFESTS]: 187 | if self.results[ComparisonResult.MISSING]["count"] == 0 \ 188 | and self.results[ComparisonResult.UNMATCHED]["count"] == 0: 189 | if self.results[ComparisonResult.MATCHED] == 0: 190 | summary_detail = "No checksum files found to compare!" 191 | else: 192 | summary_detail = "All checksums matched." 193 | else: 194 | self.errors_detected = True 195 | summary_detail = "Checksums do not match on all nodes.\n" 196 | elif self.action == Action.CREATE: 197 | if self.results[CreationResult.FAILED]["count"] > 0: 198 | self.errors_detected = True 199 | summary_detail = "Checksums could not be generated for some files." 200 | elif self.results[CreationResult.ADDED]["count"] > 0: 201 | summary_detail = "New files detected.\n" 202 | else: 203 | summary_detail = "No new files detected.\n" 204 | elif self.action in [Action.VALIDATE_MANIFEST, Action.VALIDATE_TREE]: 205 | if self.action == Action.VALIDATE_MANIFEST: 206 | reference = "manifest" 207 | else: 208 | reference = "checksum tree" 209 | if self.results[ValidationResult.MISSING]["count"] == 0 \ 210 | and self.results[ValidationResult.INVALID]["count"] == 0: 211 | if self.results[ValidationResult.VALID]["count"] == 0: 212 | summary_detail = "No files found in {} to validate!".format(reference) 213 | else: 214 | summary_detail = "All files in {} correct.\n".format(reference) 215 | else: 216 | self.errors_detected = True 217 | summary_detail = "Some files could not be validated against {}\n".format(reference) 218 | else: 219 | summary_detail = "" 220 | summary_detail += "".join(self.results_detail()) 221 | if self.stop_time is None: 222 | summary_trailer = "MPT processing still ongoing, started at: " \ 223 | "{}".format(self.start_time.strftime("%Y-%m-%d %H:%M")) 224 | else: 225 | summary_trailer = "Time taken: {}\n\nDetailed reports created " \ 226 | "in: {}".format(str(self.stop_time - self.start_time), self.out_dir) 227 | 228 | summary = "{}\n\n{}\n\n{}\n\n{}".format(summary_header, summary_intro, summary_detail, summary_trailer) 229 | return summary 230 | -------------------------------------------------------------------------------- /mpt/staging.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import multiprocessing 3 | import os 4 | from argparse import Namespace 5 | from datetime import datetime 6 | from typing import Dict, List 7 | 8 | from tqdm import tqdm 9 | 10 | from .codes import StagingStatus 11 | from .defaults import * 12 | from .email import send_email 13 | from .hashing import hash_file 14 | 15 | 16 | class FileStager(): 17 | """ 18 | Class instantiated to carry out file staging 19 | """ 20 | source = None 21 | checksum = None 22 | algorithm = None 23 | blocksize = 0 24 | destinations = {} 25 | remove_original = None 26 | 27 | def __init__(self, 28 | source: str, 29 | destinations: List, 30 | algorithm: str = default_algorithm, 31 | blocksize: int = default_blocksize, 32 | remove_original: bool = True): 33 | """ 34 | Initialise the class instance 35 | :param source: path to the source filename 36 | :param destinations: list of destinations (data directories, checksum directories and manifests) 37 | :param algorithm: the hashing algorithm to use 38 | :param blocksize: the blocksize to use when hashing/copying files 39 | :param remove_original: remove the original file when all operations complete 40 | """ 41 | self.algorithm = algorithm 42 | self.blocksize = blocksize 43 | self.remove_original = remove_original 44 | self.next_file(source, destinations) 45 | 46 | def next_file(self, source: str, destinations: List): 47 | """ 48 | Set the details of the next file to be staged 49 | :param source: the source filename 50 | :param destinations: list of destinations (data directories, checksum directories and manifests 51 | """ 52 | 53 | if len(self.destinations) > 0: 54 | for v in self.destinations.values(): 55 | if v["handler"] is not None: 56 | v["handler"].close() 57 | self.destinations = {} 58 | self.source = source 59 | self.destinations = { 60 | f["root_path"]: { 61 | "data_file": f["destination"], 62 | "checksum_file": f["checksum"], 63 | "checksum_value": None, 64 | "manifest_file": f["manifest"], 65 | "status": StagingStatus.READY, 66 | "substatus": None, 67 | "handler": None 68 | } 69 | for f in destinations 70 | } 71 | 72 | def aborted(self): 73 | """ 74 | Indicates whether file staging failed and some unstaged files could not be removed from the destination 75 | :return: True if any destination file handlers are in a COULD_NOT_REMOVE state, False otherwise 76 | """ 77 | return any(r["status"] in [StagingStatus.COULD_NOT_REMOVE] for r in self.destinations.values()) 78 | 79 | def completed(self): 80 | """ 81 | Indicates whether file staging has completed without errors 82 | :return: True if all destination file handlers are in a STAGED state, False otherwise 83 | """ 84 | return all(r["status"] == StagingStatus.STAGED for r in self.destinations.values()) 85 | 86 | def failed(self): 87 | """ 88 | Indicates whether file staging has failed for any destinations 89 | :return: True if any destination file handlers are in an error state, False otherwise 90 | """ 91 | return any(r["status"] not in [StagingStatus.STAGED, StagingStatus.READY, StagingStatus.IN_PROGRESS] for r in 92 | self.destinations.values()) 93 | 94 | def ready(self): 95 | """ 96 | Indicates whether file staging can commence 97 | :return: True if all destination file handlers are in a READY state, False otherwise 98 | """ 99 | return all(r["status"] == StagingStatus.READY for r in self.destinations.values()) 100 | 101 | def status(self): 102 | """ 103 | Gives a quick view of the state of all destination file handlers 104 | :return: List containing the status of all destination file handlers 105 | """ 106 | return [d["status"] for d in self.destinations.values()] 107 | 108 | def open_destination_files(self): 109 | """ 110 | Attempt to open all destination files if they (or their corresponding checksum file) do not already exist. 111 | Set the state of each file handler accordingly. 112 | :return: True if all file handlers are in a READY state, False otherwise 113 | """ 114 | for k, v in self.destinations.items(): 115 | if os.path.exists(v["data_file"]): 116 | v["status"] = StagingStatus.DUPLICATE_FILE 117 | elif os.path.exists(v["checksum_file"]): 118 | v["status"] = StagingStatus.DUPLICATE_CHECKSUM 119 | else: 120 | try: 121 | v["handler"] = self._open_file(v["data_file"]) 122 | except Exception as e: 123 | v["status"] = StagingStatus.DATA_WRITE_FAILURE 124 | v["substatus"] = str(e) 125 | else: 126 | v["status"] = StagingStatus.READY 127 | return self.ready() 128 | 129 | def _open_file(self, path: str, binary: bool = True, can_exist=False): 130 | """ 131 | Open a file and return a file object 132 | :param path: path to the file 133 | :param binary: open the file in binary mode - otherwise open as text 134 | :param can_exist: allow appending to an existing file - otherwise only allow creation of a new file 135 | :return: the opened file object 136 | """ 137 | if not os.path.exists(os.path.dirname(path)): 138 | try: 139 | os.makedirs(os.path.dirname(path)) 140 | except FileExistsError: 141 | pass 142 | if can_exist: 143 | if binary: 144 | options = 'ab+' 145 | else: 146 | options = 'a+' 147 | else: 148 | if binary: 149 | options = 'xb' 150 | else: 151 | options = 'x' 152 | if not binary: 153 | encoding = "utf-8" 154 | else: 155 | encoding = None 156 | 157 | for n in range(1, 5): 158 | try: 159 | if encoding is None: 160 | handler = open(path, options) 161 | else: 162 | handler = open(path, options, encoding=encoding) 163 | except FileNotFoundError: 164 | continue 165 | finally: 166 | return handler 167 | 168 | def undo_staging(self): 169 | """ 170 | Undo the staging process, removing any files which were copied to their destination 171 | """ 172 | for k, v in self.destinations.items(): 173 | if v["handler"] is not None: 174 | v["handler"].close() 175 | v["handler"] = None 176 | try: 177 | os.remove(v["data_file"]) 178 | os.remove(v["checksum_file"]) 179 | except Exception as e: 180 | v["status"] = StagingStatus.COULD_NOT_REMOVE 181 | v["substatus"] = str(e) 182 | else: 183 | if v["status"] in [StagingStatus.READY, StagingStatus.IN_PROGRESS, StagingStatus.STAGED]: 184 | v["status"] = StagingStatus.UNSTAGED 185 | old_dir = os.getcwd() 186 | os.chdir(k) 187 | try: 188 | os.removedirs(os.path.dirname(v["data_file"])) 189 | os.removedirs(os.path.dirname(v["checksum_file"])) 190 | except OSError: 191 | pass 192 | os.chdir(old_dir) 193 | 194 | def write_files(self): 195 | """ 196 | Write data from the source file to all destination files, obtaining the source file's hex digest value 197 | in the process 198 | :return: True if any file writes have failed, False otherwise 199 | """ 200 | hasher = hashlib.new(self.algorithm) 201 | for v in self.destinations.values(): 202 | v["status"] = StagingStatus.IN_PROGRESS 203 | with open(self.source, "rb") as in_file: 204 | for block in iter(lambda: in_file.read(self.blocksize), b''): 205 | hasher.update(block) 206 | for k, v in self.destinations.items(): 207 | try: 208 | v["handler"].write(block) 209 | except Exception as e: 210 | v["status"] = StagingStatus.DATA_WRITE_FAILURE 211 | v["substatus"] = str(e) 212 | break 213 | if self.failed(): 214 | break 215 | if not self.failed(): 216 | self.checksum = hasher.hexdigest() 217 | self.close_destination_files() 218 | return self.failed() 219 | 220 | def close_destination_files(self): 221 | """ 222 | Close all open file handlers 223 | """ 224 | for v in self.destinations.values(): 225 | v["handler"].close() 226 | v["handler"] = None 227 | 228 | def check_files(self): 229 | """ 230 | Calculate the hex digest value for all destination files and compare it to that of the source file. 231 | Create destination checksum files in the appropriate locations and update the manifest file, if applicable. 232 | :return: True if any failures have occurred, False otherwise 233 | """ 234 | if self.checksum is None: 235 | return False 236 | for k, v in self.destinations.items(): 237 | next_cs, _ = hash_file(v["data_file"]) 238 | v["checksum_value"] = next_cs 239 | if next_cs != self.checksum: 240 | v["status"] = StagingStatus.CHECKSUM_MISMATCH 241 | else: 242 | if v["checksum_file"] is None and v["manifest_file"] is None: 243 | v["status"] = StagingStatus.STAGED 244 | else: 245 | if v["checksum_file"] is not None: 246 | self.write_checksum(destination_key=k) 247 | if v["manifest_file"] is not None: 248 | self.write_checksum(destination_key=k, manifest_file=True) 249 | if v["status"] == StagingStatus.IN_PROGRESS: 250 | v["status"] = StagingStatus.STAGED 251 | if self.completed() and self.remove_original: 252 | try: 253 | os.remove(self.source) 254 | except Exception as e: 255 | print(str(e)) 256 | return self.failed() 257 | 258 | def write_checksum(self, destination_key: str, manifest_file: bool = False): 259 | """ 260 | Write the checksum data for the given destination either as a standalone checksum file or by appending to a 261 | manifest 262 | :param destination_key: the root path of the destination 263 | :param manifest_file: True if writing to a manifest file, False otherwise 264 | :return: True if a write failure has occurred, False otherwise 265 | """ 266 | dest_data = self.destinations[destination_key] 267 | if manifest_file: 268 | out_file = dest_data["manifest_file"] 269 | data_path = os.path.relpath(dest_data["data_file"], destination_key) 270 | else: 271 | out_file = dest_data["checksum_file"] 272 | data_path = os.path.basename(dest_data["data_file"]) 273 | cs_value = dest_data["checksum_value"] 274 | try: 275 | with self._open_file(out_file, binary=False, can_exist=manifest_file) as o: 276 | o.write("{0} *\\{1}\n".format(cs_value, data_path)) 277 | except Exception as e: 278 | dest_data["status"] = StagingStatus.CHECKSUM_WRITE_FAILURE 279 | dest_data["substatus"] = str(e) 280 | return True 281 | return False 282 | 283 | def start_copy(self): 284 | """ 285 | Initiate the staging process for the current file. 286 | :return: True if the staging process failed, False otherwise 287 | """ 288 | self.open_destination_files() 289 | 290 | if self.ready(): 291 | self.write_files() 292 | 293 | if not self.failed(): 294 | self.check_files() 295 | 296 | if self.failed(): 297 | self.undo_staging() 298 | 299 | return self.failed() 300 | 301 | 302 | def _remove_empty_folders(path, remove_root=True): 303 | """ 304 | Delete empty directories beneath a given root. 305 | :param path: the root path 306 | :param remove_root: True if the root directory itself should be deleted 307 | """ 308 | if not os.path.isdir(path): 309 | return 310 | 311 | files = os.listdir(path) 312 | if len(files): 313 | for f in files: 314 | fullpath = os.path.join(path, f) 315 | if os.path.isdir(fullpath): 316 | _remove_empty_folders(fullpath) 317 | 318 | files = os.listdir(path) 319 | if len(files) == 0 and remove_root: 320 | os.rmdir(path) 321 | 322 | 323 | def _count_files(path: str, formats: List = None, recursive: bool = True): 324 | """ Counts the number of files within the specified directory 325 | :param path: the top level path to count files in 326 | :param formats: a list of file endings to count; if omitted, count all files 327 | :param recursive: true if counting should include sub-directories 328 | :return: the number of files in the specified path 329 | """ 330 | count = 0 331 | try: 332 | for p in os.scandir(path): 333 | if p.is_file(): 334 | if formats is None: 335 | count += 1 336 | else: 337 | if (p.path.endswith(tuple(formats))): 338 | count += 1 339 | if recursive and p.is_dir(): 340 | count += _count_files(p.path, formats=formats, recursive=recursive) 341 | except (IOError, OSError) as e: 342 | print("Permission Error ({0}): {1} for {2}".format(e.errno, e.strerror, path)) 343 | return count 344 | 345 | 346 | def _get_files_to_stage(directory: str, target_roots: List, checksum_roots: List, manifest_files: List, 347 | algorithm: str = None, formats: List = None, recursive: bool = True): 348 | """ Create a generator to iterate all files in a directory 349 | :param directory: the root directory to traverse 350 | :param target_roots: a list of root directories to which the file should be copied 351 | :param checksum_roots: a list of root directories in which checksums should be created 352 | :param manifest_files: a list of manifest files to update 353 | :param algorithm: the algorithm to use for hashing 354 | :param formats: a list of file endings to list; if omitted, list all files 355 | :param recursive: true if listing should include sub-directories 356 | :return: an iterable containing all matching files 357 | """ 358 | 359 | if algorithm is None: 360 | algorithm = default_algorithm 361 | if os.path.isdir(directory): 362 | for root, dirs, files in os.walk(directory): 363 | if formats is None: 364 | filtered_files = files 365 | else: 366 | filtered_files = [file for file in files if (file.endswith(tuple(formats)))] 367 | for f in filtered_files: 368 | source_file = os.path.join(root, f) 369 | r_path = os.path.relpath(source_file, directory) 370 | index = 0 371 | dest_dicts = [] 372 | for target_root in target_roots: 373 | dest_file = os.path.join(target_root, r_path) 374 | dest_dict = { 375 | "root_path": u"{}".format(target_root), 376 | "destination": u"{}".format(dest_file), 377 | "checksum": None, 378 | "manifest": None, 379 | } 380 | if len(checksum_roots) > 0: 381 | cs_root = checksum_roots[index] 382 | cs_file = u"{}".format(os.path.join(cs_root, "{0}.{1}".format(r_path, algorithm))) 383 | dest_dict["checksum"] = cs_file 384 | if len(manifest_files) > 0: 385 | manifest_file = u"{}".format(manifest_files[index]) 386 | dest_dict["manifest"] = manifest_file 387 | dest_dicts.append(dest_dict) 388 | index += 1 389 | file = { 390 | "source": source_file, 391 | "algorithm": algorithm, 392 | "destinations": dest_dicts 393 | } 394 | yield(file) 395 | if not recursive: 396 | return 397 | 398 | 399 | def _confirm_staging_targets(staging_summary: Dict): 400 | """ Print a summary of planned staging actions, including manifest and checksum files, and prompt for confirmation 401 | :param staging_summary: a dictionary containin a summary of all staging directories and files 402 | :return: True to continue, False to abort 403 | """ 404 | 405 | print("Source: {0}".format(staging_summary["source"])) 406 | n = 1 407 | for next_target in staging_summary["destinations"]: 408 | print("-" * 40) 409 | print("Destination {0}".format(n)) 410 | print("Data path: {0}".format(next_target["data_path"])) 411 | print("Checksum path: {0}".format(next_target["checksum_path"])) 412 | if "manifest_file" in next_target: 413 | print("Manifest file: {0}".format(next_target["manifest_file"])) 414 | n += 1 415 | print("\n") 416 | response = input("Begin staging using these settings? (y/N)").lower() 417 | if len(response) > 0: 418 | return response[0] == 'y' 419 | else: 420 | return False 421 | 422 | 423 | def _stage_file(next_file: Dict): 424 | """ 425 | Instantiate the FileStager object and begin staging for a given file 426 | :param next_file: a dictionary containing the file and directory names used for staging the file 427 | :return: a triple consisting of the original file name, the staging status, and the details of all destinations 428 | """ 429 | fs = FileStager(source=next_file["source"], destinations=next_file["destinations"], 430 | algorithm=next_file["algorithm"]) 431 | fs.start_copy() 432 | if fs.completed(): 433 | result = (next_file["source"], "staged", fs.destinations) 434 | elif fs.aborted(): 435 | result = (next_file["source"], "aborted", fs.destinations) 436 | elif fs.failed(): 437 | result = (next_file["source"], "failed", fs.destinations) 438 | else: 439 | result = (next_file["source"], "unknown", fs.destinations) 440 | return result 441 | 442 | 443 | def stage_files(args: Namespace): 444 | """ 445 | Main entry point to staging functions. Called by the main module 446 | :param args: a Namespace returned by argparse passed from the main module 447 | """ 448 | # Set start time 449 | start_time = datetime.now().replace(microsecond=0) 450 | 451 | # Initialise list of target and checksum directories 452 | targets = [] 453 | checksums = [] 454 | 455 | # Override the consecutive failure threshold if necessary 456 | if args.max_failures is None: 457 | failure_threshold = max_failures 458 | else: 459 | failure_threshold = args.max_failures 460 | 461 | # Initialise staging summary to be used to confirm actions 462 | summary = { 463 | "source": args.dir, 464 | "destinations": [] 465 | } 466 | 467 | # Initialise staging results 468 | results = { 469 | "staged": [], 470 | "failed": [], 471 | "aborted": [], 472 | "unknown": [], 473 | "consecutive_failures": 0, 474 | "failure_threshold": failure_threshold 475 | } 476 | 477 | # Check that the number of target folders, checksum folders and manifests matches 478 | if len(args.targets) != len(args.trees) and len(args.trees) > 0: 479 | print("Number of target directories does not match number of tree directories") 480 | return 481 | elif len(args.targets) != len(args.manifests) and len(args.manifests) > 0: 482 | print("Number of target directories does not match number of manifest files") 483 | return 484 | 485 | # If no checksum directories have been specified, then assume that each target directory 486 | # should contain "files" and "checksums" folders to hold the staged files and checksums respectively 487 | if len(args.trees) == 0: 488 | for t in args.targets: 489 | targets.append(os.path.join(t, "files")) 490 | checksums.append(os.path.join(t, "checksums")) 491 | else: 492 | targets = args.targets 493 | checksums = args.trees 494 | 495 | # Iterate over list of target directories, find the corresponding checksum directory and manifest file 496 | # (if applicable) and add it to the summary 497 | n = 0 498 | for next_target in targets: 499 | destination = { 500 | "data_path": next_target, 501 | "checksum_path": checksums[n] 502 | } 503 | if len(args.manifests) > 0: 504 | destination["manifest_file"] = args.manifests[n] 505 | summary["destinations"].append(destination) 506 | n += 1 507 | 508 | # Unless overridden by command line options, ask the user to confirm the staging details 509 | if args.no_confirm: 510 | stg_continue = True 511 | else: 512 | stg_continue = _confirm_staging_targets(summary) 513 | if not stg_continue: 514 | return 1 515 | 516 | # Build a generator to list all files to be staged, along with their staging destinations, checksum 517 | # destination and manifest files 518 | files_iterable = _get_files_to_stage(directory=args.dir, target_roots=targets, checksum_roots=checksums, 519 | manifest_files=args.manifests) 520 | 521 | # Create a multiprocessing pool with the appropriate number of processes 522 | pool = multiprocessing.Pool(processes=args.processes) 523 | 524 | if args.count_files: 525 | file_count = _count_files(path=args.dir) 526 | else: 527 | file_count = None 528 | 529 | # Have the multiprocessing pool pass each item returned by the generator to stage_files and monitor 530 | # progress via tqdm 531 | for file, status, destinations in tqdm(pool.imap_unordered(_stage_file, files_iterable), total=file_count, 532 | desc="MPT({}p)/Staging files".format(args.processes)): 533 | # Terminate processing if the failure threshold has been exceeded 534 | terminate = _add_to_results(results, file, status, destinations) 535 | if terminate: 536 | break 537 | 538 | # Remove any empty folders left behind in the staging ara after files have been staged 539 | if not args.keep_empty_folders: 540 | _remove_empty_folders(args.dir, remove_root=False) 541 | 542 | stop_time = datetime.now().replace(microsecond=0) 543 | 544 | results["start_time"] = start_time 545 | results["stop_time"] = stop_time 546 | 547 | # Display, print and email results 548 | _show_results(args, results) 549 | 550 | 551 | def _show_results(args: Namespace, results: Dict): 552 | """ 553 | Display all the results of file stating 554 | :param args: a Namespace returned by argparse passed from the main module 555 | :param results: a Dict containing the results of staging 556 | """ 557 | if results["consecutive_failures"] > results["failure_threshold"]: 558 | staging_interrupted = True 559 | else: 560 | staging_interrupted = False 561 | 562 | summary = _produce_summary(args=args, results=results) 563 | print(summary) 564 | 565 | try: 566 | results.pop("consecutive_failures") 567 | results.pop("failure_threshold") 568 | results.pop("start_time") 569 | results.pop("stop_time") 570 | except KeyError: 571 | pass 572 | 573 | report_dir = _write_reports(args=args, results=results, summary=summary) 574 | print("\nDetailed reports created in: " + report_dir) 575 | 576 | if args.email is not None: 577 | _email_report(recipients=args.email, results=results, reports_dir=report_dir, mail_body=summary, 578 | staging_interrupted=staging_interrupted) 579 | 580 | 581 | def _produce_summary(args: Namespace, results: Dict): 582 | """ 583 | Produce staging summary based on results and initial arguments 584 | :param args: a Namespace returned by argparse, passed from the main module 585 | :param results: a Dict containing the results of staging 586 | :return: a string containing the a summary of the staging process 587 | """ 588 | import platform 589 | summary = "Minimum Preservation Tool (MPT): processing report for host " + platform.node() 590 | summary = summary + "\n\nFile staging results for " + args.dir 591 | if results["consecutive_failures"] > results["failure_threshold"]: 592 | summary = summary + "\n\nFile staging was interrupted due to consecutive error threshold breach." 593 | if len(results["staged"]) == 0: 594 | summary = summary + "\n\nNo new files staged." 595 | else: 596 | summary = summary + "\n\nNew files added to storage: " + str(len(results["staged"])) 597 | if len(results["failed"]) > 0: 598 | summary = summary + "\n\nFiles which failed staging: " + str(len(results["failed"])) 599 | if len(results["aborted"]) > 0: 600 | summary = summary + "\n\nFiles incompletely staged: " + str(len(results["aborted"])) 601 | summary = summary + "\n\nTime taken: " + str(results["stop_time"] - results["start_time"]) 602 | return summary 603 | 604 | 605 | def _write_reports(args: Namespace, results: Dict, summary: str): 606 | """ 607 | Write checksum validation results to a file 608 | :param args: a Namespace returned by argparse, passed from the main module 609 | :param results: a Dict containing the results of staging 610 | :param output_dir: the directory in which to create reports 611 | :return: the directory in which the reports were created 612 | """ 613 | reports_dir = os.path.join(args.output, "staging_reports") 614 | dated_dir = os.path.join(reports_dir, datetime.now().strftime("%Y-%m-%dT%H%M")) 615 | 616 | if not os.path.exists(dated_dir): 617 | try: 618 | os.makedirs(dated_dir) 619 | except Exception as e: 620 | print("Cannot create report directory, error: " + str(e)) 621 | return True 622 | 623 | with open(os.path.join(dated_dir, "summary.txt"), 'w') as out_file: 624 | out_file.write(summary) 625 | 626 | _write_csv_files_from_dictionary(args=args, dictionary=results, output_dir=dated_dir) 627 | 628 | return dated_dir 629 | 630 | 631 | def _write_csv_files_from_dictionary(args: Namespace, dictionary: Dict, output_dir: str): 632 | """ 633 | Write out each list item in a dictionary as a csv file 634 | :param args: a Namespace returned by argparse, passed from the main module 635 | :param dictionary: the dictionary to iterate 636 | :param output_dir: the directory in which to write files 637 | """ 638 | import csv 639 | try: 640 | for k, v in dictionary.items(): 641 | if isinstance(v, list): 642 | if len(v) > 0: 643 | file_name = k + ".csv" 644 | with open(os.path.join(output_dir,file_name), 'w', encoding='utf=8',newline='') as csv_file: 645 | if isinstance(v[0], dict): 646 | output = csv.DictWriter(csv_file, fieldnames=v[0].keys()) 647 | output.writeheader() 648 | else: 649 | output = csv.writer(csv_file) 650 | for el in v: 651 | if isinstance(el, dict): 652 | output.writerow(el) 653 | else: 654 | if self.absolute_path: 655 | output.writerow([el.replace("*\\",args.dir + "\\")]) 656 | else: 657 | output.writerow([el]) 658 | elif isinstance(v, dict): 659 | if next(iter(v.values())) is None: 660 | _write_csv_files_from_dictionary(dictionary={k: list(v.keys())}, output_dir=output_dir) 661 | else: 662 | _write_csv_files_from_dictionary(dictionary=v, output_dir=output_dir) 663 | else: 664 | pass 665 | except StopIteration: 666 | pass 667 | 668 | 669 | def _email_report(recipients: List, results: Dict, reports_dir: str, mail_body: str, staging_interrupted: bool = False): 670 | """ 671 | Email the results of file staging 672 | :param recipients: a List of email recipients 673 | :param results: a Dict containing the results of 674 | :param reports_dir: the directory containing any reports which are to be attached 675 | :param mail_body: a string to be used as the mail body text 676 | :param staging_interrupted: whether staging was prematurely terminated due to exceeding the failure threshold 677 | """ 678 | if staging_interrupted: 679 | mail_subject = "BL MPT Staging: Staging interrupted" 680 | elif len(results["failed"]) > 0 or len(results["aborted"]) > 0: 681 | mail_subject = "BL MPT Staging: Errors encountered" 682 | elif len(results["staged"]) == 0: 683 | mail_subject = "BL MPT Staging: No files to stage" 684 | else: 685 | mail_subject = "BL MPT Staging: All files staged successfully" 686 | attachments = [os.path.join(reports_dir, f) for f in os.listdir(reports_dir) if ( 687 | os.path.isfile(os.path.join(reports_dir, f)) and f.endswith("csv"))] 688 | 689 | size = sum(os.path.getsize(f) for f in attachments) 690 | zip = size >= mail_size_threshold 691 | send_email(subject=mail_subject, recipients=recipients, message=mail_body, attachments=attachments, 692 | zip_files=zip) 693 | 694 | 695 | def _add_to_results(existing_results: Dict, new_file: str, file_status: str, new_results: Dict): 696 | """ 697 | Add the outcome of staging for a single file to the dictionary of overall results 698 | :param existing_results: a Dict containing the overall staging results for all files so far 699 | :param new_file: the original path of the file 700 | :param file_status: the summary status of staging for the file 701 | :param new_results: a Dict containing the detailed results of staging for the file 702 | :return: the new Dict of overall results for all files 703 | """ 704 | write_failed = any(r["status"] in [StagingStatus.DATA_WRITE_FAILURE, StagingStatus.CHECKSUM_WRITE_FAILURE] 705 | for r in new_results.values()) 706 | new_entry = {"path": new_file} 707 | for root, values in new_results.items(): 708 | new_entry[root] = values["status"].value 709 | existing_results[file_status].append(new_entry) 710 | if write_failed: 711 | existing_results["consecutive_failures"] += 1 712 | else: 713 | existing_results["consecutive_failures"] = 0 714 | return existing_results["consecutive_failures"] > existing_results["failure_threshold"] 715 | -------------------------------------------------------------------------------- /mpt/timing.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def time_usage(func): 5 | def timing_wrapper(*args, **kwargs): 6 | start = time.time() 7 | value = func(*args, **kwargs) 8 | end = time.time() 9 | runtime = end - start 10 | msg = "{func} took {time} seconds" 11 | print (msg.format(func=func.__name__, time=runtime)) 12 | return value 13 | return timing_wrapper 14 | -------------------------------------------------------------------------------- /mptreport/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | -------------------------------------------------------------------------------- /mptreport/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from .reportcollator import ReportCollator 5 | 6 | 7 | def main(args=None): 8 | if args is None: 9 | args = sys.argv[1:] 10 | 11 | ap = argparse.ArgumentParser(prog="mptreport", description="MPT report collation tool") 12 | ap.add_argument("-i", "--in-dir", dest="base_path", help="full path to base report location") 13 | ap.add_argument("-o", "--out-file", dest="out_file", help="full path to output CSV file") 14 | ap.add_argument("-s", "-start-date", dest="date_start", metavar="START_DATE", 15 | help="earliest date to include (yyyymmdd)") 16 | ap.add_argument("-f", "--finish-date", dest="date_end", metavar="FINISH_DATE", 17 | help="latest date to include (yyyymmdd)") 18 | ap.add_argument("-e", "--email-recipients", dest="email_recipients", nargs="+", 19 | help="e-mail addresses to receive collated CSV file") 20 | 21 | parsed_args = ap.parse_args(args) 22 | 23 | rc = ReportCollator(**parsed_args.__dict__) 24 | rc.start() 25 | 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /mptreport/reportcollator.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import multiprocessing 3 | import os 4 | from datetime import datetime 5 | 6 | from tqdm import tqdm 7 | 8 | from mpt.defaults import base_output_dir 9 | from mpt.email import send_email 10 | from mpt.filemanager import scan_tree 11 | 12 | 13 | def split_all(path): 14 | allparts = [] 15 | while 1: 16 | parts = os.path.split(path) 17 | if parts[0] == path: # sentinel for absolute paths 18 | allparts.insert(0, parts[0]) 19 | break 20 | elif parts[1] == path: # sentinel for relative paths 21 | allparts.insert(0, parts[1]) 22 | else: 23 | path = parts[0] 24 | allparts.insert(0, parts[1]) 25 | return allparts 26 | 27 | 28 | def convert_time_string(time_string: str): 29 | split_time = time_string.split(":") 30 | seconds = (int(split_time[0]) * 3600) + (int(split_time[1]) * 60) + int(split_time[2]) 31 | return seconds 32 | 33 | 34 | class ReportCollator: 35 | num_procs = None 36 | base_path = None 37 | date_start = None 38 | date_end = None 39 | out_file = None 40 | email_recipients = None 41 | results = [] 42 | results_sorted = {} 43 | 44 | def __init__(self, 45 | base_path: str = None, 46 | date_start: str = None, 47 | date_end: str = None, 48 | out_file: str = None, 49 | email_recipients: list = None): 50 | if base_path is None: 51 | self.base_path = base_output_dir 52 | else: 53 | self.base_path = base_path 54 | if date_start is None: 55 | self.date_start = datetime(1970, 1, 1, 0, 0, 0) 56 | else: 57 | self.date_start = datetime.strptime(date_start, "yyyymmdd") 58 | if date_end is None: 59 | self.date_end = datetime.now() 60 | else: 61 | self.date_end = datetime.strptime(date_end, "yyyymmdd") 62 | self.out_file = out_file 63 | self.email_recipients = email_recipients 64 | try: 65 | self.num_procs = int(os.environ["NUMBER_OF_PROCESSORS"]) 66 | except KeyError: 67 | self.num_procs = 2 68 | pass 69 | 70 | def _parse_file(self, file_path: str): 71 | result = { 72 | "datetime": None, 73 | "hostname": None, 74 | "action": None, 75 | "path": None, 76 | "time_taken": None, 77 | "total_files": None, 78 | "status": [] 79 | } 80 | path_parts = split_all(file_path) 81 | result["datetime"] = datetime.strptime(path_parts[-2], "%Y-%m-%dT%H%M") 82 | skip = False 83 | if self.date_start <= result["datetime"] <= self.date_end: 84 | with open(file_path, 'r') as in_file: 85 | next_line = in_file.readline().strip() 86 | result["hostname"] = next_line.split(" ")[-1] 87 | for next_line in in_file.readlines(): 88 | next_line = next_line.strip() 89 | if next_line != "": 90 | if "results" in next_line: 91 | result["action"], result["path"] = next_line.split(" results for ") 92 | if result["action"] == "File staging": 93 | return None 94 | else: 95 | if ": " in next_line: 96 | if "ongoing" in next_line: 97 | skip = True 98 | elif "reports created" in next_line: 99 | pass 100 | elif "Time taken" in next_line: 101 | time_str = next_line.split(": ")[-1] 102 | result["time_taken"] = time_str 103 | else: 104 | try: 105 | file_status, file_results = next_line.rsplit(": ", 1) 106 | except Exception as e: 107 | print(f"Error in file: {file_path}, text: '{next_line}'") 108 | raise e 109 | status = { 110 | "file_status": file_status, 111 | "file_count": None, 112 | "file_size": None 113 | } 114 | if "(" in file_results: 115 | file_count, file_size = file_results.split(" (") 116 | status["file_count"] = int("".join(c for c in file_count if c.isdigit())) 117 | status["file_size"] = int("".join(c for c in file_size if c.isdigit())) 118 | else: 119 | status["file_count"] = int(file_results.replace(",","")) 120 | result["status"].append(status) 121 | else: 122 | skip = True 123 | if not skip: 124 | result["total_files"] = sum([n["file_count"] for n in result["status"]]) 125 | return result 126 | 127 | def _write_report(self): 128 | cols = ["datetime", "hostname", "action", "path", 129 | "time_taken", "total_files", "file_status", "file_count", "file_size" 130 | ] 131 | if self.out_file is not None: 132 | with open(self.out_file, "w+", encoding="utf-8", newline="") as o: 133 | dw = csv.DictWriter(o, fieldnames=cols) 134 | dw.writeheader() 135 | for item in self.results: 136 | new_row = {k:v for k, v in item.items() if k != "status"} 137 | for status in item["status"]: 138 | new_row["file_status"] = status["file_status"] 139 | new_row["file_count"] = status["file_count"] 140 | new_row["file_size"] = status["file_size"] 141 | dw.writerow(new_row) 142 | return self.out_file 143 | 144 | def _email_report(self): 145 | server = self.results[0]["hostname"] 146 | subject = "Collated MPT statistics for {}".format(server) 147 | message = "Minimum Preservation Tool (MPT): collated reports for host {host}\n\n" \ 148 | "Attached are the collated statistics " \ 149 | "for the period {start} - {end}".format(host=server, 150 | start=self.date_start.strftime("%Y-%m-%d"), 151 | end=self.date_end.strftime("%Y-%m-%d")) 152 | send_email(subject=subject, recipients=self.email_recipients, message=message, attachments=[self.out_file]) 153 | 154 | def start(self): 155 | files_iterable = scan_tree(path=self.base_path, recursive=True, formats="summary.txt") 156 | pool = multiprocessing.Pool(processes=self.num_procs) 157 | for result in tqdm(pool.imap_unordered(self._parse_file, files_iterable), 158 | desc="MPTReport({}p)/Collating summaries".format(self.num_procs)): 159 | if result is not None: 160 | self.results.append(result) 161 | self._write_report() 162 | if self.email_recipients is not None: 163 | self._email_report() 164 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os 3 | import re 4 | 5 | from setuptools import find_packages, setup 6 | 7 | # Project version approach from https://packaging.python.org/guides/single-sourcing-package-version/#single-sourcing-the-version 8 | # Version specified in mpt.__init__.py 9 | here = os.path.abspath(os.path.dirname(__file__)) 10 | 11 | 12 | def read(*parts): 13 | with codecs.open(os.path.join(here, *parts), 'r') as fp: 14 | return fp.read() 15 | 16 | 17 | def find_version(*file_paths): 18 | version_file = read(*file_paths) 19 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", 20 | version_file, re.M) 21 | if version_match: 22 | return version_match.group(1) 23 | raise RuntimeError("Unable to find version string.") 24 | 25 | 26 | setup( 27 | name='bl-mpt', 28 | version=find_version('mpt', '__init__.py'), 29 | packages=find_packages(), 30 | url='http://github.com/britishlibrary/mpt', 31 | license='Apache License 2.0', 32 | author='The British Library', 33 | author_email='digital.preservation@bl.uk', 34 | description='Tools for creating and validating checksums', 35 | entry_points={ 36 | 'console_scripts': [ 37 | 'mpt = mpt.__main__:main', 38 | 'mptreport = mptreport.__main__:main' 39 | ] 40 | }, 41 | install_requires=[ 42 | 'tqdm>=4.32', 43 | ], 44 | classifiers=[ 45 | 'Development Status :: 5 - Production/Stable', 46 | 47 | # Indicate who your project is intended for 48 | 'Intended Audience :: End Users/Desktop', 49 | 'Topic :: Utilities', 50 | 51 | # Pick your license as you wish (should match "license" above) 52 | 'License :: OSI Approved :: Apache Software License', 53 | 54 | # Specify the Python versions you support here. In particular, ensure 55 | # that you indicate whether you support Python 2, Python 3 or both. 56 | 'Programming Language :: Python :: 3.6', 57 | ] 58 | ) 59 | --------------------------------------------------------------------------------